Adding upstream version 14.2.21.upstream/14.2.21 upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-27 18:24:20 +0000
commit: 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree: e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg
parent: Initial commit. (diff)
download: ceph-upstream.tar.xz
ceph-upstream.zip
112 files changed, 41000 insertions, 0 deletions
diff --git a/src/msg/CMakeLists.txt b/src/msg/CMakeLists.txt
new file mode 100644
index 00000000..1ad34615
--- /dev/null
+++ b/src/msg/CMakeLists.txt
@@ -0,0 +1,76 @@
+set(msg_srcs
+  DispatchQueue.cc
+  Message.cc
+  Messenger.cc
+  QueueStrategy.cc
+  msg_types.cc
+  simple/Accepter.cc
+  simple/Pipe.cc
+  simple/PipeConnection.cc
+  simple/SimpleMessenger.cc)
+
+if(HAVE_XIO)
+  list(APPEND msg_srcs
+    xio/XioConnection.cc
+    xio/XioMsg.cc
+    xio/XioPool.cc
+    xio/XioMessenger.cc
+    xio/XioPortal.cc)
+endif(HAVE_XIO)
+
+list(APPEND msg_srcs
+  async/AsyncConnection.cc
+  async/AsyncMessenger.cc
+  async/Protocol.cc
+  async/ProtocolV1.cc
+  async/ProtocolV2.cc
+  async/Event.cc
+  async/EventSelect.cc
+  async/PosixStack.cc
+  async/Stack.cc
+  async/crypto_onwire.cc
+  async/frames_v2.cc
+  async/net_handler.cc)
+
+if(LINUX)
+  list(APPEND msg_srcs
+    async/EventEpoll.cc)
+elseif(FREEBSD OR APPLE)
+  list(APPEND msg_srcs
+    async/EventKqueue.cc)
+endif(LINUX)
+
+if(HAVE_RDMA)
+  list(APPEND msg_srcs
+    async/rdma/Infiniband.cc
+    async/rdma/RDMAConnectedSocketImpl.cc
+    async/rdma/RDMAIWARPConnectedSocketImpl.cc
+    async/rdma/RDMAServerSocketImpl.cc
+    async/rdma/RDMAIWARPServerSocketImpl.cc
+    async/rdma/RDMAStack.cc)
+endif()
+
+add_library(common-msg-objs OBJECT ${msg_srcs})
+
+if(WITH_DPDK)
+  set(async_dpdk_srcs
+    async/dpdk/ARP.cc
+    async/dpdk/DPDK.cc
+    async/dpdk/dpdk_rte.cc
+    async/dpdk/DPDKStack.cc
+    async/dpdk/EventDPDK.cc
+    async/dpdk/IP.cc
+    async/dpdk/net.cc
+    async/dpdk/IPChecksum.cc
+    async/dpdk/Packet.cc
+    async/dpdk/TCP.cc
+    async/dpdk/UserspaceEvent.cc
+    async/dpdk/ethernet.cc)
+  add_library(common_async_dpdk STATIC
+    ${async_dpdk_srcs})
+  target_link_libraries(common_async_dpdk PRIVATE
+    dpdk::dpdk)
+  # Stack.cc includes DPDKStack.h, which includes rte_config.h indirectly
+  target_include_directories(common-msg-objs PRIVATE
+    $<TARGET_PROPERTY:dpdk::dpdk,INTERFACE_INCLUDE_DIRECTORIES>)
+endif(WITH_DPDK)
diff --git a/src/msg/Connection.h b/src/msg/Connection.h
new file mode 100644
index 00000000..4eea5ff0
--- /dev/null
+++ b/src/msg/Connection.h
@@ -0,0 +1,253 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_CONNECTION_H
+#define CEPH_CONNECTION_H
+
+#include <stdlib.h>
+#include <ostream>
+
+#include <boost/intrusive_ptr.hpp>
+
+#include "auth/Auth.h"
+#include "common/RefCountedObj.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/Mutex.h"
+#include "include/ceph_assert.h" // Because intusive_ptr clobbers our assert...
+#include "include/buffer.h"
+#include "include/types.h"
+#include "common/item_history.h"
+#include "msg/MessageRef.h"
+
+
+// ======================================================
+
+// abstract Connection, for keeping per-connection state
+
+class Messenger;
+
+#ifdef UNIT_TESTS_BUILT
+class Interceptor;
+#endif
+
+struct Connection : public RefCountedObject {
+  mutable Mutex lock;
+  Messenger *msgr;
+  RefCountedPtr priv;
+  int peer_type;
+  int64_t peer_id = -1;  // [msgr2 only] the 0 of osd.0, 4567 or client.4567
+  safe_item_history<entity_addrvec_t> peer_addrs;
+  utime_t last_keepalive, last_keepalive_ack;
+private:
+  uint64_t features;
+public:
+  bool failed; // true if we are a lossy connection that has failed.
+
+  int rx_buffers_version;
+  map<ceph_tid_t,pair<bufferlist,int> > rx_buffers;
+
+  // authentication state
+  // FIXME make these private after ms_handle_authorizer is removed
+public:
+  AuthCapsInfo peer_caps_info;
+  EntityName peer_name;
+  uint64_t peer_global_id = 0;
+
+#ifdef UNIT_TESTS_BUILT
+  Interceptor *interceptor;
+#endif
+
+  friend class boost::intrusive_ptr<Connection>;
+  friend class PipeConnection;
+
+public:
+  Connection(CephContext *cct, Messenger *m)
+    // we are managed exclusively by ConnectionRef; make it so you can
+    //   ConnectionRef foo = new Connection;
+    : RefCountedObject(cct, 0),
+      lock("Connection::lock"),
+      msgr(m),
+      peer_type(-1),
+      features(0),
+      failed(false),
+      rx_buffers_version(0) {
+  }
+
+  ~Connection() override {
+    //generic_dout(0) << "~Connection " << this << dendl;
+  }
+
+  void set_priv(const RefCountedPtr& o) {
+    Mutex::Locker l(lock);
+    priv = o;
+  }
+
+  RefCountedPtr get_priv() {
+    Mutex::Locker l(lock);
+    return priv;
+  }
+
+  /**
+   * Used to judge whether this connection is ready to send. Usually, the
+   * implementation need to build a own shakehand or sesson then it can be
+   * ready to send.
+   *
+   * @return true if ready to send, or false otherwise
+   */
+  virtual bool is_connected() = 0;
+
+  Messenger *get_messenger() {
+    return msgr;
+  }
+
+  /**
+   * Queue the given Message to send out on the given Connection.
+   * Success in this function does not guarantee Message delivery, only
+   * success in queueing the Message. Other guarantees may be provided based
+   * on the Connection policy.
+   *
+   * @param m The Message to send. The Messenger consumes a single reference
+   * when you pass it in.
+   *
+   * @return 0 on success, or -errno on failure.
+   */
+  virtual int send_message(Message *m) = 0;
+
+  virtual int send_message2(MessageRef m)
+  {
+    return send_message(m.detach()); /* send_message(Message *m) consumes a reference */
+  }
+
+  /**
+   * Send a "keepalive" ping along the given Connection, if it's working.
+   * If the underlying connection has broken, this function does nothing.
+   *
+   * @return 0, or implementation-defined error numbers.
+   */
+  virtual void send_keepalive() = 0;
+  /**
+   * Mark down the given Connection.
+   *
+   * This will cause us to discard its outgoing queue, and if reset
+   * detection is enabled in the policy and the endpoint tries to
+   * reconnect they will discard their queue when we inform them of
+   * the session reset.
+   *
+   * It does not generate any notifications to the Dispatcher.
+   */
+  virtual void mark_down() = 0;
+
+  /**
+   * Mark a Connection as "disposable", setting it to lossy
+   * (regardless of initial Policy).  This does not immediately close
+   * the Connection once Messages have been delivered, so as long as
+   * there are no errors you can continue to receive responses; but it
+   * will not attempt to reconnect for message delivery or preserve
+   * your old delivery semantics, either.
+   *
+   * TODO: There's some odd stuff going on in our SimpleMessenger
+   * implementation during connect that looks unused; is there
+   * more of a contract that that's enforcing?
+   */
+  virtual void mark_disposable() = 0;
+
+  // WARNING / FIXME: this is not populated for loopback connections
+  AuthCapsInfo& get_peer_caps_info() {
+    return peer_caps_info;
+  }
+  const EntityName& get_peer_entity_name() {
+    return peer_name;
+  }
+  uint64_t get_peer_global_id() {
+    return peer_global_id;
+  }
+
+  int get_peer_type() const { return peer_type; }
+  void set_peer_type(int t) { peer_type = t; }
+
+  // peer_id is only defined for msgr2
+  int64_t get_peer_id() const { return peer_id; }
+  void set_peer_id(int64_t t) { peer_id = t; }
+
+  bool peer_is_mon() const { return peer_type == CEPH_ENTITY_TYPE_MON; }
+  bool peer_is_mgr() const { return peer_type == CEPH_ENTITY_TYPE_MGR; }
+  bool peer_is_mds() const { return peer_type == CEPH_ENTITY_TYPE_MDS; }
+  bool peer_is_osd() const { return peer_type == CEPH_ENTITY_TYPE_OSD; }
+  bool peer_is_client() const { return peer_type == CEPH_ENTITY_TYPE_CLIENT; }
+
+  /// which of the peer's addrs is actually in use for this connection
+  virtual entity_addr_t get_peer_socket_addr() const = 0;
+
+  entity_addr_t get_peer_addr() const {
+    return peer_addrs->front();
+  }
+  const entity_addrvec_t& get_peer_addrs() const {
+    return *peer_addrs;
+  }
+  void set_peer_addr(const entity_addr_t& a) {
+    peer_addrs = entity_addrvec_t(a);
+  }
+  void set_peer_addrs(const entity_addrvec_t& av) { peer_addrs = av; }
+
+  uint64_t get_features() const { return features; }
+  bool has_feature(uint64_t f) const { return features & f; }
+  bool has_features(uint64_t f) const {
+    return (features & f) == f;
+  }
+  void set_features(uint64_t f) { features = f; }
+  void set_feature(uint64_t f) { features |= f; }
+
+  virtual int get_con_mode() const {
+    return CEPH_CON_MODE_CRC;
+  }
+
+  void post_rx_buffer(ceph_tid_t tid, bufferlist& bl) {
+#if 0
+    Mutex::Locker l(lock);
+    ++rx_buffers_version;
+    rx_buffers[tid] = pair<bufferlist,int>(bl, rx_buffers_version);
+#endif
+  }
+
+  void revoke_rx_buffer(ceph_tid_t tid) {
+#if 0
+    Mutex::Locker l(lock);
+    rx_buffers.erase(tid);
+#endif
+  }
+
+  utime_t get_last_keepalive() const {
+    Mutex::Locker l(lock);
+    return last_keepalive;
+  }
+  void set_last_keepalive(utime_t t) {
+    Mutex::Locker l(lock);
+    last_keepalive = t;
+  }
+  utime_t get_last_keepalive_ack() const {
+    Mutex::Locker l(lock);
+    return last_keepalive_ack;
+  }
+  void set_last_keepalive_ack(utime_t t) {
+    Mutex::Locker l(lock);
+    last_keepalive_ack = t;
+  }
+
+};
+
+typedef boost::intrusive_ptr<Connection> ConnectionRef;
+
+
+#endif /* CEPH_CONNECTION_H */
diff --git a/src/msg/DispatchQueue.cc b/src/msg/DispatchQueue.cc
new file mode 100644
index 00000000..587a2dbe
--- /dev/null
+++ b/src/msg/DispatchQueue.cc
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "msg/Message.h"
+#include "DispatchQueue.h"
+#include "Messenger.h"
+#include "common/ceph_context.h"
+
+#define dout_subsys ceph_subsys_ms
+#include "common/debug.h"
+
+
+/*******************
+ * DispatchQueue
+ */
+
+#undef dout_prefix
+#define dout_prefix *_dout << "-- " << msgr->get_myaddrs() << " "
+
+double DispatchQueue::get_max_age(utime_t now) const {
+  Mutex::Locker l(lock);
+  if (marrival.empty())
+    return 0;
+  else
+    return (now - marrival.begin()->first);
+}
+
+uint64_t DispatchQueue::pre_dispatch(const Message::ref& m)
+{
+  ldout(cct,1) << "<== " << m->get_source_inst()
+	       << " " << m->get_seq()
+	       << " ==== " << *m
+	       << " ==== " << m->get_payload().length()
+	       << "+" << m->get_middle().length()
+	       << "+" << m->get_data().length()
+	       << " (" << ceph_con_mode_name(m->get_connection()->get_con_mode())
+	       << " " << m->get_footer().front_crc << " "
+	       << m->get_footer().middle_crc
+	       << " " << m->get_footer().data_crc << ")"
+	       << " " << m << " con " << m->get_connection()
+	       << dendl;
+  uint64_t msize = m->get_dispatch_throttle_size();
+  m->set_dispatch_throttle_size(0); // clear it out, in case we requeue this message.
+  return msize;
+}
+
+void DispatchQueue::post_dispatch(const Message::ref& m, uint64_t msize)
+{
+  dispatch_throttle_release(msize);
+  ldout(cct,20) << "done calling dispatch on " << m << dendl;
+}
+
+bool DispatchQueue::can_fast_dispatch(const Message::const_ref &m) const
+{
+  return msgr->ms_can_fast_dispatch(m);
+}
+
+void DispatchQueue::fast_dispatch(const Message::ref& m)
+{
+  uint64_t msize = pre_dispatch(m);
+  msgr->ms_fast_dispatch(m);
+  post_dispatch(m, msize);
+}
+
+void DispatchQueue::fast_preprocess(const Message::ref& m)
+{
+  msgr->ms_fast_preprocess(m);
+}
+
+void DispatchQueue::enqueue(const Message::ref& m, int priority, uint64_t id)
+{
+  Mutex::Locker l(lock);
+  if (stop) {
+    return;
+  }
+  ldout(cct,20) << "queue " << m << " prio " << priority << dendl;
+  add_arrival(m);
+  if (priority >= CEPH_MSG_PRIO_LOW) {
+    mqueue.enqueue_strict(id, priority, QueueItem(m));
+  } else {
+    mqueue.enqueue(id, priority, m->get_cost(), QueueItem(m));
+  }
+  cond.Signal();
+}
+
+void DispatchQueue::local_delivery(const Message::ref& m, int priority)
+{
+  m->set_recv_stamp(ceph_clock_now());
+  Mutex::Locker l(local_delivery_lock);
+  if (local_messages.empty())
+    local_delivery_cond.Signal();
+  local_messages.emplace(m, priority);
+  return;
+}
+
+void DispatchQueue::run_local_delivery()
+{
+  local_delivery_lock.Lock();
+  while (true) {
+    if (stop_local_delivery)
+      break;
+    if (local_messages.empty()) {
+      local_delivery_cond.Wait(local_delivery_lock);
+      continue;
+    }
+    auto p = std::move(local_messages.front());
+    local_messages.pop();
+    local_delivery_lock.Unlock();
+    const Message::ref& m = p.first;
+    int priority = p.second;
+    fast_preprocess(m);
+    if (can_fast_dispatch(m)) {
+      fast_dispatch(m);
+    } else {
+      enqueue(m, priority, 0);
+    }
+    local_delivery_lock.Lock();
+  }
+  local_delivery_lock.Unlock();
+}
+
+void DispatchQueue::dispatch_throttle_release(uint64_t msize)
+{
+  if (msize) {
+    ldout(cct,10) << __func__ << " " << msize << " to dispatch throttler "
+	    << dispatch_throttler.get_current() << "/"
+	    << dispatch_throttler.get_max() << dendl;
+    dispatch_throttler.put(msize);
+  }
+}
+
+/*
+ * This function delivers incoming messages to the Messenger.
+ * Connections with messages are kept in queues; when beginning a message
+ * delivery the highest-priority queue is selected, the connection from the
+ * front of the queue is removed, and its message read. If the connection
+ * has remaining messages at that priority level, it is re-placed on to the
+ * end of the queue. If the queue is empty; it's removed.
+ * The message is then delivered and the process starts again.
+ */
+void DispatchQueue::entry()
+{
+  lock.Lock();
+  while (true) {
+    while (!mqueue.empty()) {
+      QueueItem qitem = mqueue.dequeue();
+      if (!qitem.is_code())
+	remove_arrival(qitem.get_message());
+      lock.Unlock();
+
+      if (qitem.is_code()) {
+	if (cct->_conf->ms_inject_internal_delays &&
+	    cct->_conf->ms_inject_delay_probability &&
+	    (rand() % 10000)/10000.0 < cct->_conf->ms_inject_delay_probability) {
+	  utime_t t;
+	  t.set_from_double(cct->_conf->ms_inject_internal_delays);
+	  ldout(cct, 1) << "DispatchQueue::entry  inject delay of " << t
+			<< dendl;
+	  t.sleep();
+	}
+	switch (qitem.get_code()) {
+	case D_BAD_REMOTE_RESET:
+	  msgr->ms_deliver_handle_remote_reset(qitem.get_connection());
+	  break;
+	case D_CONNECT:
+	  msgr->ms_deliver_handle_connect(qitem.get_connection());
+	  break;
+	case D_ACCEPT:
+	  msgr->ms_deliver_handle_accept(qitem.get_connection());
+	  break;
+	case D_BAD_RESET:
+	  msgr->ms_deliver_handle_reset(qitem.get_connection());
+	  break;
+	case D_CONN_REFUSED:
+	  msgr->ms_deliver_handle_refused(qitem.get_connection());
+	  break;
+	default:
+	  ceph_abort();
+	}
+      } else {
+	const Message::ref& m = qitem.get_message();
+	if (stop) {
+	  ldout(cct,10) << " stop flag set, discarding " << m << " " << *m << dendl;
+	} else {
+	  uint64_t msize = pre_dispatch(m);
+	  msgr->ms_deliver_dispatch(m);
+	  post_dispatch(m, msize);
+	}
+      }
+
+      lock.Lock();
+    }
+    if (stop)
+      break;
+
+    // wait for something to be put on queue
+    cond.Wait(lock);
+  }
+  lock.Unlock();
+}
+
+void DispatchQueue::discard_queue(uint64_t id) {
+  Mutex::Locker l(lock);
+  list<QueueItem> removed;
+  mqueue.remove_by_class(id, &removed);
+  for (list<QueueItem>::iterator i = removed.begin();
+       i != removed.end();
+       ++i) {
+    ceph_assert(!(i->is_code())); // We don't discard id 0, ever!
+    const Message::ref& m = i->get_message();
+    remove_arrival(m);
+    dispatch_throttle_release(m->get_dispatch_throttle_size());
+  }
+}
+
+void DispatchQueue::start()
+{
+  ceph_assert(!stop);
+  ceph_assert(!dispatch_thread.is_started());
+  dispatch_thread.create("ms_dispatch");
+  local_delivery_thread.create("ms_local");
+}
+
+void DispatchQueue::wait()
+{
+  local_delivery_thread.join();
+  dispatch_thread.join();
+}
+
+void DispatchQueue::discard_local()
+{
+  decltype(local_messages)().swap(local_messages);
+}
+
+void DispatchQueue::shutdown()
+{
+  // stop my local delivery thread
+  local_delivery_lock.Lock();
+  stop_local_delivery = true;
+  local_delivery_cond.Signal();
+  local_delivery_lock.Unlock();
+
+  // stop my dispatch thread
+  lock.Lock();
+  stop = true;
+  cond.Signal();
+  lock.Unlock();
+}
diff --git a/src/msg/DispatchQueue.h b/src/msg/DispatchQueue.h
new file mode 100644
index 00000000..2d90d82c
--- /dev/null
+++ b/src/msg/DispatchQueue.h
@@ -0,0 +1,243 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_DISPATCHQUEUE_H
+#define CEPH_DISPATCHQUEUE_H
+
+#include <atomic>
+#include <map>
+#include <queue>
+#include <boost/intrusive_ptr.hpp>
+#include "include/ceph_assert.h"
+#include "common/Throttle.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "common/PrioritizedQueue.h"
+
+#include "Message.h"
+
+class CephContext;
+class Messenger;
+struct Connection;
+
+/**
+ * The DispatchQueue contains all the connections which have Messages
+ * they want to be dispatched, carefully organized by Message priority
+ * and permitted to deliver in a round-robin fashion.
+ * See Messenger::dispatch_entry for details.
+ */
+class DispatchQueue {
+  class QueueItem {
+    int type;
+    ConnectionRef con;
+    Message::ref m;
+  public:
+    explicit QueueItem(const Message::ref& m) : type(-1), con(0), m(m) {}
+    QueueItem(int type, Connection *con) : type(type), con(con), m(0) {}
+    bool is_code() const {
+      return type != -1;
+    }
+    int get_code () const {
+      ceph_assert(is_code());
+      return type;
+    }
+    const Message::ref& get_message() {
+      ceph_assert(!is_code());
+      return m;
+    }
+    Connection *get_connection() {
+      ceph_assert(is_code());
+      return con.get();
+    }
+  };
+    
+  CephContext *cct;
+  Messenger *msgr;
+  mutable Mutex lock;
+  Cond cond;
+
+  PrioritizedQueue<QueueItem, uint64_t> mqueue;
+
+  std::set<pair<double, Message::ref>> marrival;
+  map<Message::ref, decltype(marrival)::iterator> marrival_map;
+  void add_arrival(const Message::ref& m) {
+    marrival_map.insert(
+      make_pair(
+	m,
+	marrival.insert(make_pair(m->get_recv_stamp(), m)).first
+	)
+      );
+  }
+  void remove_arrival(const Message::ref& m) {
+    auto it = marrival_map.find(m);
+    ceph_assert(it != marrival_map.end());
+    marrival.erase(it->second);
+    marrival_map.erase(it);
+  }
+
+  std::atomic<uint64_t> next_id;
+    
+  enum { D_CONNECT = 1, D_ACCEPT, D_BAD_REMOTE_RESET, D_BAD_RESET, D_CONN_REFUSED, D_NUM_CODES };
+
+  /**
+   * The DispatchThread runs dispatch_entry to empty out the dispatch_queue.
+   */
+  class DispatchThread : public Thread {
+    DispatchQueue *dq;
+  public:
+    explicit DispatchThread(DispatchQueue *dq) : dq(dq) {}
+    void *entry() override {
+      dq->entry();
+      return 0;
+    }
+  } dispatch_thread;
+
+  Mutex local_delivery_lock;
+  Cond local_delivery_cond;
+  bool stop_local_delivery;
+  std::queue<pair<Message::ref, int>> local_messages;
+  class LocalDeliveryThread : public Thread {
+    DispatchQueue *dq;
+  public:
+    explicit LocalDeliveryThread(DispatchQueue *dq) : dq(dq) {}
+    void *entry() override {
+      dq->run_local_delivery();
+      return 0;
+    }
+  } local_delivery_thread;
+
+  uint64_t pre_dispatch(const Message::ref& m);
+  void post_dispatch(const Message::ref& m, uint64_t msize);
+
+ public:
+
+  /// Throttle preventing us from building up a big backlog waiting for dispatch
+  Throttle dispatch_throttler;
+
+  bool stop;
+  void local_delivery(const Message::ref& m, int priority);
+  void local_delivery(Message* m, int priority) {
+    return local_delivery(Message::ref(m, false), priority); /* consume ref */
+  }
+  void run_local_delivery();
+
+  double get_max_age(utime_t now) const;
+
+  int get_queue_len() const {
+    Mutex::Locker l(lock);
+    return mqueue.length();
+  }
+
+  /**
+   * Release memory accounting back to the dispatch throttler.
+   *
+   * @param msize The amount of memory to release.
+   */
+  void dispatch_throttle_release(uint64_t msize);
+
+  void queue_connect(Connection *con) {
+    Mutex::Locker l(lock);
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_CONNECT, con));
+    cond.Signal();
+  }
+  void queue_accept(Connection *con) {
+    Mutex::Locker l(lock);
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_ACCEPT, con));
+    cond.Signal();
+  }
+  void queue_remote_reset(Connection *con) {
+    Mutex::Locker l(lock);
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_BAD_REMOTE_RESET, con));
+    cond.Signal();
+  }
+  void queue_reset(Connection *con) {
+    Mutex::Locker l(lock);
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_BAD_RESET, con));
+    cond.Signal();
+  }
+  void queue_refused(Connection *con) {
+    Mutex::Locker l(lock);
+    if (stop)
+      return;
+    mqueue.enqueue_strict(
+      0,
+      CEPH_MSG_PRIO_HIGHEST,
+      QueueItem(D_CONN_REFUSED, con));
+    cond.Signal();
+  }
+
+  bool can_fast_dispatch(const Message::const_ref &m) const;
+  void fast_dispatch(const Message::ref& m);
+  void fast_dispatch(Message* m) {
+    return fast_dispatch(Message::ref(m, false)); /* consume ref */
+  }
+  void fast_preprocess(const Message::ref& m);
+  void enqueue(const Message::ref& m, int priority, uint64_t id);
+  void enqueue(Message* m, int priority, uint64_t id) {
+    return enqueue(Message::ref(m, false), priority, id); /* consume ref */
+  }
+  void discard_queue(uint64_t id);
+  void discard_local();
+  uint64_t get_id() {
+    return next_id++;
+  }
+  void start();
+  void entry();
+  void wait();
+  void shutdown();
+  bool is_started() const {return dispatch_thread.is_started();}
+
+  DispatchQueue(CephContext *cct, Messenger *msgr, string &name)
+    : cct(cct), msgr(msgr),
+      lock("Messenger::DispatchQueue::lock" + name),
+      mqueue(cct->_conf->ms_pq_max_tokens_per_priority,
+	     cct->_conf->ms_pq_min_cost),
+      next_id(1),
+      dispatch_thread(this),
+      local_delivery_lock("Messenger::DispatchQueue::local_delivery_lock" + name),
+      stop_local_delivery(false),
+      local_delivery_thread(this),
+      dispatch_throttler(cct, string("msgr_dispatch_throttler-") + name,
+                         cct->_conf->ms_dispatch_throttle_bytes),
+      stop(false)
+    {}
+  ~DispatchQueue() {
+    ceph_assert(mqueue.empty());
+    ceph_assert(marrival.empty());
+    ceph_assert(local_messages.empty());
+  }
+};
+
+#endif
diff --git a/src/msg/DispatchStrategy.h b/src/msg/DispatchStrategy.h
new file mode 100644
index 00000000..4c9726ed
--- /dev/null
+++ b/src/msg/DispatchStrategy.h
@@ -0,0 +1,37 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef DISPATCH_STRATEGY_H
+#define DISPATCH_STRATEGY_H
+
+#include "msg/Message.h"
+
+class Messenger;
+
+class DispatchStrategy
+{
+protected:
+  Messenger *msgr = nullptr;
+public:
+  DispatchStrategy() {}
+  Messenger *get_messenger() { return msgr; }
+  void set_messenger(Messenger *_msgr) { msgr = _msgr; }
+  virtual void ds_dispatch(Message *m) = 0;
+  virtual void shutdown() = 0;
+  virtual void start() = 0;
+  virtual void wait() = 0;
+  virtual ~DispatchStrategy() {}
+};
+
+#endif /* DISPATCH_STRATEGY_H */
diff --git a/src/msg/Dispatcher.h b/src/msg/Dispatcher.h
new file mode 100644
index 00000000..fef5e320
--- /dev/null
+++ b/src/msg/Dispatcher.h
@@ -0,0 +1,264 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_DISPATCHER_H
+#define CEPH_DISPATCHER_H
+
+#include <memory>
+#include "include/buffer_fwd.h"
+#include "include/ceph_assert.h"
+#include "msg/MessageRef.h"
+
+class Messenger;
+class Connection;
+class AuthAuthorizer;
+class CryptoKey;
+class CephContext;
+class AuthAuthorizerChallenge;
+class KeyStore;
+
+class Dispatcher {
+public:
+  explicit Dispatcher(CephContext *cct_)
+    : cct(cct_)
+  {
+  }
+  virtual ~Dispatcher() { }
+
+  /**
+   * The Messenger calls this function to query if you are capable
+   * of "fast dispatch"ing a message. Indicating that you can fast
+   * dispatch it requires that you:
+   * 1) Handle the Message quickly and without taking long-term contended
+   * locks. (This function is likely to be called in-line with message
+   * receipt.)
+   * 2) Be able to accept the Message even if you have not yet received
+   * an ms_handle_accept() notification for the Connection it is associated
+   * with, and even if you *have* called mark_down() or received an
+   * ms_handle_reset() (or similar) call on the Connection. You will
+   * not receive more than one dead "message" (and should generally be
+   * prepared for that circumstance anyway, since the normal dispatch can begin,
+   * then trigger Connection failure before it's percolated through your system).
+   * We provide ms_handle_fast_[connect|accept] calls if you need them, under
+   * similar speed and state constraints as fast_dispatch itself.
+   * 3) Be able to make a determination on fast_dispatch without relying
+   * on particular system state -- the ms_can_fast_dispatch() call might
+   * be called multiple times on a single message; the state might change between
+   * calling ms_can_fast_dispatch and ms_fast_dispatch; etc.
+   *
+   * @param m The message we want to fast dispatch.
+   * @returns True if the message can be fast dispatched; false otherwise.
+   */
+  virtual bool ms_can_fast_dispatch(const Message *m) const { return false; }
+  virtual bool ms_can_fast_dispatch2(const MessageConstRef& m) const {
+    return ms_can_fast_dispatch(m.get());
+  }
+  /**
+   * This function determines if a dispatcher is included in the
+   * list of fast-dispatch capable Dispatchers.
+   * @returns True if the Dispatcher can handle any messages via
+   * fast dispatch; false otherwise.
+   */
+  virtual bool ms_can_fast_dispatch_any() const { return false; }
+  /**
+   * Perform a "fast dispatch" on a given message. See
+   * ms_can_fast_dispatch() for the requirements.
+   *
+   * @param m The Message to fast dispatch.
+   */
+  virtual void ms_fast_dispatch(Message *m) { ceph_abort(); }
+
+  /* ms_fast_dispatch2 because otherwise the child must define both */
+  virtual void ms_fast_dispatch2(const MessageRef &m) {
+    /* allow old style dispatch handling that expects a Message * with a floating ref */
+    return ms_fast_dispatch(MessageRef(m).detach()); /* XXX N.B. always consumes ref */
+  }
+
+  /**
+   * Let the Dispatcher preview a Message before it is dispatched. This
+   * function is called on *every* Message, prior to the fast/regular dispatch
+   * decision point, but it is only used on fast-dispatch capable systems. An
+   * implementation of ms_fast_preprocess must be essentially lock-free in the
+   * same way as the ms_fast_dispatch function is (in particular, ms_fast_preprocess
+   * may be called while the Messenger holds internal locks that prevent progress from
+   * other threads, so any locks it takes must be at the very bottom of the hierarchy).
+   * Messages are delivered in receipt order within a single Connection, but there are
+   * no guarantees across Connections. This makes it useful for some limited
+   * coordination between Messages which can be fast_dispatch'ed and those which must
+   * go through normal dispatch.
+   *
+   * @param m A message which has been received
+   */
+  virtual void ms_fast_preprocess(Message *m) {}
+
+  /* ms_fast_preprocess2 because otherwise the child must define both */
+  virtual void ms_fast_preprocess2(const MessageRef &m) {
+    /* allow old style dispatch handling that expects a Message* */
+    return ms_fast_preprocess(m.get());
+  }
+
+  /**
+   * The Messenger calls this function to deliver a single message.
+   *
+   * @param m The message being delivered. You (the Dispatcher)
+   * are given a single reference count on it.
+   */
+  virtual bool ms_dispatch(Message *m) {
+    ceph_abort();
+  }
+
+  /* ms_dispatch2 because otherwise the child must define both */
+  virtual bool ms_dispatch2(const MessageRef &m) {
+    /* allow old style dispatch handling that expects a Message * with a floating ref */
+    MessageRef mr(m);
+    if (ms_dispatch(mr.get())) {
+      mr.detach(); /* dispatcher consumed ref */
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * This function will be called whenever a Connection is newly-created
+   * or reconnects in the Messenger.
+   *
+   * @param con The new Connection which has been established. You are not
+   * granted a reference to it -- take one if you need one!
+   */
+  virtual void ms_handle_connect(Connection *con) {}
+
+  /**
+   * This function will be called synchronously whenever a Connection is
+   * newly-created or reconnects in the Messenger, if you support fast
+   * dispatch. It is guaranteed to be called before any messages are
+   * dispatched.
+   *
+   * @param con The new Connection which has been established. You are not
+   * granted a reference to it -- take one if you need one!
+   */
+  virtual void ms_handle_fast_connect(Connection *con) {}
+
+  /**
+   * Callback indicating we have accepted an incoming connection.
+   *
+   * @param con The (new or existing) Connection associated with the session
+   */
+  virtual void ms_handle_accept(Connection *con) {}
+
+  /**
+   * Callback indicating we have accepted an incoming connection, if you
+   * support fast dispatch. It is guaranteed to be called before any messages
+   * are dispatched.
+   *
+   * @param con The (new or existing) Connection associated with the session
+   */
+  virtual void ms_handle_fast_accept(Connection *con) {}
+
+  /*
+   * this indicates that the ordered+reliable delivery semantics have
+   * been violated.  Messages may have been lost due to a fault
+   * in the network connection.
+   * Only called on lossy Connections.
+   *
+   * @param con The Connection which broke. You are not granted
+   * a reference to it.
+   */
+  virtual bool ms_handle_reset(Connection *con) = 0;
+
+  /**
+   * This indicates that the ordered+reliable delivery semantics
+   * have been violated because the remote somehow reset.
+   * It implies that incoming messages were dropped, and
+   * probably some of our previous outgoing messages were too.
+   *
+   * @param con The Connection which broke. You are not granted
+   * a reference to it.
+   */
+  virtual void ms_handle_remote_reset(Connection *con) = 0;
+
+  /**
+   * This indicates that the connection is both broken and further
+   * connection attempts are failing because other side refuses
+   * it.
+   *
+   * @param con The Connection which broke. You are not granted
+   * a reference to it.
+   */
+  virtual bool ms_handle_refused(Connection *con) = 0;
+
+  /**
+   * @defgroup Authentication
+   * @{
+   */
+
+  /**
+   * handle successful authentication (msgr2)
+   *
+   * Authenticated result/state will be attached to the Connection.
+   *
+   * return 1 for success
+   * return 0 for no action (let another Dispatcher handle it)
+   * return <0 for failure (failure to parse caps, for instance)
+   */
+  virtual int ms_handle_authentication(Connection *con) {
+    return 0;
+  }
+
+  /**
+   * get authentication keyring
+   *
+   * Return the keyring to use for authentication with msgr1.  Remove me
+   * someday.
+   */
+  virtual KeyStore* ms_get_auth1_authorizer_keystore() {
+    return nullptr;
+  }
+
+  /**
+   * Retrieve the AuthAuthorizer for the given peer type. It might not
+   * provide one if it knows there is no AuthAuthorizer for that type.
+   *
+   * @param dest_type The peer type we want the authorizer for.
+   * @param a Double pointer to an AuthAuthorizer. The Dispatcher will fill
+   * in *a with the correct AuthAuthorizer, if it can. Make sure that you have
+   * set *a to NULL before calling in.
+   * @param force_new Force the Dispatcher to wait for a new set of keys before
+   * returning the authorizer.
+   *
+   * @return True if this function call properly filled in *a, false otherwise.
+   */
+  virtual bool ms_get_authorizer(int dest_type, AuthAuthorizer **a) {
+    return false;
+  }
+  /**
+   * @} //Authentication
+   */
+
+  void ms_set_require_authorizer(bool b) {
+    require_authorizer = b;
+  }
+protected:
+  CephContext *cct;
+public:
+  // allow unauthenticated connections.  This is needed for
+  // compatibility with pre-nautilus OSDs, which do not authenticate
+  // the heartbeat sessions.
+  bool require_authorizer = true;
+private:
+  explicit Dispatcher(const Dispatcher &rhs);
+  Dispatcher& operator=(const Dispatcher &rhs);
+};
+
+#endif
diff --git a/src/msg/FastStrategy.h b/src/msg/FastStrategy.h
new file mode 100644
index 00000000..001ff400
--- /dev/null
+++ b/src/msg/FastStrategy.h
@@ -0,0 +1,35 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef FAST_STRATEGY_H
+#define FAST_STRATEGY_H
+#include "DispatchStrategy.h"
+
+class FastStrategy : public DispatchStrategy {
+public:
+  FastStrategy() {}
+  void ds_dispatch(Message *m) override {
+    msgr->ms_fast_preprocess(m);
+    if (msgr->ms_can_fast_dispatch(m))
+      msgr->ms_fast_dispatch(m);
+    else
+      msgr->ms_deliver_dispatch(m);
+  }
+  void shutdown() override {}
+  void start() override {}
+  void wait() override {}
+  virtual ~FastStrategy() {}
+};
+#endif /* FAST_STRATEGY_H */
diff --git a/src/msg/Message.cc b/src/msg/Message.cc
new file mode 100644
index 00000000..d36a95eb
--- /dev/null
+++ b/src/msg/Message.cc
@@ -0,0 +1,1000 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+
+#ifdef ENCODE_DUMP
+# include <typeinfo>
+# include <cxxabi.h>
+#endif
+
+#include <iostream>
+
+#include "include/types.h"
+
+#include "global/global_context.h"
+
+#include "Message.h"
+
+#include "messages/MPGStats.h"
+
+#include "messages/MGenericMessage.h"
+
+#include "messages/MPGStatsAck.h"
+
+#include "messages/MStatfs.h"
+#include "messages/MStatfsReply.h"
+
+#include "messages/MGetPoolStats.h"
+#include "messages/MGetPoolStatsReply.h"
+
+
+#include "messages/MPoolOp.h"
+#include "messages/MPoolOpReply.h"
+
+#include "messages/PaxosServiceMessage.h"
+#include "messages/MMonCommand.h"
+#include "messages/MMonCommandAck.h"
+#include "messages/MMonPaxos.h"
+#include "messages/MConfig.h"
+#include "messages/MGetConfig.h"
+
+#include "messages/MMonProbe.h"
+#include "messages/MMonJoin.h"
+#include "messages/MMonElection.h"
+#include "messages/MMonSync.h"
+#include "messages/MMonScrub.h"
+
+#include "messages/MLog.h"
+#include "messages/MLogAck.h"
+
+#include "messages/MPing.h"
+
+#include "messages/MCommand.h"
+#include "messages/MCommandReply.h"
+#include "messages/MBackfillReserve.h"
+#include "messages/MRecoveryReserve.h"
+
+#include "messages/MRoute.h"
+#include "messages/MForward.h"
+
+#include "messages/MOSDBoot.h"
+#include "messages/MOSDAlive.h"
+#include "messages/MOSDBeacon.h"
+#include "messages/MOSDPGTemp.h"
+#include "messages/MOSDFailure.h"
+#include "messages/MOSDMarkMeDown.h"
+#include "messages/MOSDFull.h"
+#include "messages/MOSDPing.h"
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "messages/MOSDRepOp.h"
+#include "messages/MOSDRepOpReply.h"
+#include "messages/MOSDMap.h"
+#include "messages/MMonGetOSDMap.h"
+
+#include "messages/MOSDPGCreated.h"
+#include "messages/MOSDPGNotify.h"
+#include "messages/MOSDPGQuery.h"
+#include "messages/MOSDPGLog.h"
+#include "messages/MOSDPGRemove.h"
+#include "messages/MOSDPGInfo.h"
+#include "messages/MOSDPGCreate.h"
+#include "messages/MOSDPGCreate2.h"
+#include "messages/MOSDPGTrim.h"
+#include "messages/MOSDScrub.h"
+#include "messages/MOSDScrub2.h"
+#include "messages/MOSDScrubReserve.h"
+#include "messages/MOSDRepScrub.h"
+#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDForceRecovery.h"
+#include "messages/MOSDPGScan.h"
+#include "messages/MOSDPGBackfill.h"
+#include "messages/MOSDBackoff.h"
+#include "messages/MOSDPGBackfillRemove.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
+#include "messages/MOSDPGReadyToMerge.h"
+
+#include "messages/MRemoveSnaps.h"
+
+#include "messages/MMonMap.h"
+#include "messages/MMonGetMap.h"
+#include "messages/MMonGetVersion.h"
+#include "messages/MMonGetVersionReply.h"
+#include "messages/MMonHealth.h"
+#include "messages/MMonHealthChecks.h"
+#include "messages/MMonMetadata.h"
+#include "messages/MDataPing.h"
+#include "messages/MAuth.h"
+#include "messages/MAuthReply.h"
+#include "messages/MMonSubscribe.h"
+#include "messages/MMonSubscribeAck.h"
+#include "messages/MMonGlobalID.h"
+#include "messages/MClientSession.h"
+#include "messages/MClientReconnect.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientRequestForward.h"
+#include "messages/MClientReply.h"
+#include "messages/MClientReclaim.h"
+#include "messages/MClientReclaimReply.h"
+#include "messages/MClientCaps.h"
+#include "messages/MClientCapRelease.h"
+#include "messages/MClientLease.h"
+#include "messages/MClientSnap.h"
+#include "messages/MClientQuota.h"
+
+#include "messages/MMDSSlaveRequest.h"
+
+#include "messages/MMDSMap.h"
+#include "messages/MFSMap.h"
+#include "messages/MFSMapUser.h"
+#include "messages/MMDSBeacon.h"
+#include "messages/MMDSLoadTargets.h"
+#include "messages/MMDSResolve.h"
+#include "messages/MMDSResolveAck.h"
+#include "messages/MMDSCacheRejoin.h"
+#include "messages/MMDSFindIno.h"
+#include "messages/MMDSFindInoReply.h"
+#include "messages/MMDSOpenIno.h"
+#include "messages/MMDSOpenInoReply.h"
+#include "messages/MMDSSnapUpdate.h"
+
+#include "messages/MDirUpdate.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MMDSFragmentNotify.h"
+#include "messages/MMDSFragmentNotifyAck.h"
+
+#include "messages/MExportDirDiscover.h"
+#include "messages/MExportDirDiscoverAck.h"
+#include "messages/MExportDirCancel.h"
+#include "messages/MExportDirPrep.h"
+#include "messages/MExportDirPrepAck.h"
+#include "messages/MExportDir.h"
+#include "messages/MExportDirAck.h"
+#include "messages/MExportDirNotify.h"
+#include "messages/MExportDirNotifyAck.h"
+#include "messages/MExportDirFinish.h"
+
+#include "messages/MExportCaps.h"
+#include "messages/MExportCapsAck.h"
+#include "messages/MGatherCaps.h"
+
+
+#include "messages/MDentryUnlink.h"
+#include "messages/MDentryLink.h"
+
+#include "messages/MHeartbeat.h"
+
+#include "messages/MMDSTableRequest.h"
+
+//#include "messages/MInodeUpdate.h"
+#include "messages/MCacheExpire.h"
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MMgrBeacon.h"
+#include "messages/MMgrMap.h"
+#include "messages/MMgrDigest.h"
+#include "messages/MMgrReport.h"
+#include "messages/MMgrOpen.h"
+#include "messages/MMgrClose.h"
+#include "messages/MMgrConfigure.h"
+#include "messages/MMonMgrReport.h"
+#include "messages/MServiceMap.h"
+
+#include "messages/MLock.h"
+
+#include "messages/MWatchNotify.h"
+#include "messages/MTimeCheck.h"
+#include "messages/MTimeCheck2.h"
+
+#include "common/config.h"
+
+#include "messages/MOSDPGPush.h"
+#include "messages/MOSDPGPushReply.h"
+#include "messages/MOSDPGPull.h"
+
+#include "messages/MOSDECSubOpWrite.h"
+#include "messages/MOSDECSubOpWriteReply.h"
+#include "messages/MOSDECSubOpRead.h"
+#include "messages/MOSDECSubOpReadReply.h"
+
+#include "messages/MOSDPGUpdateLogMissing.h"
+#include "messages/MOSDPGUpdateLogMissingReply.h"
+
+#define DEBUGLVL  10    // debug level of output
+
+#define dout_subsys ceph_subsys_ms
+
+void Message::encode(uint64_t features, int crcflags)
+{
+  // encode and copy out of *m
+  if (empty_payload()) {
+    ceph_assert(middle.length() == 0);
+    encode_payload(features);
+
+    if (byte_throttler) {
+      byte_throttler->take(payload.length() + middle.length());
+    }
+
+    // if the encoder didn't specify past compatibility, we assume it
+    // is incompatible.
+    if (header.compat_version == 0)
+      header.compat_version = header.version;
+  }
+  if (crcflags & MSG_CRC_HEADER)
+    calc_front_crc();
+
+  // update envelope
+  header.front_len = get_payload().length();
+  header.middle_len = get_middle().length();
+  header.data_len = get_data().length();
+  if (crcflags & MSG_CRC_HEADER)
+    calc_header_crc();
+
+  footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+
+  if (crcflags & MSG_CRC_DATA) {
+    calc_data_crc();
+
+#ifdef ENCODE_DUMP
+    bufferlist bl;
+    encode(get_header(), bl);
+
+    // dump the old footer format
+    ceph_msg_footer_old old_footer;
+    old_footer.front_crc = footer.front_crc;
+    old_footer.middle_crc = footer.middle_crc;
+    old_footer.data_crc = footer.data_crc;
+    old_footer.flags = footer.flags;
+    encode(old_footer, bl);
+
+    encode(get_payload(), bl);
+    encode(get_middle(), bl);
+    encode(get_data(), bl);
+
+    // this is almost an exponential backoff, except because we count
+    // bits we tend to sample things we encode later, which should be
+    // more representative.
+    static int i = 0;
+    i++;
+    int bits = 0;
+    for (unsigned t = i; t; bits++)
+      t &= t - 1;
+    if (bits <= 2) {
+      char fn[200];
+      int status;
+      snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP) "/%s__%d.%x",
+	       abi::__cxa_demangle(typeid(*this).name(), 0, 0, &status),
+	       getpid(), i++);
+      int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644);
+      if (fd >= 0) {
+	bl.write_fd(fd);
+	::close(fd);
+      }
+    }
+#endif
+  } else {
+    footer.flags = (unsigned)footer.flags | CEPH_MSG_FOOTER_NOCRC;
+  }
+}
+
+void Message::dump(Formatter *f) const
+{
+  stringstream ss;
+  print(ss);
+  f->dump_string("summary", ss.str());
+}
+
+Message *decode_message(CephContext *cct, int crcflags,
+			ceph_msg_header& header,
+			ceph_msg_footer& footer,
+			bufferlist& front, bufferlist& middle,
+			bufferlist& data, Connection* conn)
+{
+  // verify crc
+  if (crcflags & MSG_CRC_HEADER) {
+    __u32 front_crc = front.crc32c(0);
+    __u32 middle_crc = middle.crc32c(0);
+
+    if (front_crc != footer.front_crc) {
+      if (cct) {
+	ldout(cct, 0) << "bad crc in front " << front_crc << " != exp " << footer.front_crc
+		      << " from " << conn->get_peer_addr() << dendl;
+	ldout(cct, 20) << " ";
+	front.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      return 0;
+    }
+    if (middle_crc != footer.middle_crc) {
+      if (cct) {
+	ldout(cct, 0) << "bad crc in middle " << middle_crc << " != exp " << footer.middle_crc
+		      << " from " << conn->get_peer_addr() << dendl;
+	ldout(cct, 20) << " ";
+	middle.hexdump(*_dout);
+	*_dout << dendl;
+      }
+      return 0;
+    }
+  }
+  if (crcflags & MSG_CRC_DATA) {
+    if ((footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0) {
+      __u32 data_crc = data.crc32c(0);
+      if (data_crc != footer.data_crc) {
+	if (cct) {
+	  ldout(cct, 0) << "bad crc in data " << data_crc << " != exp " << footer.data_crc
+			<< " from " << conn->get_peer_addr() << dendl;
+	  ldout(cct, 20) << " ";
+	  data.hexdump(*_dout);
+	  *_dout << dendl;
+	}
+	return 0;
+      }
+    }
+  }
+
+  // make message
+  Message::ref m;
+  int type = header.type;
+  switch (type) {
+
+    // -- with payload --
+
+  case MSG_PGSTATS:
+    m = MPGStats::create();
+    break;
+  case MSG_PGSTATSACK:
+    m = MPGStatsAck::create();
+    break;
+
+  case CEPH_MSG_STATFS:
+    m = MStatfs::create();
+    break;
+  case CEPH_MSG_STATFS_REPLY:
+    m = MStatfsReply::create();
+    break;
+  case MSG_GETPOOLSTATS:
+    m = MGetPoolStats::create();
+    break;
+  case MSG_GETPOOLSTATSREPLY:
+    m = MGetPoolStatsReply::create();
+    break;
+  case CEPH_MSG_POOLOP:
+    m = MPoolOp::create();
+    break;
+  case CEPH_MSG_POOLOP_REPLY:
+    m = MPoolOpReply::create();
+    break;
+  case MSG_MON_COMMAND:
+    m = MMonCommand::create();
+    break;
+  case MSG_MON_COMMAND_ACK:
+    m = MMonCommandAck::create();
+    break;
+  case MSG_MON_PAXOS:
+    m = MMonPaxos::create();
+    break;
+  case MSG_CONFIG:
+    m = MConfig::create();
+    break;
+  case MSG_GET_CONFIG:
+    m = MGetConfig::create();
+    break;
+
+  case MSG_MON_PROBE:
+    m = MMonProbe::create();
+    break;
+  case MSG_MON_JOIN:
+    m = MMonJoin::create();
+    break;
+  case MSG_MON_ELECTION:
+    m = MMonElection::create();
+    break;
+  case MSG_MON_SYNC:
+    m = MMonSync::create();
+    break;
+  case MSG_MON_SCRUB:
+    m = MMonScrub::create();
+    break;
+
+  case MSG_LOG:
+    m = MLog::create();
+    break;
+  case MSG_LOGACK:
+    m = MLogAck::create();
+    break;
+
+  case CEPH_MSG_PING:
+    m = MPing::create();
+    break;
+  case MSG_COMMAND:
+    m = MCommand::create();
+    break;
+  case MSG_COMMAND_REPLY:
+    m = MCommandReply::create();
+    break;
+  case MSG_OSD_BACKFILL_RESERVE:
+    m = MBackfillReserve::create();
+    break;
+  case MSG_OSD_RECOVERY_RESERVE:
+    m = MRecoveryReserve::create();
+    break;
+  case MSG_OSD_FORCE_RECOVERY:
+    m = MOSDForceRecovery::create();
+    break;
+
+  case MSG_ROUTE:
+    m = MRoute::create();
+    break;
+  case MSG_FORWARD:
+    m = MForward::create();
+    break;
+    
+  case CEPH_MSG_MON_MAP:
+    m = MMonMap::create();
+    break;
+  case CEPH_MSG_MON_GET_MAP:
+    m = MMonGetMap::create();
+    break;
+  case CEPH_MSG_MON_GET_OSDMAP:
+    m = MMonGetOSDMap::create();
+    break;
+  case CEPH_MSG_MON_GET_VERSION:
+    m = MMonGetVersion::create();
+    break;
+  case CEPH_MSG_MON_GET_VERSION_REPLY:
+    m = MMonGetVersionReply::create();
+    break;
+  case CEPH_MSG_MON_METADATA:
+    m = MMonMetadata::create();
+    break;
+
+  case MSG_OSD_BOOT:
+    m = MOSDBoot::create();
+    break;
+  case MSG_OSD_ALIVE:
+    m = MOSDAlive::create();
+    break;
+  case MSG_OSD_BEACON:
+    m = MOSDBeacon::create();
+    break;
+  case MSG_OSD_PGTEMP:
+    m = MOSDPGTemp::create();
+    break;
+  case MSG_OSD_FAILURE:
+    m = MOSDFailure::create();
+    break;
+  case MSG_OSD_MARK_ME_DOWN:
+    m = MOSDMarkMeDown::create();
+    break;
+  case MSG_OSD_FULL:
+    m = MOSDFull::create();
+    break;
+  case MSG_OSD_PING:
+    m = MOSDPing::create();
+    break;
+  case CEPH_MSG_OSD_OP:
+    m = MOSDOp::create();
+    break;
+  case CEPH_MSG_OSD_OPREPLY:
+    m = MOSDOpReply::create();
+    break;
+  case MSG_OSD_REPOP:
+    m = MOSDRepOp::create();
+    break;
+  case MSG_OSD_REPOPREPLY:
+    m = MOSDRepOpReply::create();
+    break;
+  case MSG_OSD_PG_CREATED:
+    m = MOSDPGCreated::create();
+    break;
+  case MSG_OSD_PG_UPDATE_LOG_MISSING:
+    m = MOSDPGUpdateLogMissing::create();
+    break;
+  case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
+    m = MOSDPGUpdateLogMissingReply::create();
+    break;
+  case CEPH_MSG_OSD_BACKOFF:
+    m = MOSDBackoff::create();
+    break;
+
+  case CEPH_MSG_OSD_MAP:
+    m = MOSDMap::create();
+    break;
+
+  case CEPH_MSG_WATCH_NOTIFY:
+    m = MWatchNotify::create();
+    break;
+
+  case MSG_OSD_PG_NOTIFY:
+    m = MOSDPGNotify::create();
+    break;
+  case MSG_OSD_PG_QUERY:
+    m = MOSDPGQuery::create();
+    break;
+  case MSG_OSD_PG_LOG:
+    m = MOSDPGLog::create();
+    break;
+  case MSG_OSD_PG_REMOVE:
+    m = MOSDPGRemove::create();
+    break;
+  case MSG_OSD_PG_INFO:
+    m = MOSDPGInfo::create();
+    break;
+  case MSG_OSD_PG_CREATE:
+    m = MOSDPGCreate::create();
+    break;
+  case MSG_OSD_PG_CREATE2:
+    m = MOSDPGCreate2::create();
+    break;
+  case MSG_OSD_PG_TRIM:
+    m = MOSDPGTrim::create();
+    break;
+
+  case MSG_OSD_SCRUB:
+    m = MOSDScrub::create();
+    break;
+  case MSG_OSD_SCRUB2:
+    m = MOSDScrub2::create();
+    break;
+  case MSG_OSD_SCRUB_RESERVE:
+    m = MOSDScrubReserve::create();
+    break;
+  case MSG_REMOVE_SNAPS:
+    m = MRemoveSnaps::create();
+    break;
+  case MSG_OSD_REP_SCRUB:
+    m = MOSDRepScrub::create();
+    break;
+  case MSG_OSD_REP_SCRUBMAP:
+    m = MOSDRepScrubMap::create();
+    break;
+  case MSG_OSD_PG_SCAN:
+    m = MOSDPGScan::create();
+    break;
+  case MSG_OSD_PG_BACKFILL:
+    m = MOSDPGBackfill::create();
+    break;
+  case MSG_OSD_PG_BACKFILL_REMOVE:
+    m = MOSDPGBackfillRemove::create();
+    break;
+  case MSG_OSD_PG_PUSH:
+    m = MOSDPGPush::create();
+    break;
+  case MSG_OSD_PG_PULL:
+    m = MOSDPGPull::create();
+    break;
+  case MSG_OSD_PG_PUSH_REPLY:
+    m = MOSDPGPushReply::create();
+    break;
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    m = MOSDPGRecoveryDelete::create();
+    break;
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    m = MOSDPGRecoveryDeleteReply::create();
+    break;
+  case MSG_OSD_PG_READY_TO_MERGE:
+    m = MOSDPGReadyToMerge::create();
+    break;
+  case MSG_OSD_EC_WRITE:
+    m = MOSDECSubOpWrite::create();
+    break;
+  case MSG_OSD_EC_WRITE_REPLY:
+    m = MOSDECSubOpWriteReply::create();
+    break;
+  case MSG_OSD_EC_READ:
+    m = MOSDECSubOpRead::create();
+    break;
+  case MSG_OSD_EC_READ_REPLY:
+    m = MOSDECSubOpReadReply::create();
+    break;
+   // auth
+  case CEPH_MSG_AUTH:
+    m = MAuth::create();
+    break;
+  case CEPH_MSG_AUTH_REPLY:
+    m = MAuthReply::create();
+    break;
+
+  case MSG_MON_GLOBAL_ID:
+    m = MMonGlobalID::create();
+    break; 
+
+    // clients
+  case CEPH_MSG_MON_SUBSCRIBE:
+    m = MMonSubscribe::create();
+    break;
+  case CEPH_MSG_MON_SUBSCRIBE_ACK:
+    m = MMonSubscribeAck::create();
+    break;
+  case CEPH_MSG_CLIENT_SESSION:
+    m = MClientSession::create();
+    break;
+  case CEPH_MSG_CLIENT_RECONNECT:
+    m = MClientReconnect::create();
+    break;
+  case CEPH_MSG_CLIENT_REQUEST:
+    m = MClientRequest::create();
+    break;
+  case CEPH_MSG_CLIENT_REQUEST_FORWARD:
+    m = MClientRequestForward::create();
+    break;
+  case CEPH_MSG_CLIENT_REPLY:
+    m = MClientReply::create();
+    break;
+  case CEPH_MSG_CLIENT_RECLAIM:
+    m = MClientReclaim::create();
+    break;
+  case CEPH_MSG_CLIENT_RECLAIM_REPLY:
+    m = MClientReclaimReply::create();
+    break;
+  case CEPH_MSG_CLIENT_CAPS:
+    m = MClientCaps::create();
+    break;
+  case CEPH_MSG_CLIENT_CAPRELEASE:
+    m = MClientCapRelease::create();
+    break;
+  case CEPH_MSG_CLIENT_LEASE:
+    m = MClientLease::create();
+    break;
+  case CEPH_MSG_CLIENT_SNAP:
+    m = MClientSnap::create();
+    break;
+  case CEPH_MSG_CLIENT_QUOTA:
+    m = MClientQuota::create();
+    break;
+
+    // mds
+  case MSG_MDS_SLAVE_REQUEST:
+    m = MMDSSlaveRequest::create();
+    break;
+
+  case CEPH_MSG_MDS_MAP:
+    m = MMDSMap::create();
+    break;
+  case CEPH_MSG_FS_MAP:
+    m = MFSMap::create();
+    break;
+  case CEPH_MSG_FS_MAP_USER:
+    m = MFSMapUser::create();
+    break;
+  case MSG_MDS_BEACON:
+    m = MMDSBeacon::create();
+    break;
+  case MSG_MDS_OFFLOAD_TARGETS:
+    m = MMDSLoadTargets::create();
+    break;
+  case MSG_MDS_RESOLVE:
+    m = MMDSResolve::create();
+    break;
+  case MSG_MDS_RESOLVEACK:
+    m = MMDSResolveAck::create();
+    break;
+  case MSG_MDS_CACHEREJOIN:
+    m = MMDSCacheRejoin::create();
+	break;
+  
+  case MSG_MDS_DIRUPDATE:
+    m = MDirUpdate::create();
+    break;
+
+  case MSG_MDS_DISCOVER:
+    m = MDiscover::create();
+    break;
+  case MSG_MDS_DISCOVERREPLY:
+    m = MDiscoverReply::create();
+    break;
+
+  case MSG_MDS_FINDINO:
+    m = MMDSFindIno::create();
+    break;
+  case MSG_MDS_FINDINOREPLY:
+    m = MMDSFindInoReply::create();
+    break;
+
+  case MSG_MDS_OPENINO:
+    m = MMDSOpenIno::create();
+    break;
+  case MSG_MDS_OPENINOREPLY:
+    m = MMDSOpenInoReply::create();
+    break;
+
+  case MSG_MDS_SNAPUPDATE:
+    m = MMDSSnapUpdate::create();
+    break;
+
+  case MSG_MDS_FRAGMENTNOTIFY:
+    m = MMDSFragmentNotify::create();
+    break;
+
+  case MSG_MDS_FRAGMENTNOTIFYACK:
+    m = MMDSFragmentNotifyAck::create();
+    break;
+
+  case MSG_MDS_EXPORTDIRDISCOVER:
+    m = MExportDirDiscover::create();
+    break;
+  case MSG_MDS_EXPORTDIRDISCOVERACK:
+    m = MExportDirDiscoverAck::create();
+    break;
+  case MSG_MDS_EXPORTDIRCANCEL:
+    m = MExportDirCancel::create();
+    break;
+
+  case MSG_MDS_EXPORTDIR:
+    m = MExportDir::create();
+    break;
+  case MSG_MDS_EXPORTDIRACK:
+    m = MExportDirAck::create();
+    break;
+  case MSG_MDS_EXPORTDIRFINISH:
+    m = MExportDirFinish::create();
+    break;
+
+  case MSG_MDS_EXPORTDIRNOTIFY:
+    m = MExportDirNotify::create();
+    break;
+
+  case MSG_MDS_EXPORTDIRNOTIFYACK:
+    m = MExportDirNotifyAck::create();
+    break;
+
+  case MSG_MDS_EXPORTDIRPREP:
+    m = MExportDirPrep::create();
+    break;
+
+  case MSG_MDS_EXPORTDIRPREPACK:
+    m = MExportDirPrepAck::create();
+    break;
+
+  case MSG_MDS_EXPORTCAPS:
+    m = MExportCaps::create();
+    break;
+  case MSG_MDS_EXPORTCAPSACK:
+    m = MExportCapsAck::create();
+    break;
+  case MSG_MDS_GATHERCAPS:
+    m = MGatherCaps::create();
+    break;
+
+
+  case MSG_MDS_DENTRYUNLINK:
+    m = MDentryUnlink::create();
+    break;
+  case MSG_MDS_DENTRYLINK:
+    m = MDentryLink::create();
+    break;
+
+  case MSG_MDS_HEARTBEAT:
+    m = MHeartbeat::create();
+    break;
+
+  case MSG_MDS_CACHEEXPIRE:
+    m = MCacheExpire::create();
+    break;
+
+  case MSG_MDS_TABLE_REQUEST:
+    m = MMDSTableRequest::create();
+    break;
+
+	/*  case MSG_MDS_INODEUPDATE:
+    m = MInodeUpdate::create();
+    break;
+	*/
+
+  case MSG_MDS_INODEFILECAPS:
+    m = MInodeFileCaps::create();
+    break;
+
+  case MSG_MDS_LOCK:
+    m = MLock::create();
+    break;
+
+  case MSG_MGR_BEACON:
+    m = MMgrBeacon::create();
+    break;
+
+  case MSG_MON_MGR_REPORT:
+    m = MMonMgrReport::create();
+    break;
+
+  case MSG_SERVICE_MAP:
+    m = MServiceMap::create();
+    break;
+
+  case MSG_MGR_MAP:
+    m = MMgrMap::create();
+    break;
+
+  case MSG_MGR_DIGEST:
+    m = MMgrDigest::create();
+    break;
+
+  case MSG_MGR_OPEN:
+    m = MMgrOpen::create();
+    break;
+
+  case MSG_MGR_CLOSE:
+    m = MMgrClose::create();
+    break;
+
+  case MSG_MGR_REPORT:
+    m = MMgrReport::create();
+    break;
+
+  case MSG_MGR_CONFIGURE:
+    m = MMgrConfigure::create();
+    break;
+
+  case MSG_TIMECHECK:
+    m = MTimeCheck::create();
+    break;
+  case MSG_TIMECHECK2:
+    m = MTimeCheck2::create();
+    break;
+
+  case MSG_MON_HEALTH:
+    m = MMonHealth::create();
+    break;
+
+  case MSG_MON_HEALTH_CHECKS:
+    m = MMonHealthChecks::create();
+    break;
+
+#if defined(HAVE_XIO)
+  case MSG_DATA_PING:
+    m = MDataPing::create();
+    break;
+#endif
+    // -- simple messages without payload --
+
+  case CEPH_MSG_SHUTDOWN:
+    m = MGenericMessage::create(type);
+    break;
+
+  default:
+    if (cct) {
+      ldout(cct, 0) << "can't decode unknown message type " << type << " MSG_AUTH=" << CEPH_MSG_AUTH << dendl;
+      if (cct->_conf->ms_die_on_bad_msg)
+	ceph_abort();
+    }
+    return 0;
+  }
+
+  m->set_cct(cct);
+
+  // m->header.version, if non-zero, should be populated with the
+  // newest version of the encoding the code supports.  If set, check
+  // it against compat_version.
+  if (m->get_header().version &&
+      m->get_header().version < header.compat_version) {
+    if (cct) {
+      ldout(cct, 0) << "will not decode message of type " << type
+		    << " version " << header.version
+		    << " because compat_version " << header.compat_version
+		    << " > supported version " << m->get_header().version << dendl;
+      if (cct->_conf->ms_die_on_bad_msg)
+	ceph_abort();
+    }
+    return 0;
+  }
+
+  m->set_connection(conn);
+  m->set_header(header);
+  m->set_footer(footer);
+  m->set_payload(front);
+  m->set_middle(middle);
+  m->set_data(data);
+
+  try {
+    m->decode_payload();
+  }
+  catch (const buffer::error &e) {
+    if (cct) {
+      lderr(cct) << "failed to decode message of type " << type
+		 << " v" << header.version
+		 << ": " << e.what() << dendl;
+      ldout(cct, ceph::dout::need_dynamic(
+	cct->_conf->ms_dump_corrupt_message_level)) << "dump: \n";
+      m->get_payload().hexdump(*_dout);
+      *_dout << dendl;
+      if (cct->_conf->ms_die_on_bad_msg)
+	ceph_abort();
+    }
+    return 0;
+  }
+
+  // done!
+  return m.detach();
+}
+
+void Message::encode_trace(bufferlist &bl, uint64_t features) const
+{
+  using ceph::encode;
+  auto p = trace.get_info();
+  static const blkin_trace_info empty = { 0, 0, 0 };
+  if (!p) {
+    p = &empty;
+  }
+  encode(*p, bl);
+}
+
+void Message::decode_trace(bufferlist::const_iterator &p, bool create)
+{
+  blkin_trace_info info = {};
+  decode(info, p);
+
+#ifdef WITH_BLKIN
+  if (!connection)
+    return;
+
+  const auto msgr = connection->get_messenger();
+  const auto endpoint = msgr->get_trace_endpoint();
+  if (info.trace_id) {
+    trace.init(get_type_name(), endpoint, &info, true);
+    trace.event("decoded trace");
+  } else if (create || (msgr->get_myname().is_osd() &&
+                        msgr->cct->_conf->osd_blkin_trace_all)) {
+    // create a trace even if we didn't get one on the wire
+    trace.init(get_type_name(), endpoint);
+    trace.event("created trace");
+  }
+  trace.keyval("tid", get_tid());
+  trace.keyval("entity type", get_source().type_str());
+  trace.keyval("entity num", get_source().num());
+#endif
+}
+
+
+// This routine is not used for ordinary messages, but only when encapsulating a message
+// for forwarding and routing.  It's also used in a backward compatibility test, which only
+// effectively tests backward compability for those functions.  To avoid backward compatibility
+// problems, we currently always encode and decode using the old footer format that doesn't
+// allow for message authentication.  Eventually we should fix that.  PLR
+
+void encode_message(Message *msg, uint64_t features, bufferlist& payload)
+{
+  bufferlist front, middle, data;
+  ceph_msg_footer_old old_footer;
+  ceph_msg_footer footer;
+  msg->encode(features, MSG_CRC_ALL);
+  encode(msg->get_header(), payload);
+
+  // Here's where we switch to the old footer format.  PLR
+
+  footer = msg->get_footer();
+  old_footer.front_crc = footer.front_crc;   
+  old_footer.middle_crc = footer.middle_crc;   
+  old_footer.data_crc = footer.data_crc;   
+  old_footer.flags = footer.flags;   
+  encode(old_footer, payload);
+
+  encode(msg->get_payload(), payload);
+  encode(msg->get_middle(), payload);
+  encode(msg->get_data(), payload);
+}
+
+// See above for somewhat bogus use of the old message footer.  We switch to the current footer
+// after decoding the old one so the other form of decode_message() doesn't have to change.
+// We've slipped in a 0 signature at this point, so any signature checking after this will
+// fail.  PLR
+
+Message *decode_message(CephContext *cct, int crcflags, bufferlist::const_iterator& p)
+{
+  ceph_msg_header h;
+  ceph_msg_footer_old fo;
+  ceph_msg_footer f;
+  bufferlist fr, mi, da;
+  decode(h, p);
+  decode(fo, p);
+  f.front_crc = fo.front_crc;
+  f.middle_crc = fo.middle_crc;
+  f.data_crc = fo.data_crc;
+  f.flags = fo.flags;
+  f.sig = 0;
+  decode(fr, p);
+  decode(mi, p);
+  decode(da, p);
+  return decode_message(cct, crcflags, h, f, fr, mi, da, nullptr);
+}
diff --git a/src/msg/Message.h b/src/msg/Message.h
new file mode 100644
index 00000000..42405ed3
--- /dev/null
+++ b/src/msg/Message.h
@@ -0,0 +1,577 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MESSAGE_H
+#define CEPH_MESSAGE_H
+ 
+#include <stdlib.h>
+#include <ostream>
+#include <string_view>
+
+#include <boost/intrusive/list.hpp>
+
+#include "include/Context.h"
+#include "common/RefCountedObj.h"
+#include "common/ThrottleInterface.h"
+#include "common/config.h"
+#include "common/debug.h"
+#include "common/zipkin_trace.h"
+#include "include/ceph_assert.h" // Because intrusive_ptr clobbers our assert...
+#include "include/buffer.h"
+#include "include/types.h"
+#include "msg/Connection.h"
+#include "msg/MessageRef.h"
+#include "msg_types.h"
+
+// monitor internal
+#define MSG_MON_SCRUB              64
+#define MSG_MON_ELECTION           65
+#define MSG_MON_PAXOS              66
+#define MSG_MON_PROBE              67
+#define MSG_MON_JOIN               68
+#define MSG_MON_SYNC		   69
+
+/* monitor <-> mon admin tool */
+#define MSG_MON_COMMAND            50
+#define MSG_MON_COMMAND_ACK        51
+#define MSG_LOG                    52
+#define MSG_LOGACK                 53
+
+#define MSG_GETPOOLSTATS           58
+#define MSG_GETPOOLSTATSREPLY      59
+
+#define MSG_MON_GLOBAL_ID          60
+
+#define MSG_ROUTE                  47
+#define MSG_FORWARD                46
+
+#define MSG_PAXOS                  40
+
+#define MSG_CONFIG           62
+#define MSG_GET_CONFIG       63
+
+
+// osd internal
+#define MSG_OSD_PING         70
+#define MSG_OSD_BOOT         71
+#define MSG_OSD_FAILURE      72
+#define MSG_OSD_ALIVE        73
+#define MSG_OSD_MARK_ME_DOWN 74
+#define MSG_OSD_FULL         75
+
+// removed right after luminous
+//#define MSG_OSD_SUBOP        76
+//#define MSG_OSD_SUBOPREPLY   77
+
+#define MSG_OSD_PGTEMP       78
+
+#define MSG_OSD_BEACON       79
+
+#define MSG_OSD_PG_NOTIFY      80
+#define MSG_OSD_PG_QUERY       81
+#define MSG_OSD_PG_LOG         83
+#define MSG_OSD_PG_REMOVE      84
+#define MSG_OSD_PG_INFO        85
+#define MSG_OSD_PG_TRIM        86
+
+#define MSG_PGSTATS            87
+#define MSG_PGSTATSACK         88
+
+#define MSG_OSD_PG_CREATE      89
+#define MSG_REMOVE_SNAPS       90
+
+#define MSG_OSD_SCRUB          91
+#define MSG_OSD_SCRUB_RESERVE  92  // previous PG_MISSING
+#define MSG_OSD_REP_SCRUB      93
+
+#define MSG_OSD_PG_SCAN        94
+#define MSG_OSD_PG_BACKFILL    95
+#define MSG_OSD_PG_BACKFILL_REMOVE 96
+
+#define MSG_COMMAND            97
+#define MSG_COMMAND_REPLY      98
+
+#define MSG_OSD_BACKFILL_RESERVE 99
+#define MSG_OSD_RECOVERY_RESERVE 150
+#define MSG_OSD_FORCE_RECOVERY 151
+
+#define MSG_OSD_PG_PUSH        105
+#define MSG_OSD_PG_PULL        106
+#define MSG_OSD_PG_PUSH_REPLY  107
+
+#define MSG_OSD_EC_WRITE       108
+#define MSG_OSD_EC_WRITE_REPLY 109
+#define MSG_OSD_EC_READ        110
+#define MSG_OSD_EC_READ_REPLY  111
+
+#define MSG_OSD_REPOP         112
+#define MSG_OSD_REPOPREPLY    113
+#define MSG_OSD_PG_UPDATE_LOG_MISSING  114
+#define MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY  115
+
+#define MSG_OSD_PG_CREATED      116
+#define MSG_OSD_REP_SCRUBMAP    117
+#define MSG_OSD_PG_RECOVERY_DELETE 118
+#define MSG_OSD_PG_RECOVERY_DELETE_REPLY 119
+#define MSG_OSD_PG_CREATE2      120
+#define MSG_OSD_SCRUB2          121
+
+#define MSG_OSD_PG_READY_TO_MERGE 122
+
+// *** MDS ***
+
+#define MSG_MDS_BEACON             100  // to monitor
+#define MSG_MDS_SLAVE_REQUEST      101
+#define MSG_MDS_TABLE_REQUEST      102
+
+                                // 150 already in use (MSG_OSD_RECOVERY_RESERVE)
+
+#define MSG_MDS_RESOLVE            0x200
+#define MSG_MDS_RESOLVEACK         0x201
+#define MSG_MDS_CACHEREJOIN        0x202
+#define MSG_MDS_DISCOVER           0x203
+#define MSG_MDS_DISCOVERREPLY      0x204
+#define MSG_MDS_INODEUPDATE        0x205
+#define MSG_MDS_DIRUPDATE          0x206
+#define MSG_MDS_CACHEEXPIRE        0x207
+#define MSG_MDS_DENTRYUNLINK       0x208
+#define MSG_MDS_FRAGMENTNOTIFY     0x209
+#define MSG_MDS_OFFLOAD_TARGETS    0x20a
+#define MSG_MDS_DENTRYLINK         0x20c
+#define MSG_MDS_FINDINO            0x20d
+#define MSG_MDS_FINDINOREPLY       0x20e
+#define MSG_MDS_OPENINO            0x20f
+#define MSG_MDS_OPENINOREPLY       0x210
+#define MSG_MDS_SNAPUPDATE         0x211
+#define MSG_MDS_FRAGMENTNOTIFYACK  0x212
+#define MSG_MDS_LOCK               0x300
+#define MSG_MDS_INODEFILECAPS      0x301
+
+#define MSG_MDS_EXPORTDIRDISCOVER     0x449
+#define MSG_MDS_EXPORTDIRDISCOVERACK  0x450
+#define MSG_MDS_EXPORTDIRCANCEL       0x451
+#define MSG_MDS_EXPORTDIRPREP         0x452
+#define MSG_MDS_EXPORTDIRPREPACK      0x453
+#define MSG_MDS_EXPORTDIRWARNING      0x454
+#define MSG_MDS_EXPORTDIRWARNINGACK   0x455
+#define MSG_MDS_EXPORTDIR             0x456
+#define MSG_MDS_EXPORTDIRACK          0x457
+#define MSG_MDS_EXPORTDIRNOTIFY       0x458
+#define MSG_MDS_EXPORTDIRNOTIFYACK    0x459
+#define MSG_MDS_EXPORTDIRFINISH       0x460
+
+#define MSG_MDS_EXPORTCAPS            0x470
+#define MSG_MDS_EXPORTCAPSACK         0x471
+#define MSG_MDS_GATHERCAPS            0x472
+
+#define MSG_MDS_HEARTBEAT          0x500  // for mds load balancer
+
+// *** generic ***
+#define MSG_TIMECHECK             0x600
+#define MSG_MON_HEALTH            0x601
+
+// *** Message::encode() crcflags bits ***
+#define MSG_CRC_DATA           (1 << 0)
+#define MSG_CRC_HEADER         (1 << 1)
+#define MSG_CRC_ALL            (MSG_CRC_DATA | MSG_CRC_HEADER)
+
+// Xio Testing
+#define MSG_DATA_PING		  0x602
+
+// Xio intends to define messages 0x603..0x606
+
+// Special
+#define MSG_NOP                   0x607
+
+#define MSG_MON_HEALTH_CHECKS     0x608
+#define MSG_TIMECHECK2            0x609
+
+// *** ceph-mgr <-> OSD/MDS daemons ***
+#define MSG_MGR_OPEN              0x700
+#define MSG_MGR_CONFIGURE         0x701
+#define MSG_MGR_REPORT            0x702
+
+// *** ceph-mgr <-> ceph-mon ***
+#define MSG_MGR_BEACON            0x703
+
+// *** ceph-mon(MgrMonitor) -> OSD/MDS daemons ***
+#define MSG_MGR_MAP               0x704
+
+// *** ceph-mon(MgrMonitor) -> ceph-mgr
+#define MSG_MGR_DIGEST               0x705
+// *** cephmgr -> ceph-mon
+#define MSG_MON_MGR_REPORT        0x706
+#define MSG_SERVICE_MAP           0x707
+
+#define MSG_MGR_CLOSE             0x708
+
+// ======================================================
+
+// abstract Message class
+
+namespace bi = boost::intrusive;
+
+// XioMessenger conditional trace flags
+#define MSG_MAGIC_XIO          0x0002
+#define MSG_MAGIC_TRACE_XCON   0x0004
+#define MSG_MAGIC_TRACE_DTOR   0x0008
+#define MSG_MAGIC_TRACE_HDR    0x0010
+#define MSG_MAGIC_TRACE_XIO    0x0020
+#define MSG_MAGIC_TRACE_XMSGR  0x0040
+#define MSG_MAGIC_TRACE_CTR    0x0080
+
+// XioMessenger diagnostic "ping pong" flag (resend msg when send completes)
+#define MSG_MAGIC_REDUPE       0x0100
+
+class Message : public RefCountedObject {
+protected:
+  ceph_msg_header  header;      // headerelope
+  ceph_msg_footer  footer;
+  bufferlist       payload;  // "front" unaligned blob
+  bufferlist       middle;   // "middle" unaligned blob
+  bufferlist       data;     // data payload (page-alignment will be preserved where possible)
+
+  /* recv_stamp is set when the Messenger starts reading the
+   * Message off the wire */
+  utime_t recv_stamp;
+  /* dispatch_stamp is set when the Messenger starts calling dispatch() on
+   * its endpoints */
+  utime_t dispatch_stamp;
+  /* throttle_stamp is the point at which we got throttle */
+  utime_t throttle_stamp;
+  /* time at which message was fully read */
+  utime_t recv_complete_stamp;
+
+  ConnectionRef connection;
+
+  uint32_t magic = 0;
+
+  bi::list_member_hook<> dispatch_q;
+
+public:
+  using ref = MessageRef;
+  using const_ref = MessageConstRef;
+
+  // zipkin tracing
+  ZTracer::Trace trace;
+  void encode_trace(bufferlist &bl, uint64_t features) const;
+  void decode_trace(bufferlist::const_iterator &p, bool create = false);
+
+  class CompletionHook : public Context {
+  protected:
+    Message *m;
+    friend class Message;
+  public:
+    explicit CompletionHook(Message *_m) : m(_m) {}
+    virtual void set_message(Message *_m) { m = _m; }
+  };
+
+  typedef bi::list< Message,
+		    bi::member_hook< Message,
+				     bi::list_member_hook<>,
+				     &Message::dispatch_q > > Queue;
+
+protected:
+  CompletionHook* completion_hook = nullptr; // owned by Messenger
+
+  // release our size in bytes back to this throttler when our payload
+  // is adjusted or when we are destroyed.
+  ThrottleInterface *byte_throttler = nullptr;
+
+  // release a count back to this throttler when we are destroyed
+  ThrottleInterface *msg_throttler = nullptr;
+
+  // keep track of how big this message was when we reserved space in
+  // the msgr dispatch_throttler, so that we can properly release it
+  // later.  this is necessary because messages can enter the dispatch
+  // queue locally (not via read_message()), and those are not
+  // currently throttled.
+  uint64_t dispatch_throttle_size = 0;
+
+  friend class Messenger;
+
+public:
+  Message() {
+    memset(&header, 0, sizeof(header));
+    memset(&footer, 0, sizeof(footer));
+  }
+  Message(int t, int version=1, int compat_version=0) {
+    memset(&header, 0, sizeof(header));
+    header.type = t;
+    header.version = version;
+    header.compat_version = compat_version;
+    header.priority = 0;  // undef
+    header.data_off = 0;
+    memset(&footer, 0, sizeof(footer));
+  }
+
+  Message *get() {
+    return static_cast<Message *>(RefCountedObject::get());
+  }
+
+protected:
+  ~Message() override {
+    if (byte_throttler)
+      byte_throttler->put(payload.length() + middle.length() + data.length());
+    release_message_throttle();
+    trace.event("message destructed");
+    /* call completion hooks (if any) */
+    if (completion_hook)
+      completion_hook->complete(0);
+  }
+public:
+  const ConnectionRef& get_connection() const { return connection; }
+  void set_connection(const ConnectionRef& c) {
+    connection = c;
+  }
+  CompletionHook* get_completion_hook() { return completion_hook; }
+  void set_completion_hook(CompletionHook *hook) { completion_hook = hook; }
+  void set_byte_throttler(ThrottleInterface *t) {
+    byte_throttler = t;
+  }
+  void set_message_throttler(ThrottleInterface *t) {
+    msg_throttler = t;
+  }
+
+  void set_dispatch_throttle_size(uint64_t s) { dispatch_throttle_size = s; }
+  uint64_t get_dispatch_throttle_size() const { return dispatch_throttle_size; }
+
+  const ceph_msg_header &get_header() const { return header; }
+  ceph_msg_header &get_header() { return header; }
+  void set_header(const ceph_msg_header &e) { header = e; }
+  void set_footer(const ceph_msg_footer &e) { footer = e; }
+  const ceph_msg_footer &get_footer() const { return footer; }
+  ceph_msg_footer &get_footer() { return footer; }
+  void set_src(const entity_name_t& src) { header.src = src; }
+
+  uint32_t get_magic() const { return magic; }
+  void set_magic(int _magic) { magic = _magic; }
+
+  /*
+   * If you use get_[data, middle, payload] you shouldn't
+   * use it to change those bufferlists unless you KNOW
+   * there is no throttle being used. The other
+   * functions are throttling-aware as appropriate.
+   */
+
+  void clear_payload() {
+    if (byte_throttler) {
+      byte_throttler->put(payload.length() + middle.length());
+    }
+    payload.clear();
+    middle.clear();
+  }
+
+  virtual void clear_buffers() {}
+  void clear_data() {
+    if (byte_throttler)
+      byte_throttler->put(data.length());
+    data.clear();
+    clear_buffers(); // let subclass drop buffers as well
+  }
+  void release_message_throttle() {
+    if (msg_throttler)
+      msg_throttler->put();
+    msg_throttler = nullptr;
+  }
+
+  bool empty_payload() const { return payload.length() == 0; }
+  bufferlist& get_payload() { return payload; }
+  const bufferlist& get_payload() const { return payload; }
+  void set_payload(bufferlist& bl) {
+    if (byte_throttler)
+      byte_throttler->put(payload.length());
+    payload.claim(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE);
+    if (byte_throttler)
+      byte_throttler->take(payload.length());
+  }
+
+  void set_middle(bufferlist& bl) {
+    if (byte_throttler)
+      byte_throttler->put(middle.length());
+    middle.claim(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE);
+    if (byte_throttler)
+      byte_throttler->take(middle.length());
+  }
+  bufferlist& get_middle() { return middle; }
+
+  void set_data(const bufferlist &bl) {
+    if (byte_throttler)
+      byte_throttler->put(data.length());
+    data.share(bl);
+    if (byte_throttler)
+      byte_throttler->take(data.length());
+  }
+
+  const bufferlist& get_data() const { return data; }
+  bufferlist& get_data() { return data; }
+  void claim_data(bufferlist& bl,
+		  unsigned int flags = buffer::list::CLAIM_DEFAULT) {
+    if (byte_throttler)
+      byte_throttler->put(data.length());
+    bl.claim(data, flags);
+  }
+  off_t get_data_len() const { return data.length(); }
+
+  void set_recv_stamp(utime_t t) { recv_stamp = t; }
+  const utime_t& get_recv_stamp() const { return recv_stamp; }
+  void set_dispatch_stamp(utime_t t) { dispatch_stamp = t; }
+  const utime_t& get_dispatch_stamp() const { return dispatch_stamp; }
+  void set_throttle_stamp(utime_t t) { throttle_stamp = t; }
+  const utime_t& get_throttle_stamp() const { return throttle_stamp; }
+  void set_recv_complete_stamp(utime_t t) { recv_complete_stamp = t; }
+  const utime_t& get_recv_complete_stamp() const { return recv_complete_stamp; }
+
+  void calc_header_crc() {
+    header.crc = ceph_crc32c(0, (unsigned char*)&header,
+			     sizeof(header) - sizeof(header.crc));
+  }
+  void calc_front_crc() {
+    footer.front_crc = payload.crc32c(0);
+    footer.middle_crc = middle.crc32c(0);
+  }
+  void calc_data_crc() {
+    footer.data_crc = data.crc32c(0);
+  }
+
+  virtual int get_cost() const {
+    return data.length();
+  }
+
+  // type
+  int get_type() const { return header.type; }
+  void set_type(int t) { header.type = t; }
+
+  uint64_t get_tid() const { return header.tid; }
+  void set_tid(uint64_t t) { header.tid = t; }
+
+  uint64_t get_seq() const { return header.seq; }
+  void set_seq(uint64_t s) { header.seq = s; }
+
+  unsigned get_priority() const { return header.priority; }
+  void set_priority(__s16 p) { header.priority = p; }
+
+  // source/dest
+  entity_inst_t get_source_inst() const {
+    return entity_inst_t(get_source(), get_source_addr());
+  }
+  entity_name_t get_source() const {
+    return entity_name_t(header.src);
+  }
+  entity_addr_t get_source_addr() const {
+    if (connection)
+      return connection->get_peer_addr();
+    return entity_addr_t();
+  }
+  entity_addrvec_t get_source_addrs() const {
+    if (connection)
+      return connection->get_peer_addrs();
+    return entity_addrvec_t();
+  }
+
+  // forwarded?
+  entity_inst_t get_orig_source_inst() const {
+    return get_source_inst();
+  }
+  entity_name_t get_orig_source() const {
+    return get_source();
+  }
+  entity_addr_t get_orig_source_addr() const {
+    return get_source_addr();
+  }
+  entity_addrvec_t get_orig_source_addrs() const {
+    return get_source_addrs();
+  }
+
+  // virtual bits
+  virtual void decode_payload() = 0;
+  virtual void encode_payload(uint64_t features) = 0;
+  virtual std::string_view get_type_name() const = 0;
+  virtual void print(ostream& out) const {
+    out << get_type_name() << " magic: " << magic;
+  }
+
+  virtual void dump(Formatter *f) const;
+
+  void encode(uint64_t features, int crcflags);
+};
+
+extern Message *decode_message(CephContext *cct, int crcflags,
+			       ceph_msg_header &header,
+			       ceph_msg_footer& footer, bufferlist& front,
+			       bufferlist& middle, bufferlist& data,
+			       Connection* conn);
+inline ostream& operator<<(ostream& out, const Message& m) {
+  m.print(out);
+  if (m.get_header().version)
+    out << " v" << m.get_header().version;
+  return out;
+}
+
+extern void encode_message(Message *m, uint64_t features, bufferlist& bl);
+extern Message *decode_message(CephContext *cct, int crcflags,
+                               bufferlist::const_iterator& bl);
+
+template <class MessageType>
+class MessageFactory {
+public:
+template<typename... Args>
+  static typename MessageType::ref build(Args&&... args) {
+    return typename MessageType::ref(new MessageType(std::forward<Args>(args)...), false);
+  }
+};
+
+template<class T, class M = Message>
+class MessageSubType : public M {
+public:
+  typedef boost::intrusive_ptr<T> ref;
+  typedef boost::intrusive_ptr<T const> const_ref;
+
+  static auto msgref_cast(typename M::ref const& m) {
+    return boost::static_pointer_cast<typename T::const_ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m);
+  }
+  static auto msgref_cast(typename M::const_ref const& m) {
+    return boost::static_pointer_cast<typename T::ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m);
+  }
+
+protected:
+template<typename... Args>
+  MessageSubType(Args&&... args) : M(std::forward<Args>(args)...) {}
+  virtual ~MessageSubType() override {}
+};
+
+
+template<class T, class M = Message>
+class MessageInstance : public MessageSubType<T, M> {
+public:
+  using factory = MessageFactory<T>;
+
+  template<typename... Args>
+  static auto create(Args&&... args) {
+    return MessageFactory<T>::build(std::forward<Args>(args)...);
+  }
+  static auto msgref_cast(typename Message::ref const& m) {
+    return boost::static_pointer_cast<typename T::ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m);
+  }
+  static auto msgref_cast(typename Message::const_ref const& m) {
+    return boost::static_pointer_cast<typename T::const_ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m);
+  }
+
+protected:
+template<typename... Args>
+  MessageInstance(Args&&... args) : MessageSubType<T,M>(std::forward<Args>(args)...) {}
+  virtual ~MessageInstance() override {}
+};
+
+#endif
diff --git a/src/msg/MessageRef.h b/src/msg/MessageRef.h
new file mode 100644
index 00000000..c2bd3152
--- /dev/null
+++ b/src/msg/MessageRef.h
@@ -0,0 +1,186 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2018 Red Hat, Inc. <contact@redhat.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MESSAGEREF_H
+#define CEPH_MESSAGEREF_H
+ 
+#include <boost/intrusive_ptr.hpp>
+
+template<typename T>
+using MRef = boost::intrusive_ptr<T>;
+template<typename T>
+using MConstRef = boost::intrusive_ptr<T const>;
+
+using MessageRef = MRef<class Message>;
+using MessageConstRef = MConstRef<class Message>;
+
+/* cd src/messages/ && for f in *; do printf 'class '; basename "$f" .h | tr -d '\n'; printf ';\n'; done >> ../msg/MessageRef.h */
+
+class MAuth;
+class MAuthReply;
+class MBackfillReserve;
+class MCacheExpire;
+class MClientCapRelease;
+class MClientCaps;
+class MClientLease;
+class MClientQuota;
+class MClientReclaim;
+class MClientReclaimReply;
+class MClientReconnect;
+class MClientReply;
+class MClientRequestForward;
+class MClientRequest;
+class MClientSession;
+class MClientSnap;
+class MCommand;
+class MCommandReply;
+class MConfig;
+class MDataPing;
+class MDentryLink;
+class MDentryUnlink;
+class MDirUpdate;
+class MDiscover;
+class MDiscoverReply;
+class MExportCapsAck;
+class MExportCaps;
+class MExportDirAck;
+class MExportDirCancel;
+class MExportDirDiscoverAck;
+class MExportDirDiscover;
+class MExportDirFinish;
+class MExportDir;
+class MExportDirNotifyAck;
+class MExportDirNotify;
+class MExportDirPrepAck;
+class MExportDirPrep;
+class MForward;
+class MFSMap;
+class MFSMapUser;
+class MGatherCaps;
+class MGenericMessage;
+class MGetConfig;
+class MGetPoolStats;
+class MGetPoolStatsReply;
+class MHeartbeat;
+class MInodeFileCaps;
+class MLock;
+class MLogAck;
+class MLog;
+class MMDSBeacon;
+class MMDSCacheRejoin;
+class MMDSFindIno;
+class MMDSFindInoReply;
+class MMDSFragmentNotifyAck;
+class MMDSFragmentNotify;
+class MMDSLoadTargets;
+class MMDSMap;
+class MMDSOpenIno;
+class MMDSOpenInoReply;
+class MMDSResolveAck;
+class MMDSResolve;
+class MMDSSlaveRequest;
+class MMDSSnapUpdate;
+class MMDSTableRequest;
+class MMgrBeacon;
+class MMgrClose;
+class MMgrConfigure;
+class MMgrDigest;
+class MMgrMap;
+class MMgrOpen;
+class MMgrReport;
+class MMonCommandAck;
+class MMonCommand;
+class MMonElection;
+class MMonGetMap;
+class MMonGetOSDMap;
+class MMonGetVersion;
+class MMonGetVersionReply;
+class MMonGlobalID;
+class MMonHealthChecks;
+class MMonHealth;
+class MMonJoin;
+class MMonMap;
+class MMonMetadata;
+class MMonMgrReport;
+class MMonPaxos;
+class MMonProbe;
+class MMonQuorumService;
+class MMonScrub;
+class MMonSubscribeAck;
+class MMonSubscribe;
+class MMonSync;
+class MNop;
+class MOSDAlive;
+class MOSDBackoff;
+class MOSDBeacon;
+class MOSDBoot;
+class MOSDECSubOpRead;
+class MOSDECSubOpReadReply;
+class MOSDECSubOpWrite;
+class MOSDECSubOpWriteReply;
+class MOSDFailure;
+class MOSDFastDispatchOp;
+class MOSDForceRecovery;
+class MOSDFull;
+class MOSDMap;
+class MOSDMarkMeDown;
+class MOSDOp;
+class MOSDOpReply;
+class MOSDPeeringOp;
+class MOSDPGBackfill;
+class MOSDPGBackfillRemove;
+class MOSDPGCreate2;
+class MOSDPGCreated;
+class MOSDPGCreate;
+class MOSDPGInfo;
+class MOSDPGLog;
+class MOSDPGNotify;
+class MOSDPGPull;
+class MOSDPGPush;
+class MOSDPGPushReply;
+class MOSDPGQuery;
+class MOSDPGReadyToMerge;
+class MOSDPGRecoveryDelete;
+class MOSDPGRecoveryDeleteReply;
+class MOSDPGRemove;
+class MOSDPGScan;
+class MOSDPGTemp;
+class MOSDPGTrim;
+class MOSDPGUpdateLogMissing;
+class MOSDPGUpdateLogMissingReply;
+class MOSDPing;
+class MOSDRepOp;
+class MOSDRepOpReply;
+class MOSDRepScrub;
+class MOSDRepScrubMap;
+class MOSDScrub2;
+class MOSDScrub;
+class MOSDScrubReserve;
+class MPGStatsAck;
+class MPGStats;
+class MPing;
+class MPoolOp;
+class MPoolOpReply;
+class MRecoveryReserve;
+class MRemoveSnaps;
+class MRoute;
+class MServiceMap;
+class MStatfs;
+class MStatfsReply;
+class MTimeCheck2;
+class MTimeCheck;
+class MWatchNotify;
+class PaxosServiceMessage;
+
+#endif
diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc
new file mode 100644
index 00000000..efeab390
--- /dev/null
+++ b/src/msg/Messenger.cc
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <netdb.h>
+
+#include "include/types.h"
+#include "include/random.h"
+
+#include "Messenger.h"
+
+#include "msg/simple/SimpleMessenger.h"
+#include "msg/async/AsyncMessenger.h"
+#ifdef HAVE_XIO
+#include "msg/xio/XioMessenger.h"
+#endif
+
+Messenger *Messenger::create_client_messenger(CephContext *cct, string lname)
+{
+  std::string public_msgr_type = cct->_conf->ms_public_type.empty() ? cct->_conf.get_val<std::string>("ms_type") : cct->_conf->ms_public_type;
+  auto nonce = ceph::util::generate_random_number<uint64_t>();
+  return Messenger::create(cct, public_msgr_type, entity_name_t::CLIENT(),
+			   std::move(lname), nonce, 0);
+}
+
+Messenger *Messenger::create(CephContext *cct, const string &type,
+			     entity_name_t name, string lname,
+			     uint64_t nonce, uint64_t cflags)
+{
+  int r = -1;
+  if (type == "random") {
+    r = ceph::util::generate_random_number(0, 1);
+  }
+  if (r == 0 || type == "simple")
+    return new SimpleMessenger(cct, name, std::move(lname), nonce);
+  else if (r == 1 || type.find("async") != std::string::npos)
+    return new AsyncMessenger(cct, name, type, std::move(lname), nonce);
+#ifdef HAVE_XIO
+  else if ((type == "xio") &&
+	   cct->check_experimental_feature_enabled("ms-type-xio"))
+    return new XioMessenger(cct, name, std::move(lname), nonce, cflags);
+#endif
+  lderr(cct) << "unrecognized ms_type '" << type << "'" << dendl;
+  return nullptr;
+}
+
+/**
+ * Get the default crc flags for this messenger.
+ * but not yet dispatched.
+ */
+static int get_default_crc_flags(const ConfigProxy&);
+
+Messenger::Messenger(CephContext *cct_, entity_name_t w)
+  : trace_endpoint("0.0.0.0", 0, "Messenger"),
+    my_name(w),
+    default_send_priority(CEPH_MSG_PRIO_DEFAULT),
+    started(false),
+    magic(0),
+    socket_priority(-1),
+    cct(cct_),
+    crcflags(get_default_crc_flags(cct->_conf)),
+    auth_registry(cct)
+{
+  auth_registry.refresh_config();
+}
+
+void Messenger::set_endpoint_addr(const entity_addr_t& a,
+                                  const entity_name_t &name)
+{
+  size_t hostlen;
+  if (a.get_family() == AF_INET)
+    hostlen = sizeof(struct sockaddr_in);
+  else if (a.get_family() == AF_INET6)
+    hostlen = sizeof(struct sockaddr_in6);
+  else
+    hostlen = 0;
+
+  if (hostlen) {
+    char buf[NI_MAXHOST] = { 0 };
+    getnameinfo(a.get_sockaddr(), hostlen, buf, sizeof(buf),
+                NULL, 0, NI_NUMERICHOST);
+
+    trace_endpoint.copy_ip(buf);
+  }
+  trace_endpoint.set_port(a.get_port());
+}
+
+/**
+ * Get the default crc flags for this messenger.
+ * but not yet dispatched.
+ *
+ * Pre-calculate desired software CRC settings.  CRC computation may
+ * be disabled by default for some transports (e.g., those with strong
+ * hardware checksum support).
+ */
+int get_default_crc_flags(const ConfigProxy& conf)
+{
+  int r = 0;
+  if (conf->ms_crc_data)
+    r |= MSG_CRC_DATA;
+  if (conf->ms_crc_header)
+    r |= MSG_CRC_HEADER;
+  return r;
+}
+
+int Messenger::bindv(const entity_addrvec_t& addrs)
+{
+  return bind(addrs.legacy_addr());
+}
+
+bool Messenger::ms_deliver_verify_authorizer(
+  Connection *con,
+  int peer_type,
+  int protocol,
+  bufferlist& authorizer,
+  bufferlist& authorizer_reply,
+  bool& isvalid,
+  CryptoKey& session_key,
+  std::string *connection_secret,
+  std::unique_ptr<AuthAuthorizerChallenge> *challenge)
+{
+  if (authorizer.length() == 0) {
+    for (auto dis : dispatchers) {
+      if (!dis->require_authorizer) {
+	//ldout(cct,10) << __func__ << " tolerating missing authorizer" << dendl;
+	isvalid = true;
+	return true;
+      }
+    }
+  }
+  AuthAuthorizeHandler *ah = auth_registry.get_handler(peer_type, protocol);
+  if (get_mytype() == CEPH_ENTITY_TYPE_MON &&
+      peer_type != CEPH_ENTITY_TYPE_MON) {
+    // the monitor doesn't do authenticators for msgr1.
+    isvalid = true;
+    return true;
+  }
+  if (!ah) {
+    lderr(cct) << __func__ << " no AuthAuthorizeHandler found for protocol "
+	       << protocol << dendl;
+    isvalid = false;
+    return false;
+  }
+
+  for (auto dis : dispatchers) {
+    KeyStore *ks = dis->ms_get_auth1_authorizer_keystore();
+    if (ks) {
+      isvalid = ah->verify_authorizer(
+	cct,
+	ks,
+	authorizer,
+	0,
+	&authorizer_reply,
+	&con->peer_name,
+	&con->peer_global_id,
+	&con->peer_caps_info,
+	&session_key,
+	connection_secret,
+	challenge);
+      if (isvalid) {
+	return dis->ms_handle_authentication(con)>=0;
+      }
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h
new file mode 100644
index 00000000..2602765c
--- /dev/null
+++ b/src/msg/Messenger.h
@@ -0,0 +1,837 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+
+#ifndef CEPH_MESSENGER_H
+#define CEPH_MESSENGER_H
+
+#include <map>
+#include <deque>
+
+#include <errno.h>
+#include <sstream>
+#include <memory>
+
+#include "Message.h"
+#include "Dispatcher.h"
+#include "Policy.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Throttle.h"
+#include "include/Context.h"
+#include "include/types.h"
+#include "include/ceph_features.h"
+#include "auth/Crypto.h"
+#include "common/item_history.h"
+#include "auth/AuthRegistry.h"
+#include "include/ceph_assert.h"
+
+#include <errno.h>
+#include <sstream>
+#include <signal.h>
+
+#define SOCKET_PRIORITY_MIN_DELAY 6
+
+class Timer;
+
+class AuthClient;
+class AuthServer;
+
+#ifdef UNIT_TESTS_BUILT
+
+struct Interceptor {
+  std::mutex lock;
+  std::condition_variable cond_var;
+
+  enum ACTION : uint32_t {
+    CONTINUE = 0,
+    FAIL,
+    STOP
+  };
+
+  virtual ~Interceptor() {}
+  virtual ACTION intercept(Connection *conn, uint32_t step) = 0;
+};
+
+#endif
+
+class Messenger {
+private:
+  std::deque<Dispatcher*> dispatchers;
+  std::deque<Dispatcher*> fast_dispatchers;
+  ZTracer::Endpoint trace_endpoint;
+
+protected:
+  void set_endpoint_addr(const entity_addr_t& a,
+                         const entity_name_t &name);
+
+protected:
+  /// the "name" of the local daemon. eg client.99
+  entity_name_t my_name;
+
+  /// my addr
+  safe_item_history<entity_addrvec_t> my_addrs;
+
+  int default_send_priority;
+  /// set to true once the Messenger has started, and set to false on shutdown
+  bool started;
+  uint32_t magic;
+  int socket_priority;
+
+public:
+  AuthClient *auth_client = 0;
+  AuthServer *auth_server = 0;
+
+#ifdef UNIT_TESTS_BUILT
+  Interceptor *interceptor = nullptr;
+#endif
+
+  /**
+   * Various Messenger conditional config/type flags to allow
+   * different "transport" Messengers to tune themselves
+   */
+  static const int HAS_HEAVY_TRAFFIC    = 0x0001;
+  static const int HAS_MANY_CONNECTIONS = 0x0002;
+  static const int HEARTBEAT            = 0x0004;
+
+  /**
+   *  The CephContext this Messenger uses. Many other components initialize themselves
+   *  from this value.
+   */
+  CephContext *cct;
+  int crcflags;
+
+  using Policy = ceph::net::Policy<Throttle>;
+
+protected:
+  // for authentication
+  AuthRegistry auth_registry;
+
+public:
+  /**
+   * Messenger constructor. Call this from your implementation.
+   * Messenger users should construct full implementations directly,
+   * or use the create() function.
+   */
+  Messenger(CephContext *cct_, entity_name_t w);
+  virtual ~Messenger() {}
+
+  /**
+   * create a new messenger
+   *
+   * Create a new messenger instance, with whatever implementation is
+   * available or specified via the configuration in cct.
+   *
+   * @param cct context
+   * @param type name of messenger type
+   * @param name entity name to register
+   * @param lname logical name of the messenger in this process (e.g., "client")
+   * @param nonce nonce value to uniquely identify this instance on the current host
+   * @param features bits for the local connection
+   * @param cflags general set of flags to configure transport resources
+   */
+  static Messenger *create(CephContext *cct,
+                           const string &type,
+                           entity_name_t name,
+			   string lname,
+                           uint64_t nonce,
+			   uint64_t cflags);
+
+  /**
+   * create a new messenger
+   *
+   * Create a new messenger instance.
+   * Same as the above, but a slightly simpler interface for clients:
+   * - Generate a random nonce
+   * - use the default feature bits
+   * - get the messenger type from cct
+   * - use the client entity_type
+   *
+   * @param cct context
+   * @param lname logical name of the messenger in this process (e.g., "client")
+   */
+  static Messenger *create_client_messenger(CephContext *cct, string lname);
+
+  /**
+   * @defgroup Accessors
+   * @{
+   */
+  int get_mytype() const { return my_name.type(); }
+
+  /**
+   * Retrieve the Messenger's name
+   *
+   * @return A const reference to the name this Messenger
+   * currently believes to be its own.
+   */
+  const entity_name_t& get_myname() { return my_name; }
+
+  /**
+   * Retrieve the Messenger's address.
+   *
+   * @return A const reference to the address this Messenger
+   * currently believes to be its own.
+   */
+  const entity_addrvec_t& get_myaddrs() {
+    return *my_addrs;
+  }
+
+  /**
+   * get legacy addr for myself, suitable for protocol v1
+   *
+   * Note that myaddrs might be a proper addrvec with v1 in it, or it might be an
+   * ANY addr (if i am a pure client).
+   */
+  entity_addr_t get_myaddr_legacy() {
+    return my_addrs->as_legacy_addr();
+  }
+
+
+  /**
+   * set messenger's instance
+   */
+  uint32_t get_magic() { return magic; }
+  void set_magic(int _magic) { magic = _magic; }
+
+  void set_auth_client(AuthClient *ac) {
+    auth_client = ac;
+  }
+  void set_auth_server(AuthServer *as) {
+    auth_server = as;
+  }
+
+protected:
+  /**
+   * set messenger's address
+   */
+  virtual void set_myaddrs(const entity_addrvec_t& a) {
+    my_addrs = a;
+    set_endpoint_addr(a.front(), my_name);
+  }
+public:
+  /**
+   * @return the zipkin trace endpoint
+   */
+  const ZTracer::Endpoint* get_trace_endpoint() const {
+    return &trace_endpoint;
+  }
+
+  /**
+   * Set the name of the local entity. The name is reported to others and
+   * can be changed while the system is running, but doing so at incorrect
+   * times may have bad results.
+   *
+   * @param m The name to set.
+   */
+  void set_myname(const entity_name_t& m) { my_name = m; }
+
+  /**
+   * Set the unknown address components for this Messenger.
+   * This is useful if the Messenger doesn't know its full address just by
+   * binding, but another Messenger on the same interface has already learned
+   * its full address. This function does not fill in known address elements,
+   * cause a rebind, or do anything of that sort.
+   *
+   * @param addr The address to use as a template.
+   */
+  virtual bool set_addr_unknowns(const entity_addrvec_t &addrs) = 0;
+  /**
+   * Set the address for this Messenger. This is useful if the Messenger
+   * binds to a specific address but advertises a different address on the
+   * the network.
+   *
+   * @param addr The address to use.
+   */
+  virtual void set_addrs(const entity_addrvec_t &addr) = 0;
+  /// Get the default send priority.
+  int get_default_send_priority() { return default_send_priority; }
+  /**
+   * Get the number of Messages which the Messenger has received
+   * but not yet dispatched.
+   */
+  virtual int get_dispatch_queue_len() = 0;
+
+  /**
+   * Get age of oldest undelivered message
+   * (0 if the queue is empty)
+   */
+  virtual double get_dispatch_queue_max_age(utime_t now) = 0;
+
+  /**
+   * @} // Accessors
+   */
+
+  /**
+   * @defgroup Configuration
+   * @{
+   */
+  /**
+   * Set the cluster protocol in use by this daemon.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param p The cluster protocol to use. Defined externally.
+   */
+  virtual void set_cluster_protocol(int p) = 0;
+  /**
+   * Set a policy which is applied to all peers who do not have a type-specific
+   * Policy.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param p The Policy to apply.
+   */
+  virtual void set_default_policy(Policy p) = 0;
+  /**
+   * Set a policy which is applied to all peers of the given type.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type this policy applies to.
+   * @param p The policy to apply.
+   */
+  virtual void set_policy(int type, Policy p) = 0;
+  /**
+   * Set the Policy associated with a type of peer.
+   *
+   * This can be called either on initial setup, or after connections
+   * are already established.  However, the policies for existing
+   * connections will not be affected; the new policy will only apply
+   * to future connections.
+   *
+   * @param t The peer type to get the default policy for.
+   * @return A const Policy reference.
+   */
+  virtual Policy get_policy(int t) = 0;
+  /**
+   * Get the default Policy
+   *
+   * @return A const Policy reference.
+   */
+  virtual Policy get_default_policy() = 0;
+  /**
+   * Set Throttlers applied to all Messages from the given type of peer
+   *
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type the Throttlers will apply to.
+   * @param bytes The Throttle for the number of bytes carried by the message
+   * @param msgs The Throttle for the number of messages for this @p type
+   * @note The Messenger does not take ownership of the Throttle pointers, but
+   * you must not destroy them before you destroy the Messenger.
+   */
+  virtual void set_policy_throttlers(int type, Throttle *bytes, Throttle *msgs=NULL) = 0;
+  /**
+   * Set the default send priority
+   *
+   * This is an init-time function and must be called *before* calling
+   * start().
+   *
+   * @param p The cluster protocol to use. Defined externally.
+   */
+  void set_default_send_priority(int p) {
+    ceph_assert(!started);
+    default_send_priority = p;
+  }
+  /**
+   * Set the priority(SO_PRIORITY) for all packets to be sent on this socket.
+   *
+   * Linux uses this value to order the networking queues: packets with a higher
+   * priority may be processed first depending on the selected device queueing
+   * discipline.
+   *
+   * @param prio The priority. Setting a priority outside the range 0 to 6
+   * requires the CAP_NET_ADMIN capability.
+   */
+  void set_socket_priority(int prio) {
+    socket_priority = prio;
+  }
+  /**
+   * Get the socket priority
+   *
+   * @return the socket priority
+   */
+  int get_socket_priority() {
+    return socket_priority;
+  }
+  /**
+   * Add a new Dispatcher to the front of the list. If you add
+   * a Dispatcher which is already included, it will get a duplicate
+   * entry. This will reduce efficiency but not break anything.
+   *
+   * @param d The Dispatcher to insert into the list.
+   */
+  void add_dispatcher_head(Dispatcher *d) {
+    bool first = dispatchers.empty();
+    dispatchers.push_front(d);
+    if (d->ms_can_fast_dispatch_any())
+      fast_dispatchers.push_front(d);
+    if (first)
+      ready();
+  }
+  /**
+   * Add a new Dispatcher to the end of the list. If you add
+   * a Dispatcher which is already included, it will get a duplicate
+   * entry. This will reduce efficiency but not break anything.
+   *
+   * @param d The Dispatcher to insert into the list.
+   */
+  void add_dispatcher_tail(Dispatcher *d) {
+    bool first = dispatchers.empty();
+    dispatchers.push_back(d);
+    if (d->ms_can_fast_dispatch_any())
+      fast_dispatchers.push_back(d);
+    if (first)
+      ready();
+  }
+  /**
+   * Bind the Messenger to a specific address. If bind_addr
+   * is not completely filled in the system will use the
+   * valid portions and cycle through the unset ones (eg, the port)
+   * in an unspecified order.
+   *
+   * @param bind_addr The address to bind to.
+   * @return 0 on success, or -1 on error, or -errno if
+   * we can be more specific about the failure.
+   */
+  virtual int bind(const entity_addr_t& bind_addr) = 0;
+
+  /**
+   * This function performs a full restart of the Messenger component,
+   * whatever that means.  Other entities who connect to this
+   * Messenger post-rebind() should perceive it as a new entity which
+   * they have not previously contacted, and it MUST bind to a
+   * different address than it did previously.
+   *
+   * @param avoid_ports Additional port to avoid binding to.
+   */
+  virtual int rebind(const set<int>& avoid_ports) { return -EOPNOTSUPP; }
+  /**
+   * Bind the 'client' Messenger to a specific address.Messenger will bind
+   * the address before connect to others when option ms_bind_before_connect
+   * is true.
+   * @param bind_addr The address to bind to.
+   * @return 0 on success, or -1 on error, or -errno if
+   */
+  virtual int client_bind(const entity_addr_t& bind_addr) = 0;
+
+  virtual int bindv(const entity_addrvec_t& addrs);
+
+
+  virtual bool should_use_msgr2() {
+    return false;
+  }
+
+  /**
+   * @} // Configuration
+   */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  /**
+   * Perform any resource allocation, thread startup, etc
+   * that is required before attempting to connect to other
+   * Messengers or transmit messages.
+   * Once this function completes, started shall be set to true.
+   *
+   * @return 0 on success; -errno on failure.
+   */
+  virtual int start() { started = true; return 0; }
+
+  // shutdown
+  /**
+   * Block until the Messenger has finished shutting down (according
+   * to the shutdown() function).
+   * It is valid to call this after calling shutdown(), but it must
+   * be called before deleting the Messenger.
+   */
+  virtual void wait() = 0;
+  /**
+   * Initiate a shutdown of the Messenger.
+   *
+   * @return 0 on success, -errno otherwise.
+   */
+  virtual int shutdown() { started = false; return 0; }
+  /**
+   * @} // Startup/Shutdown
+   */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  /**
+   * Queue the given Message for the given entity.
+   * Success in this function does not guarantee Message delivery, only
+   * success in queueing the Message. Other guarantees may be provided based
+   * on the Connection policy associated with the dest.
+   *
+   * @param m The Message to send. The Messenger consumes a single reference
+   * when you pass it in.
+   * @param dest The entity to send the Message to.
+   *
+   * DEPRECATED: please do not use this interface for any new code;
+   * use the Connection* variant.
+   *
+   * @return 0 on success, or -errno on failure.
+   */
+  virtual int send_to(
+    Message *m,
+    int type,
+    const entity_addrvec_t& addr) = 0;
+  int send_to_mon(
+    Message *m, const entity_addrvec_t& addrs) {
+    return send_to(m, CEPH_ENTITY_TYPE_MON, addrs);
+  }
+  int send_to_mds(
+    Message *m, const entity_addrvec_t& addrs) {
+    return send_to(m, CEPH_ENTITY_TYPE_MDS, addrs);
+  }
+  int send_to_osd(
+    Message *m, const entity_addrvec_t& addrs) {
+    return send_to(m, CEPH_ENTITY_TYPE_OSD, addrs);
+  }
+  int send_to_mgr(
+    Message *m, const entity_addrvec_t& addrs) {
+    return send_to(m, CEPH_ENTITY_TYPE_MGR, addrs);
+  }
+
+  /**
+   * @} // Messaging
+   */
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  /**
+   * Get the Connection object associated with a given entity. If a
+   * Connection does not exist, create one and establish a logical connection.
+   * The caller owns a reference when this returns. Call ->put() when you're
+   * done!
+   *
+   * @param dest The entity to get a connection for.
+   */
+  virtual ConnectionRef connect_to(
+    int type, const entity_addrvec_t& dest) = 0;
+  ConnectionRef connect_to_mon(const entity_addrvec_t& dest) {
+    return connect_to(CEPH_ENTITY_TYPE_MON, dest);
+  }
+  ConnectionRef connect_to_mds(const entity_addrvec_t& dest) {
+    return connect_to(CEPH_ENTITY_TYPE_MDS, dest);
+  }
+  ConnectionRef connect_to_osd(const entity_addrvec_t& dest) {
+    return connect_to(CEPH_ENTITY_TYPE_OSD, dest);
+  }
+  ConnectionRef connect_to_mgr(const entity_addrvec_t& dest) {
+    return connect_to(CEPH_ENTITY_TYPE_MGR, dest);
+  }
+
+  /**
+   * Get the Connection object associated with ourselves.
+   */
+  virtual ConnectionRef get_loopback_connection() = 0;
+  /**
+   * Mark down a Connection to a remote.
+   *
+   * This will cause us to discard our outgoing queue for them, and if
+   * reset detection is enabled in the policy and the endpoint tries
+   * to reconnect they will discard their queue when we inform them of
+   * the session reset.
+   *
+   * If there is no Connection to the given dest, it is a no-op.
+   *
+   * This generates a RESET notification to the Dispatcher.
+   *
+   * DEPRECATED: please do not use this interface for any new code;
+   * use the Connection* variant.
+   *
+   * @param a The address to mark down.
+   */
+  virtual void mark_down(const entity_addr_t& a) = 0;
+  virtual void mark_down_addrs(const entity_addrvec_t& a) {
+    mark_down(a.legacy_addr());
+  }
+  /**
+   * Mark all the existing Connections down. This is equivalent
+   * to iterating over all Connections and calling mark_down()
+   * on each.
+   *
+   * This will generate a RESET event for each closed connections.
+   */
+  virtual void mark_down_all() = 0;
+  /**
+   * @} // Connection Management
+   */
+protected:
+  /**
+   * @defgroup Subclass Interfacing
+   * @{
+   */
+  /**
+   * A courtesy function for Messenger implementations which
+   * will be called when we receive our first Dispatcher.
+   */
+  virtual void ready() { }
+  /**
+   * @} // Subclass Interfacing
+   */
+public:
+#ifdef CEPH_USE_SIGPIPE_BLOCKER
+  /**
+   * We need to disable SIGPIPE on all platforms, and if they
+   * don't give us a better mechanism (read: are on Solaris) that
+   * means blocking the signal whenever we do a send or sendmsg...
+   * That means any implementations must invoke MSGR_SIGPIPE_STOPPER in-scope
+   * whenever doing so. On most systems that's blank, but on systems where
+   * it's needed we construct an RAII object to plug and un-plug the SIGPIPE.
+   * See http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html
+   */
+  struct sigpipe_stopper {
+    bool blocked;
+    sigset_t existing_mask;
+    sigset_t pipe_mask;
+    sigpipe_stopper() {
+      sigemptyset(&pipe_mask);
+      sigaddset(&pipe_mask, SIGPIPE);
+      sigset_t signals;
+      sigemptyset(&signals);
+      sigpending(&signals);
+      if (sigismember(&signals, SIGPIPE)) {
+	blocked = false;
+      } else {
+	blocked = true;
+	int r = pthread_sigmask(SIG_BLOCK, &pipe_mask, &existing_mask);
+	ceph_assert(r == 0);
+      }
+    }
+    ~sigpipe_stopper() {
+      if (blocked) {
+	struct timespec nowait{0};
+	int r = sigtimedwait(&pipe_mask, 0, &nowait);
+	ceph_assert(r == EAGAIN || r == 0);
+	r = pthread_sigmask(SIG_SETMASK, &existing_mask, 0);
+	ceph_assert(r == 0);
+      }
+    }
+  };
+#  define MSGR_SIGPIPE_STOPPER Messenger::sigpipe_stopper stopper();
+#else
+#  define MSGR_SIGPIPE_STOPPER
+#endif
+  /**
+   * @defgroup Dispatcher Interfacing
+   * @{
+   */
+  /**
+   * Determine whether a message can be fast-dispatched. We will
+   * query each Dispatcher in sequence to determine if they are
+   * capable of handling a particular message via "fast dispatch".
+   *
+   * @param m The Message we are testing.
+   */
+  bool ms_can_fast_dispatch(const Message::const_ref& m) {
+    for (const auto &dispatcher : fast_dispatchers) {
+      if (dispatcher->ms_can_fast_dispatch2(m))
+	return true;
+    }
+    return false;
+  }
+
+  /**
+   * Deliver a single Message via "fast dispatch".
+   *
+   * @param m The Message we are fast dispatching.
+   * If none of our Dispatchers can handle it, ceph_abort().
+   */
+  void ms_fast_dispatch(const Message::ref &m) {
+    m->set_dispatch_stamp(ceph_clock_now());
+    for (const auto &dispatcher : fast_dispatchers) {
+      if (dispatcher->ms_can_fast_dispatch2(m)) {
+	dispatcher->ms_fast_dispatch2(m);
+	return;
+      }
+    }
+    ceph_abort();
+  }
+  void ms_fast_dispatch(Message *m) {
+    return ms_fast_dispatch(Message::ref(m, false)); /* consume ref */
+  }
+  /**
+   *
+   */
+  void ms_fast_preprocess(const Message::ref &m) {
+    for (const auto &dispatcher : fast_dispatchers) {
+      dispatcher->ms_fast_preprocess2(m);
+    }
+  }
+  /**
+   *  Deliver a single Message. Send it to each Dispatcher
+   *  in sequence until one of them handles it.
+   *  If none of our Dispatchers can handle it, ceph_abort().
+   *
+   *  @param m The Message to deliver.
+   */
+  void ms_deliver_dispatch(const Message::ref &m) {
+    m->set_dispatch_stamp(ceph_clock_now());
+    for (const auto &dispatcher : dispatchers) {
+      if (dispatcher->ms_dispatch2(m))
+	return;
+    }
+    lsubdout(cct, ms, 0) << "ms_deliver_dispatch: unhandled message " << m << " " << *m << " from "
+			 << m->get_source_inst() << dendl;
+    ceph_assert(!cct->_conf->ms_die_on_unhandled_msg);
+  }
+  void ms_deliver_dispatch(Message *m) {
+    return ms_deliver_dispatch(Message::ref(m, false)); /* consume ref */
+  }
+  /**
+   * Notify each Dispatcher of a new Connection. Call
+   * this function whenever a new Connection is initiated or
+   * reconnects.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_connect(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_connect(con);
+    }
+  }
+
+  /**
+   * Notify each fast Dispatcher of a new Connection. Call
+   * this function whenever a new Connection is initiated or
+   * reconnects.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_fast_connect(Connection *con) {
+    for (const auto& dispatcher : fast_dispatchers) {
+      dispatcher->ms_handle_fast_connect(con);
+    }
+  }
+
+  /**
+   * Notify each Dispatcher of a new incoming Connection. Call
+   * this function whenever a new Connection is accepted.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_accept(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_accept(con);
+    }
+  }
+
+  /**
+   * Notify each fast Dispatcher of a new incoming Connection. Call
+   * this function whenever a new Connection is accepted.
+   *
+   * @param con Pointer to the new Connection.
+   */
+  void ms_deliver_handle_fast_accept(Connection *con) {
+    for (const auto& dispatcher : fast_dispatchers) {
+      dispatcher->ms_handle_fast_accept(con);
+    }
+  }
+
+  /**
+   * Notify each Dispatcher of a Connection which may have lost
+   * Messages. Call this function whenever you detect that a lossy Connection
+   * has been disconnected.
+   *
+   * @param con Pointer to the broken Connection.
+   */
+  void ms_deliver_handle_reset(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      if (dispatcher->ms_handle_reset(con))
+	return;
+    }
+  }
+  /**
+   * Notify each Dispatcher of a Connection which has been "forgotten" about
+   * by the remote end, implying that messages have probably been lost.
+   * Call this function whenever you detect a reset.
+   *
+   * @param con Pointer to the broken Connection.
+   */
+  void ms_deliver_handle_remote_reset(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      dispatcher->ms_handle_remote_reset(con);
+    }
+  }
+
+  /**
+   * Notify each Dispatcher of a Connection for which reconnection
+   * attempts are being refused. Call this function whenever you
+   * detect that a lossy Connection has been disconnected and it's
+   * impossible to reconnect.
+   *
+   * @param con Pointer to the broken Connection.
+   */
+  void ms_deliver_handle_refused(Connection *con) {
+    for (const auto& dispatcher : dispatchers) {
+      if (dispatcher->ms_handle_refused(con))
+        return;
+    }
+  }
+
+  /**
+   * Get the AuthAuthorizer for a new outgoing Connection.
+   *
+   * @param peer_type The peer type for the new Connection
+   * @param force_new True if we want to wait for new keys, false otherwise.
+   * @return A pointer to the AuthAuthorizer, if we have one; NULL otherwise
+   */
+  AuthAuthorizer *ms_deliver_get_authorizer(int peer_type) {
+    AuthAuthorizer *a = 0;
+    for (const auto& dispatcher : dispatchers) {
+      if (dispatcher->ms_get_authorizer(peer_type, &a))
+	return a;
+    }
+    return NULL;
+  }
+  /**
+   * Verify that the authorizer on a new incoming Connection is correct.
+   *
+   * @param con The new incoming Connection
+   * @param peer_type The type of the endpoint on the new Connection
+   * @param protocol The ID of the protocol in use (at time of writing, cephx or none)
+   * @param authorizer The authorization string supplied by the remote
+   * @param authorizer_reply Output param: The string we should send back to
+   * the remote to authorize ourselves. Only filled in if isvalid
+   * @param isvalid Output param: True if authorizer is valid, false otherwise
+   *
+   * @return True if we were able to prove or disprove correctness of
+   * authorizer, false otherwise.
+   */
+  bool ms_deliver_verify_authorizer(
+    Connection *con, int peer_type,
+    int protocol, bufferlist& authorizer, bufferlist& authorizer_reply,
+    bool& isvalid,
+    CryptoKey& session_key,
+    std::string *connection_secret,
+    std::unique_ptr<AuthAuthorizerChallenge> *challenge);
+
+  /**
+   * @} // Dispatcher Interfacing
+   */
+};
+
+
+
+#endif
diff --git a/src/msg/Policy.h b/src/msg/Policy.h
new file mode 100644
index 00000000..5d13ffb8
--- /dev/null
+++ b/src/msg/Policy.h
@@ -0,0 +1,117 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "include/ceph_features.h"
+
+namespace ceph::net {
+
+using peer_type_t = int;
+
+/**
+ * A Policy describes the rules of a Connection. Is there a limit on how
+ * much data this Connection can have locally? When the underlying connection
+ * experiences an error, does the Connection disappear? Can this Messenger
+ * re-establish the underlying connection?
+ */
+template<class ThrottleType>
+struct Policy {
+  /// If true, the Connection is tossed out on errors.
+  bool lossy;
+  /// If true, the underlying connection can't be re-established from this end.
+  bool server;
+  /// If true, we will standby when idle
+  bool standby;
+  /// If true, we will try to detect session resets
+  bool resetcheck;
+  /**
+   *  The throttler is used to limit how much data is held by Messages from
+   *  the associated Connection(s). When reading in a new Message, the Messenger
+   *  will call throttler->throttle() for the size of the new Message.
+   */
+  ThrottleType* throttler_bytes;
+  ThrottleType* throttler_messages;
+  
+  /// Specify features supported locally by the endpoint.
+  uint64_t features_supported;
+  /// Specify features any remotes must have to talk to this endpoint.
+  uint64_t features_required;
+  
+  Policy()
+    : lossy(false), server(false), standby(false), resetcheck(true),
+      throttler_bytes(NULL),
+      throttler_messages(NULL),
+      features_supported(CEPH_FEATURES_SUPPORTED_DEFAULT),
+      features_required(0) {}
+private:
+  Policy(bool l, bool s, bool st, bool r, uint64_t req)
+    : lossy(l), server(s), standby(st), resetcheck(r),
+      throttler_bytes(NULL),
+      throttler_messages(NULL),
+      features_supported(CEPH_FEATURES_SUPPORTED_DEFAULT),
+      features_required(req) {}
+  
+public:
+  static Policy stateful_server(uint64_t req) {
+    return Policy(false, true, true, true, req);
+  }
+  static Policy stateless_server(uint64_t req) {
+    return Policy(true, true, false, false, req);
+  }
+  static Policy lossless_peer(uint64_t req) {
+    return Policy(false, false, true, false, req);
+  }
+  static Policy lossless_peer_reuse(uint64_t req) {
+    return Policy(false, false, true, true, req);
+  }
+  static Policy lossy_client(uint64_t req) {
+    return Policy(true, false, false, false, req);
+  }
+  static Policy lossless_client(uint64_t req) {
+    return Policy(false, false, false, true, req);
+  }
+};
+
+template<class ThrottleType>
+class PolicySet {
+  using policy_t = Policy<ThrottleType> ;
+  /// the default Policy we use for Pipes
+  policy_t default_policy;
+  /// map specifying different Policies for specific peer types
+  std::map<int, policy_t> policy_map; // entity_name_t::type -> Policy
+
+public:
+  const policy_t& get(peer_type_t peer_type) const {
+    if (auto found = policy_map.find(peer_type); found != policy_map.end()) {
+      return found->second;
+    } else {
+      return default_policy;
+    }
+  }
+  policy_t& get(peer_type_t peer_type) {
+    if (auto found = policy_map.find(peer_type); found != policy_map.end()) {
+      return found->second;
+    } else {
+      return default_policy;
+    }
+  }
+  void set(peer_type_t peer_type, const policy_t& p) {
+    policy_map[peer_type] = p;
+  }
+  const policy_t& get_default() const {
+    return default_policy;
+  }
+  void set_default(const policy_t& p) {
+    default_policy = p;
+  }
+  void set_throttlers(peer_type_t peer_type,
+                      ThrottleType* byte_throttle,
+                      ThrottleType* msg_throttle) {
+    auto& policy = get(peer_type);
+    policy.throttler_bytes = byte_throttle;
+    policy.throttler_messages = msg_throttle;
+  }
+};
+
+}
diff --git a/src/msg/QueueStrategy.cc b/src/msg/QueueStrategy.cc
new file mode 100644
index 00000000..9356e5c5
--- /dev/null
+++ b/src/msg/QueueStrategy.cc
@@ -0,0 +1,112 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include <string>
+#include "QueueStrategy.h"
+#define dout_subsys ceph_subsys_ms
+#include "common/debug.h"
+
+QueueStrategy::QueueStrategy(int _n_threads)
+  : lock("QueueStrategy::lock"),
+    n_threads(_n_threads),
+    stop(false),
+    mqueue(),
+    disp_threads()
+{
+}
+
+void QueueStrategy::ds_dispatch(Message *m) {
+  msgr->ms_fast_preprocess(m);
+  if (msgr->ms_can_fast_dispatch(m)) {
+    msgr->ms_fast_dispatch(m);
+    return;
+  }
+  lock.Lock();
+  mqueue.push_back(*m);
+  if (disp_threads.size()) {
+    if (! disp_threads.empty()) {
+      QSThread *thrd = &disp_threads.front();
+      disp_threads.pop_front();
+      thrd->cond.Signal();
+    }
+  }
+  lock.Unlock();
+}
+
+void QueueStrategy::entry(QSThread *thrd)
+{
+  for (;;) {
+    Message::ref m;
+    lock.Lock();
+    for (;;) {
+      if (! mqueue.empty()) {
+	m = Message::ref(&mqueue.front(), false);
+	mqueue.pop_front();
+	break;
+      }
+      if (stop)
+	break;
+      disp_threads.push_front(*thrd);
+      thrd->cond.Wait(lock);
+    }
+    lock.Unlock();
+    if (stop) {
+	if (!m) break;
+	continue;
+    }
+    get_messenger()->ms_deliver_dispatch(m);
+  }
+}
+
+void QueueStrategy::shutdown()
+{
+  QSThread *thrd;
+  lock.Lock();
+  stop = true;
+  while (disp_threads.size()) {
+    thrd = &(disp_threads.front());
+    disp_threads.pop_front();
+    thrd->cond.Signal();
+  }
+  lock.Unlock();
+}
+
+void QueueStrategy::wait()
+{
+  lock.Lock();
+  ceph_assert(stop);
+  for (auto& thread : threads) {
+    lock.Unlock();
+
+    // join outside of lock
+    thread->join();
+
+    lock.Lock();
+  }
+  lock.Unlock();
+}
+
+void QueueStrategy::start()
+{
+  ceph_assert(!stop);
+  lock.Lock();
+  threads.reserve(n_threads);
+  for (int ix = 0; ix < n_threads; ++ix) {
+    string thread_name = "ms_xio_qs_";
+    thread_name.append(std::to_string(ix));
+    auto thrd = std::make_unique<QSThread>(this);
+    thrd->create(thread_name.c_str());
+    threads.emplace_back(std::move(thrd));
+  }
+  lock.Unlock();
+}
diff --git a/src/msg/QueueStrategy.h b/src/msg/QueueStrategy.h
new file mode 100644
index 00000000..a531cd77
--- /dev/null
+++ b/src/msg/QueueStrategy.h
@@ -0,0 +1,63 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef QUEUE_STRATEGY_H
+#define QUEUE_STRATEGY_H
+
+#include <vector>
+#include <memory>
+#include <boost/intrusive/list.hpp>
+#include "DispatchStrategy.h"
+#include "msg/Messenger.h"
+
+namespace bi = boost::intrusive;
+
+class QueueStrategy : public DispatchStrategy {
+  Mutex lock;
+  const int n_threads;
+  bool stop;
+
+  Message::Queue mqueue;
+
+  class QSThread : public Thread {
+  public:
+    bi::list_member_hook<> thread_q;
+    QueueStrategy *dq;
+    Cond cond;
+    explicit QSThread(QueueStrategy *dq) : thread_q(), dq(dq), cond() {}
+    void* entry() {
+      dq->entry(this);
+      return NULL;
+    }
+
+    typedef bi::list< QSThread,
+		      bi::member_hook< QSThread,
+				       bi::list_member_hook<>,
+				       &QSThread::thread_q > > Queue;
+  };
+
+  std::vector<std::unique_ptr<QSThread>> threads; //< all threads
+  QSThread::Queue disp_threads; //< waiting threads
+
+public:
+  explicit QueueStrategy(int n_threads);
+  void ds_dispatch(Message *m) override;
+  void shutdown() override;
+  void start() override;
+  void wait() override;
+  void entry(QSThread *thrd);
+  virtual ~QueueStrategy() {}
+};
+#endif /* QUEUE_STRATEGY_H */
diff --git a/src/msg/SimplePolicyMessenger.h b/src/msg/SimplePolicyMessenger.h
new file mode 100644
index 00000000..2e9b84ec
--- /dev/null
+++ b/src/msg/SimplePolicyMessenger.h
@@ -0,0 +1,100 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef SIMPLE_POLICY_MESSENGER_H
+#define SIMPLE_POLICY_MESSENGER_H
+
+#include "Messenger.h"
+#include "Policy.h"
+
+class SimplePolicyMessenger : public Messenger
+{
+private:
+  /// lock protecting policy
+  Mutex policy_lock;
+  // entity_name_t::type -> Policy
+  ceph::net::PolicySet<Throttle> policy_set;
+
+public:
+
+  SimplePolicyMessenger(CephContext *cct, entity_name_t name,
+			string mname, uint64_t _nonce)
+    : Messenger(cct, name),
+      policy_lock("SimplePolicyMessenger::policy_lock")
+    {
+    }
+
+    /**
+   * Get the Policy associated with a type of peer.
+   * @param t The peer type to get the default policy for.
+   *
+   * @return A const Policy reference.
+   */
+  Policy get_policy(int t) override {
+    Mutex::Locker l(policy_lock);
+    return policy_set.get(t);
+  }
+
+  Policy get_default_policy() override {
+    Mutex::Locker l(policy_lock);
+    return policy_set.get_default();
+  }
+
+  /**
+   * Set a policy which is applied to all peers who do not have a type-specific
+   * Policy.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param p The Policy to apply.
+   */
+  void set_default_policy(Policy p) override {
+    Mutex::Locker l(policy_lock);
+    policy_set.set_default(p);
+  }
+  /**
+   * Set a policy which is applied to all peers of the given type.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type this policy applies to.
+   * @param p The policy to apply.
+   */
+  void set_policy(int type, Policy p) override {
+    Mutex::Locker l(policy_lock);
+    policy_set.set(type, p);
+  }
+
+  /**
+   * Set a Throttler which is applied to all Messages from the given
+   * type of peer.
+   * This is an init-time function and cannot be called after calling
+   * start() or bind().
+   *
+   * @param type The peer type this Throttler will apply to.
+   * @param t The Throttler to apply. SimpleMessenger does not take
+   * ownership of this pointer, but you must not destroy it before
+   * you destroy SimpleMessenger.
+   */
+  void set_policy_throttlers(int type,
+			     Throttle* byte_throttle,
+			     Throttle* msg_throttle) override {
+    Mutex::Locker l(policy_lock);
+    policy_set.set_throttlers(type, byte_throttle, msg_throttle);
+  }
+
+}; /* SimplePolicyMessenger */
+
+#endif /* SIMPLE_POLICY_MESSENGER_H */
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
new file mode 100644
index 00000000..b78d84a3
--- /dev/null
+++ b/src/msg/async/AsyncConnection.cc
@@ -0,0 +1,771 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/errno.h"
+#include "AsyncMessenger.h"
+#include "AsyncConnection.h"
+
+#include "ProtocolV1.h"
+#include "ProtocolV2.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
+#define SEQ_MASK  0x7fffffff
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "-- " << async_msgr->get_myaddrs() << " >> "
+		<< *peer_addrs << " conn(" << this
+		<< (msgr2 ? " msgr2=" : " legacy=")
+		<< protocol.get()
+		<< " " << ceph_con_mode_name(protocol->auth_meta->con_mode)
+                << " :" << port
+                << " s=" << get_state_name(state)
+                << " l=" << policy.lossy
+                << ").";
+}
+
+// Notes:
+// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead
+
+const uint32_t AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512;
+
+class C_time_wakeup : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_time_wakeup(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->wakeup_from(fd_or_id);
+  }
+};
+
+class C_handle_read : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_handle_read(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->process();
+  }
+};
+
+class C_handle_write : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_handle_write(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd) override {
+    conn->handle_write();
+  }
+};
+
+class C_handle_write_callback : public EventCallback {
+  AsyncConnectionRef conn;
+
+public:
+  explicit C_handle_write_callback(AsyncConnectionRef c) : conn(c) {}
+  void do_request(uint64_t fd) override { conn->handle_write_callback(); }
+};
+
+class C_clean_handler : public EventCallback {
+  AsyncConnectionRef conn;
+ public:
+  explicit C_clean_handler(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t id) override {
+    conn->cleanup();
+    delete this;
+  }
+};
+
+class C_tick_wakeup : public EventCallback {
+  AsyncConnectionRef conn;
+
+ public:
+  explicit C_tick_wakeup(AsyncConnectionRef c): conn(c) {}
+  void do_request(uint64_t fd_or_id) override {
+    conn->tick(fd_or_id);
+  }
+};
+
+
+AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+                                 Worker *w, bool m2, bool local)
+  : Connection(cct, m), delay_state(NULL), async_msgr(m), conn_id(q->get_id()),
+    logger(w->get_perf_counter()),
+    state(STATE_NONE), port(-1),
+    dispatch_queue(q), recv_buf(NULL),
+    recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)),
+    recv_start(0), recv_end(0),
+    last_active(ceph::coarse_mono_clock::now()),
+    connect_timeout_us(cct->_conf->ms_connection_ready_timeout*1000*1000),
+    inactive_timeout_us(cct->_conf->ms_connection_idle_timeout*1000*1000),
+    msgr2(m2), state_offset(0),
+    worker(w), center(&w->center),read_buffer(nullptr)
+{
+#ifdef UNIT_TESTS_BUILT
+  this->interceptor = m->interceptor;
+#endif
+  read_handler = new C_handle_read(this);
+  write_handler = new C_handle_write(this);
+  write_callback_handler = new C_handle_write_callback(this);
+  wakeup_handler = new C_time_wakeup(this);
+  tick_handler = new C_tick_wakeup(this);
+  // double recv_max_prefetch see "read_until"
+  recv_buf = new char[2*recv_max_prefetch];
+  if (local) {
+    protocol = std::unique_ptr<Protocol>(new LoopbackProtocolV1(this));
+  } else if (m2) {
+    protocol = std::unique_ptr<Protocol>(new ProtocolV2(this));
+  } else {
+    protocol = std::unique_ptr<Protocol>(new ProtocolV1(this));
+  }
+  logger->inc(l_msgr_created_connections);
+}
+
+AsyncConnection::~AsyncConnection()
+{
+  if (recv_buf)
+    delete[] recv_buf;
+  ceph_assert(!delay_state);
+}
+
+int AsyncConnection::get_con_mode() const {
+  return protocol->get_con_mode();
+}
+
+void AsyncConnection::maybe_start_delay_thread()
+{
+  if (!delay_state) {
+    async_msgr->cct->_conf.with_val<std::string>(
+      "ms_inject_delay_type",
+      [this](const string& s) {
+	if (s.find(ceph_entity_type_name(peer_type)) != string::npos) {
+	  ldout(msgr->cct, 1) << __func__ << " setting up a delay queue"
+			      << dendl;
+	  delay_state = new DelayedDelivery(async_msgr, center, dispatch_queue,
+					    conn_id);
+	}
+      });
+  }
+}
+
+
+ssize_t AsyncConnection::read(unsigned len, char *buffer,
+                              std::function<void(char *, ssize_t)> callback) {
+  ldout(async_msgr->cct, 20) << __func__
+                             << (pendingReadLen ? " continue" : " start")
+                             << " len=" << len << dendl;
+  ssize_t r = read_until(len, buffer);
+  if (r > 0) {
+    readCallback = callback;
+    pendingReadLen = len;
+    read_buffer = buffer;
+  }
+  return r;
+}
+
+// Because this func will be called multi times to populate
+// the needed buffer, so the passed in bufferptr must be the same.
+// Normally, only "read_message" will pass existing bufferptr in
+//
+// And it will uses readahead method to reduce small read overhead,
+// "recv_buf" is used to store read buffer
+//
+// return the remaining bytes, 0 means this buffer is finished
+// else return < 0 means error
+ssize_t AsyncConnection::read_until(unsigned len, char *p)
+{
+  ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is "
+                             << state_offset << dendl;
+
+  if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+    if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+      cs.shutdown();
+    }
+  }
+
+  ssize_t r = 0;
+  uint64_t left = len - state_offset;
+  if (recv_end > recv_start) {
+    uint64_t to_read = std::min<uint64_t>(recv_end - recv_start, left);
+    memcpy(p, recv_buf+recv_start, to_read);
+    recv_start += to_read;
+    left -= to_read;
+    ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer "
+                               << " left is " << left << " buffer still has "
+                               << recv_end - recv_start << dendl;
+    if (left == 0) {
+      return 0;
+    }
+    state_offset += to_read;
+  }
+
+  recv_end = recv_start = 0;
+  /* nothing left in the prefetch buffer */
+  if (left > (uint64_t)recv_max_prefetch) {
+    /* this was a large read, we don't prefetch for these */
+    do {
+      r = read_bulk(p+state_offset, left);
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl;
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+        return -1;
+      } else if (r == static_cast<int>(left)) {
+        state_offset = 0;
+        return 0;
+      }
+      state_offset += r;
+      left -= r;
+    } while (r > 0);
+  } else {
+    do {
+      r = read_bulk(recv_buf+recv_end, recv_max_prefetch);
+      ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end
+                                 << " left is " << left << " got " << r << dendl;
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+        return -1;
+      }
+      recv_end += r;
+      if (r >= static_cast<int>(left)) {
+        recv_start = len - state_offset;
+        memcpy(p+state_offset, recv_buf, recv_start);
+        state_offset = 0;
+        return 0;
+      }
+      left -= r;
+    } while (r > 0);
+    memcpy(p+state_offset, recv_buf, recv_end-recv_start);
+    state_offset += (recv_end - recv_start);
+    recv_end = recv_start = 0;
+  }
+  ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining "
+                             << len - state_offset << " bytes" << dendl;
+  return len - state_offset;
+}
+
+/* return -1 means `fd` occurs error or closed, it should be closed
+ * return 0 means EAGAIN or EINTR */
+ssize_t AsyncConnection::read_bulk(char *buf, unsigned len)
+{
+  ssize_t nread;
+ again:
+  nread = cs.read(buf, len);
+  if (nread < 0) {
+    if (nread == -EAGAIN) {
+      nread = 0;
+    } else if (nread == -EINTR) {
+      goto again;
+    } else {
+      ldout(async_msgr->cct, 1) << __func__ << " reading from fd=" << cs.fd()
+                          << " : "<< strerror(nread) << dendl;
+      return -1;
+    }
+  } else if (nread == 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " peer close file descriptor "
+                              << cs.fd() << dendl;
+    return -1;
+  }
+  return nread;
+}
+
+ssize_t AsyncConnection::write(bufferlist &bl,
+                               std::function<void(ssize_t)> callback,
+                               bool more) {
+
+    std::unique_lock<std::mutex> l(write_lock);
+    outgoing_bl.claim_append(bl);
+    ssize_t r = _try_send(more);
+    if (r > 0) {
+      writeCallback = callback;
+    }
+    return r;
+}
+
+// return the remaining bytes, it may larger than the length of ptr
+// else return < 0 means error
+ssize_t AsyncConnection::_try_send(bool more)
+{
+  if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+    if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+      cs.shutdown();
+    }
+  }
+
+  ceph_assert(center->in_thread());
+  ldout(async_msgr->cct, 25) << __func__ << " cs.send " << outgoing_bl.length()
+                             << " bytes" << dendl;
+  ssize_t r = cs.send(outgoing_bl, more);
+  if (r < 0) {
+    ldout(async_msgr->cct, 1) << __func__ << " send error: " << cpp_strerror(r) << dendl;
+    return r;
+  }
+
+  ldout(async_msgr->cct, 10) << __func__ << " sent bytes " << r
+                             << " remaining bytes " << outgoing_bl.length() << dendl;
+
+  if (!open_write && is_queued()) {
+    center->create_file_event(cs.fd(), EVENT_WRITABLE, write_handler);
+    open_write = true;
+  }
+
+  if (open_write && !is_queued()) {
+    center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+    open_write = false;
+    if (writeCallback) {
+      center->dispatch_event_external(write_callback_handler);
+    }
+  }
+
+  return outgoing_bl.length();
+}
+
+void AsyncConnection::inject_delay() {
+  if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+    ldout(async_msgr->cct, 10) << __func__ << " sleep for " <<
+      async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
+    utime_t t;
+    t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+}
+
+void AsyncConnection::process() {
+  std::lock_guard<std::mutex> l(lock);
+  last_active = ceph::coarse_mono_clock::now();
+  recv_start_time = ceph::mono_clock::now();
+
+  ldout(async_msgr->cct, 20) << __func__ << dendl;
+
+  switch (state) {
+    case STATE_NONE: {
+      ldout(async_msgr->cct, 20) << __func__ << " enter none state" << dendl;
+      return;
+    }
+    case STATE_CLOSED: {
+      ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
+      return;
+    }
+    case STATE_CONNECTING: {
+      ceph_assert(!policy.server);
+
+      // clear timer (if any) since we are connecting/re-connecting
+      if (last_tick_id) {
+        center->delete_time_event(last_tick_id);
+        last_tick_id = 0;
+      }
+
+      if (cs) {
+        center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+        cs.close();
+      }
+
+      SocketOptions opts;
+      opts.priority = async_msgr->get_socket_priority();
+      opts.connect_bind_addr = msgr->get_myaddrs().front();
+      ssize_t r = worker->connect(target_addr, opts, &cs);
+      if (r < 0) {
+        protocol->fault();
+        return;
+      }
+
+      center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+      state = STATE_CONNECTING_RE;
+    }
+    case STATE_CONNECTING_RE: {
+      ssize_t r = cs.is_connected();
+      if (r < 0) {
+        ldout(async_msgr->cct, 1) << __func__ << " reconnect failed to "
+                                  << target_addr << dendl;
+        if (r == -ECONNREFUSED) {
+          ldout(async_msgr->cct, 2)
+              << __func__ << " connection refused!" << dendl;
+          dispatch_queue->queue_refused(this);
+        }
+        protocol->fault();
+        return;
+      } else if (r == 0) {
+        ldout(async_msgr->cct, 10)
+            << __func__ << " nonblock connect inprogress" << dendl;
+        if (async_msgr->get_stack()->nonblock_connect_need_writable_event()) {
+          center->create_file_event(cs.fd(), EVENT_WRITABLE,
+                                    read_handler);
+        }
+        logger->tinc(l_msgr_running_recv_time,
+               ceph::mono_clock::now() - recv_start_time);
+        return;
+      }
+
+      center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+      ldout(async_msgr->cct, 10)
+          << __func__ << " connect successfully, ready to send banner" << dendl;
+      state = STATE_CONNECTION_ESTABLISHED;
+      ceph_assert(last_tick_id == 0);
+      // exclude TCP nonblock connect time
+      last_connect_started = ceph::coarse_mono_clock::now();
+      last_tick_id = center->create_time_event(
+        connect_timeout_us, tick_handler);
+      break;
+    }
+
+    case STATE_ACCEPTING: {
+      center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+      state = STATE_CONNECTION_ESTABLISHED;
+
+      break;
+    }
+
+    case STATE_CONNECTION_ESTABLISHED: {
+      if (pendingReadLen) {
+        ssize_t r = read(*pendingReadLen, read_buffer, readCallback);
+        if (r <= 0) { // read all bytes, or an error occured
+          pendingReadLen.reset();
+          char *buf_tmp = read_buffer;
+          read_buffer = nullptr;
+          readCallback(buf_tmp, r);
+        }
+        return;
+      }
+      break;
+    }
+  }
+
+  protocol->read_event();
+
+  logger->tinc(l_msgr_running_recv_time,
+               ceph::mono_clock::now() - recv_start_time);
+}
+
+bool AsyncConnection::is_connected() {
+  return protocol->is_connected();
+}
+
+void AsyncConnection::connect(const entity_addrvec_t &addrs, int type,
+                              entity_addr_t &target) {
+
+  std::lock_guard<std::mutex> l(lock);
+  set_peer_type(type);
+  set_peer_addrs(addrs);
+  policy = msgr->get_policy(type);
+  target_addr = target;
+  _connect();
+}
+
+void AsyncConnection::_connect()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+
+  state = STATE_CONNECTING;
+  protocol->connect();
+  // rescheduler connection in order to avoid lock dep
+  // may called by external thread(send_message)
+  center->dispatch_event_external(read_handler);
+}
+
+void AsyncConnection::accept(ConnectedSocket socket,
+			     const entity_addr_t &listen_addr,
+			     const entity_addr_t &peer_addr)
+{
+  ldout(async_msgr->cct, 10) << __func__ << " sd=" << socket.fd()
+			     << " listen_addr " << listen_addr
+			     << " peer_addr " << peer_addr << dendl;
+  ceph_assert(socket.fd() >= 0);
+
+  std::lock_guard<std::mutex> l(lock);
+  cs = std::move(socket);
+  socket_addr = listen_addr;
+  target_addr = peer_addr; // until we know better
+  state = STATE_ACCEPTING;
+  protocol->accept();
+  // rescheduler connection in order to avoid lock dep
+  center->dispatch_event_external(read_handler);
+}
+
+int AsyncConnection::send_message(Message *m)
+{
+  FUNCTRACE(async_msgr->cct);
+  lgeneric_subdout(async_msgr->cct, ms,
+		   1) << "-- " << async_msgr->get_myaddrs() << " --> "
+		      << get_peer_addrs() << " -- "
+		      << *m << " -- " << m << " con "
+		      << this
+		      << dendl;
+
+  // optimistic think it's ok to encode(actually may broken now)
+  if (!m->get_priority())
+    m->set_priority(async_msgr->get_default_send_priority());
+
+  m->get_header().src = async_msgr->get_myname();
+  m->set_connection(this);
+
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_BEGIN", true);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_BEGIN", true);
+
+  if (async_msgr->get_myaddrs() == get_peer_addrs()) { //loopback connection
+    ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl;
+    std::lock_guard<std::mutex> l(write_lock);
+    if (protocol->is_connected()) {
+      dispatch_queue->local_delivery(m, m->get_priority());
+    } else {
+      ldout(async_msgr->cct, 10) << __func__ << " loopback connection closed."
+                                 << " Drop message " << m << dendl;
+      m->put();
+    }
+    return 0;
+  }
+
+  // we don't want to consider local message here, it's too lightweight which
+  // may disturb users
+  logger->inc(l_msgr_send_messages);
+
+  protocol->send_message(m);
+  return 0;
+}
+
+entity_addr_t AsyncConnection::_infer_target_addr(const entity_addrvec_t& av)
+{
+  // pick the first addr of the same address family as socket_addr.  it could be
+  // an any: or v2: addr, we don't care.  it should not be a v1 addr.
+  for (auto& i : av.v) {
+    if (i.is_legacy()) {
+      continue;
+    }
+    if (i.get_family() == socket_addr.get_family()) {
+      ldout(async_msgr->cct,10) << __func__ << " " << av << " -> " << i << dendl;
+      return i;
+    }
+  }
+  ldout(async_msgr->cct,10) << __func__ << " " << av << " -> nothing to match "
+			    << socket_addr << dendl;
+  return {};
+}
+
+void AsyncConnection::fault()
+{
+  shutdown_socket();
+  open_write = false;
+
+  // queue delayed items immediately
+  if (delay_state)
+    delay_state->flush();
+
+  recv_start = recv_end = 0;
+  state_offset = 0;
+  outgoing_bl.clear();
+}
+
+void AsyncConnection::_stop() {
+  writeCallback.reset();
+  dispatch_queue->discard_queue(conn_id);
+  async_msgr->unregister_conn(this);
+  worker->release_worker();
+
+  state = STATE_CLOSED;
+  open_write = false;
+
+  state_offset = 0;
+  // Make sure in-queue events will been processed
+  center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this)));
+}
+
+bool AsyncConnection::is_queued() const {
+  return outgoing_bl.length();
+}
+
+void AsyncConnection::shutdown_socket() {
+  for (auto &&t : register_time_events) center->delete_time_event(t);
+  register_time_events.clear();
+  if (last_tick_id) {
+    center->delete_time_event(last_tick_id);
+    last_tick_id = 0;
+  }
+  if (cs) {
+    center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+    cs.shutdown();
+    cs.close();
+  }
+}
+
+void AsyncConnection::DelayedDelivery::do_request(uint64_t id)
+{
+  Message *m = nullptr;
+  {
+    std::lock_guard<std::mutex> l(delay_lock);
+    register_time_events.erase(id);
+    if (stop_dispatch)
+      return ;
+    if (delay_queue.empty())
+      return ;
+    m = delay_queue.front();
+    delay_queue.pop_front();
+  }
+  if (msgr->ms_can_fast_dispatch(m)) {
+    dispatch_queue->fast_dispatch(m);
+  } else {
+    dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+  }
+}
+
+void AsyncConnection::DelayedDelivery::discard() {
+  stop_dispatch = true;
+  center->submit_to(center->get_id(),
+                    [this]() mutable {
+                      std::lock_guard<std::mutex> l(delay_lock);
+                      while (!delay_queue.empty()) {
+                        Message *m = delay_queue.front();
+                        dispatch_queue->dispatch_throttle_release(
+                            m->get_dispatch_throttle_size());
+                        m->put();
+                        delay_queue.pop_front();
+                      }
+                      for (auto i : register_time_events)
+                        center->delete_time_event(i);
+                      register_time_events.clear();
+                      stop_dispatch = false;
+                    },
+                    true);
+}
+
+void AsyncConnection::DelayedDelivery::flush() {
+  stop_dispatch = true;
+  center->submit_to(
+      center->get_id(), [this] () mutable {
+    std::lock_guard<std::mutex> l(delay_lock);
+    while (!delay_queue.empty()) {
+      Message *m = delay_queue.front();
+      if (msgr->ms_can_fast_dispatch(m)) {
+        dispatch_queue->fast_dispatch(m);
+      } else {
+        dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+      }
+      delay_queue.pop_front();
+    }
+    for (auto i : register_time_events)
+      center->delete_time_event(i);
+    register_time_events.clear();
+    stop_dispatch = false;
+  }, true);
+}
+
+void AsyncConnection::send_keepalive()
+{
+  protocol->send_keepalive();
+}
+
+void AsyncConnection::mark_down()
+{
+  ldout(async_msgr->cct, 1) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(lock);
+  protocol->stop();
+}
+
+void AsyncConnection::handle_write()
+{
+  ldout(async_msgr->cct, 10) << __func__ << dendl;
+  protocol->write_event();
+}
+
+void AsyncConnection::handle_write_callback() {
+  std::lock_guard<std::mutex> l(lock);
+  last_active = ceph::coarse_mono_clock::now();
+  recv_start_time = ceph::mono_clock::now();
+  write_lock.lock();
+  if (writeCallback) {
+    auto callback = *writeCallback;
+    writeCallback.reset();
+    write_lock.unlock();
+    callback(0);
+    return;
+  }
+  write_lock.unlock();
+}
+
+void AsyncConnection::stop(bool queue_reset) {
+  lock.lock();
+  bool need_queue_reset = (state != STATE_CLOSED) && queue_reset;
+  protocol->stop();
+  lock.unlock();
+  if (need_queue_reset) dispatch_queue->queue_reset(this);
+}
+
+void AsyncConnection::cleanup() {
+  shutdown_socket();
+  delete read_handler;
+  delete write_handler;
+  delete write_callback_handler;
+  delete wakeup_handler;
+  delete tick_handler;
+  if (delay_state) {
+    delete delay_state;
+    delay_state = NULL;
+  }
+}
+
+void AsyncConnection::wakeup_from(uint64_t id)
+{
+  lock.lock();
+  register_time_events.erase(id);
+  lock.unlock();
+  process();
+}
+
+void AsyncConnection::tick(uint64_t id)
+{
+  auto now = ceph::coarse_mono_clock::now();
+  ldout(async_msgr->cct, 20) << __func__ << " last_id=" << last_tick_id
+                             << " last_active=" << last_active << dendl;
+  std::lock_guard<std::mutex> l(lock);
+  last_tick_id = 0;
+  if (!is_connected()) {
+    if (connect_timeout_us <=
+        (uint64_t)std::chrono::duration_cast<std::chrono::microseconds>
+          (now - last_connect_started).count()) {
+      ldout(async_msgr->cct, 1) << __func__ << " see no progress in more than "
+                                << connect_timeout_us
+                                << " us during connecting, fault."
+                                << dendl;
+      protocol->fault();
+    } else {
+      last_tick_id = center->create_time_event(connect_timeout_us, tick_handler);
+    }
+  } else {
+    auto idle_period = std::chrono::duration_cast<std::chrono::microseconds>
+      (now - last_active).count();
+    if (inactive_timeout_us < (uint64_t)idle_period) {
+      ldout(async_msgr->cct, 1) << __func__ << " idle (" << idle_period
+                                << ") for more than " << inactive_timeout_us
+                                << " us, fault."
+                                << dendl;
+      protocol->fault();
+    } else {
+      last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler);
+    }
+  }
+}
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
new file mode 100644
index 00000000..0c2512c8
--- /dev/null
+++ b/src/msg/async/AsyncConnection.h
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNCCONNECTION_H
+#define CEPH_MSG_ASYNCCONNECTION_H
+
+#include <atomic>
+#include <pthread.h>
+#include <climits>
+#include <list>
+#include <mutex>
+#include <map>
+#include <functional>
+#include <optional>
+
+#include "auth/AuthSessionHandler.h"
+#include "common/ceph_time.h"
+#include "common/perf_counters.h"
+#include "include/buffer.h"
+#include "msg/Connection.h"
+#include "msg/Messenger.h"
+
+#include "Event.h"
+#include "Stack.h"
+
+class AsyncMessenger;
+class DispatchQueue;
+class Worker;
+class Protocol;
+
+static const int ASYNC_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
+
+/*
+ * AsyncConnection maintains a logic session between two endpoints. In other
+ * word, a pair of addresses can find the only AsyncConnection. AsyncConnection
+ * will handle with network fault or read/write transactions. If one file
+ * descriptor broken, AsyncConnection will maintain the message queue and
+ * sequence, try to reconnect peer endpoint.
+ */
+class AsyncConnection : public Connection {
+
+  ssize_t read(unsigned len, char *buffer,
+               std::function<void(char *, ssize_t)> callback);
+  ssize_t read_until(unsigned needed, char *p);
+  ssize_t read_bulk(char *buf, unsigned len);
+
+  ssize_t write(bufferlist &bl, std::function<void(ssize_t)> callback,
+                bool more=false);
+  ssize_t _try_send(bool more=false);
+
+  void _connect();
+  void _stop();
+  void fault();
+  void inject_delay();
+
+  bool is_queued() const;
+  void shutdown_socket();
+
+   /**
+   * The DelayedDelivery is for injecting delays into Message delivery off
+   * the socket. It is only enabled if delays are requested, and if they
+   * are then it pulls Messages off the DelayQueue and puts them into the
+   * AsyncMessenger event queue.
+   */
+  class DelayedDelivery : public EventCallback {
+    std::set<uint64_t> register_time_events; // need to delete it if stop
+    std::deque<Message*> delay_queue;
+    std::mutex delay_lock;
+    AsyncMessenger *msgr;
+    EventCenter *center;
+    DispatchQueue *dispatch_queue;
+    uint64_t conn_id;
+    std::atomic_bool stop_dispatch;
+
+   public:
+    explicit DelayedDelivery(AsyncMessenger *omsgr, EventCenter *c,
+                             DispatchQueue *q, uint64_t cid)
+      : msgr(omsgr), center(c), dispatch_queue(q), conn_id(cid),
+        stop_dispatch(false) { }
+    ~DelayedDelivery() override {
+      ceph_assert(register_time_events.empty());
+      ceph_assert(delay_queue.empty());
+    }
+    void set_center(EventCenter *c) { center = c; }
+    void do_request(uint64_t id) override;
+    void queue(double delay_period, Message *m) {
+      std::lock_guard<std::mutex> l(delay_lock);
+      delay_queue.push_back(m);
+      register_time_events.insert(center->create_time_event(delay_period*1000000, this));
+    }
+    void discard();
+    bool ready() const { return !stop_dispatch && delay_queue.empty() && register_time_events.empty(); }
+    void flush();
+  } *delay_state;
+
+ public:
+  AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+		  Worker *w, bool is_msgr2, bool local);
+  ~AsyncConnection() override;
+  void maybe_start_delay_thread();
+
+  ostream& _conn_prefix(std::ostream *_dout);
+
+  bool is_connected() override;
+
+  // Only call when AsyncConnection first construct
+  void connect(const entity_addrvec_t& addrs, int type, entity_addr_t& target);
+
+  // Only call when AsyncConnection first construct
+  void accept(ConnectedSocket socket,
+	      const entity_addr_t &listen_addr,
+	      const entity_addr_t &peer_addr);
+  int send_message(Message *m) override;
+
+  void send_keepalive() override;
+  void mark_down() override;
+  void mark_disposable() override {
+    std::lock_guard<std::mutex> l(lock);
+    policy.lossy = true;
+  }
+
+  entity_addr_t get_peer_socket_addr() const override {
+    return target_addr;
+  }
+
+  int get_con_mode() const override;
+
+ private:
+  enum {
+    STATE_NONE,
+    STATE_CONNECTING,
+    STATE_CONNECTING_RE,
+    STATE_ACCEPTING,
+    STATE_CONNECTION_ESTABLISHED,
+    STATE_CLOSED
+  };
+
+  static const uint32_t TCP_PREFETCH_MIN_SIZE;
+  static const char *get_state_name(int state) {
+      const char* const statenames[] = {"STATE_NONE",
+                                        "STATE_CONNECTING",
+                                        "STATE_CONNECTING_RE",
+                                        "STATE_ACCEPTING",
+                                        "STATE_CONNECTION_ESTABLISHED",
+                                        "STATE_CLOSED"};
+      return statenames[state];
+  }
+
+  AsyncMessenger *async_msgr;
+  uint64_t conn_id;
+  PerfCounters *logger;
+  int state;
+  ConnectedSocket cs;
+  int port;
+  Messenger::Policy policy;
+
+  DispatchQueue *dispatch_queue;
+
+  // lockfree, only used in own thread
+  bufferlist outgoing_bl;
+  bool open_write = false;
+
+  std::mutex write_lock;
+
+  std::mutex lock;
+  EventCallbackRef read_handler;
+  EventCallbackRef write_handler;
+  EventCallbackRef write_callback_handler;
+  EventCallbackRef wakeup_handler;
+  EventCallbackRef tick_handler;
+  char *recv_buf;
+  uint32_t recv_max_prefetch;
+  uint32_t recv_start;
+  uint32_t recv_end;
+  set<uint64_t> register_time_events; // need to delete it if stop
+  ceph::coarse_mono_clock::time_point last_connect_started;
+  ceph::coarse_mono_clock::time_point last_active;
+  ceph::mono_clock::time_point recv_start_time;
+  uint64_t last_tick_id = 0;
+  const uint64_t connect_timeout_us;
+  const uint64_t inactive_timeout_us;
+
+  // Tis section are temp variables used by state transition
+
+  // Accepting state
+  bool msgr2 = false;
+  entity_addr_t socket_addr;  ///< local socket addr
+  entity_addr_t target_addr;  ///< which of the peer_addrs we're connecting to (as clienet) or should reconnect to (as peer)
+
+  entity_addr_t _infer_target_addr(const entity_addrvec_t& av);
+
+  // used only by "read_until"
+  uint64_t state_offset;
+  Worker *worker;
+  EventCenter *center;
+
+  std::unique_ptr<Protocol> protocol;
+
+  std::optional<std::function<void(ssize_t)>> writeCallback;
+  std::function<void(char *, ssize_t)> readCallback;
+  std::optional<unsigned> pendingReadLen;
+  char *read_buffer;
+
+ public:
+  // used by eventcallback
+  void handle_write();
+  void handle_write_callback();
+  void process();
+  void wakeup_from(uint64_t id);
+  void tick(uint64_t id);
+  void local_deliver();
+  void stop(bool queue_reset);
+  void cleanup();
+  PerfCounters *get_perf_counter() {
+    return logger;
+  }
+
+  friend class Protocol;
+  friend class ProtocolV1;
+  friend class ProtocolV2;
+}; /* AsyncConnection */
+
+typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef;
+
+#endif
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
new file mode 100644
index 00000000..2b1488c4
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.cc
@@ -0,0 +1,949 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+
+#include <iostream>
+#include <fstream>
+
+#include "AsyncMessenger.h"
+
+#include "common/config.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) {
+  return *_dout << "-- " << m->get_myaddrs() << " ";
+}
+
+static ostream& _prefix(std::ostream *_dout, Processor *p) {
+  return *_dout << " Processor -- ";
+}
+
+
+/*******************
+ * Processor
+ */
+
+class Processor::C_processor_accept : public EventCallback {
+  Processor *pro;
+
+ public:
+  explicit C_processor_accept(Processor *p): pro(p) {}
+  void do_request(uint64_t id) override {
+    pro->accept();
+  }
+};
+
+Processor::Processor(AsyncMessenger *r, Worker *w, CephContext *c)
+  : msgr(r), net(c), worker(w),
+    listen_handler(new C_processor_accept(this)) {}
+
+int Processor::bind(const entity_addrvec_t &bind_addrs,
+		    const set<int>& avoid_ports,
+		    entity_addrvec_t* bound_addrs)
+{
+  const auto& conf = msgr->cct->_conf;
+  // bind to socket(s)
+  ldout(msgr->cct, 10) << __func__ << " " << bind_addrs << dendl;
+
+  SocketOptions opts;
+  opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+  opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+
+  listen_sockets.resize(bind_addrs.v.size());
+  *bound_addrs = bind_addrs;
+
+  for (unsigned k = 0; k < bind_addrs.v.size(); ++k) {
+    auto& listen_addr = bound_addrs->v[k];
+
+    /* bind to port */
+    int r = -1;
+
+    for (int i = 0; i < conf->ms_bind_retry_count; i++) {
+      if (i > 0) {
+	lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in "
+			 << conf->ms_bind_retry_delay << " seconds " << dendl;
+	sleep(conf->ms_bind_retry_delay);
+      }
+
+      if (listen_addr.get_port()) {
+	worker->center.submit_to(
+	  worker->center.get_id(),
+	  [this, k, &listen_addr, &opts, &r]() {
+	    r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+	  }, false);
+	if (r < 0) {
+	  lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			   << ": " << cpp_strerror(r) << dendl;
+	  continue;
+	}
+      } else {
+	// try a range of ports
+	for (int port = msgr->cct->_conf->ms_bind_port_min;
+	     port <= msgr->cct->_conf->ms_bind_port_max;
+	     port++) {
+	  if (avoid_ports.count(port))
+	    continue;
+
+	  listen_addr.set_port(port);
+	  worker->center.submit_to(
+	    worker->center.get_id(),
+	    [this, k, &listen_addr, &opts, &r]() {
+	      r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+	    }, false);
+	  if (r == 0)
+	    break;
+	}
+	if (r < 0) {
+	  lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			   << " on any port in range "
+			   << msgr->cct->_conf->ms_bind_port_min
+			   << "-" << msgr->cct->_conf->ms_bind_port_max << ": "
+			   << cpp_strerror(r) << dendl;
+	  listen_addr.set_port(0); // Clear port before retry, otherwise we shall fail again.
+	  continue;
+	}
+	ldout(msgr->cct, 10) << __func__ << " bound on random port "
+			     << listen_addr << dendl;
+      }
+      if (r == 0) {
+	break;
+      }
+    }
+
+    // It seems that binding completely failed, return with that exit status
+    if (r < 0) {
+      lderr(msgr->cct) << __func__ << " was unable to bind after "
+		       << conf->ms_bind_retry_count
+		       << " attempts: " << cpp_strerror(r) << dendl;
+      for (unsigned j = 0; j < k; ++j) {
+	// clean up previous bind
+	listen_sockets[j].abort_accept();
+      }
+      return r;
+    }
+  }
+
+  ldout(msgr->cct, 10) << __func__ << " bound to " << *bound_addrs << dendl;
+  return 0;
+}
+
+void Processor::start()
+{
+  ldout(msgr->cct, 1) << __func__ << dendl;
+
+  // start thread
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+      for (auto& l : listen_sockets) {
+	if (l) {
+	  worker->center.create_file_event(l.fd(), EVENT_READABLE,
+					   listen_handler); }
+      }
+    }, false);
+}
+
+void Processor::accept()
+{
+  SocketOptions opts;
+  opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+  opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+  opts.priority = msgr->get_socket_priority();
+
+  for (auto& listen_socket : listen_sockets) {
+    ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
+			 << dendl;
+    unsigned accept_error_num = 0;
+
+    while (true) {
+      entity_addr_t addr;
+      ConnectedSocket cli_socket;
+      Worker *w = worker;
+      if (!msgr->get_stack()->support_local_listen_table())
+	w = msgr->get_stack()->get_worker();
+      else
+	++w->references;
+      int r = listen_socket.accept(&cli_socket, opts, &addr, w);
+      if (r == 0) {
+	ldout(msgr->cct, 10) << __func__ << " accepted incoming on sd "
+			     << cli_socket.fd() << dendl;
+
+	msgr->add_accept(
+	  w, std::move(cli_socket),
+	  msgr->get_myaddrs().v[listen_socket.get_addr_slot()],
+	  addr);
+	accept_error_num = 0;
+	continue;
+      } else {
+	--w->references;
+	if (r == -EINTR) {
+	  continue;
+	} else if (r == -EAGAIN) {
+	  break;
+	} else if (r == -EMFILE || r == -ENFILE) {
+	  lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
+			   << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
+	} else if (r == -ECONNABORTED) {
+	  ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
+			      << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  continue;
+	} else {
+	  lderr(msgr->cct) << __func__ << " no incoming connection?"
+			   << " errno " << r << " " << cpp_strerror(r) << dendl;
+	  if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+	    lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+	    ceph_abort();
+	  }
+	  continue;
+	}
+      }
+    }
+  }
+}
+
+void Processor::stop()
+{
+  ldout(msgr->cct,10) << __func__ << dendl;
+
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+      for (auto& listen_socket : listen_sockets) {
+	if (listen_socket) {
+	  worker->center.delete_file_event(listen_socket.fd(), EVENT_READABLE);
+	  listen_socket.abort_accept();
+	}
+      }
+    }, false);
+}
+
+
+struct StackSingleton {
+  CephContext *cct;
+  std::shared_ptr<NetworkStack> stack;
+
+  explicit StackSingleton(CephContext *c): cct(c) {}
+  void ready(std::string &type) {
+    if (!stack)
+      stack = NetworkStack::create(cct, type);
+  }
+  ~StackSingleton() {
+    stack->stop();
+  }
+};
+
+
+class C_handle_reap : public EventCallback {
+  AsyncMessenger *msgr;
+
+  public:
+  explicit C_handle_reap(AsyncMessenger *m): msgr(m) {}
+  void do_request(uint64_t id) override {
+    // judge whether is a time event
+    msgr->reap_dead();
+  }
+};
+
+/*******************
+ * AsyncMessenger
+ */
+
+AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
+                               const std::string &type, string mname, uint64_t _nonce)
+  : SimplePolicyMessenger(cct, name,mname, _nonce),
+    dispatch_queue(cct, this, mname),
+    lock("AsyncMessenger::lock"),
+    nonce(_nonce), need_addr(true), did_bind(false),
+    global_seq(0), deleted_lock("AsyncMessenger::deleted_lock"),
+    cluster_protocol(0), stopped(true)
+{
+  std::string transport_type = "posix";
+  if (type.find("rdma") != std::string::npos)
+    transport_type = "rdma";
+  else if (type.find("dpdk") != std::string::npos)
+    transport_type = "dpdk";
+
+  auto single = &cct->lookup_or_create_singleton_object<StackSingleton>(
+    "AsyncMessenger::NetworkStack::" + transport_type, true, cct);
+  single->ready(transport_type);
+  stack = single->stack.get();
+  stack->start();
+  local_worker = stack->get_worker();
+  local_connection = new AsyncConnection(cct, this, &dispatch_queue,
+					 local_worker, true, true);
+  init_local_connection();
+  reap_handler = new C_handle_reap(this);
+  unsigned processor_num = 1;
+  if (stack->support_local_listen_table())
+    processor_num = stack->get_num_worker();
+  for (unsigned i = 0; i < processor_num; ++i)
+    processors.push_back(new Processor(this, stack->get_worker(i), cct));
+}
+
+/**
+ * Destroy the AsyncMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+AsyncMessenger::~AsyncMessenger()
+{
+  delete reap_handler;
+  ceph_assert(!did_bind); // either we didn't bind or we shut down the Processor
+  for (auto &&p : processors)
+    delete p;
+}
+
+void AsyncMessenger::ready()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+  stack->ready();
+  if (pending_bind) {
+    int err = bindv(pending_bind_addrs);
+    if (err) {
+      lderr(cct) << __func__ << " postponed bind failed" << dendl;
+      ceph_abort();
+    }
+  }
+
+  Mutex::Locker l(lock);
+  for (auto &&p : processors)
+    p->start();
+  dispatch_queue.start();
+}
+
+int AsyncMessenger::shutdown()
+{
+  ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+  // done!  clean up.
+  for (auto &&p : processors)
+    p->stop();
+  mark_down_all();
+  // break ref cycles on the loopback connection
+  local_connection->set_priv(NULL);
+  local_connection->mark_down();
+  did_bind = false;
+  lock.Lock();
+  stop_cond.Signal();
+  stopped = true;
+  lock.Unlock();
+  stack->drain();
+  return 0;
+}
+
+int AsyncMessenger::bind(const entity_addr_t &bind_addr)
+{
+  ldout(cct,10) << __func__ << " " << bind_addr << dendl;
+  // old bind() can take entity_addr_t(). new bindv() can take a
+  // 0.0.0.0-like address but needs type and family to be set.
+  auto a = bind_addr;
+  if (a == entity_addr_t()) {
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+    if (cct->_conf->ms_bind_ipv6) {
+      a.set_family(AF_INET6);
+    } else {
+      a.set_family(AF_INET);
+    }
+  }
+  return bindv(entity_addrvec_t(a));
+}
+
+int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs)
+{
+  lock.Lock();
+
+  if (!pending_bind && started) {
+    ldout(cct,10) << __func__ << " already started" << dendl;
+    lock.Unlock();
+    return -1;
+  }
+
+  ldout(cct,10) << __func__ << " " << bind_addrs << dendl;
+
+  if (!stack->is_ready()) {
+    ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl;
+    pending_bind_addrs = bind_addrs;
+    pending_bind = true;
+    lock.Unlock();
+    return 0;
+  }
+
+  lock.Unlock();
+
+  // bind to a socket
+  set<int> avoid_ports;
+  entity_addrvec_t bound_addrs;
+  unsigned i = 0;
+  for (auto &&p : processors) {
+    int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+    if (r) {
+      // Note: this is related to local tcp listen table problem.
+      // Posix(default kernel implementation) backend shares listen table
+      // in the kernel, so all threads can use the same listen table naturally
+      // and only one thread need to bind. But other backends(like dpdk) uses local
+      // listen table, we need to bind/listen tcp port for each worker. So if the
+      // first worker failed to bind, it could be think the normal error then handle
+      // it, like port is used case. But if the first worker successfully to bind
+      // but the second worker failed, it's not expected and we need to assert
+      // here
+      ceph_assert(i == 0);
+      return r;
+    }
+    ++i;
+  }
+  _finish_bind(bind_addrs, bound_addrs);
+  return 0;
+}
+
+int AsyncMessenger::rebind(const set<int>& avoid_ports)
+{
+  ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl;
+  ceph_assert(did_bind);
+
+  for (auto &&p : processors)
+    p->stop();
+  mark_down_all();
+
+  // adjust the nonce; we want our entity_addr_t to be truly unique.
+  nonce += 1000000;
+  ldout(cct, 10) << __func__ << " new nonce " << nonce
+		 << " and addr " << get_myaddrs() << dendl;
+
+  entity_addrvec_t bound_addrs;
+  entity_addrvec_t bind_addrs = get_myaddrs();
+  set<int> new_avoid(avoid_ports);
+  for (auto& a : bind_addrs.v) {
+    new_avoid.insert(a.get_port());
+    a.set_port(0);
+  }
+  ldout(cct, 10) << __func__ << " will try " << bind_addrs
+		 << " and avoid ports " << new_avoid << dendl;
+  unsigned i = 0;
+  for (auto &&p : processors) {
+    int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+    if (r) {
+      ceph_assert(i == 0);
+      return r;
+    }
+    ++i;
+  }
+  _finish_bind(bind_addrs, bound_addrs);
+  for (auto &&p : processors) {
+    p->start();
+  }
+  return 0;
+}
+
+int AsyncMessenger::client_bind(const entity_addr_t &bind_addr)
+{
+  if (!cct->_conf->ms_bind_before_connect)
+    return 0;
+  Mutex::Locker l(lock);
+  if (did_bind) {
+    return 0;
+  }
+  if (started) {
+    ldout(cct, 10) << __func__ << " already started" << dendl;
+    return -1;
+  }
+  ldout(cct, 10) << __func__ << " " << bind_addr << dendl;
+
+  set_myaddrs(entity_addrvec_t(bind_addr));
+  return 0;
+}
+
+void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs,
+				  const entity_addrvec_t& listen_addrs)
+{
+  set_myaddrs(bind_addrs);
+  for (auto& a : bind_addrs.v) {
+    if (!a.is_blank_ip()) {
+      learned_addr(a);
+    }
+  }
+
+  if (get_myaddrs().front().get_port() == 0) {
+    set_myaddrs(listen_addrs);
+  }
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(newaddrs);
+
+  init_local_connection();
+
+  ldout(cct,1) << __func__ << " bind my_addrs is " << get_myaddrs() << dendl;
+  did_bind = true;
+}
+
+int AsyncMessenger::start()
+{
+  lock.Lock();
+  ldout(cct,1) << __func__ << " start" << dendl;
+
+  // register at least one entity, first!
+  ceph_assert(my_name.type() >= 0);
+
+  ceph_assert(!started);
+  started = true;
+  stopped = false;
+
+  if (!did_bind) {
+    entity_addrvec_t newaddrs = *my_addrs;
+    for (auto& a : newaddrs.v) {
+      a.nonce = nonce;
+    }
+    set_myaddrs(newaddrs);
+    _init_local_connection();
+  }
+
+  lock.Unlock();
+  return 0;
+}
+
+void AsyncMessenger::wait()
+{
+  lock.Lock();
+  if (!started) {
+    lock.Unlock();
+    return;
+  }
+  if (!stopped)
+    stop_cond.Wait(lock);
+
+  lock.Unlock();
+
+  dispatch_queue.shutdown();
+  if (dispatch_queue.is_started()) {
+    ldout(cct, 10) << __func__ << ": waiting for dispatch queue" << dendl;
+    dispatch_queue.wait();
+    dispatch_queue.discard_local();
+    ldout(cct, 10) << __func__ << ": dispatch queue is stopped" << dendl;
+  }
+
+  // close all connections
+  shutdown_connections(false);
+  stack->drain();
+
+  ldout(cct, 10) << __func__ << ": done." << dendl;
+  ldout(cct, 1) << __func__ << " complete." << dendl;
+  started = false;
+}
+
+void AsyncMessenger::add_accept(Worker *w, ConnectedSocket cli_socket,
+				const entity_addr_t &listen_addr,
+				const entity_addr_t &peer_addr)
+{
+  lock.Lock();
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w,
+						listen_addr.is_msgr2(), false);
+  conn->accept(std::move(cli_socket), listen_addr, peer_addr);
+  accepting_conns.insert(conn);
+  lock.Unlock();
+}
+
+AsyncConnectionRef AsyncMessenger::create_connect(
+  const entity_addrvec_t& addrs, int type)
+{
+  ceph_assert(lock.is_locked());
+
+  ldout(cct, 10) << __func__ << " " << addrs
+      << ", creating connection and registering" << dendl;
+
+  // here is where we decide which of the addrs to connect to.  always prefer
+  // the first one, if we support it.
+  entity_addr_t target;
+  for (auto& a : addrs.v) {
+    if (!a.is_msgr2() && !a.is_legacy()) {
+      continue;
+    }
+    // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before
+    // trying it?  for now, just pick whichever is listed first.
+    target = a;
+    break;
+  }
+
+  // create connection
+  Worker *w = stack->get_worker();
+  AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w,
+						target.is_msgr2(), false);
+  conn->connect(addrs, type, target);
+  ceph_assert(!conns.count(addrs));
+  ldout(cct, 10) << __func__ << " " << conn << " " << addrs << " "
+		 << *conn->peer_addrs << dendl;
+  conns[addrs] = conn;
+  w->get_perf_counter()->inc(l_msgr_active_connections);
+
+  return conn;
+}
+
+
+ConnectionRef AsyncMessenger::get_loopback_connection()
+{
+  return local_connection;
+}
+
+bool AsyncMessenger::should_use_msgr2()
+{
+  // if we are bound to v1 only, and we are connecting to a v2 peer,
+  // we cannot use the peer's v2 address. otherwise the connection
+  // is assymetrical, because they would have to use v1 to connect
+  // to us, and we would use v2, and connection race detection etc
+  // would totally break down (among other things).  or, the other
+  // end will be confused that we advertise ourselve with a v1
+  // address only (that we bound to) but connected with protocol v2.
+  return !did_bind || get_myaddrs().has_msgr2();
+}
+
+entity_addrvec_t AsyncMessenger::_filter_addrs(int type,
+					       const entity_addrvec_t& addrs)
+{
+  if (!should_use_msgr2()) {
+    ldout(cct, 10) << __func__ << " " << addrs << " type " << type
+		   << " limiting to v1 ()" << dendl;
+    entity_addrvec_t r;
+    for (auto& i : addrs.v) {
+      if (i.is_msgr2()) {
+	continue;
+      }
+      r.v.push_back(i);
+    }
+    return r;
+  } else {
+    return addrs;
+  }
+}
+
+int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs)
+{
+  Mutex::Locker l(lock);
+
+  FUNCTRACE(cct);
+  ceph_assert(m);
+
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE(((MOSDOp *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP");
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE(((MOSDOpReply *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP_REPLY");
+
+  ldout(cct, 1) << __func__ << "--> " << ceph_entity_type_name(type) << " "
+      << addrs << " -- " << *m << " -- ?+"
+      << m->get_data().length() << " " << m << dendl;
+
+  if (addrs.empty()) {
+    ldout(cct,0) << __func__ <<  " message " << *m
+        << " with empty dest " << addrs << dendl;
+    m->put();
+    return -EINVAL;
+  }
+
+  auto av = _filter_addrs(type, addrs);
+  AsyncConnectionRef conn = _lookup_conn(av);
+  submit_message(m, conn, av, type);
+  return 0;
+}
+
+ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs)
+{
+  Mutex::Locker l(lock);
+  if (*my_addrs == addrs ||
+      (addrs.v.size() == 1 &&
+       my_addrs->contains(addrs.front()))) {
+    // local
+    return local_connection;
+  }
+
+  auto av = _filter_addrs(type, addrs);
+
+  AsyncConnectionRef conn = _lookup_conn(av);
+  if (conn) {
+    ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl;
+  } else {
+    conn = create_connect(av, type);
+    ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl;
+  }
+
+  return conn;
+}
+
+void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con,
+                                    const entity_addrvec_t& dest_addrs,
+				    int dest_type)
+{
+  if (cct->_conf->ms_dump_on_send) {
+    m->encode(-1, MSG_CRC_ALL);
+    ldout(cct, 0) << __func__ << " submit_message " << *m << "\n";
+    m->get_payload().hexdump(*_dout);
+    if (m->get_data().length() > 0) {
+      *_dout << " data:\n";
+      m->get_data().hexdump(*_dout);
+    }
+    *_dout << dendl;
+    m->clear_payload();
+  }
+
+  // existing connection?
+  if (con) {
+    con->send_message(m);
+    return ;
+  }
+
+  // local?
+  if (*my_addrs == dest_addrs ||
+      (dest_addrs.v.size() == 1 &&
+       my_addrs->contains(dest_addrs.front()))) {
+    // local
+    local_connection->send_message(m);
+    return ;
+  }
+
+  // remote, no existing connection.
+  const Policy& policy = get_policy(dest_type);
+  if (policy.server) {
+    ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addrs
+        << ", lossy server for target type "
+        << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl;
+    m->put();
+  } else {
+    ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addrs
+		  << ", new connection." << dendl;
+    con = create_connect(dest_addrs, dest_type);
+    con->send_message(m);
+  }
+}
+
+/**
+ * If my_addr doesn't have an IP set, this function
+ * will fill it in from the passed addr. Otherwise it does nothing and returns.
+ */
+bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs)
+{
+  ldout(cct,1) << __func__ << " " << addrs << dendl;
+  bool ret = false;
+  Mutex::Locker l(lock);
+
+  entity_addrvec_t newaddrs = *my_addrs;
+  for (auto& a : newaddrs.v) {
+    if (a.is_blank_ip()) {
+      int type = a.get_type();
+      int port = a.get_port();
+      uint32_t nonce = a.get_nonce();
+      for (auto& b : addrs.v) {
+	if (a.get_family() == b.get_family()) {
+	  ldout(cct,1) << __func__ << " assuming my addr " << a
+		       << " matches provided addr " << b << dendl;
+	  a = b;
+	  a.set_nonce(nonce);
+	  a.set_type(type);
+	  a.set_port(port);
+	  ret = true;
+	  break;
+	}
+      }
+    }
+  }
+  set_myaddrs(newaddrs);
+  if (ret) {
+    _init_local_connection();
+  }
+  ldout(cct,1) << __func__ << " now " << *my_addrs << dendl;
+  return ret;
+}
+
+void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs)
+{
+  Mutex::Locker l(lock);
+  auto t = addrs;
+  for (auto& a : t.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(t);
+  _init_local_connection();
+}
+
+void AsyncMessenger::shutdown_connections(bool queue_reset)
+{
+  ldout(cct,1) << __func__ << " " << dendl;
+  lock.Lock();
+  for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin();
+       q != accepting_conns.end(); ++q) {
+    AsyncConnectionRef p = *q;
+    ldout(cct, 5) << __func__ << " accepting_conn " << p.get() << dendl;
+    p->stop(queue_reset);
+  }
+  accepting_conns.clear();
+
+  while (!conns.empty()) {
+    auto it = conns.begin();
+    AsyncConnectionRef p = it->second;
+    ldout(cct, 5) << __func__ << " mark down " << it->first << " " << p << dendl;
+    conns.erase(it);
+    p->get_perf_counter()->dec(l_msgr_active_connections);
+    p->stop(queue_reset);
+  }
+
+  {
+    Mutex::Locker l(deleted_lock);
+    while (!deleted_conns.empty()) {
+      set<AsyncConnectionRef>::iterator it = deleted_conns.begin();
+      AsyncConnectionRef p = *it;
+      ldout(cct, 5) << __func__ << " delete " << p << dendl;
+      deleted_conns.erase(it);
+    }
+  }
+  lock.Unlock();
+}
+
+void AsyncMessenger::mark_down_addrs(const entity_addrvec_t& addrs)
+{
+  lock.Lock();
+  AsyncConnectionRef p = _lookup_conn(addrs);
+  if (p) {
+    ldout(cct, 1) << __func__ << " " << addrs << " -- " << p << dendl;
+    p->stop(true);
+  } else {
+    ldout(cct, 1) << __func__ << " " << addrs << " -- connection dne" << dendl;
+  }
+  lock.Unlock();
+}
+
+int AsyncMessenger::get_proto_version(int peer_type, bool connect) const
+{
+  int my_type = my_name.type();
+
+  // set reply protocol version
+  if (peer_type == my_type) {
+    // internal
+    return cluster_protocol;
+  } else {
+    // public
+    switch (connect ? peer_type : my_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+    }
+  }
+  return 0;
+}
+
+int AsyncMessenger::accept_conn(AsyncConnectionRef conn)
+{
+  Mutex::Locker l(lock);
+  auto it = conns.find(*conn->peer_addrs);
+  if (it != conns.end()) {
+    AsyncConnectionRef existing = it->second;
+
+    // lazy delete, see "deleted_conns"
+    // If conn already in, we will return 0
+    Mutex::Locker l(deleted_lock);
+    if (deleted_conns.erase(existing)) {
+      conns.erase(it);
+    } else if (conn != existing) {
+      return -1;
+    }
+  }
+  ldout(cct, 10) << __func__ << " " << conn << " " << *conn->peer_addrs << dendl;
+  conns[*conn->peer_addrs] = conn;
+  conn->get_perf_counter()->inc(l_msgr_active_connections);
+  accepting_conns.erase(conn);
+  return 0;
+}
+
+
+bool AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  if (!need_addr)
+    return false;
+  std::lock_guard l(lock);
+  if (need_addr) {
+    if (my_addrs->empty()) {
+      auto a = peer_addr_for_me;
+      a.set_type(entity_addr_t::TYPE_ANY);
+      a.set_nonce(nonce);
+      if (!did_bind) {
+	a.set_port(0);
+      }
+      set_myaddrs(entity_addrvec_t(a));
+      ldout(cct,10) << __func__ << " had no addrs" << dendl;
+    } else {
+      // fix all addrs of the same family, regardless of type (msgr2 vs legacy)
+      entity_addrvec_t newaddrs = *my_addrs;
+      for (auto& a : newaddrs.v) {
+	if (a.is_blank_ip() &&
+	    a.get_family() == peer_addr_for_me.get_family()) {
+	  entity_addr_t t = peer_addr_for_me;
+	  if (!did_bind) {
+	    t.set_type(entity_addr_t::TYPE_ANY);
+	    t.set_port(0);
+	  } else {	  
+	    t.set_type(a.get_type());
+	    t.set_port(a.get_port());
+	  }
+	  t.set_nonce(a.get_nonce());
+	  ldout(cct,10) << __func__ << " " << a << " -> " << t << dendl;
+	  a = t;
+	}
+      }
+      set_myaddrs(newaddrs);
+    }
+    ldout(cct, 1) << __func__ << " learned my addr " << *my_addrs
+		  << " (peer_addr_for_me " << peer_addr_for_me << ")" << dendl;
+    _init_local_connection();
+    need_addr = false;
+    return true;
+  }
+  return false;
+}
+
+int AsyncMessenger::reap_dead()
+{
+  ldout(cct, 1) << __func__ << " start" << dendl;
+  int num = 0;
+
+  Mutex::Locker l1(lock);
+  Mutex::Locker l2(deleted_lock);
+
+  while (!deleted_conns.empty()) {
+    auto it = deleted_conns.begin();
+    AsyncConnectionRef p = *it;
+    ldout(cct, 5) << __func__ << " delete " << p << dendl;
+    auto conns_it = conns.find(*p->peer_addrs);
+    if (conns_it != conns.end() && conns_it->second == p)
+      conns.erase(conns_it);
+    accepting_conns.erase(p);
+    deleted_conns.erase(it);
+    ++num;
+  }
+
+  return num;
+}
diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h
new file mode 100644
index 00000000..98bf9d52
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.h
@@ -0,0 +1,426 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNCMESSENGER_H
+#define CEPH_ASYNCMESSENGER_H
+
+#include <map>
+#include <mutex>
+
+#include "include/types.h"
+#include "include/xlist.h"
+#include "include/spinlock.h"
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "msg/SimplePolicyMessenger.h"
+#include "msg/DispatchQueue.h"
+#include "AsyncConnection.h"
+#include "Event.h"
+
+#include "include/ceph_assert.h"
+
+class AsyncMessenger;
+
+/**
+ * If the Messenger binds to a specific address, the Processor runs
+ * and listens for incoming connections.
+ */
+class Processor {
+  AsyncMessenger *msgr;
+  NetHandler net;
+  Worker *worker;
+  vector<ServerSocket> listen_sockets;
+  EventCallbackRef listen_handler;
+
+  class C_processor_accept;
+
+ public:
+  Processor(AsyncMessenger *r, Worker *w, CephContext *c);
+  ~Processor() { delete listen_handler; };
+
+  void stop();
+  int bind(const entity_addrvec_t &bind_addrs,
+	   const set<int>& avoid_ports,
+	   entity_addrvec_t* bound_addrs);
+  void start();
+  void accept();
+};
+
+/*
+ * AsyncMessenger is represented for maintaining a set of asynchronous connections,
+ * it may own a bind address and the accepted connections will be managed by
+ * AsyncMessenger.
+ *
+ */
+
+class AsyncMessenger : public SimplePolicyMessenger {
+  // First we have the public Messenger interface implementation...
+public:
+  /**
+   * Initialize the AsyncMessenger!
+   *
+   * @param cct The CephContext to use
+   * @param name The name to assign ourselves
+   * _nonce A unique ID to use for this AsyncMessenger. It should not
+   * be a value that will be repeated if the daemon restarts.
+   */
+  AsyncMessenger(CephContext *cct, entity_name_t name, const std::string &type,
+                 string mname, uint64_t _nonce);
+
+  /**
+   * Destroy the AsyncMessenger. Pretty simple since all the work is done
+   * elsewhere.
+   */
+  ~AsyncMessenger() override;
+
+  /** @defgroup Accessors
+   * @{
+   */
+  bool set_addr_unknowns(const entity_addrvec_t &addr) override;
+  void set_addrs(const entity_addrvec_t &addrs) override;
+
+  int get_dispatch_queue_len() override {
+    return dispatch_queue.get_queue_len();
+  }
+
+  double get_dispatch_queue_max_age(utime_t now) override {
+    return dispatch_queue.get_max_age(now);
+  }
+  /** @} Accessors */
+
+  /**
+   * @defgroup Configuration functions
+   * @{
+   */
+  void set_cluster_protocol(int p) override {
+    ceph_assert(!started && !did_bind);
+    cluster_protocol = p;
+  }
+
+  int bind(const entity_addr_t& bind_addr) override;
+  int rebind(const set<int>& avoid_ports) override;
+  int client_bind(const entity_addr_t& bind_addr) override;
+
+  int bindv(const entity_addrvec_t& bind_addrs) override;
+
+  bool should_use_msgr2() override;
+
+  /** @} Configuration functions */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  int start() override;
+  void wait() override;
+  int shutdown() override;
+
+  /** @} // Startup/Shutdown */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  int send_to(Message *m, int type, const entity_addrvec_t& addrs) override;
+
+  /** @} // Messaging */
+
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  ConnectionRef connect_to(int type,
+			   const entity_addrvec_t& addrs) override;
+  ConnectionRef get_loopback_connection() override;
+  void mark_down(const entity_addr_t& addr) override {
+    mark_down_addrs(entity_addrvec_t(addr));
+  }
+  void mark_down_addrs(const entity_addrvec_t& addrs) override;
+  void mark_down_all() override {
+    shutdown_connections(true);
+  }
+  /** @} // Connection Management */
+
+  /**
+   * @defgroup Inner classes
+   * @{
+   */
+
+  /**
+   * @} // Inner classes
+   */
+
+protected:
+  /**
+   * @defgroup Messenger Interfaces
+   * @{
+   */
+  /**
+   * Start up the DispatchQueue thread once we have somebody to dispatch to.
+   */
+  void ready() override;
+  /** @} // Messenger Interfaces */
+
+private:
+
+  /**
+   * @defgroup Utility functions
+   * @{
+   */
+
+  /**
+   * Create a connection associated with the given entity (of the given type).
+   * Initiate the connection. (This function returning does not guarantee
+   * connection success.)
+   *
+   * @param addrs The address(es) of the entity to connect to.
+   * @param type The peer type of the entity at the address.
+   *
+   * @return a pointer to the newly-created connection. Caller does not own a
+   * reference; take one if you need it.
+   */
+  AsyncConnectionRef create_connect(const entity_addrvec_t& addrs, int type);
+
+  /**
+   * Queue up a Message for delivery to the entity specified
+   * by addr and dest_type.
+   * submit_message() is responsible for creating
+   * new AsyncConnection (and closing old ones) as necessary.
+   *
+   * @param m The Message to queue up. This function eats a reference.
+   * @param con The existing Connection to use, or NULL if you don't know of one.
+   * @param dest_addr The address to send the Message to.
+   * @param dest_type The peer type of the address we're sending to
+   * just drop silently under failure.
+   */
+  void submit_message(Message *m, AsyncConnectionRef con,
+                      const entity_addrvec_t& dest_addrs, int dest_type);
+
+  void _finish_bind(const entity_addrvec_t& bind_addrs,
+		    const entity_addrvec_t& listen_addrs);
+
+  entity_addrvec_t _filter_addrs(int type,
+				 const entity_addrvec_t& addrs);
+
+ private:
+  static const uint64_t ReapDeadConnectionThreshold = 5;
+
+  NetworkStack *stack;
+  std::vector<Processor*> processors;
+  friend class Processor;
+  DispatchQueue dispatch_queue;
+
+  // the worker run messenger's cron jobs
+  Worker *local_worker;
+
+  std::string ms_type;
+
+  /// overall lock used for AsyncMessenger data structures
+  Mutex lock;
+  // AsyncMessenger stuff
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
+
+  /// true, specifying we haven't learned our addr; set false when we find it.
+  // maybe this should be protected by the lock?
+  bool need_addr;
+
+  /**
+   * set to bind addresses if bind was called before NetworkStack was ready to
+   * bind
+   */
+  entity_addrvec_t pending_bind_addrs;
+
+  /**
+   * false; set to true if a pending bind exists
+   */
+  bool pending_bind = false;
+
+  /**
+   *  The following aren't lock-protected since you shouldn't be able to race
+   *  the only writers.
+   */
+
+  /**
+   *  false; set to true if the AsyncMessenger bound to a specific address;
+   *  and set false again by Accepter::stop().
+   */
+  bool did_bind;
+  /// counter for the global seq our connection protocol uses
+  __u32 global_seq;
+  /// lock to protect the global_seq
+  ceph::spinlock global_seq_lock;
+
+  /**
+   * hash map of addresses to Asyncconnection
+   *
+   * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered
+   * invalid and can be replaced by anyone holding the msgr lock
+   */
+  ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns;
+
+  /**
+   * list of connection are in the process of accepting
+   *
+   * These are not yet in the conns map.
+   */
+  set<AsyncConnectionRef> accepting_conns;
+
+  /**
+   * list of connection are closed which need to be clean up
+   *
+   * Because AsyncMessenger and AsyncConnection follow a lock rule that
+   * we can lock AsyncMesenger::lock firstly then lock AsyncConnection::lock
+   * but can't reversed. This rule is aimed to avoid dead lock.
+   * So if AsyncConnection want to unregister itself from AsyncMessenger,
+   * we pick up this idea that just queue itself to this set and do lazy
+   * deleted for AsyncConnection. "_lookup_conn" must ensure not return a
+   * AsyncConnection in this set.
+   */
+  Mutex deleted_lock;
+  set<AsyncConnectionRef> deleted_conns;
+
+  EventCallbackRef reap_handler;
+
+  /// internal cluster protocol version, if any, for talking to entities of the same type.
+  int cluster_protocol;
+
+  Cond  stop_cond;
+  bool stopped;
+
+  AsyncConnectionRef _lookup_conn(const entity_addrvec_t& k) {
+    ceph_assert(lock.is_locked());
+    auto p = conns.find(k);
+    if (p == conns.end())
+      return NULL;
+
+    // lazy delete, see "deleted_conns"
+    Mutex::Locker l(deleted_lock);
+    if (deleted_conns.erase(p->second)) {
+      conns.erase(p);
+      return NULL;
+    }
+
+    return p->second;
+  }
+
+  void _init_local_connection() {
+    ceph_assert(lock.is_locked());
+    local_connection->peer_addrs = *my_addrs;
+    local_connection->peer_type = my_name.type();
+    local_connection->set_features(CEPH_FEATURES_ALL);
+    ms_deliver_handle_fast_connect(local_connection.get());
+  }
+
+  void shutdown_connections(bool queue_reset);
+
+public:
+
+  /// con used for sending messages to ourselves
+  AsyncConnectionRef local_connection;
+
+  /**
+   * @defgroup AsyncMessenger internals
+   * @{
+   */
+  /**
+   * This wraps _lookup_conn.
+   */
+  AsyncConnectionRef lookup_conn(const entity_addrvec_t& k) {
+    Mutex::Locker l(lock);
+    return _lookup_conn(k);
+  }
+
+  int accept_conn(AsyncConnectionRef conn);
+  bool learned_addr(const entity_addr_t &peer_addr_for_me);
+  void add_accept(Worker *w, ConnectedSocket cli_socket,
+		  const entity_addr_t &listen_addr,
+		  const entity_addr_t &peer_addr);
+  NetworkStack *get_stack() {
+    return stack;
+  }
+
+  uint64_t get_nonce() const {
+    return nonce;
+  }
+
+  /**
+   * Increment the global sequence for this AsyncMessenger and return it.
+   * This is for the connect protocol, although it doesn't hurt if somebody
+   * else calls it.
+   *
+   * @return a global sequence ID that nobody else has seen.
+   */
+  __u32 get_global_seq(__u32 old=0) {
+    std::lock_guard<ceph::spinlock> lg(global_seq_lock);
+
+    if (old > global_seq)
+      global_seq = old;
+    __u32 ret = ++global_seq;
+
+    return ret;
+  }
+  /**
+   * Get the protocol version we support for the given peer type: either
+   * a peer protocol (if it matches our own), the protocol version for the
+   * peer (if we're connecting), or our protocol version (if we're accepting).
+   */
+  int get_proto_version(int peer_type, bool connect) const;
+
+  /**
+   * Fill in the address and peer type for the local connection, which
+   * is used for delivering messages back to ourself.
+   */
+  void init_local_connection() {
+    Mutex::Locker l(lock);
+    _init_local_connection();
+  }
+
+  /**
+   * Unregister connection from `conns`
+   *
+   * See "deleted_conns"
+   */
+  void unregister_conn(AsyncConnectionRef conn) {
+    Mutex::Locker l(deleted_lock);
+    conn->get_perf_counter()->dec(l_msgr_active_connections);
+    deleted_conns.emplace(std::move(conn));
+
+    if (deleted_conns.size() >= ReapDeadConnectionThreshold) {
+      local_worker->center.dispatch_event_external(reap_handler);
+    }
+  }
+
+  /**
+   * Reap dead connection from `deleted_conns`
+   *
+   * @return the number of dead connections
+   *
+   * See "deleted_conns"
+   */
+  int reap_dead();
+
+  /**
+   * @} // AsyncMessenger Internals
+   */
+} ;
+
+#endif /* CEPH_ASYNCMESSENGER_H */
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
new file mode 100644
index 00000000..6b5e4c7c
--- /dev/null
+++ b/src/msg/async/Event.cc
@@ -0,0 +1,471 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "Event.h"
+
+#ifdef HAVE_DPDK
+#include "dpdk/EventDPDK.h"
+#endif
+
+#ifdef HAVE_EPOLL
+#include "EventEpoll.h"
+#else
+#ifdef HAVE_KQUEUE
+#include "EventKqueue.h"
+#else
+#include "EventSelect.h"
+#endif
+#endif
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EventCallback "
+class C_handle_notify : public EventCallback {
+  EventCenter *center;
+  CephContext *cct;
+
+ public:
+  C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {}
+  void do_request(uint64_t fd_or_id) override {
+    char c[256];
+    int r = 0;
+    do {
+      r = read(fd_or_id, c, sizeof(c));
+      if (r < 0) {
+        if (errno != EAGAIN)
+          ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(errno) << dendl;
+      }
+    } while (r > 0);
+  }
+};
+
+#undef dout_prefix
+#define dout_prefix _event_prefix(_dout)
+
+/**
+ * Construct a Poller.
+ *
+ * \param center
+ *      EventCenter object through which the poller will be invoked (defaults
+ *      to the global #RAMCloud::center object).
+ * \param pollerName
+ *      Human readable name that can be printed out in debugging messages
+ *      about the poller. The name of the superclass is probably sufficient
+ *      for most cases.
+ */
+EventCenter::Poller::Poller(EventCenter* center, const string& name)
+    : owner(center), poller_name(name), slot(owner->pollers.size())
+{
+  owner->pollers.push_back(this);
+}
+
+/**
+ * Destroy a Poller.
+ */
+EventCenter::Poller::~Poller()
+{
+  // Erase this Poller from the vector by overwriting it with the
+  // poller that used to be the last one in the vector.
+  //
+  // Note: this approach is reentrant (it is safe to delete a
+  // poller from a poller callback, which means that the poll
+  // method is in the middle of scanning the list of all pollers;
+  // the worst that will happen is that the poller that got moved
+  // may not be invoked in the current scan).
+  owner->pollers[slot] = owner->pollers.back();
+  owner->pollers[slot]->slot = slot;
+  owner->pollers.pop_back();
+  slot = -1;
+}
+
+ostream& EventCenter::_event_prefix(std::ostream *_dout)
+{
+  return *_dout << "Event(" << this << " nevent=" << nevent
+                << " time_id=" << time_event_next_id << ").";
+}
+
+int EventCenter::init(int n, unsigned i, const std::string &t)
+{
+  // can't init multi times
+  ceph_assert(nevent == 0);
+
+  type = t;
+  idx = i;
+
+  if (t == "dpdk") {
+#ifdef HAVE_DPDK
+    driver = new DPDKDriver(cct);
+#endif
+  } else {
+#ifdef HAVE_EPOLL
+  driver = new EpollDriver(cct);
+#else
+#ifdef HAVE_KQUEUE
+  driver = new KqueueDriver(cct);
+#else
+  driver = new SelectDriver(cct);
+#endif
+#endif
+  }
+
+  if (!driver) {
+    lderr(cct) << __func__ << " failed to create event driver " << dendl;
+    return -1;
+  }
+
+  int r = driver->init(this, n);
+  if (r < 0) {
+    lderr(cct) << __func__ << " failed to init event driver." << dendl;
+    return r;
+  }
+
+  file_events.resize(n);
+  nevent = n;
+
+  if (!driver->need_wakeup())
+    return 0;
+
+  int fds[2];
+  if (pipe_cloexec(fds) < 0) {
+    int e = errno;
+    lderr(cct) << __func__ << " can't create notify pipe: " << cpp_strerror(e) << dendl;
+    return -e;
+  }
+
+  notify_receive_fd = fds[0];
+  notify_send_fd = fds[1];
+  r = net.set_nonblock(notify_receive_fd);
+  if (r < 0) {
+    return r;
+  }
+  r = net.set_nonblock(notify_send_fd);
+  if (r < 0) {
+    return r;
+  }
+
+  return r;
+}
+
+EventCenter::~EventCenter()
+{
+  {
+    std::lock_guard<std::mutex> l(external_lock);
+    while (!external_events.empty()) {
+      EventCallbackRef e = external_events.front();
+      if (e)
+        e->do_request(0);
+      external_events.pop_front();
+    }
+  }
+  time_events.clear();
+  //assert(time_events.empty());
+
+  if (notify_receive_fd >= 0)
+    ::close(notify_receive_fd);
+  if (notify_send_fd >= 0)
+    ::close(notify_send_fd);
+
+  delete driver;
+  if (notify_handler)
+    delete notify_handler;
+}
+
+
+void EventCenter::set_owner()
+{
+  owner = pthread_self();
+  ldout(cct, 2) << __func__ << " idx=" << idx << " owner=" << owner << dendl;
+  if (!global_centers) {
+    global_centers = &cct->lookup_or_create_singleton_object<
+      EventCenter::AssociatedCenters>(
+	"AsyncMessenger::EventCenter::global_center::" + type, true);
+    ceph_assert(global_centers);
+    global_centers->centers[idx] = this;
+    if (driver->need_wakeup()) {
+      notify_handler = new C_handle_notify(this, cct);
+      int r = create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler);
+      ceph_assert(r == 0);
+    }
+  }
+}
+
+int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
+{
+  ceph_assert(in_thread());
+  int r = 0;
+  if (fd >= nevent) {
+    int new_size = nevent << 2;
+    while (fd >= new_size)
+      new_size <<= 2;
+    ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
+    r = driver->resize_events(new_size);
+    if (r < 0) {
+      lderr(cct) << __func__ << " event count is exceed." << dendl;
+      return -ERANGE;
+    }
+    file_events.resize(new_size);
+    nevent = new_size;
+  }
+
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  ldout(cct, 20) << __func__ << " create event started fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  if (event->mask == mask)
+    return 0;
+
+  r = driver->add_event(fd, event->mask, mask);
+  if (r < 0) {
+    // Actually we don't allow any failed error code, caller doesn't prepare to
+    // handle error status. So now we need to assert failure here. In practice,
+    // add_event shouldn't report error, otherwise it must be a innermost bug!
+    lderr(cct) << __func__ << " add event failed, ret=" << r << " fd=" << fd
+               << " mask=" << mask << " original mask is " << event->mask << dendl;
+    ceph_abort_msg("BUG!");
+    return r;
+  }
+
+  event->mask |= mask;
+  if (mask & EVENT_READABLE) {
+    event->read_cb = ctxt;
+  }
+  if (mask & EVENT_WRITABLE) {
+    event->write_cb = ctxt;
+  }
+  ldout(cct, 20) << __func__ << " create event end fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  return 0;
+}
+
+void EventCenter::delete_file_event(int fd, int mask)
+{
+  ceph_assert(in_thread() && fd >= 0);
+  if (fd >= nevent) {
+    ldout(cct, 1) << __func__ << " delete event fd=" << fd << " is equal or greater than nevent=" << nevent
+                  << "mask=" << mask << dendl;
+    return ;
+  }
+  EventCenter::FileEvent *event = _get_file_event(fd);
+  ldout(cct, 30) << __func__ << " delete event started fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+  if (!event->mask)
+    return ;
+
+  int r = driver->del_event(fd, event->mask, mask);
+  if (r < 0) {
+    // see create_file_event
+    ceph_abort_msg("BUG!");
+  }
+
+  if (mask & EVENT_READABLE && event->read_cb) {
+    event->read_cb = nullptr;
+  }
+  if (mask & EVENT_WRITABLE && event->write_cb) {
+    event->write_cb = nullptr;
+  }
+
+  event->mask = event->mask & (~mask);
+  ldout(cct, 30) << __func__ << " delete event end fd=" << fd << " mask=" << mask
+                 << " original mask is " << event->mask << dendl;
+}
+
+uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt)
+{
+  ceph_assert(in_thread());
+  uint64_t id = time_event_next_id++;
+
+  ldout(cct, 30) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl;
+  EventCenter::TimeEvent event;
+  clock_type::time_point expire = clock_type::now() + std::chrono::microseconds(microseconds);
+  event.id = id;
+  event.time_cb = ctxt;
+  std::multimap<clock_type::time_point, TimeEvent>::value_type s_val(expire, event);
+  auto it = time_events.insert(std::move(s_val));
+  event_map[id] = it;
+
+  return id;
+}
+
+void EventCenter::delete_time_event(uint64_t id)
+{
+  ceph_assert(in_thread());
+  ldout(cct, 30) << __func__ << " id=" << id << dendl;
+  if (id >= time_event_next_id || id == 0)
+    return ;
+
+  auto it = event_map.find(id);
+  if (it == event_map.end()) {
+    ldout(cct, 10) << __func__ << " id=" << id << " not found" << dendl;
+    return ;
+  }
+
+  time_events.erase(it->second);
+  event_map.erase(it);
+}
+
+void EventCenter::wakeup()
+{
+  // No need to wake up since we never sleep
+  if (!pollers.empty() || !driver->need_wakeup())
+    return ;
+
+  ldout(cct, 20) << __func__ << dendl;
+  char buf = 'c';
+  // wake up "event_wait"
+  int n = write(notify_send_fd, &buf, sizeof(buf));
+  if (n < 0) {
+    if (errno != EAGAIN) {
+      ldout(cct, 1) << __func__ << " write notify pipe failed: " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+  }
+}
+
+int EventCenter::process_time_events()
+{
+  int processed = 0;
+  clock_type::time_point now = clock_type::now();
+  ldout(cct, 30) << __func__ << " cur time is " << now << dendl;
+
+  while (!time_events.empty()) {
+    auto it = time_events.begin();
+    if (now >= it->first) {
+      TimeEvent &e = it->second;
+      EventCallbackRef cb = e.time_cb;
+      uint64_t id = e.id;
+      time_events.erase(it);
+      event_map.erase(id);
+      ldout(cct, 30) << __func__ << " process time event: id=" << id << dendl;
+      processed++;
+      cb->do_request(id);
+    } else {
+      break;
+    }
+  }
+
+  return processed;
+}
+
+int EventCenter::process_events(unsigned timeout_microseconds,  ceph::timespan *working_dur)
+{
+  struct timeval tv;
+  int numevents;
+  bool trigger_time = false;
+  auto now = clock_type::now();
+
+  auto it = time_events.begin();
+  bool blocking = pollers.empty() && !external_num_events.load();
+  // If exists external events or poller, don't block
+  if (!blocking) {
+    if (it != time_events.end() && now >= it->first)
+      trigger_time = true;
+    tv.tv_sec = 0;
+    tv.tv_usec = 0;
+  } else {
+    clock_type::time_point shortest;
+    shortest = now + std::chrono::microseconds(timeout_microseconds); 
+
+    if (it != time_events.end() && shortest >= it->first) {
+      ldout(cct, 30) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl;
+      shortest = it->first;
+      trigger_time = true;
+      if (shortest > now) {
+        timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(
+            shortest - now).count();
+      } else {
+        shortest = now;
+        timeout_microseconds = 0;
+      }
+    }
+    tv.tv_sec = timeout_microseconds / 1000000;
+    tv.tv_usec = timeout_microseconds % 1000000;
+  }
+
+  ldout(cct, 30) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl;
+  vector<FiredFileEvent> fired_events;
+  numevents = driver->event_wait(fired_events, &tv);
+  auto working_start = ceph::mono_clock::now();
+  for (int j = 0; j < numevents; j++) {
+    int rfired = 0;
+    FileEvent *event;
+    EventCallbackRef cb;
+    event = _get_file_event(fired_events[j].fd);
+
+    /* note the event->mask & mask & ... code: maybe an already processed
+    * event removed an element that fired and we still didn't
+    * processed, so we check if the event is still valid. */
+    if (event->mask & fired_events[j].mask & EVENT_READABLE) {
+      rfired = 1;
+      cb = event->read_cb;
+      cb->do_request(fired_events[j].fd);
+    }
+
+    if (event->mask & fired_events[j].mask & EVENT_WRITABLE) {
+      if (!rfired || event->read_cb != event->write_cb) {
+        cb = event->write_cb;
+        cb->do_request(fired_events[j].fd);
+      }
+    }
+
+    ldout(cct, 30) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl;
+  }
+
+  if (trigger_time)
+    numevents += process_time_events();
+
+  if (external_num_events.load()) {
+    external_lock.lock();
+    deque<EventCallbackRef> cur_process;
+    cur_process.swap(external_events);
+    external_num_events.store(0);
+    external_lock.unlock();
+    numevents += cur_process.size();
+    while (!cur_process.empty()) {
+      EventCallbackRef e = cur_process.front();
+      ldout(cct, 30) << __func__ << " do " << e << dendl;
+      e->do_request(0);
+      cur_process.pop_front();
+    }
+  }
+
+  if (!numevents && !blocking) {
+    for (uint32_t i = 0; i < pollers.size(); i++)
+      numevents += pollers[i]->poll();
+  }
+
+  if (working_dur)
+    *working_dur = ceph::mono_clock::now() - working_start;
+  return numevents;
+}
+
+void EventCenter::dispatch_event_external(EventCallbackRef e)
+{
+  uint64_t num = 0;
+  {
+    std::lock_guard lock{external_lock};
+    if (external_num_events > 0 && *external_events.rbegin() == e) {
+      return;
+    }
+    external_events.push_back(e);
+    num = ++external_num_events;
+  }
+  if (num == 1 && !in_thread())
+    wakeup();
+
+  ldout(cct, 30) << __func__ << " " << e << " pending " << num << dendl;
+}
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
new file mode 100644
index 00000000..6736060e
--- /dev/null
+++ b/src/msg/async/Event.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENT_H
+#define CEPH_MSG_EVENT_H
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+// We use epoll, kqueue, evport, select in descending order by performance.
+#if defined(__linux__)
+#define HAVE_EPOLL 1
+#endif
+
+#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
+#define HAVE_KQUEUE 1
+#endif
+
+#ifdef __sun
+#include <sys/feature_tests.h>
+#ifdef _DTRACE_VERSION
+#define HAVE_EVPORT 1
+#endif
+#endif
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "net_handler.h"
+
+#define EVENT_NONE 0
+#define EVENT_READABLE 1
+#define EVENT_WRITABLE 2
+
+class EventCenter;
+
+class EventCallback {
+
+ public:
+  virtual void do_request(uint64_t fd_or_id) = 0;
+  virtual ~EventCallback() {}       // we want a virtual destructor!!!
+};
+
+typedef EventCallback* EventCallbackRef;
+
+struct FiredFileEvent {
+  int fd;
+  int mask;
+};
+
+/*
+ * EventDriver is a wrap of event mechanisms depends on different OS.
+ * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will
+ * be used for worst condition.
+ */
+class EventDriver {
+ public:
+  virtual ~EventDriver() {}       // we want a virtual destructor!!!
+  virtual int init(EventCenter *center, int nevent) = 0;
+  virtual int add_event(int fd, int cur_mask, int mask) = 0;
+  virtual int del_event(int fd, int cur_mask, int del_mask) = 0;
+  virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
+  virtual int resize_events(int newsize) = 0;
+  virtual bool need_wakeup() { return true; }
+};
+
+/*
+ * EventCenter maintain a set of file descriptor and handle registered events.
+ */
+class EventCenter {
+ public:
+  // should be enough;
+  static const int MAX_EVENTCENTER = 24;
+
+ private:
+  using clock_type = ceph::coarse_mono_clock;
+
+  struct AssociatedCenters {
+    EventCenter *centers[MAX_EVENTCENTER];
+    AssociatedCenters() {
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*));
+    }
+  };
+
+  struct FileEvent {
+    int mask;
+    EventCallbackRef read_cb;
+    EventCallbackRef write_cb;
+    FileEvent(): mask(0), read_cb(NULL), write_cb(NULL) {}
+  };
+
+  struct TimeEvent {
+    uint64_t id;
+    EventCallbackRef time_cb;
+
+    TimeEvent(): id(0), time_cb(NULL) {}
+  };
+
+ public:
+  /**
+     * A Poller object is invoked once each time through the dispatcher's
+     * inner polling loop.
+     */
+  class Poller {
+   public:
+    explicit Poller(EventCenter* center, const string& pollerName);
+    virtual ~Poller();
+
+    /**
+     * This method is defined by a subclass and invoked once by the
+     * center during each pass through its inner polling loop.
+     *
+     * \return
+     *      1 means that this poller did useful work during this call.
+     *      0 means that the poller found no work to do.
+     */
+    virtual int poll() = 0;
+
+   private:
+    /// The EventCenter object that owns this Poller.  NULL means the
+    /// EventCenter has been deleted.
+    EventCenter* owner;
+
+    /// Human-readable string name given to the poller to make it
+    /// easy to identify for debugging. For most pollers just passing
+    /// in the subclass name probably makes sense.
+    string poller_name;
+
+    /// Index of this Poller in EventCenter::pollers.  Allows deletion
+    /// without having to scan all the entries in pollers. -1 means
+    /// this poller isn't currently in EventCenter::pollers (happens
+    /// after EventCenter::reset).
+    int slot;
+  };
+
+ private:
+  CephContext *cct;
+  std::string type;
+  int nevent;
+  // Used only to external event
+  pthread_t owner = 0;
+  std::mutex external_lock;
+  std::atomic_ulong external_num_events;
+  deque<EventCallbackRef> external_events;
+  vector<FileEvent> file_events;
+  EventDriver *driver;
+  std::multimap<clock_type::time_point, TimeEvent> time_events;
+  // Keeps track of all of the pollers currently defined.  We don't
+  // use an intrusive list here because it isn't reentrant: we need
+  // to add/remove elements while the center is traversing the list.
+  std::vector<Poller*> pollers;
+  std::map<uint64_t, std::multimap<clock_type::time_point, TimeEvent>::iterator> event_map;
+  uint64_t time_event_next_id;
+  int notify_receive_fd;
+  int notify_send_fd;
+  NetHandler net;
+  EventCallbackRef notify_handler;
+  unsigned idx;
+  AssociatedCenters *global_centers = nullptr;
+
+  int process_time_events();
+  FileEvent *_get_file_event(int fd) {
+    ceph_assert(fd < nevent);
+    return &file_events[fd];
+  }
+
+ public:
+  explicit EventCenter(CephContext *c):
+    cct(c), nevent(0),
+    external_num_events(0),
+    driver(NULL), time_event_next_id(1),
+    notify_receive_fd(-1), notify_send_fd(-1), net(c),
+    notify_handler(NULL), idx(0) { }
+  ~EventCenter();
+  ostream& _event_prefix(std::ostream *_dout);
+
+  int init(int nevent, unsigned idx, const std::string &t);
+  void set_owner();
+  pthread_t get_owner() const { return owner; }
+  unsigned get_id() const { return idx; }
+
+  EventDriver *get_driver() { return driver; }
+
+  // Used by internal thread
+  int create_file_event(int fd, int mask, EventCallbackRef ctxt);
+  uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt);
+  void delete_file_event(int fd, int mask);
+  void delete_time_event(uint64_t id);
+  int process_events(unsigned timeout_microseconds, ceph::timespan *working_dur = nullptr);
+  void wakeup();
+
+  // Used by external thread
+  void dispatch_event_external(EventCallbackRef e);
+  inline bool in_thread() const {
+    return pthread_equal(pthread_self(), owner);
+  }
+
+ private:
+  template <typename func>
+  class C_submit_event : public EventCallback {
+    std::mutex lock;
+    std::condition_variable cond;
+    bool done = false;
+    func f;
+    bool nonwait;
+   public:
+    C_submit_event(func &&_f, bool nw)
+      : f(std::move(_f)), nonwait(nw) {}
+    void do_request(uint64_t id) override {
+      f();
+      lock.lock();
+      cond.notify_all();
+      done = true;
+      bool del = nonwait;
+      lock.unlock();
+      if (del)
+        delete this;
+    }
+    void wait() {
+      ceph_assert(!nonwait);
+      std::unique_lock<std::mutex> l(lock);
+      while (!done)
+        cond.wait(l);
+    }
+  };
+
+ public:
+  template <typename func>
+  void submit_to(int i, func &&f, bool nowait = false) {
+    ceph_assert(i < MAX_EVENTCENTER && global_centers);
+    EventCenter *c = global_centers->centers[i];
+    ceph_assert(c);
+    if (!nowait && c->in_thread()) {
+      f();
+      return ;
+    }
+    if (nowait) {
+      C_submit_event<func> *event = new C_submit_event<func>(std::move(f), true);
+      c->dispatch_event_external(event);
+    } else {
+      C_submit_event<func> event(std::move(f), false);
+      c->dispatch_event_external(&event);
+      event.wait();
+    }
+  };
+};
+
+#endif
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
new file mode 100644
index 00000000..37b46973
--- /dev/null
+++ b/src/msg/async/EventEpoll.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include <fcntl.h>
+#include "EventEpoll.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EpollDriver."
+
+int EpollDriver::init(EventCenter *c, int nevent)
+{
+  events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent);
+  if (!events) {
+    lderr(cct) << __func__ << " unable to malloc memory. " << dendl;
+    return -ENOMEM;
+  }
+  memset(events, 0, sizeof(struct epoll_event)*nevent);
+
+  epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+  if (epfd == -1) {
+    lderr(cct) << __func__ << " unable to do epoll_create: "
+                       << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  if (::fcntl(epfd, F_SETFD, FD_CLOEXEC) == -1) {
+    int e = errno;
+    ::close(epfd);
+    lderr(cct) << __func__ << " unable to set cloexec: "
+                       << cpp_strerror(e) << dendl;
+
+    return -e;
+  }
+
+  size = nevent;
+
+  return 0;
+}
+
+int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+                 << " add_mask=" << add_mask << " to " << epfd << dendl;
+  struct epoll_event ee;
+  /* If the fd was already monitored for some event, we need a MOD
+   * operation. Otherwise we need an ADD operation. */
+  int op;
+  op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD;
+
+  ee.events = EPOLLET;
+  add_mask |= cur_mask; /* Merge old events */
+  if (add_mask & EVENT_READABLE)
+    ee.events |= EPOLLIN;
+  if (add_mask & EVENT_WRITABLE)
+    ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (epoll_ctl(epfd, op, fd, &ee) == -1) {
+    lderr(cct) << __func__ << " epoll_ctl: add fd=" << fd << " failed. "
+               << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+
+  return 0;
+}
+
+int EpollDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+                 << " delmask=" << delmask << " to " << epfd << dendl;
+  struct epoll_event ee;
+  int mask = cur_mask & (~delmask);
+  int r = 0;
+
+  ee.events = 0;
+  if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
+  if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
+  ee.data.u64 = 0; /* avoid valgrind warning */
+  ee.data.fd = fd;
+  if (mask != EVENT_NONE) {
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  } else {
+    /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+     * EPOLL_CTL_DEL. */
+    if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) {
+      lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  return 0;
+}
+
+int EpollDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  retval = epoll_wait(epfd, events, size,
+                      tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+  if (retval > 0) {
+    int j;
+
+    numevents = retval;
+    fired_events.resize(numevents);
+    for (j = 0; j < numevents; j++) {
+      int mask = 0;
+      struct epoll_event *e = events + j;
+
+      if (e->events & EPOLLIN) mask |= EVENT_READABLE;
+      if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE;
+      if (e->events & EPOLLERR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      if (e->events & EPOLLHUP) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      fired_events[j].fd = e->data.fd;
+      fired_events[j].mask = mask;
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h
new file mode 100644
index 00000000..abc4b8bb
--- /dev/null
+++ b/src/msg/async/EventEpoll.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTEPOLL_H
+#define CEPH_MSG_EVENTEPOLL_H
+
+#include <unistd.h>
+#include <sys/epoll.h>
+
+#include "Event.h"
+
+class EpollDriver : public EventDriver {
+  int epfd;
+  struct epoll_event *events;
+  CephContext *cct;
+  int size;
+
+ public:
+  explicit EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), size(0) {}
+  ~EpollDriver() override {
+    if (epfd != -1)
+      close(epfd);
+
+    if (events)
+      free(events);
+  }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc
new file mode 100644
index 00000000..d6ba4a3d
--- /dev/null
+++ b/src/msg/async/EventKqueue.cc
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventKqueue.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "KqueueDriver."
+
+#define KEVENT_NOWAIT 0
+
+int KqueueDriver::test_kqfd() {
+  struct kevent ke[1];
+  if (kevent(kqfd, ke, 0, NULL, 0, KEVENT_NOWAIT) == -1) {
+    ldout(cct,0) << __func__ << " invalid kqfd = " << kqfd 
+                 << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return kqfd;
+}
+
+int KqueueDriver::restore_events() {
+  struct kevent ke[2];
+  int i;
+
+  ldout(cct,30) << __func__ << " on kqfd = " << kqfd << dendl;
+  for(i=0;i<size;i++) {
+    int num = 0;
+    if (sav_events[i].mask == 0 )
+      continue;
+    ldout(cct,30) << __func__ << " restore kqfd = " << kqfd 
+                  << " fd = " << i << " mask " << sav_events[i].mask << dendl;
+    if (sav_events[i].mask & EVENT_READABLE)
+      EV_SET(&ke[num++], i, EVFILT_READ, EV_ADD, 0, 0, NULL);
+    if (sav_events[i].mask & EVENT_WRITABLE)
+      EV_SET(&ke[num++], i, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
+    if (num) {
+      if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+        ldout(cct,0) << __func__ << " unable to add event: "
+                     << cpp_strerror(errno) << dendl;
+        return -errno;
+      }
+    }
+  }
+  return 0;
+}
+
+int KqueueDriver::test_thread_change(const char* funcname) {
+  // check to see if we changed thread, because that invalidates
+  // the kqfd and we need to restore that
+  int oldkqfd = kqfd;
+
+  if (!pthread_equal(mythread, pthread_self())) {
+    ldout(cct,20) << funcname << " We changed thread from " << mythread
+                  << " to " << pthread_self() << dendl;
+    mythread = pthread_self();
+    kqfd = -1;
+  } else if ((kqfd != -1) && (test_kqfd() < 0)) {
+    // should this ever happen?
+    // It would be strange to change kqfd with thread change.
+    // Might nee to change this into an ceph_assert() in the future.
+    ldout(cct,0) << funcname << " Warning: Recreating old kqfd. "
+                 << "This should not happen!!!"  << dendl;
+    kqfd = -1;
+  }
+  if (kqfd == -1) {
+    kqfd = kqueue();
+    ldout(cct,30) << funcname << " kqueue: new kqfd = " << kqfd
+                  << " (was: " << oldkqfd << ")"
+                  << dendl;
+    if (kqfd < 0) {
+      lderr(cct) << funcname << " unable to do kqueue: "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+    if (restore_events()< 0) {
+      lderr(cct) << funcname << " unable restore all events "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  return 0;
+}
+
+int KqueueDriver::init(EventCenter *c, int nevent)
+{
+  // keep track of possible changes of our thread
+  // because change of thread kills the kqfd
+  mythread = pthread_self();
+
+  // Reserve the space to accept the kevent return events.
+  res_events = (struct kevent*)malloc(sizeof(struct kevent)*nevent);
+  if (!res_events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -ENOMEM;
+  }
+  memset(res_events, 0, sizeof(struct kevent)*nevent);
+  size = nevent;
+
+  // Reserve the space to keep all of the events set, so it can be redone
+  // when we change trhread ID. 
+  sav_events = (struct SaveEvent*)malloc(sizeof(struct SaveEvent)*nevent);
+  if (!sav_events) {
+    lderr(cct) << __func__ << " unable to malloc memory: "
+                           << cpp_strerror(errno) << dendl;
+    return -ENOMEM;
+  }
+  memset(sav_events, 0, sizeof(struct SaveEvent)*nevent);
+  sav_max = nevent;
+
+  // Delay assigning a descriptor until it is really needed.
+  // kqfd = kqueue();
+  kqfd = -1;
+  return 0;
+}
+
+int KqueueDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  struct kevent ke[2];
+  int num = 0;
+
+  ldout(cct,30) << __func__ << " add event kqfd = " << kqfd << " fd = " << fd 
+	<< " cur_mask = " << cur_mask << " add_mask = " << add_mask 
+	<< dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (add_mask & EVENT_READABLE)
+    EV_SET(&ke[num++], fd, EVFILT_READ, EV_ADD|EV_CLEAR, 0, 0, NULL);
+  if (add_mask & EVENT_WRITABLE)
+    EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_ADD|EV_CLEAR, 0, 0, NULL);
+
+  if (num) {
+    if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+      lderr(cct) << __func__ << " unable to add event: "
+                             << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  // keep what we set
+  if (fd >= sav_max)
+    resize_events(sav_max+5000);
+  sav_events[fd].mask = cur_mask | add_mask;
+  return 0;
+}
+
+int KqueueDriver::del_event(int fd, int cur_mask, int del_mask)
+{
+  struct kevent ke[2];
+  int num = 0;
+  int mask = cur_mask & del_mask;
+
+  ldout(cct,30) << __func__ << " delete event kqfd = " << kqfd 
+	<< " fd = " << fd << " cur_mask = " << cur_mask 
+	<< " del_mask = " << del_mask << dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (mask & EVENT_READABLE)
+    EV_SET(&ke[num++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+  if (mask & EVENT_WRITABLE)
+    EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
+
+  if (num) {
+    int r = 0;
+    if ((r = kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT)) < 0) {
+      lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << mask
+                 << " failed." << cpp_strerror(errno) << dendl;
+      return -errno;
+    }
+  }
+  // keep the administration
+  sav_events[fd].mask = cur_mask & ~del_mask;
+  return 0;
+}
+
+int KqueueDriver::resize_events(int newsize)
+{
+  ldout(cct,30) << __func__ << " kqfd = " << kqfd << "newsize = " << newsize 
+                << dendl;
+  if (newsize > sav_max) {
+    sav_events = (struct SaveEvent*)realloc(sav_events, sizeof(struct SaveEvent)*newsize);
+    if (!sav_events) {
+      lderr(cct) << __func__ << " unable to realloc memory: "
+                             << cpp_strerror(errno) << dendl;
+      ceph_assert(sav_events);
+      return -ENOMEM;
+    }
+    memset(&sav_events[size], 0, sizeof(struct SaveEvent)*(newsize-sav_max));
+    sav_max = newsize;
+  }
+  return 0;
+}
+
+int KqueueDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+  struct timespec timeout;
+
+  ldout(cct,10) << __func__ << " kqfd = " << kqfd << dendl;
+
+  int r = test_thread_change(__func__);
+  if ( r < 0 )
+    return r;
+
+  if (tvp != NULL) {
+      timeout.tv_sec = tvp->tv_sec;
+      timeout.tv_nsec = tvp->tv_usec * 1000;
+      ldout(cct,20) << __func__ << " "
+		<< timeout.tv_sec << " sec "
+		<< timeout.tv_nsec << " nsec"
+		<< dendl;
+      retval = kevent(kqfd, NULL, 0, res_events, size, &timeout);
+  } else {
+      ldout(cct,30) << __func__ << " event_wait: " << " NULL" << dendl;
+      retval = kevent(kqfd, NULL, 0, res_events, size, KEVENT_NOWAIT);
+  }
+
+  ldout(cct,25) << __func__ << " kevent retval: " << retval << dendl;
+  if (retval < 0) {
+    lderr(cct) << __func__ << " kqueue error: "
+                           << cpp_strerror(errno) << dendl;
+    return -errno;
+  } else if (retval == 0) {
+    ldout(cct,5) << __func__ << " Hit timeout("
+                 << timeout.tv_sec << " sec "
+                 << timeout.tv_nsec << " nsec"
+		 << ")." << dendl;
+  } else {
+    int j;
+
+    numevents = retval;
+    fired_events.resize(numevents);
+    for (j = 0; j < numevents; j++) {
+      int mask = 0;
+      struct kevent *e = res_events + j;
+
+      if (e->filter == EVFILT_READ) mask |= EVENT_READABLE;
+      if (e->filter == EVFILT_WRITE) mask |= EVENT_WRITABLE;
+      if (e->flags & EV_ERROR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+      fired_events[j].fd = (int)e->ident;
+      fired_events[j].mask = mask;
+
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h
new file mode 100644
index 00000000..24863a93
--- /dev/null
+++ b/src/msg/async/EventKqueue.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTKQUEUE_H
+#define CEPH_MSG_EVENTKQUEUE_H
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <unistd.h>
+
+#include "Event.h"
+
+class KqueueDriver : public EventDriver {
+  int kqfd;
+  pthread_t mythread;
+  struct kevent *res_events;
+  CephContext *cct;
+  int size;
+
+  // Keep what we set on the kqfd
+  struct SaveEvent{
+    int fd;
+    int mask;
+  };
+  struct SaveEvent *sav_events;
+  int sav_max;
+  int restore_events();
+  int test_kqfd();
+  int test_thread_change(const char* funcname);
+
+ public:
+  explicit KqueueDriver(CephContext *c): kqfd(-1), res_events(NULL), cct(c), 
+		size(0), sav_max(0) {}
+  virtual ~KqueueDriver() {
+    if (kqfd != -1)
+      close(kqfd);
+
+    if (res_events)
+      free(res_events);
+    size = 0;
+    if (sav_events)
+      free(sav_events);
+    sav_max = 0;
+  }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc
new file mode 100644
index 00000000..fdee6ebc
--- /dev/null
+++ b/src/msg/async/EventSelect.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventSelect.h"
+
+#include <unistd.h>
+#include <sys/select.h>
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "SelectDriver."
+
+int SelectDriver::init(EventCenter *c, int nevent)
+{
+  ldout(cct, 0) << "Select isn't suitable for production env, just avoid "
+                << "compiling error or special purpose" << dendl;
+  FD_ZERO(&rfds);
+  FD_ZERO(&wfds);
+  max_fd = 0;
+  return 0;
+}
+
+int SelectDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+  ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask
+                 << dendl;
+
+  int mask = cur_mask | add_mask;
+  if (mask & EVENT_READABLE)
+    FD_SET(fd, &rfds);
+  if (mask & EVENT_WRITABLE)
+    FD_SET(fd, &wfds);
+  if (fd > max_fd)
+      max_fd = fd;
+
+  return 0;
+}
+
+int SelectDriver::del_event(int fd, int cur_mask, int delmask)
+{
+  ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask
+                 << dendl;
+
+  if (delmask & EVENT_READABLE)
+    FD_CLR(fd, &rfds);
+  if (delmask & EVENT_WRITABLE)
+    FD_CLR(fd, &wfds);
+  return 0;
+}
+
+int SelectDriver::resize_events(int newsize)
+{
+  return 0;
+}
+
+int SelectDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+  int retval, numevents = 0;
+
+  memcpy(&_rfds, &rfds, sizeof(fd_set));
+  memcpy(&_wfds, &wfds, sizeof(fd_set));
+
+  retval = select(max_fd+1, &_rfds, &_wfds, NULL, tvp);
+  if (retval > 0) {
+    for (int j = 0; j <= max_fd; j++) {
+      int mask = 0;
+      struct FiredFileEvent fe;
+      if (FD_ISSET(j, &_rfds))
+          mask |= EVENT_READABLE;
+      if (FD_ISSET(j, &_wfds))
+          mask |= EVENT_WRITABLE;
+      if (mask) {
+        fe.fd = j;
+        fe.mask = mask;
+        fired_events.push_back(fe);
+        numevents++;
+      }
+    }
+  }
+  return numevents;
+}
diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h
new file mode 100644
index 00000000..1b75da0b
--- /dev/null
+++ b/src/msg/async/EventSelect.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTSELECT_H
+#define CEPH_MSG_EVENTSELECT_H
+
+#include "Event.h"
+
+class SelectDriver : public EventDriver {
+  fd_set rfds, wfds;
+  /* We need to have a copy of the fd sets as it's not safe to reuse
+   * FD sets after select(). */
+  fd_set _rfds, _wfds;
+  int max_fd;
+  CephContext *cct;
+
+ public:
+  explicit SelectDriver(CephContext *c): max_fd(0), cct(c) {}
+  ~SelectDriver() override {}
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events,
+		 struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc
new file mode 100644
index 00000000..e9c8d404
--- /dev/null
+++ b/src/msg/async/PosixStack.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+#include <algorithm>
+
+#include "PosixStack.h"
+
+#include "include/buffer.h"
+#include "include/str_list.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/dout.h"
+#include "msg/Messenger.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "PosixStack "
+
+class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
+  NetHandler &handler;
+  int _fd;
+  entity_addr_t sa;
+  bool connected;
+
+ public:
+  explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected)
+      : handler(h), _fd(f), sa(sa), connected(connected) {}
+
+  int is_connected() override {
+    if (connected)
+      return 1;
+
+    int r = handler.reconnect(sa, _fd);
+    if (r == 0) {
+      connected = true;
+      return 1;
+    } else if (r < 0) {
+      return r;
+    } else {
+      return 0;
+    }
+  }
+
+  ssize_t zero_copy_read(bufferptr&) override {
+    return -EOPNOTSUPP;
+  }
+
+  ssize_t read(char *buf, size_t len) override {
+    ssize_t r = ::read(_fd, buf, len);
+    if (r < 0)
+      r = -errno;
+    return r;
+  }
+
+  // return the sent length
+  // < 0 means error occurred
+  static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
+  {
+    size_t sent = 0;
+    while (1) {
+      MSGR_SIGPIPE_STOPPER;
+      ssize_t r;
+      r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+      if (r < 0) {
+        if (errno == EINTR) {
+          continue;
+        } else if (errno == EAGAIN) {
+          break;
+        }
+        return -errno;
+      }
+
+      sent += r;
+      if (len == sent) break;
+
+      while (r > 0) {
+        if (msg.msg_iov[0].iov_len <= (size_t)r) {
+          // drain this whole item
+          r -= msg.msg_iov[0].iov_len;
+          msg.msg_iov++;
+          msg.msg_iovlen--;
+        } else {
+          msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r;
+          msg.msg_iov[0].iov_len -= r;
+          break;
+        }
+      }
+    }
+    return (ssize_t)sent;
+  }
+
+  ssize_t send(bufferlist &bl, bool more) override {
+    size_t sent_bytes = 0;
+    auto pb = std::cbegin(bl.buffers());
+    uint64_t left_pbrs = std::size(bl.buffers());
+    while (left_pbrs) {
+      struct msghdr msg;
+      struct iovec msgvec[IOV_MAX];
+      uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+      left_pbrs -= size;
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&msg, 0, sizeof(msg));
+      msg.msg_iovlen = size;
+      msg.msg_iov = msgvec;
+      unsigned msglen = 0;
+      for (auto iov = msgvec; iov != msgvec + size; iov++) {
+	iov->iov_base = (void*)(pb->c_str());
+	iov->iov_len = pb->length();
+	msglen += pb->length();
+	++pb;
+      }
+      ssize_t r = do_sendmsg(_fd, msg, msglen, left_pbrs || more);
+      if (r < 0)
+        return r;
+
+      // "r" is the remaining length
+      sent_bytes += r;
+      if (static_cast<unsigned>(r) < msglen)
+        break;
+      // only "r" == 0 continue
+    }
+
+    if (sent_bytes) {
+      bufferlist swapped;
+      if (sent_bytes < bl.length()) {
+        bl.splice(sent_bytes, bl.length()-sent_bytes, &swapped);
+        bl.swap(swapped);
+      } else {
+        bl.clear();
+      }
+    }
+
+    return static_cast<ssize_t>(sent_bytes);
+  }
+  void shutdown() override {
+    ::shutdown(_fd, SHUT_RDWR);
+  }
+  void close() override {
+    ::close(_fd);
+  }
+  int fd() const override {
+    return _fd;
+  }
+  int socket_fd() const override {
+    return _fd;
+  }
+  friend class PosixServerSocketImpl;
+  friend class PosixNetworkStack;
+};
+
+class PosixServerSocketImpl : public ServerSocketImpl {
+  NetHandler &handler;
+  int _fd;
+
+ public:
+  explicit PosixServerSocketImpl(NetHandler &h, int f,
+				 const entity_addr_t& listen_addr, unsigned slot)
+    : ServerSocketImpl(listen_addr.get_type(), slot),
+      handler(h), _fd(f) {}
+  int accept(ConnectedSocket *sock, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  void abort_accept() override {
+    ::close(_fd);
+  }
+  int fd() const override {
+    return _fd;
+  }
+};
+
+int PosixServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+  ceph_assert(sock);
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(_fd, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -errno;
+  }
+
+  int r = handler.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  r = handler.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  handler.set_priority(sd, opt.priority, out->get_family());
+
+  std::unique_ptr<PosixConnectedSocketImpl> csi(new PosixConnectedSocketImpl(handler, *out, sd, true));
+  *sock = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+void PosixWorker::initialize()
+{
+}
+
+int PosixWorker::listen(entity_addr_t &sa,
+			unsigned addr_slot,
+			const SocketOptions &opt,
+                        ServerSocket *sock)
+{
+  int listen_sd = net.create_socket(sa.get_family(), true);
+  if (listen_sd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_nonblock(listen_sd);
+  if (r < 0) {
+    ::close(listen_sd);
+    return -errno;
+  }
+
+  r = net.set_socket_options(listen_sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(listen_sd);
+    return -errno;
+  }
+
+  r = ::bind(listen_sd, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (r < 0) {
+    r = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << ": " << cpp_strerror(r) << dendl;
+    ::close(listen_sd);
+    return r;
+  }
+
+  r = ::listen(listen_sd, cct->_conf->ms_tcp_listen_backlog);
+  if (r < 0) {
+    r = -errno;
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(r) << dendl;
+    ::close(listen_sd);
+    return r;
+  }
+
+  *sock = ServerSocket(
+          std::unique_ptr<PosixServerSocketImpl>(
+	    new PosixServerSocketImpl(net, listen_sd, sa, addr_slot)));
+  return 0;
+}
+
+int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) {
+  int sd;
+
+  if (opts.nonblock) {
+    sd = net.nonblock_connect(addr, opts.connect_bind_addr);
+  } else {
+    sd = net.connect(addr, opts.connect_bind_addr);
+  }
+
+  if (sd < 0) {
+    return -errno;
+  }
+
+  net.set_priority(sd, opts.priority, addr.get_family());
+  *socket = ConnectedSocket(
+      std::unique_ptr<PosixConnectedSocketImpl>(new PosixConnectedSocketImpl(net, addr, sd, !opts.nonblock)));
+  return 0;
+}
+
+PosixNetworkStack::PosixNetworkStack(CephContext *c, const string &t)
+    : NetworkStack(c, t)
+{
+}
diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h
new file mode 100644
index 00000000..f1aaccd4
--- /dev/null
+++ b/src/msg/async/PosixStack.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_POSIXSTACK_H
+#define CEPH_MSG_ASYNC_POSIXSTACK_H
+
+#include <thread>
+
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#include "Stack.h"
+
+class PosixWorker : public Worker {
+  NetHandler net;
+  void initialize() override;
+ public:
+  PosixWorker(CephContext *c, unsigned i)
+      : Worker(c, i), net(c) {}
+  int listen(entity_addr_t &sa,
+	     unsigned addr_slot,
+	     const SocketOptions &opt,
+	     ServerSocket *socks) override;
+  int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+};
+
+class PosixNetworkStack : public NetworkStack {
+  vector<std::thread> threads;
+
+ public:
+  explicit PosixNetworkStack(CephContext *c, const string &t);
+
+  void spawn_worker(unsigned i, std::function<void ()> &&func) override {
+    threads.resize(i+1);
+    threads[i] = std::thread(func);
+  }
+  void join_worker(unsigned i) override {
+    ceph_assert(threads.size() > i && threads[i].joinable());
+    threads[i].join();
+  }
+};
+
+#endif //CEPH_MSG_ASYNC_POSIXSTACK_H
diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc
new file mode 100644
index 00000000..4bdc065e
--- /dev/null
+++ b/src/msg/async/Protocol.cc
@@ -0,0 +1,14 @@
+#include "Protocol.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+
+Protocol::Protocol(int type, AsyncConnection *connection)
+  : proto_type(type),
+    connection(connection),
+    messenger(connection->async_msgr),
+    cct(connection->async_msgr->cct) {
+  auth_meta.reset(new AuthConnectionMeta());
+}
+
+Protocol::~Protocol() {}
diff --git a/src/msg/async/Protocol.h b/src/msg/async/Protocol.h
new file mode 100644
index 00000000..cccba183
--- /dev/null
+++ b/src/msg/async/Protocol.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_
+#define _MSG_ASYNC_PROTOCOL_
+
+#include <list>
+#include <map>
+
+#include "AsyncConnection.h"
+#include "include/buffer.h"
+#include "include/msgr.h"
+
+/*
+ * Continuation Helper Classes
+ */
+
+#include <memory>
+#include <tuple>
+
+template <class C>
+class Ct {
+public:
+  virtual ~Ct() {}
+  virtual Ct<C> *call(C *foo) const = 0;
+};
+
+template <class C, typename... Args>
+class CtFun : public Ct<C> {
+private:
+  using fn_t = Ct<C> *(C::*)(Args...);
+  fn_t _f;
+  std::tuple<Args...> _params;
+
+  template <std::size_t... Is>
+  inline Ct<C> *_call(C *foo, std::index_sequence<Is...>) const {
+    return (foo->*_f)(std::get<Is>(_params)...);
+  }
+
+public:
+  CtFun(fn_t f) : _f(f) {}
+
+  inline void setParams(Args... args) { _params = std::make_tuple(args...); }
+  inline Ct<C> *call(C *foo) const override {
+    return _call(foo, std::index_sequence_for<Args...>());
+  }
+};
+
+using rx_buffer_t =
+    std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>;
+
+template <class C>
+class CtRxNode : public Ct<C> {
+  using fn_t = Ct<C> *(C::*)(rx_buffer_t&&, int r);
+  fn_t _f;
+
+public:
+  mutable rx_buffer_t node;
+  int r;
+
+  CtRxNode(fn_t f) : _f(f) {}
+  void setParams(rx_buffer_t &&node, int r) {
+    this->node = std::move(node);
+    this->r = r;
+  }
+  inline Ct<C> *call(C *foo) const override {
+    return (foo->*_f)(std::move(node), r);
+  }
+};
+
+template <class C> using CONTINUATION_TYPE = CtFun<C>;
+template <class C> using CONTINUATION_TX_TYPE = CtFun<C, int>;
+template <class C> using CONTINUATION_RX_TYPE = CtFun<C, char*, int>;
+template <class C> using CONTINUATION_RXBPTR_TYPE = CtRxNode<C>;
+
+#define CONTINUATION_DECL(C, F, ...)                    \
+  CtFun<C, ##__VA_ARGS__> F##_cont { (&C::F) };
+
+#define CONTINUATION(F) F##_cont
+#define CONTINUE(F, ...) (F##_cont.setParams(__VA_ARGS__), &F##_cont)
+
+#define CONTINUATION_RUN(CT)                                      \
+  {                                                               \
+    Ct<std::remove_reference<decltype(*this)>::type> *_cont = &CT;\
+    do {                                                          \
+      _cont = _cont->call(this);                                  \
+    } while (_cont);                                              \
+  }
+
+#define READ_HANDLER_CONTINUATION_DECL(C, F) \
+  CONTINUATION_DECL(C, F, char *, int)
+
+#define READ_BPTR_HANDLER_CONTINUATION_DECL(C, F) \
+  CtRxNode<C> F##_cont { (&C::F) };
+
+#define WRITE_HANDLER_CONTINUATION_DECL(C, F) CONTINUATION_DECL(C, F, int)
+
+//////////////////////////////////////////////////////////////////////
+
+class AsyncMessenger;
+
+class Protocol {
+public:
+  const int proto_type;
+protected:
+  AsyncConnection *connection;
+  AsyncMessenger *messenger;
+  CephContext *cct;
+public:
+  std::shared_ptr<AuthConnectionMeta> auth_meta;
+
+public:
+  Protocol(int type, AsyncConnection *connection);
+  virtual ~Protocol();
+
+  // prepare protocol for connecting to peer
+  virtual void connect() = 0;
+  // prepare protocol for accepting peer connections
+  virtual void accept() = 0;
+  // true -> protocol is ready for sending messages
+  virtual bool is_connected() = 0;
+  // stop connection
+  virtual void stop() = 0;
+  // signal and handle connection failure
+  virtual void fault() = 0;
+  // send message
+  virtual void send_message(Message *m) = 0;
+  // send keepalive
+  virtual void send_keepalive() = 0;
+
+  virtual void read_event() = 0;
+  virtual void write_event() = 0;
+  virtual bool is_queued() = 0;
+
+  int get_con_mode() const {
+    return auth_meta->con_mode;
+  }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_ */
diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc
new file mode 100644
index 00000000..9a7ab9d4
--- /dev/null
+++ b/src/msg/async/ProtocolV1.cc
@@ -0,0 +1,2547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV1.h"
+
+#include "common/errno.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+#include "common/EventTrace.h"
+#include "include/random.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "--1- " << messenger->get_myaddrs() << " >> "
+                << *connection->peer_addrs
+		<< " conn("
+                << connection << " " << this
+                << " :" << connection->port << " s=" << get_state_name(state)
+                << " pgs=" << peer_global_seq << " cs=" << connect_seq
+                << " l=" << connection->policy.lossy << ").";
+}
+
+#define WRITE(B, C) write(CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), L)
+
+#define READB(L, B, C) read(CONTINUATION(C), L, B)
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about
+// it, just a big number.  PLR
+#define SEQ_MASK 0x7fffffff
+
+const int ASYNC_COALESCE_THRESHOLD = 256;
+
+using namespace std;
+
+static void alloc_aligned_buffer(bufferlist &data, unsigned len, unsigned off) {
+  // create a buffer to read into that matches the data alignment
+  unsigned alloc_len = 0;
+  unsigned left = len;
+  unsigned head = 0;
+  if (off & ~CEPH_PAGE_MASK) {
+    // head
+    alloc_len += CEPH_PAGE_SIZE;
+    head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
+    left -= head;
+  }
+  alloc_len += left;
+  bufferptr ptr(buffer::create_small_page_aligned(alloc_len));
+  if (head) ptr.set_offset(CEPH_PAGE_SIZE - head);
+  data.push_back(std::move(ptr));
+}
+
+/**
+ * Protocol V1
+ **/
+
+ProtocolV1::ProtocolV1(AsyncConnection *connection)
+    : Protocol(1, connection),
+      temp_buffer(nullptr),
+      can_write(WriteStatus::NOWRITE),
+      keepalive(false),
+      connect_seq(0),
+      peer_global_seq(0),
+      msg_left(0),
+      cur_msg_size(0),
+      replacing(false),
+      is_reset_from_peer(false),
+      once_ready(false),
+      state(NONE),
+      global_seq(0),
+      authorizer(nullptr),
+      wait_for_seq(false) {
+  temp_buffer = new char[4096];
+}
+
+ProtocolV1::~ProtocolV1() {
+  ceph_assert(out_q.empty());
+  ceph_assert(sent.empty());
+
+  delete[] temp_buffer;
+
+  if (authorizer) {
+    delete authorizer;
+  }
+}
+
+void ProtocolV1::connect() {
+  this->state = START_CONNECT;
+
+  // reset connect state variables
+  if (authorizer) {
+    delete authorizer;
+    authorizer = nullptr;
+  }
+  authorizer_buf.clear();
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+  memset(&connect_reply, 0, sizeof(connect_reply));
+
+  global_seq = messenger->get_global_seq();
+}
+
+void ProtocolV1::accept() { this->state = START_ACCEPT; }
+
+bool ProtocolV1::is_connected() {
+  return can_write.load() == WriteStatus::CANWRITE;
+}
+
+void ProtocolV1::stop() {
+  ldout(cct, 20) << __func__ << dendl;
+  if (state == CLOSED) {
+    return;
+  }
+
+  if (connection->delay_state) connection->delay_state->flush();
+
+  ldout(cct, 2) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+
+  reset_recv_state();
+  discard_out_queue();
+
+  connection->_stop();
+
+  can_write = WriteStatus::CLOSED;
+  state = CLOSED;
+}
+
+void ProtocolV1::fault() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (state == CLOSED || state == NONE) {
+    ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+    return;
+  }
+
+  if (connection->policy.lossy && state != START_CONNECT &&
+      state != CONNECTING) {
+    ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return;
+  }
+
+  connection->write_lock.lock();
+  can_write = WriteStatus::NOWRITE;
+  is_reset_from_peer = false;
+
+  // requeue sent items
+  requeue_sent();
+
+  if (!once_ready && out_q.empty() && state >= START_ACCEPT &&
+      state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) {
+    ldout(cct, 10) << __func__ << " with nothing to send and in the half "
+                   << " accept state just closed" << dendl;
+    connection->write_lock.unlock();
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return;
+  }
+  replacing = false;
+
+  connection->fault();
+
+  reset_recv_state();
+
+  if (connection->policy.standby && out_q.empty() && !keepalive &&
+      state != WAIT) {
+    ldout(cct, 10) << __func__ << " with nothing to send, going to standby"
+                   << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return;
+  }
+
+  connection->write_lock.unlock();
+
+  if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) ||
+      state == WAIT) {
+    // backoff!
+    if (state == WAIT) {
+      backoff.set_from_double(cct->_conf->ms_max_backoff);
+    } else if (backoff == utime_t()) {
+      backoff.set_from_double(cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > cct->_conf->ms_max_backoff)
+        backoff.set_from_double(cct->_conf->ms_max_backoff);
+    }
+
+    global_seq = messenger->get_global_seq();
+    state = START_CONNECT;
+    connection->state = AsyncConnection::STATE_CONNECTING;
+    ldout(cct, 10) << __func__ << " waiting " << backoff << dendl;
+    // woke up again;
+    connection->register_time_events.insert(
+        connection->center->create_time_event(backoff.to_nsec() / 1000,
+                                              connection->wakeup_handler));
+  } else {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 0) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 0) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      global_seq = messenger->get_global_seq();
+      state = START_CONNECT;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
+  }
+}
+
+void ProtocolV1::send_message(Message *m) {
+  bufferlist bl;
+  uint64_t f = connection->get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+  if (can_fast_prepare) {
+    prepare_send_message(f, m, bl);
+  }
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare &&
+      (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) {
+    // ensure the correctness of message encoding
+    bl.clear();
+    m->clear_payload();
+    ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f
+                  << " != " << connection->get_features() << dendl;
+  }
+  if (can_write == WriteStatus::CLOSED) {
+    ldout(cct, 10) << __func__ << " connection closed."
+                   << " Drop message " << m << dendl;
+    m->put();
+  } else {
+    m->trace.event("async enqueueing message");
+    out_q[m->get_priority()].emplace_back(std::move(bl), m);
+    ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+                   << dendl;
+    if (can_write != WriteStatus::REPLACING && !write_in_progress) {
+      write_in_progress = true;
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+}
+
+void ProtocolV1::prepare_send_message(uint64_t features, Message *m,
+                                      bufferlist &bl) {
+  ldout(cct, 20) << __func__ << " m " << *m << dendl;
+
+  // associate message with Connection (for benefit of encode_payload)
+  if (m->empty_payload()) {
+    ldout(cct, 20) << __func__ << " encoding features " << features << " " << m
+                   << " " << *m << dendl;
+  } else {
+    ldout(cct, 20) << __func__ << " half-reencoding features " << features
+                   << " " << m << " " << *m << dendl;
+  }
+
+  // encode and copy out of *m
+  m->encode(features, messenger->crcflags);
+
+  bl.append(m->get_payload());
+  bl.append(m->get_middle());
+  bl.append(m->get_data());
+}
+
+void ProtocolV1::send_keepalive() {
+  ldout(cct, 10) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (can_write != WriteStatus::CLOSED) {
+    keepalive = true;
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+}
+
+void ProtocolV1::read_event() {
+  ldout(cct, 20) << __func__ << dendl;
+  switch (state) {
+    case START_CONNECT:
+      CONTINUATION_RUN(CONTINUATION(send_client_banner));
+      break;
+    case START_ACCEPT:
+      CONTINUATION_RUN(CONTINUATION(send_server_banner));
+      break;
+    case OPENED:
+      CONTINUATION_RUN(CONTINUATION(wait_message));
+      break;
+    case THROTTLE_MESSAGE:
+      CONTINUATION_RUN(CONTINUATION(throttle_message));
+      break;
+    case THROTTLE_BYTES:
+      CONTINUATION_RUN(CONTINUATION(throttle_bytes));
+      break;
+    case THROTTLE_DISPATCH_QUEUE:
+      CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue));
+      break;
+    default:
+      break;
+  }
+}
+
+void ProtocolV1::write_event() {
+  ldout(cct, 10) << __func__ << dendl;
+  ssize_t r = 0;
+
+  connection->write_lock.lock();
+  if (can_write == WriteStatus::CANWRITE) {
+    if (keepalive) {
+      append_keepalive_or_ack();
+      keepalive = false;
+    }
+
+    auto start = ceph::mono_clock::now();
+    bool more;
+    do {
+      bufferlist data;
+      Message *m = _get_next_outgoing(&data);
+      if (!m) {
+        break;
+      }
+
+      if (!connection->policy.lossy) {
+        // put on sent list
+        sent.push_back(m);
+        m->get();
+      }
+      more = !out_q.empty();
+      connection->write_lock.unlock();
+
+      // send_message or requeue messages may not encode message
+      if (!data.length()) {
+        prepare_send_message(connection->get_features(), m, data);
+      }
+
+      r = write_message(m, data, more);
+
+      connection->write_lock.lock();
+      if (r == 0) {
+        ;
+      } else if (r < 0) {
+        ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+        break;
+      } else if (r > 0)
+        break;
+    } while (can_write == WriteStatus::CANWRITE);
+    write_in_progress = false;
+    connection->write_lock.unlock();
+
+    // if r > 0 mean data still lefted, so no need _try_send.
+    if (r == 0) {
+      uint64_t left = ack_left;
+      if (left) {
+        ceph_le64 s;
+        s = in_seq;
+        connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK);
+        connection->outgoing_bl.append((char *)&s, sizeof(s));
+        ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+                       << " messages" << dendl;
+        ack_left -= left;
+        left = ack_left;
+        r = connection->_try_send(left);
+      } else if (is_queued()) {
+        r = connection->_try_send();
+      }
+    }
+
+    connection->logger->tinc(l_msgr_running_send_time,
+                             ceph::mono_clock::now() - start);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+      connection->lock.lock();
+      fault();
+      connection->lock.unlock();
+      return;
+    }
+  } else {
+    write_in_progress = false;
+    connection->write_lock.unlock();
+    connection->lock.lock();
+    connection->write_lock.lock();
+    if (state == STANDBY && !connection->policy.server && is_queued()) {
+      ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+      connection->_connect();
+    } else if (connection->cs && state != NONE && state != CLOSED &&
+               state != START_CONNECT) {
+      r = connection->_try_send();
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        connection->write_lock.unlock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+    }
+    connection->write_lock.unlock();
+    connection->lock.unlock();
+  }
+}
+
+bool ProtocolV1::is_queued() {
+  return !out_q.empty() || connection->is_queued();
+}
+
+void ProtocolV1::run_continuation(CtPtr pcontinuation) {
+  if (pcontinuation) {
+    CONTINUATION_RUN(*pcontinuation);
+  }
+}
+
+CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE<ProtocolV1> &next,
+                       int len, char *buffer) {
+  if (!buffer) {
+    buffer = temp_buffer;
+  }
+  ssize_t r = connection->read(len, buffer,
+                               [&next, this](char *buffer, int r) {
+                                 next.setParams(buffer, r);
+                                 CONTINUATION_RUN(next);
+                               });
+  if (r <= 0) {
+    next.setParams(buffer, r);
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE<ProtocolV1> &next,
+                        bufferlist &buffer) {
+  ssize_t r = connection->write(buffer, [&next, this](int r) {
+    next.setParams(r);
+    CONTINUATION_RUN(next);
+  });
+  if (r <= 0) {
+    next.setParams(r);
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV1::ready() {
+  ldout(cct, 25) << __func__ << dendl;
+
+  // make sure no pending tick timer
+  if (connection->last_tick_id) {
+    connection->center->delete_time_event(connection->last_tick_id);
+  }
+  connection->last_tick_id = connection->center->create_time_event(
+      connection->inactive_timeout_us, connection->tick_handler);
+
+  connection->write_lock.lock();
+  can_write = WriteStatus::CANWRITE;
+  if (is_queued()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+  connection->write_lock.unlock();
+  connection->maybe_start_delay_thread();
+
+  state = OPENED;
+  return wait_message();
+}
+
+CtPtr ProtocolV1::wait_message() {
+  if (state != OPENED) {  // must have changed due to a replace
+    return nullptr;
+  }
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(char), handle_message);
+}
+
+CtPtr ProtocolV1::handle_message(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read tag failed" << dendl;
+    return _fault();
+  }
+
+  char tag = buffer[0];
+  ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl;
+
+  if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
+    ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+    connection->set_last_keepalive(ceph_clock_now());
+  } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
+    return READ(sizeof(ceph_timespec), handle_keepalive2);
+  } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+    return READ(sizeof(ceph_timespec), handle_keepalive2_ack);
+  } else if (tag == CEPH_MSGR_TAG_ACK) {
+    return READ(sizeof(ceph_le64), handle_tag_ack);
+  } else if (tag == CEPH_MSGR_TAG_MSG) {
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+    ltt_recv_stamp = ceph_clock_now();
+#endif
+    recv_stamp = ceph_clock_now();
+    ldout(cct, 20) << __func__ << " begin MSG" << dendl;
+    return READ(sizeof(ceph_msg_header), handle_message_header);
+  } else if (tag == CEPH_MSGR_TAG_CLOSE) {
+    ldout(cct, 20) << __func__ << " got CLOSE" << dendl;
+    stop();
+  } else {
+    ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl;
+    return _fault();
+  }
+  return nullptr;
+}
+
+CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+  ceph_timespec *t;
+  t = (ceph_timespec *)buffer;
+  utime_t kp_t = utime_t(*t);
+  connection->write_lock.lock();
+  append_keepalive_or_ack(true, &kp_t);
+  connection->write_lock.unlock();
+
+  ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
+  connection->set_last_keepalive(ceph_clock_now());
+
+  if (is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(wait_message);
+}
+
+void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) {
+  ldout(cct, 10) << __func__ << dendl;
+  if (ack) {
+    ceph_assert(tp);
+    struct ceph_timespec ts;
+    tp->encode_timeval(&ts);
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
+    connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+  } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+    struct ceph_timespec ts;
+    utime_t t = ceph_clock_now();
+    t.encode_timeval(&ts);
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2);
+    connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+  } else {
+    connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE);
+  }
+}
+
+CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+    return _fault();
+  }
+
+  ceph_timespec *t;
+  t = (ceph_timespec *)buffer;
+  connection->set_last_keepalive_ack(utime_t(*t));
+  ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+  return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+    return _fault();
+  }
+
+  ceph_le64 seq;
+  seq = *(ceph_le64 *)buffer;
+  ldout(cct, 20) << __func__ << " got ACK" << dendl;
+
+  ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl;
+  // trim sent list
+  static const int max_pending = 128;
+  int i = 0;
+  Message *pending[max_pending];
+  connection->write_lock.lock();
+  while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+    Message *m = sent.front();
+    sent.pop_front();
+    pending[i++] = m;
+    ldout(cct, 10) << __func__ << " got ack seq " << seq
+                   << " >= " << m->get_seq() << " on " << m << " " << *m
+                   << dendl;
+  }
+  connection->write_lock.unlock();
+  for (int k = 0; k < i; k++) {
+    pending[k]->put();
+  }
+
+  return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_message_header(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message header failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got MSG header" << dendl;
+
+  current_header = *((ceph_msg_header *)buffer);
+
+  ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src "
+                 << entity_name_t(current_header.src) << " front=" << current_header.front_len
+                 << " data=" << current_header.data_len << " off " << current_header.data_off
+                 << dendl;
+
+  if (messenger->crcflags & MSG_CRC_HEADER) {
+    __u32 header_crc = 0;
+    header_crc = ceph_crc32c(0, (unsigned char *)&current_header,
+                             sizeof(current_header) - sizeof(current_header.crc));
+    // verify header crc
+    if (header_crc != current_header.crc) {
+      ldout(cct, 0) << __func__ << " got bad header crc " << header_crc
+                    << " != " << current_header.crc << dendl;
+      return _fault();
+    }
+  }
+
+  // Reset state
+  data_buf.clear();
+  front.clear();
+  middle.clear();
+  data.clear();
+
+  state = THROTTLE_MESSAGE;
+  return CONTINUE(throttle_message);
+}
+
+CtPtr ProtocolV1::throttle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " wants " << 1
+                   << " message from policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    if (!connection->policy.throttler_messages->get_or_fail()) {
+      ldout(cct, 10) << __func__ << " wants 1 message from policy throttle "
+                     << connection->policy.throttler_messages->get_current()
+                     << "/" << connection->policy.throttler_messages->get_max()
+                     << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  state = THROTTLE_BYTES;
+  return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV1::throttle_bytes() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  cur_msg_size = current_header.front_len + current_header.middle_len +
+                 current_header.data_len;
+  if (cur_msg_size) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                     << " bytes from policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+        ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                       << " bytes from policy throttler "
+                       << connection->policy.throttler_bytes->get_current()
+                       << "/" << connection->policy.throttler_bytes->get_max()
+                       << " failed, just wait." << dendl;
+        // following thread pool deal with th full message queue isn't a
+        // short time, so we can wait a ms.
+        if (connection->register_time_events.empty()) {
+          connection->register_time_events.insert(
+              connection->center->create_time_event(
+                  1000, connection->wakeup_handler));
+        }
+        return nullptr;
+      }
+    }
+  }
+
+  state = THROTTLE_DISPATCH_QUEUE;
+  return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV1::throttle_dispatch_queue() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (cur_msg_size) {
+    if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+            cur_msg_size)) {
+      ldout(cct, 10)
+          << __func__ << " wants " << cur_msg_size
+          << " bytes from dispatch throttle "
+          << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+          << connection->dispatch_queue->dispatch_throttler.get_max()
+          << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  throttle_stamp = ceph_clock_now();
+
+  state = READ_MESSAGE_FRONT;
+  return read_message_front();
+}
+
+CtPtr ProtocolV1::read_message_front() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  unsigned front_len = current_header.front_len;
+  if (front_len) {
+    if (!front.length()) {
+      front.push_back(buffer::create(front_len));
+    }
+    return READB(front_len, front.c_str(), handle_message_front);
+  }
+  return read_message_middle();
+}
+
+CtPtr ProtocolV1::handle_message_front(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message front failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got front " << front.length() << dendl;
+
+  return read_message_middle();
+}
+
+CtPtr ProtocolV1::read_message_middle() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (current_header.middle_len) {
+    if (!middle.length()) {
+      middle.push_back(buffer::create(current_header.middle_len));
+    }
+    return READB(current_header.middle_len, middle.c_str(),
+                 handle_message_middle);
+  }
+
+  return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read message middle failed" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl;
+
+  return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::read_message_data_prepare() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  unsigned data_len = le32_to_cpu(current_header.data_len);
+  unsigned data_off = le32_to_cpu(current_header.data_off);
+
+  if (data_len) {
+    // get a buffer
+#if 0
+    // rx_buffers is broken by design... see
+    //  http://tracker.ceph.com/issues/22480
+    map<ceph_tid_t, pair<bufferlist, int> >::iterator p =
+        connection->rx_buffers.find(current_header.tid);
+    if (p != connection->rx_buffers.end()) {
+      ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second
+                     << " at offset " << data_off << " len "
+                     << p->second.first.length() << dendl;
+      data_buf = p->second.first;
+      // make sure it's big enough
+      if (data_buf.length() < data_len)
+        data_buf.push_back(buffer::create(data_len - data_buf.length()));
+      data_blp = data_buf.begin();
+    } else {
+      ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+                     << data_off << dendl;
+      alloc_aligned_buffer(data_buf, data_len, data_off);
+      data_blp = data_buf.begin();
+    }
+#else
+    ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+		   << data_off << dendl;
+    alloc_aligned_buffer(data_buf, data_len, data_off);
+    data_blp = data_buf.begin();
+#endif
+  }
+
+  msg_left = data_len;
+
+  return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_data() {
+  ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl;
+
+  if (msg_left > 0) {
+    bufferptr bp = data_blp.get_current_ptr();
+    unsigned read_len = std::min(bp.length(), msg_left);
+
+    return READB(read_len, bp.c_str(), handle_message_data);
+  }
+
+  return read_message_footer();
+}
+
+CtPtr ProtocolV1::handle_message_data(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read data error " << dendl;
+    return _fault();
+  }
+
+  bufferptr bp = data_blp.get_current_ptr();
+  unsigned read_len = std::min(bp.length(), msg_left);
+  ceph_assert(read_len < std::numeric_limits<int>::max());
+  data_blp.advance(read_len);
+  data.append(bp, 0, read_len);
+  msg_left -= read_len;
+
+  return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_footer() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = READ_FOOTER_AND_DISPATCH;
+
+  unsigned len;
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    len = sizeof(ceph_msg_footer);
+  } else {
+    len = sizeof(ceph_msg_footer_old);
+  }
+
+  return READ(len, handle_message_footer);
+}
+
+CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read footer data error " << dendl;
+    return _fault();
+  }
+
+  ceph_msg_footer footer;
+  ceph_msg_footer_old old_footer;
+
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    footer = *((ceph_msg_footer *)buffer);
+  } else {
+    old_footer = *((ceph_msg_footer_old *)buffer);
+    footer.front_crc = old_footer.front_crc;
+    footer.middle_crc = old_footer.middle_crc;
+    footer.data_crc = old_footer.data_crc;
+    footer.sig = 0;
+    footer.flags = old_footer.flags;
+  }
+
+  int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
+  ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl;
+  if (aborted) {
+    ldout(cct, 0) << __func__ << " got " << front.length() << " + "
+                  << middle.length() << " + " << data.length()
+                  << " byte message.. ABORTED" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__ << " got " << front.length() << " + "
+                 << middle.length() << " + " << data.length() << " byte message"
+                 << dendl;
+  Message *message = decode_message(cct, messenger->crcflags, current_header,
+                                    footer, front, middle, data, connection);
+  if (!message) {
+    ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+    return _fault();
+  }
+
+  //
+  //  Check the signature if one should be present.  A zero return indicates
+  //  success. PLR
+  //
+
+  if (session_security.get() == NULL) {
+    ldout(cct, 10) << __func__ << " no session security set" << dendl;
+  } else {
+    if (session_security->check_message_signature(message)) {
+      ldout(cct, 0) << __func__ << " Signature check failed" << dendl;
+      message->put();
+      return _fault();
+    }
+  }
+  message->set_byte_throttler(connection->policy.throttler_bytes);
+  message->set_message_throttler(connection->policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(cur_msg_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  // check received seq#.  if it is old, drop the message.
+  // note that incoming messages may skip ahead.  this is convenient for the
+  // client side queueing because messages can't be renumbered, but the (kernel)
+  // client will occasionally pull a message out of the sent queue to send
+  // elsewhere.  in that case it doesn't matter if we "got" it or not.
+  uint64_t cur_seq = in_seq;
+  if (message->get_seq() <= cur_seq) {
+    ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+                  << " <= " << cur_seq << " " << message << " " << *message
+                  << ", discarding" << dendl;
+    message->put();
+    if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+        cct->_conf->ms_die_on_old_message) {
+      ceph_assert(0 == "old msgs despite reconnect_seq feature");
+    }
+    return nullptr;
+  }
+  if (message->get_seq() > cur_seq + 1) {
+    ldout(cct, 0) << __func__ << " missed message?  skipped from seq "
+                  << cur_seq << " to " << message->get_seq() << dendl;
+    if (cct->_conf->ms_die_on_skipped_message) {
+      ceph_assert(0 == "skipped incoming seq");
+    }
+  }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+  if (message->get_type() == CEPH_MSG_OSD_OP ||
+      message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+    utime_t ltt_processed_stamp = ceph_clock_now();
+    double usecs_elapsed =
+        (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000;
+    ostringstream buf;
+    if (message->get_type() == CEPH_MSG_OSD_OP)
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+                           false);
+    else
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+                           false);
+  }
+#endif
+
+  // note last received message.
+  in_seq = message->get_seq();
+  ldout(cct, 5) << " rx " << message->get_source() << " seq "
+                << message->get_seq() << " " << message << " " << *message
+                << dendl;
+
+  bool need_dispatch_writer = false;
+  if (!connection->policy.lossy) {
+    ack_left++;
+    need_dispatch_writer = true;
+  }
+
+  state = OPENED;
+
+  connection->logger->inc(l_msgr_recv_messages);
+  connection->logger->inc(
+      l_msgr_recv_bytes,
+      cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer));
+
+  messenger->ms_fast_preprocess(message);
+  auto fast_dispatch_time = ceph::mono_clock::now();
+  connection->logger->tinc(l_msgr_running_recv_time,
+                           fast_dispatch_time - connection->recv_start_time);
+  if (connection->delay_state) {
+    double delay_period = 0;
+    if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+      delay_period =
+          cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+      ldout(cct, 1) << "queue_received will delay after "
+                    << (ceph_clock_now() + delay_period) << " on " << message
+                    << " " << *message << dendl;
+    }
+    connection->delay_state->queue(delay_period, message);
+  } else if (messenger->ms_can_fast_dispatch(message)) {
+    connection->lock.unlock();
+    connection->dispatch_queue->fast_dispatch(message);
+    connection->recv_start_time = ceph::mono_clock::now();
+    connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+                             connection->recv_start_time - fast_dispatch_time);
+    connection->lock.lock();
+  } else {
+    connection->dispatch_queue->enqueue(message, message->get_priority(),
+                                        connection->conn_id);
+  }
+
+  // clean up local buffer references
+  data_buf.clear();
+  front.clear();
+  middle.clear();
+  data.clear();
+
+  if (need_dispatch_writer && connection->is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(wait_message);
+}
+
+void ProtocolV1::session_reset() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (connection->delay_state) {
+    connection->delay_state->discard();
+  }
+
+  connection->dispatch_queue->discard_queue(connection->conn_id);
+  discard_out_queue();
+  // note: we need to clear outgoing_bl here, but session_reset may be
+  // called by other thread, so let caller clear this itself!
+  // outgoing_bl.clear();
+
+  connection->dispatch_queue->queue_remote_reset(connection);
+
+  randomize_out_seq();
+
+  in_seq = 0;
+  connect_seq = 0;
+  // it's safe to directly set 0, double locked
+  ack_left = 0;
+  once_ready = false;
+  can_write = WriteStatus::NOWRITE;
+}
+
+void ProtocolV1::randomize_out_seq() {
+  if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) {
+    // Set out_seq to a random value, so CRC won't be predictable.
+    auto rand_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK);
+    ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl;
+    out_seq = rand_seq;
+  } else {
+    // previously, seq #'s always started at 0.
+    out_seq = 0;
+  }
+}
+
+ssize_t ProtocolV1::write_message(Message *m, bufferlist &bl, bool more) {
+  FUNCTRACE(cct);
+  ceph_assert(connection->center->in_thread());
+  m->set_seq(++out_seq);
+
+  if (messenger->crcflags & MSG_CRC_HEADER) {
+    m->calc_header_crc();
+  }
+
+  ceph_msg_header &header = m->get_header();
+  ceph_msg_footer &footer = m->get_footer();
+
+  // TODO: let sign_message could be reentry?
+  // Now that we have all the crcs calculated, handle the
+  // digital signature for the message, if the AsyncConnection has session
+  // security set up.  Some session security options do not
+  // actually calculate and check the signature, but they should
+  // handle the calls to sign_message and check_signature.  PLR
+  if (session_security.get() == NULL) {
+    ldout(cct, 20) << __func__ << " no session security" << dendl;
+  } else {
+    if (session_security->sign_message(m)) {
+      ldout(cct, 20) << __func__ << " failed to sign m=" << m
+                     << "): sig = " << footer.sig << dendl;
+    } else {
+      ldout(cct, 20) << __func__ << " signed m=" << m
+                     << "): sig = " << footer.sig << dendl;
+    }
+  }
+
+  connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG);
+  connection->outgoing_bl.append((char *)&header, sizeof(header));
+
+  ldout(cct, 20) << __func__ << " sending message type=" << header.type
+                 << " src " << entity_name_t(header.src)
+                 << " front=" << header.front_len << " data=" << header.data_len
+                 << " off " << header.data_off << dendl;
+
+  if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.buffers().size() > 1)) {
+    for (const auto &pb : bl.buffers()) {
+      connection->outgoing_bl.append((char *)pb.c_str(), pb.length());
+    }
+  } else {
+    connection->outgoing_bl.claim_append(bl);
+  }
+
+  // send footer; if receiver doesn't support signatures, use the old footer
+  // format
+  ceph_msg_footer_old old_footer;
+  if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    connection->outgoing_bl.append((char *)&footer, sizeof(footer));
+  } else {
+    if (messenger->crcflags & MSG_CRC_HEADER) {
+      old_footer.front_crc = footer.front_crc;
+      old_footer.middle_crc = footer.middle_crc;
+      old_footer.data_crc = footer.data_crc;
+    } else {
+      old_footer.front_crc = old_footer.middle_crc = 0;
+    }
+    old_footer.data_crc =
+        messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
+    old_footer.flags = footer.flags;
+    connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer));
+  }
+
+  m->trace.event("async writing message");
+  ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m
+                 << dendl;
+  ssize_t total_send_size = connection->outgoing_bl.length();
+  ssize_t rc = connection->_try_send(more);
+  if (rc < 0) {
+    ldout(cct, 1) << __func__ << " error sending " << m << ", "
+                  << cpp_strerror(rc) << dendl;
+  } else {
+    connection->logger->inc(
+        l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+    ldout(cct, 10) << __func__ << " sending " << m
+                   << (rc ? " continuely." : " done.") << dendl;
+  }
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+  m->put();
+
+  return rc;
+}
+
+void ProtocolV1::requeue_sent() {
+  write_in_progress = false;
+  if (sent.empty()) {
+    return;
+  }
+
+  list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  out_seq -= sent.size();
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(cct, 10) << __func__ << " " << *m << " for resend "
+                   << " (" << m->get_seq() << ")" << dendl;
+    rq.push_front(make_pair(bufferlist(), m));
+  }
+}
+
+uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+  ldout(cct, 10) << __func__ << " " << seq << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    return seq;
+  }
+  list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  uint64_t count = out_seq;
+  while (!rq.empty()) {
+    pair<bufferlist, Message *> p = rq.front();
+    if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break;
+    ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq "
+                   << p.second->get_seq() << " <= " << seq << ", discarding"
+                   << dendl;
+    p.second->put();
+    rq.pop_front();
+    count++;
+  }
+  if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+  return count;
+}
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV1::discard_out_queue() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (map<int, list<pair<bufferlist, Message *> > >::iterator p =
+           out_q.begin();
+       p != out_q.end(); ++p) {
+    for (list<pair<bufferlist, Message *> >::iterator r = p->second.begin();
+         r != p->second.end(); ++r) {
+      ldout(cct, 20) << __func__ << " discard " << r->second << dendl;
+      r->second->put();
+    }
+  }
+  out_q.clear();
+  write_in_progress = false;
+}
+
+void ProtocolV1::reset_security()
+{
+  ldout(cct, 5) << __func__ << dendl;
+
+  // clean up state internal variables and states
+  if (state == CONNECTING_SEND_CONNECT_MSG) {
+    if (authorizer) {
+      delete authorizer;
+    }
+    authorizer = nullptr;
+  }
+}
+
+void ProtocolV1::reset_recv_state() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  // execute in the same thread that uses the `session_security`.
+  // We need to do the warp because holding `write_lock` is not
+  // enough as `write_event()` releases it just before calling
+  // `write_message()`. `submit_to()` here is NOT blocking.
+  if (!connection->center->in_thread()) {
+    connection->center->submit_to(connection->center->get_id(), [this] {
+      ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers"
+                    << dendl;
+      // Possibly unnecessary. See the comment in `deactivate_existing`.
+      std::lock_guard<std::mutex> l(connection->lock);
+      std::lock_guard<std::mutex> wl(connection->write_lock);
+      reset_security();
+    }, /* nowait = */true);
+  } else {
+    reset_security();
+  }
+
+  // clean read and write callbacks
+  connection->pendingReadLen.reset();
+  connection->writeCallback.reset();
+
+  if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH &&
+      connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " releasing " << 1
+                   << " message to policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    connection->policy.throttler_messages->put();
+  }
+  if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+                     << " bytes to policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      connection->policy.throttler_bytes->put(cur_msg_size);
+    }
+  }
+  if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) {
+    ldout(cct, 10)
+        << __func__ << " releasing " << cur_msg_size
+        << " bytes to dispatch_queue throttler "
+        << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+        << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+    connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+  }
+}
+
+Message *ProtocolV1::_get_next_outgoing(bufferlist *bl) {
+  Message *m = 0;
+  if (!out_q.empty()) {
+    map<int, list<pair<bufferlist, Message *> > >::reverse_iterator it =
+        out_q.rbegin();
+    ceph_assert(!it->second.empty());
+    list<pair<bufferlist, Message *> >::iterator p = it->second.begin();
+    m = p->second;
+    if (bl) bl->swap(p->first);
+    it->second.erase(p);
+    if (it->second.empty()) out_q.erase(it->first);
+  }
+  return m;
+}
+
+/**
+ * Client Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_client_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+  state = CONNECTING;
+
+  bufferlist bl;
+  bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+  return WRITE(bl, handle_client_banner_write);
+}
+
+CtPtr ProtocolV1::handle_client_banner_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " write client banner failed" << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " connect write banner done: "
+                 << connection->get_peer_addr() << dendl;
+
+  return wait_server_banner();
+}
+
+CtPtr ProtocolV1::wait_server_banner() {
+  state = CONNECTING_WAIT_BANNER_AND_IDENTIFY;
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  bufferlist myaddrbl;
+  unsigned banner_len = strlen(CEPH_BANNER);
+  unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2;
+  return READ(need_len, handle_server_banner_and_identify);
+}
+
+CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read banner and identify addresses failed"
+                  << dendl;
+    return _fault();
+  }
+
+  unsigned banner_len = strlen(CEPH_BANNER);
+  if (memcmp(buffer, CEPH_BANNER, banner_len)) {
+    ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer "
+                  << connection->get_peer_addr() << dendl;
+    return _fault();
+  }
+
+  bufferlist bl;
+  entity_addr_t paddr, peer_addr_for_me;
+
+  bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2);
+  auto p = bl.cbegin();
+  try {
+    decode(paddr, p);
+    decode(peer_addr_for_me, p);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode peer addr failed " << dendl;
+    return _fault();
+  }
+  ldout(cct, 20) << __func__ << " connect read peer addr " << paddr
+                 << " on socket " << connection->cs.fd() << dendl;
+
+  entity_addr_t peer_addr = connection->peer_addrs->legacy_addr();
+  if (peer_addr != paddr) {
+    if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() &&
+        peer_addr.get_nonce() == paddr.get_nonce()) {
+      ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not "
+                    << peer_addr << " - presumably this is the same node!"
+                    << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not "
+                     << peer_addr << dendl;
+      return _fault();
+    }
+  }
+
+  ldout(cct, 20) << __func__ << " connect peer addr for me is "
+                 << peer_addr_for_me << dendl;
+  if (messenger->get_myaddrs().empty() ||
+      messenger->get_myaddrs().front().is_blank_ip()) {
+    sockaddr_storage ss;
+    socklen_t len = sizeof(ss);
+    getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+    entity_addr_t a;
+    if (cct->_conf->ms_learn_addr_from_peer) {
+      ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+		    << " says I am " << peer_addr_for_me << " (socket says "
+		    << (sockaddr*)&ss << ")" << dendl;
+      a = peer_addr_for_me;
+    } else {
+      ldout(cct, 1) << __func__ << " socket to  " << connection->target_addr
+		    << " says I am " << (sockaddr*)&ss
+		    << " (peer says " << peer_addr_for_me << ")" << dendl;
+      a.set_sockaddr((sockaddr *)&ss);
+    }
+    a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this
+    a.set_port(0);
+    connection->lock.unlock();
+    messenger->learned_addr(a);
+    if (cct->_conf->ms_inject_internal_delays &&
+	cct->_conf->ms_inject_socket_failures) {
+      if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+	ldout(cct, 10) << __func__ << " sleep for "
+		       << cct->_conf->ms_inject_internal_delays << dendl;
+	utime_t t;
+	t.set_from_double(cct->_conf->ms_inject_internal_delays);
+	t.sleep();
+      }
+    }
+    connection->lock.lock();
+    if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) {
+      ldout(cct, 1) << __func__
+                  << " state changed while learned_addr, mark_down or "
+		    << " replacing must be happened just now" << dendl;
+      return nullptr;
+    }
+  }
+
+  bufferlist myaddrbl;
+  encode(messenger->get_myaddr_legacy(), myaddrbl, 0);  // legacy
+  return WRITE(myaddrbl, handle_my_addr_write);
+}
+
+CtPtr ProtocolV1::handle_my_addr_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 2) << __func__ << " connect couldn't write my addr, "
+                  << cpp_strerror(r) << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " connect sent my addr "
+                 << messenger->get_myaddr_legacy() << dendl;
+
+  return CONTINUE(send_connect_message);
+}
+
+CtPtr ProtocolV1::send_connect_message() {
+  state = CONNECTING_SEND_CONNECT_MSG;
+
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (!authorizer) {
+    authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type);
+  }
+
+  ceph_msg_connect connect;
+  connect.features = connection->policy.features_supported;
+  connect.host_type = messenger->get_myname().type();
+  connect.global_seq = global_seq;
+  connect.connect_seq = connect_seq;
+  connect.protocol_version =
+      messenger->get_proto_version(connection->peer_type, true);
+  connect.authorizer_protocol = authorizer ? authorizer->protocol : 0;
+  connect.authorizer_len = authorizer ? authorizer->bl.length() : 0;
+
+  if (authorizer) {
+    ldout(cct, 10) << __func__
+                   << " connect_msg.authorizer_len=" << connect.authorizer_len
+                   << " protocol=" << connect.authorizer_protocol << dendl;
+  }
+
+  connect.flags = 0;
+  if (connection->policy.lossy) {
+    connect.flags |=
+        CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
+  }
+
+  bufferlist bl;
+  bl.append((char *)&connect, sizeof(connect));
+  if (authorizer) {
+    bl.append(authorizer->bl.c_str(), authorizer->bl.length());
+  }
+
+  ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq
+                 << " cseq=" << connect_seq
+                 << " proto=" << connect.protocol_version << dendl;
+
+  return WRITE(bl, handle_connect_message_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 2) << __func__ << " connect couldn't send reply "
+                  << cpp_strerror(r) << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 20) << __func__
+                 << " connect wrote (self +) cseq, waiting for reply" << dendl;
+
+  return wait_connect_reply();
+}
+
+CtPtr ProtocolV1::wait_connect_reply() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_reply, 0, sizeof(connect_reply));
+  return READ(sizeof(connect_reply), handle_connect_reply_1);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect reply failed" << dendl;
+    return _fault();
+  }
+
+  connect_reply = *((ceph_msg_connect_reply *)buffer);
+
+  ldout(cct, 20) << __func__ << " connect got reply tag "
+                 << (int)connect_reply.tag << " connect_seq "
+                 << connect_reply.connect_seq << " global_seq "
+                 << connect_reply.global_seq << " proto "
+                 << connect_reply.protocol_version << " flags "
+                 << (int)connect_reply.flags << " features "
+                 << connect_reply.features << dendl;
+
+  if (connect_reply.authorizer_len) {
+    return wait_connect_reply_auth();
+  }
+
+  return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::wait_connect_reply_auth() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  ldout(cct, 10) << __func__
+                 << " reply.authorizer_len=" << connect_reply.authorizer_len
+                 << dendl;
+
+  ceph_assert(connect_reply.authorizer_len < 4096);
+
+  return READ(connect_reply.authorizer_len, handle_connect_reply_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect reply authorizer failed"
+                  << dendl;
+    return _fault();
+  }
+
+  bufferlist authorizer_reply;
+  authorizer_reply.append(buffer, connect_reply.authorizer_len);
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+    ldout(cct, 10) << __func__ << " connect got auth challenge" << dendl;
+    authorizer->add_challenge(cct, authorizer_reply);
+    return CONTINUE(send_connect_message);
+  }
+
+  auto iter = authorizer_reply.cbegin();
+  if (authorizer && !authorizer->verify_reply(iter,
+					      nullptr /* connection_secret */)) {
+    ldout(cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
+    return _fault();
+  }
+
+  return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::handle_connect_reply_2() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) {
+    ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my "
+                  << std::hex << connection->policy.features_supported
+                  << " < peer " << connect_reply.features << " missing "
+                  << (connect_reply.features &
+                      ~connection->policy.features_supported)
+                  << std::dec << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
+    ldout(cct, 0) << __func__ << " connect protocol version mismatch, my "
+                  << messenger->get_proto_version(connection->peer_type, true)
+                  << " != " << connect_reply.protocol_version << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
+    ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
+    ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
+    session_reset();
+    connect_seq = 0;
+
+    // see session_reset
+    connection->outgoing_bl.clear();
+
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
+    global_seq = messenger->get_global_seq(connect_reply.global_seq);
+    ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL "
+                  << connect_reply.global_seq << " chose new " << global_seq
+                  << dendl;
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
+    ceph_assert(connect_reply.connect_seq > connect_seq);
+    ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq
+                  << " -> " << connect_reply.connect_seq << dendl;
+    connect_seq = connect_reply.connect_seq;
+    return CONTINUE(send_connect_message);
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) {
+    ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl;
+    state = WAIT;
+    return _fault();
+  }
+
+  uint64_t feat_missing;
+  feat_missing =
+      connection->policy.features_required & ~(uint64_t)connect_reply.features;
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    return _fault();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) {
+    ldout(cct, 10)
+        << __func__
+        << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq"
+        << dendl;
+
+    return wait_ack_seq();
+  }
+
+  if (connect_reply.tag == CEPH_MSGR_TAG_READY) {
+    ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl;
+  }
+
+  return client_ready();
+}
+
+CtPtr ProtocolV1::wait_ack_seq() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(uint64_t), handle_ack_seq);
+}
+
+CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl;
+    return _fault();
+  }
+
+  uint64_t newly_acked_seq = 0;
+
+  newly_acked_seq = *((uint64_t *)buffer);
+  ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
+                << " vs out_seq " << out_seq << dendl;
+  out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+  bufferlist bl;
+  uint64_t s = in_seq;
+  bl.append((char *)&s, sizeof(s));
+
+  return WRITE(bl, handle_in_seq_write);
+}
+
+CtPtr ProtocolV1::handle_in_seq_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 10) << __func__ << " send in_seq done " << dendl;
+
+  return client_ready();
+}
+
+CtPtr ProtocolV1::client_ready() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // hooray!
+  peer_global_seq = connect_reply.global_seq;
+  connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
+
+  once_ready = true;
+  connect_seq += 1;
+  ceph_assert(connect_seq == connect_reply.connect_seq);
+  backoff = utime_t();
+  connection->set_features((uint64_t)connect_reply.features &
+                           (uint64_t)connection->policy.features_supported);
+  ldout(cct, 10) << __func__ << " connect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  // If we have an authorizer, get a new AuthSessionHandler to deal with
+  // ongoing security of the connection.  PLR
+  if (authorizer != NULL) {
+    ldout(cct, 10) << __func__ << " setting up session_security with auth "
+		   << authorizer << dendl;
+    session_security.reset(get_auth_session_handler(
+        cct, authorizer->protocol,
+	authorizer->session_key,
+        connection->get_features()));
+  } else {
+    // We have no authorizer, so we shouldn't be applying security to messages
+    // in this AsyncConnection.  PLR
+    ldout(cct, 10) << __func__ << " no authorizer, clearing session_security"
+		   << dendl;
+    session_security.reset();
+  }
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+/**
+ * Server Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_server_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+  state = ACCEPTING;
+
+  bufferlist bl;
+
+  bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+
+  // as a server, we should have a legacy addr if we accepted this connection.
+  auto legacy = messenger->get_myaddrs().legacy_addr();
+  encode(legacy, bl, 0);  // legacy
+  connection->port = legacy.get_port();
+  encode(connection->target_addr, bl, 0);  // legacy
+
+  ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd()
+		<< " legacy " << legacy
+		<< " socket_addr " << connection->socket_addr
+		<< " target_addr " << connection->target_addr
+		<< dendl;
+
+  return WRITE(bl, handle_server_banner_write);
+}
+
+CtPtr ProtocolV1::handle_server_banner_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << " write server banner failed" << dendl;
+    return _fault();
+  }
+  ldout(cct, 10) << __func__ << " write banner and addr done: "
+                 << connection->get_peer_addr() << dendl;
+
+  return wait_client_banner();
+}
+
+CtPtr ProtocolV1::wait_client_banner() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr),
+              handle_client_banner);
+}
+
+CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl;
+    return _fault();
+  }
+
+  if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) {
+    ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer
+                  << "' (should be '" << CEPH_BANNER << "')" << dendl;
+    return _fault();
+  }
+
+  bufferlist addr_bl;
+  entity_addr_t peer_addr;
+
+  addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr));
+  try {
+    auto ti = addr_bl.cbegin();
+    decode(peer_addr, ti);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode peer_addr failed " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
+  if (peer_addr.is_blank_ip()) {
+    // peer apparently doesn't know what ip they have; figure it out for them.
+    int port = peer_addr.get_port();
+    peer_addr.set_sockaddr(connection->target_addr.get_sockaddr());
+    peer_addr.set_port(port);
+
+    ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr
+                  << " (socket is " << connection->target_addr << ")" << dendl;
+  }
+  connection->set_peer_addr(peer_addr);  // so that connection_state gets set up
+  connection->target_addr = peer_addr;
+
+  return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::wait_connect_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+  return READ(sizeof(connect_msg), handle_connect_message_1);
+}
+
+CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect msg failed" << dendl;
+    return _fault();
+  }
+
+  connect_msg = *((ceph_msg_connect *)buffer);
+
+  state = ACCEPTING_WAIT_CONNECT_MSG_AUTH;
+
+  if (connect_msg.authorizer_len) {
+    return wait_connect_message_auth();
+  }
+
+  return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::wait_connect_message_auth() {
+  ldout(cct, 20) << __func__ << dendl;
+  authorizer_buf.clear();
+  authorizer_buf.push_back(buffer::create(connect_msg.authorizer_len));
+  return READB(connect_msg.authorizer_len, authorizer_buf.c_str(),
+               handle_connect_message_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl;
+    return _fault();
+  }
+
+  return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::handle_connect_message_2() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  ldout(cct, 20) << __func__ << " accept got peer connect_seq "
+                 << connect_msg.connect_seq << " global_seq "
+                 << connect_msg.global_seq << dendl;
+
+  connection->set_peer_type(connect_msg.host_type);
+  connection->policy = messenger->get_policy(connect_msg.host_type);
+
+  ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type
+                 << ", policy.lossy=" << connection->policy.lossy
+                 << " policy.server=" << connection->policy.server
+                 << " policy.standby=" << connection->policy.standby
+                 << " policy.resetcheck=" << connection->policy.resetcheck
+		 << " features 0x" << std::hex << (uint64_t)connect_msg.features
+		 << std::dec
+                 << dendl;
+
+  ceph_msg_connect_reply reply;
+  bufferlist authorizer_reply;
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&reply, 0, sizeof(reply));
+  reply.protocol_version =
+      messenger->get_proto_version(connection->peer_type, false);
+
+  // mismatch?
+  ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version
+                 << ", their proto " << connect_msg.protocol_version << dendl;
+
+  if (connect_msg.protocol_version != reply.protocol_version) {
+    return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply,
+                                      authorizer_reply);
+  }
+
+  // require signatures for cephx?
+  if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) {
+    if (connection->peer_type == CEPH_ENTITY_TYPE_OSD ||
+        connection->peer_type == CEPH_ENTITY_TYPE_MDS ||
+        connection->peer_type == CEPH_ENTITY_TYPE_MGR) {
+      if (cct->_conf->cephx_require_signatures ||
+          cct->_conf->cephx_cluster_require_signatures) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring MSG_AUTH feature bit for cluster"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (cct->_conf->cephx_require_version >= 2 ||
+          cct->_conf->cephx_cluster_require_version >= 2) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring cephx v2 feature bit for cluster"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+    } else {
+      if (cct->_conf->cephx_require_signatures ||
+          cct->_conf->cephx_service_require_signatures) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring MSG_AUTH feature bit for service"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+      }
+      if (cct->_conf->cephx_require_version >= 2 ||
+          cct->_conf->cephx_service_require_version >= 2) {
+        ldout(cct, 10)
+            << __func__
+            << " using cephx, requiring cephx v2 feature bit for service"
+            << dendl;
+        connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+      }
+    }
+  }
+
+  uint64_t feat_missing =
+      connection->policy.features_required & ~(uint64_t)connect_msg.features;
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply,
+                                      authorizer_reply);
+  }
+
+  bufferlist auth_bl_copy = authorizer_buf;
+  connection->lock.unlock();
+  ldout(cct,10) << __func__ << " authorizor_protocol "
+		<< connect_msg.authorizer_protocol
+		<< " len " << auth_bl_copy.length()
+		<< dendl;
+  bool authorizer_valid;
+  bool need_challenge = HAVE_FEATURE(connect_msg.features, CEPHX_V2);
+  bool had_challenge = (bool)authorizer_challenge;
+  if (!messenger->ms_deliver_verify_authorizer(
+          connection, connection->peer_type, connect_msg.authorizer_protocol,
+          auth_bl_copy, authorizer_reply, authorizer_valid, session_key,
+	  nullptr /* connection_secret */,
+          need_challenge ? &authorizer_challenge : nullptr) ||
+      !authorizer_valid) {
+    connection->lock.lock();
+    if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+      ldout(cct, 1) << __func__
+		    << " state changed while accept, it must be mark_down"
+		    << dendl;
+      ceph_assert(state == CLOSED);
+      return _fault();
+    }
+
+    if (need_challenge && !had_challenge && authorizer_challenge) {
+      ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl;
+      ceph_assert(authorizer_reply.length());
+      return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER,
+                                        reply, authorizer_reply);
+    } else {
+      ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len="
+                    << authorizer_reply.length() << dendl;
+      session_security.reset();
+      return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply,
+                                        authorizer_reply);
+    }
+  }
+
+  // We've verified the authorizer for this AsyncConnection, so set up the
+  // session security structure.  PLR
+  ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl;
+
+  // existing?
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (existing == connection) {
+    existing = nullptr;
+  }
+  if (existing && existing->protocol->proto_type != 1) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  if (existing) {
+    // There is no possible that existing connection will acquire this
+    // connection's lock
+    existing->lock.lock();  // skip lockdep check (we are locking a second
+                            // AsyncConnection here)
+
+    ldout(cct,10) << __func__ << " existing=" << existing << " exproto="
+		  << existing->protocol.get() << dendl;
+    ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+    ceph_assert(exproto);
+    ceph_assert(exproto->proto_type == 1);
+
+    if (exproto->state == CLOSED) {
+      ldout(cct, 1) << __func__ << " existing " << existing
+		    << " already closed." << dendl;
+      existing->lock.unlock();
+      existing = nullptr;
+
+      return open(reply, authorizer_reply);
+    }
+
+    if (exproto->replacing) {
+      ldout(cct, 1) << __func__
+                    << " existing racing replace happened while replacing."
+                    << " existing_state="
+                    << connection->get_state_name(existing->state) << dendl;
+      reply.global_seq = exproto->peer_global_seq;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+                                        authorizer_reply);
+    }
+
+    if (connect_msg.global_seq < exproto->peer_global_seq) {
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+                     << exproto->peer_global_seq << " > "
+                     << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl;
+      reply.global_seq = exproto->peer_global_seq;  // so we can send it below..
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+                                        authorizer_reply);
+    } else {
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+                     << exproto->peer_global_seq
+                     << " <= " << connect_msg.global_seq << ", looks ok"
+                     << dendl;
+    }
+
+    if (existing->policy.lossy) {
+      ldout(cct, 0)
+          << __func__
+          << " accept replacing existing (lossy) channel (new one lossy="
+          << connection->policy.lossy << ")" << dendl;
+      exproto->session_reset();
+      return replace(existing, reply, authorizer_reply);
+    }
+
+    ldout(cct, 1) << __func__ << " accept connect_seq "
+                  << connect_msg.connect_seq
+                  << " vs existing csq=" << exproto->connect_seq
+                  << " existing_state="
+                  << connection->get_state_name(existing->state) << dendl;
+
+    if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) {
+      ldout(cct, 0)
+          << __func__
+          << " accept peer reset, then tried to connect to us, replacing"
+          << dendl;
+      // this is a hard reset from peer
+      is_reset_from_peer = true;
+      if (connection->policy.resetcheck) {
+        exproto->session_reset();  // this resets out_queue, msg_ and
+                                   // connect_seq #'s
+      }
+      return replace(existing, reply, authorizer_reply);
+    }
+
+    if (connect_msg.connect_seq < exproto->connect_seq) {
+      // old attempt, or we sent READY but they didn't get it.
+      ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq "
+                     << exproto->connect_seq << " > " << connect_msg.connect_seq
+                     << ", RETRY_SESSION" << dendl;
+      reply.connect_seq = exproto->connect_seq + 1;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+                                        authorizer_reply);
+    }
+
+    if (connect_msg.connect_seq == exproto->connect_seq) {
+      // if the existing connection successfully opened, and/or
+      // subsequently went to standby, then the peer should bump
+      // their connect_seq and retry: this is not a connection race
+      // we need to resolve here.
+      if (exproto->state == OPENED || exproto->state == STANDBY) {
+        ldout(cct, 10) << __func__ << " accept connection race, existing "
+                       << existing << ".cseq " << exproto->connect_seq
+                       << " == " << connect_msg.connect_seq
+                       << ", OPEN|STANDBY, RETRY_SESSION " << dendl;
+        // if connect_seq both zero, dont stuck into dead lock. it's ok to
+        // replace
+        if (connection->policy.resetcheck && exproto->connect_seq == 0) {
+          return replace(existing, reply, authorizer_reply);
+        }
+
+        reply.connect_seq = exproto->connect_seq + 1;
+        existing->lock.unlock();
+        return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+                                          authorizer_reply);
+      }
+
+      // connection race?
+      if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() ||
+          existing->policy.server) {
+        // incoming wins
+        ldout(cct, 10) << __func__ << " accept connection race, existing "
+                       << existing << ".cseq " << exproto->connect_seq
+                       << " == " << connect_msg.connect_seq
+                       << ", or we are server, replacing my attempt" << dendl;
+        return replace(existing, reply, authorizer_reply);
+      } else {
+        // our existing outgoing wins
+        ldout(messenger->cct, 10)
+            << __func__ << " accept connection race, existing " << existing
+            << ".cseq " << exproto->connect_seq
+            << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl;
+        ceph_assert(connection->peer_addrs->legacy_addr() >
+                    messenger->get_myaddr_legacy());
+        existing->lock.unlock();
+	// make sure we follow through with opening the existing
+	// connection (if it isn't yet open) since we know the peer
+	// has something to send to us.
+	existing->send_keepalive();
+        return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply,
+                                          authorizer_reply);
+      }
+    }
+
+    ceph_assert(connect_msg.connect_seq > exproto->connect_seq);
+    ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq);
+    if (connection->policy.resetcheck &&  // RESETSESSION only used by servers;
+                                          // peers do not reset each other
+        exproto->connect_seq == 0) {
+      ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+                    << connect_msg.connect_seq << ", " << existing
+                    << ".cseq = " << exproto->connect_seq
+                    << "), sending RESETSESSION " << dendl;
+      existing->lock.unlock();
+      return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+                                        authorizer_reply);
+    }
+
+    // reconnect
+    ldout(cct, 10) << __func__ << " accept peer sent cseq "
+                   << connect_msg.connect_seq << " > " << exproto->connect_seq
+                   << dendl;
+    return replace(existing, reply, authorizer_reply);
+  }  // existing
+  else if (!replacing && connect_msg.connect_seq > 0) {
+    // we reset, and they are opening a new session
+    ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+                  << connect_msg.connect_seq << "), sending RESETSESSION"
+                  << dendl;
+    return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+                                      authorizer_reply);
+  } else {
+    // new session
+    ldout(cct, 10) << __func__ << " accept new session" << dendl;
+    existing = nullptr;
+    return open(reply, authorizer_reply);
+  }
+}
+
+CtPtr ProtocolV1::send_connect_message_reply(char tag,
+                                             ceph_msg_connect_reply &reply,
+                                             bufferlist &authorizer_reply) {
+  ldout(cct, 20) << __func__ << dendl;
+  bufferlist reply_bl;
+  reply.tag = tag;
+  reply.features =
+      ((uint64_t)connect_msg.features & connection->policy.features_supported) |
+      connection->policy.features_required;
+  reply.authorizer_len = authorizer_reply.length();
+  reply_bl.append((char *)&reply, sizeof(reply));
+
+  ldout(cct, 10) << __func__ << " reply features 0x" << std::hex
+		 << reply.features << " = (policy sup 0x"
+		 << connection->policy.features_supported
+		 << " & connect 0x" << (uint64_t)connect_msg.features
+		 << ") | policy req 0x"
+		 << connection->policy.features_required
+		 << dendl;
+
+  if (reply.authorizer_len) {
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+    authorizer_reply.clear();
+  }
+
+  return WRITE(reply_bl, handle_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_reply_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << " write connect message reply failed" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+
+  return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::replace(AsyncConnectionRef existing,
+                          ceph_msg_connect_reply &reply,
+                          bufferlist &authorizer_reply) {
+  ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl;
+
+  connection->inject_delay();
+  if (existing->policy.lossy) {
+    // disconnect from the Connection
+    ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing"
+                  << dendl;
+    existing->protocol->stop();
+    existing->dispatch_queue->queue_reset(existing.get());
+  } else {
+    ceph_assert(can_write == WriteStatus::NOWRITE);
+    existing->write_lock.lock();
+
+    ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+
+    // reset the in_seq if this is a hard reset from peer,
+    // otherwise we respect our original connection's value
+    if (is_reset_from_peer) {
+      exproto->is_reset_from_peer = true;
+    }
+
+    connection->center->delete_file_event(connection->cs.fd(),
+                                          EVENT_READABLE | EVENT_WRITABLE);
+
+    if (existing->delay_state) {
+      existing->delay_state->flush();
+      ceph_assert(!connection->delay_state);
+    }
+    exproto->reset_recv_state();
+
+    exproto->connect_msg.features = connect_msg.features;
+
+    auto temp_cs = std::move(connection->cs);
+    EventCenter *new_center = connection->center;
+    Worker *new_worker = connection->worker;
+    // avoid _stop shutdown replacing socket
+    // queue a reset on the new connection, which we're dumping for the old
+    stop();
+
+    connection->dispatch_queue->queue_reset(connection);
+    ldout(messenger->cct, 1)
+        << __func__ << " stop myself to swap existing" << dendl;
+    exproto->can_write = WriteStatus::REPLACING;
+    exproto->replacing = true;
+    exproto->write_in_progress = false;
+    existing->state_offset = 0;
+    // avoid previous thread modify event
+    exproto->state = NONE;
+    existing->state = AsyncConnection::STATE_NONE;
+    // Discard existing prefetch buffer in `recv_buf`
+    existing->recv_start = existing->recv_end = 0;
+    // there shouldn't exist any buffer
+    ceph_assert(connection->recv_start == connection->recv_end);
+
+    exproto->authorizer_challenge.reset();
+
+    auto deactivate_existing = std::bind(
+        [existing, new_worker, new_center, exproto, reply,
+         authorizer_reply](ConnectedSocket &cs) mutable {
+          // we need to delete time event in original thread
+          {
+            std::lock_guard<std::mutex> l(existing->lock);
+            existing->write_lock.lock();
+            exproto->requeue_sent();
+            existing->outgoing_bl.clear();
+            existing->open_write = false;
+            existing->write_lock.unlock();
+            if (exproto->state == NONE) {
+              existing->shutdown_socket();
+              existing->cs = std::move(cs);
+              existing->worker->references--;
+              new_worker->references++;
+              existing->logger = new_worker->get_perf_counter();
+              existing->worker = new_worker;
+              existing->center = new_center;
+              if (existing->delay_state)
+                existing->delay_state->set_center(new_center);
+            } else if (exproto->state == CLOSED) {
+              auto back_to_close =
+                  std::bind([](ConnectedSocket &cs) mutable { cs.close(); },
+                            std::move(cs));
+              new_center->submit_to(new_center->get_id(),
+                                    std::move(back_to_close), true);
+              return;
+            } else {
+              ceph_abort();
+            }
+          }
+
+          // Before changing existing->center, it may already exists some
+          // events in existing->center's queue. Then if we mark down
+          // `existing`, it will execute in another thread and clean up
+          // connection. Previous event will result in segment fault
+          auto transfer_existing = [existing, exproto, reply,
+                                    authorizer_reply]() mutable {
+            std::lock_guard<std::mutex> l(existing->lock);
+            if (exproto->state == CLOSED) return;
+            ceph_assert(exproto->state == NONE);
+
+            // we have called shutdown_socket above
+            ceph_assert(existing->last_tick_id == 0);
+            // restart timer since we are going to re-build connection
+            existing->last_connect_started = ceph::coarse_mono_clock::now();
+            existing->last_tick_id = existing->center->create_time_event(
+              existing->connect_timeout_us, existing->tick_handler);
+            existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+            exproto->state = ACCEPTING;
+
+            existing->center->create_file_event(
+                existing->cs.fd(), EVENT_READABLE, existing->read_handler);
+            reply.global_seq = exproto->peer_global_seq;
+            exproto->run_continuation(exproto->send_connect_message_reply(
+                CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply));
+          };
+          if (existing->center->in_thread())
+            transfer_existing();
+          else
+            existing->center->submit_to(existing->center->get_id(),
+                                        std::move(transfer_existing), true);
+        },
+        std::move(temp_cs));
+
+    existing->center->submit_to(existing->center->get_id(),
+                                std::move(deactivate_existing), true);
+    existing->write_lock.unlock();
+    existing->lock.unlock();
+    return nullptr;
+  }
+  existing->lock.unlock();
+
+  return open(reply, authorizer_reply);
+}
+
+CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply,
+                       bufferlist &authorizer_reply) {
+  ldout(cct, 20) << __func__ << dendl;
+
+  connect_seq = connect_msg.connect_seq + 1;
+  peer_global_seq = connect_msg.global_seq;
+  ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq
+                 << " in_seq=" << in_seq << ", sending READY" << dendl;
+
+  // if it is a hard reset from peer, we don't need a round-trip to negotiate
+  // in/out sequence
+  if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) &&
+      !is_reset_from_peer) {
+    reply.tag = CEPH_MSGR_TAG_SEQ;
+    wait_for_seq = true;
+  } else {
+    reply.tag = CEPH_MSGR_TAG_READY;
+    wait_for_seq = false;
+    out_seq = discard_requeued_up_to(out_seq, 0);
+    is_reset_from_peer = false;
+    in_seq = 0;
+  }
+
+  // send READY reply
+  reply.features = connection->policy.features_supported;
+  reply.global_seq = messenger->get_global_seq();
+  reply.connect_seq = connect_seq;
+  reply.flags = 0;
+  reply.authorizer_len = authorizer_reply.length();
+  if (connection->policy.lossy) {
+    reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  connection->set_features((uint64_t)reply.features &
+                           (uint64_t)connect_msg.features);
+  ldout(cct, 10) << __func__ << " accept features "
+                 << connection->get_features()
+		 << " authorizer_protocol "
+		 << connect_msg.authorizer_protocol << dendl;
+
+  session_security.reset(
+      get_auth_session_handler(cct, connect_msg.authorizer_protocol,
+                               session_key,
+			       connection->get_features()));
+
+  bufferlist reply_bl;
+  reply_bl.append((char *)&reply, sizeof(reply));
+
+  if (reply.authorizer_len) {
+    reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+  }
+
+  if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+    uint64_t s = in_seq;
+    reply_bl.append((char *)&s, sizeof(s));
+  }
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  replacing = false;
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->legacy_addr()
+                  << " just fail later one(this)" << dendl;
+    ldout(cct, 10) << "accept fault after register" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    ldout(cct, 10) << "accept fault after register" << dendl;
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  return WRITE(reply_bl, handle_ready_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " write ready connect message reply failed"
+                  << dendl;
+    return _fault();
+  }
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+  once_ready = true;
+
+  state = ACCEPTING_HANDLED_CONNECT_MSG;
+
+  if (wait_for_seq) {
+    return wait_seq();
+  }
+
+  return server_ready();
+}
+
+CtPtr ProtocolV1::wait_seq() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  return READ(sizeof(uint64_t), handle_seq);
+}
+
+CtPtr ProtocolV1::handle_seq(char *buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+    return _fault();
+  }
+
+  uint64_t newly_acked_seq = *(uint64_t *)buffer;
+  ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq
+                << dendl;
+  out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+  return server_ready();
+}
+
+CtPtr ProtocolV1::server_ready() {
+  ldout(cct, 20) << __func__ << " session_security is "
+		 << session_security
+		 << dendl;
+
+  ldout(cct, 20) << __func__ << " accept done" << dendl;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&connect_msg, 0, sizeof(connect_msg));
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  return ready();
+}
diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h
new file mode 100644
index 00000000..070ce73f
--- /dev/null
+++ b/src/msg/async/ProtocolV1.h
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V1_
+#define _MSG_ASYNC_PROTOCOL_V1_
+
+#include "Protocol.h"
+
+class ProtocolV1;
+using CtPtr = Ct<ProtocolV1>*;
+
+class ProtocolV1 : public Protocol {
+/*
+ *  ProtocolV1 State Machine
+ *
+
+    send_server_banner                             send_client_banner
+            |                                              |
+            v                                              v
+    wait_client_banner                              wait_server_banner
+            |                                              |
+            |                                              v
+            v                                 handle_server_banner_and_identify
+    wait_connect_message <---------\                       |
+      |     |                      |                       v
+      |  wait_connect_message_auth |           send_connect_message <----------\
+      |     |                      |                       |                   |
+      v     v                      |                       |                   |
+handle_connect_message_2           |                       v                   |
+        |           |              |            wait_connect_reply             |
+        v           v              |              |        |                   |
+     replace -> send_connect_message_reply        |        V                   |
+        |                                         |   wait_connect_reply_auth  |
+        |                                         |        |                   |
+        v                                         v        v                   |
+      open ---\                                 handle_connect_reply_2 --------/
+        |     |                                            |
+        |     v                                            v
+        |   wait_seq                                  wait_ack_seq
+        |     |                                            |
+        v     v                                            v
+    server_ready                                      client_ready
+            |                                              |
+            \------------------> wait_message <------------/
+                                 |  ^   |  ^
+        /------------------------/  |   |  |
+        |                           |   |  \----------------- ------------\
+        v                /----------/   v                                 |
+handle_keepalive2        |        handle_message_header      read_message_footer
+handle_keepalive2_ack    |              |                                 ^
+handle_tag_ack           |              v                                 |
+        |                |        throttle_message             read_message_data
+        \----------------/              |                                 ^
+                                        v                                 |
+                             read_message_front --> read_message_middle --/
+*/
+
+protected:
+
+  enum State {
+    NONE = 0,
+    START_CONNECT,
+    CONNECTING,
+    CONNECTING_WAIT_BANNER_AND_IDENTIFY,
+    CONNECTING_SEND_CONNECT_MSG,
+    START_ACCEPT,
+    ACCEPTING,
+    ACCEPTING_WAIT_CONNECT_MSG_AUTH,
+    ACCEPTING_HANDLED_CONNECT_MSG,
+    OPENED,
+    THROTTLE_MESSAGE,
+    THROTTLE_BYTES,
+    THROTTLE_DISPATCH_QUEUE,
+    READ_MESSAGE_FRONT,
+    READ_FOOTER_AND_DISPATCH,
+    CLOSED,
+    WAIT,
+    STANDBY
+  };
+
+  static const char *get_state_name(int state) {
+    const char *const statenames[] = {"NONE",
+                                      "START_CONNECT",
+                                      "CONNECTING",
+                                      "CONNECTING_WAIT_BANNER_AND_IDENTIFY",
+                                      "CONNECTING_SEND_CONNECT_MSG",
+                                      "START_ACCEPT",
+                                      "ACCEPTING",
+                                      "ACCEPTING_WAIT_CONNECT_MSG_AUTH",
+                                      "ACCEPTING_HANDLED_CONNECT_MSG",
+                                      "OPENED",
+                                      "THROTTLE_MESSAGE",
+                                      "THROTTLE_BYTES",
+                                      "THROTTLE_DISPATCH_QUEUE",
+                                      "READ_MESSAGE_FRONT",
+                                      "READ_FOOTER_AND_DISPATCH",
+                                      "CLOSED",
+                                      "WAIT",
+                                      "STANDBY"};
+    return statenames[state];
+  }
+
+  char *temp_buffer;
+
+  enum class WriteStatus { NOWRITE, REPLACING, CANWRITE, CLOSED };
+  std::atomic<WriteStatus> can_write;
+  std::list<Message *> sent;  // the first bufferlist need to inject seq
+  // priority queue for outbound msgs
+  std::map<int, std::list<std::pair<bufferlist, Message *>>> out_q;
+  bool keepalive;
+  bool write_in_progress = false;
+
+  __u32 connect_seq, peer_global_seq;
+  std::atomic<uint64_t> in_seq{0};
+  std::atomic<uint64_t> out_seq{0};
+  std::atomic<uint64_t> ack_left{0};
+
+  CryptoKey session_key;
+  std::shared_ptr<AuthSessionHandler> session_security;
+  std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge;  // accept side
+
+  // Open state
+  ceph_msg_connect connect_msg;
+  ceph_msg_connect_reply connect_reply;
+  bufferlist authorizer_buf;
+
+  utime_t backoff;  // backoff time
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+  unsigned msg_left;
+  uint64_t cur_msg_size;
+  ceph_msg_header current_header;
+  bufferlist data_buf;
+  bufferlist::iterator data_blp;
+  bufferlist front, middle, data;
+
+  bool replacing;  // when replacing process happened, we will reply connect
+                   // side with RETRY tag and accept side will clear replaced
+                   // connection. So when connect side reissue connect_msg,
+                   // there won't exists conflicting connection so we use
+                   // "replacing" to skip RESETSESSION to avoid detect wrong
+                   // presentation
+  bool is_reset_from_peer;
+  bool once_ready;
+
+  State state;
+
+  void run_continuation(CtPtr pcontinuation);
+  CtPtr read(CONTINUATION_RX_TYPE<ProtocolV1> &next, int len,
+             char *buffer = nullptr);
+  CtPtr write(CONTINUATION_TX_TYPE<ProtocolV1> &next,bufferlist &bl);
+  inline CtPtr _fault() {  // helper fault method that stops continuation
+    fault();
+    return nullptr;
+  }
+
+  CONTINUATION_DECL(ProtocolV1, wait_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2_ack);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_tag_ack);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_header);
+  CONTINUATION_DECL(ProtocolV1, throttle_message);
+  CONTINUATION_DECL(ProtocolV1, throttle_bytes);
+  CONTINUATION_DECL(ProtocolV1, throttle_dispatch_queue);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_front);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_middle);
+  CONTINUATION_DECL(ProtocolV1, read_message_data);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_data);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_footer);
+
+  CtPtr ready();
+  CtPtr wait_message();
+  CtPtr handle_message(char *buffer, int r);
+
+  CtPtr handle_keepalive2(char *buffer, int r);
+  void append_keepalive_or_ack(bool ack = false, utime_t *t = nullptr);
+  CtPtr handle_keepalive2_ack(char *buffer, int r);
+  CtPtr handle_tag_ack(char *buffer, int r);
+
+  CtPtr handle_message_header(char *buffer, int r);
+  CtPtr throttle_message();
+  CtPtr throttle_bytes();
+  CtPtr throttle_dispatch_queue();
+  CtPtr read_message_front();
+  CtPtr handle_message_front(char *buffer, int r);
+  CtPtr read_message_middle();
+  CtPtr handle_message_middle(char *buffer, int r);
+  CtPtr read_message_data_prepare();
+  CtPtr read_message_data();
+  CtPtr handle_message_data(char *buffer, int r);
+  CtPtr read_message_footer();
+  CtPtr handle_message_footer(char *buffer, int r);
+
+  void session_reset();
+  void randomize_out_seq();
+
+  Message *_get_next_outgoing(bufferlist *bl);
+
+  void prepare_send_message(uint64_t features, Message *m, bufferlist &bl);
+  ssize_t write_message(Message *m, bufferlist &bl, bool more);
+
+  void requeue_sent();
+  uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+  void discard_out_queue();
+
+  void reset_recv_state();
+  void reset_security();
+
+  ostream &_conn_prefix(std::ostream *_dout);
+
+public:
+  ProtocolV1(AsyncConnection *connection);
+  virtual ~ProtocolV1();
+
+  virtual void connect() override;
+  virtual void accept() override;
+  virtual bool is_connected() override;
+  virtual void stop() override;
+  virtual void fault() override;
+  virtual void send_message(Message *m) override;
+  virtual void send_keepalive() override;
+
+  virtual void read_event() override;
+  virtual void write_event() override;
+  virtual bool is_queued() override;
+
+  // Client Protocol
+private:
+  int global_seq;
+  AuthAuthorizer *authorizer;
+
+  CONTINUATION_DECL(ProtocolV1, send_client_banner);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_and_identify);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_my_addr_write);
+  CONTINUATION_DECL(ProtocolV1, send_connect_message);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_1);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_auth);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_ack_seq);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_in_seq_write);
+
+  CtPtr send_client_banner();
+  CtPtr handle_client_banner_write(int r);
+  CtPtr wait_server_banner();
+  CtPtr handle_server_banner_and_identify(char *buffer, int r);
+  CtPtr handle_my_addr_write(int r);
+  CtPtr send_connect_message();
+  CtPtr handle_connect_message_write(int r);
+  CtPtr wait_connect_reply();
+  CtPtr handle_connect_reply_1(char *buffer, int r);
+  CtPtr wait_connect_reply_auth();
+  CtPtr handle_connect_reply_auth(char *buffer, int r);
+  CtPtr handle_connect_reply_2();
+  CtPtr wait_ack_seq();
+  CtPtr handle_ack_seq(char *buffer, int r);
+  CtPtr handle_in_seq_write(int r);
+  CtPtr client_ready();
+
+  // Server Protocol
+protected:
+  bool wait_for_seq;
+
+  CONTINUATION_DECL(ProtocolV1, send_server_banner);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner);
+  CONTINUATION_DECL(ProtocolV1, wait_connect_message);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_1);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_auth);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+                                  handle_connect_message_reply_write);
+  WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+                                  handle_ready_connect_message_reply_write);
+  READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_seq);
+
+  CtPtr send_server_banner();
+  CtPtr handle_server_banner_write(int r);
+  CtPtr wait_client_banner();
+  CtPtr handle_client_banner(char *buffer, int r);
+  CtPtr wait_connect_message();
+  CtPtr handle_connect_message_1(char *buffer, int r);
+  CtPtr wait_connect_message_auth();
+  CtPtr handle_connect_message_auth(char *buffer, int r);
+  CtPtr handle_connect_message_2();
+  CtPtr send_connect_message_reply(char tag, ceph_msg_connect_reply &reply,
+                                   bufferlist &authorizer_reply);
+  CtPtr handle_connect_message_reply_write(int r);
+  CtPtr replace(AsyncConnectionRef existing, ceph_msg_connect_reply &reply,
+                bufferlist &authorizer_reply);
+  CtPtr open(ceph_msg_connect_reply &reply, bufferlist &authorizer_reply);
+  CtPtr handle_ready_connect_message_reply_write(int r);
+  CtPtr wait_seq();
+  CtPtr handle_seq(char *buffer, int r);
+  CtPtr server_ready();
+};
+
+class LoopbackProtocolV1 : public ProtocolV1 {
+public:
+  LoopbackProtocolV1(AsyncConnection *connection) : ProtocolV1(connection) {
+    this->can_write = WriteStatus::CANWRITE;
+  }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V1_ */
diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc
new file mode 100644
index 00000000..381d42c3
--- /dev/null
+++ b/src/msg/async/ProtocolV2.cc
@@ -0,0 +1,2870 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <type_traits>
+
+#include "ProtocolV2.h"
+#include "AsyncMessenger.h"
+
+#include "common/EventTrace.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/random.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream &ProtocolV2::_conn_prefix(std::ostream *_dout) {
+  return *_dout << "--2- " << messenger->get_myaddrs() << " >> "
+                << *connection->peer_addrs << " conn(" << connection << " "
+                << this
+		<< " " << ceph_con_mode_name(auth_meta->con_mode)
+		<< " :" << connection->port
+                << " s=" << get_state_name(state) << " pgs=" << peer_global_seq
+                << " cs=" << connect_seq << " l=" << connection->policy.lossy
+                << " rev1=" << HAVE_MSGR2_FEATURE(peer_supported_features,
+                                                  REVISION_1)
+                << " rx=" << session_stream_handlers.rx.get()
+                << " tx=" << session_stream_handlers.tx.get()
+                << ").";
+}
+
+using namespace ceph::msgr::v2;
+
+using CtPtr = Ct<ProtocolV2> *;
+using CtRef = Ct<ProtocolV2> &;
+
+void ProtocolV2::run_continuation(CtPtr pcontinuation) {
+  if (pcontinuation) {
+    run_continuation(*pcontinuation);
+  }
+}
+
+void ProtocolV2::run_continuation(CtRef continuation) {
+  try {
+    CONTINUATION_RUN(continuation)
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " failed decoding of frame header: " << e
+               << dendl;
+    _fault();
+  } catch (const ceph::crypto::onwire::MsgAuthError &e) {
+    lderr(cct) << __func__ << " " << e.what() << dendl;
+    _fault();
+  } catch (const DecryptionError &) {
+    lderr(cct) << __func__ << " failed to decrypt frame payload" << dendl;
+  }
+}
+
+#define WRITE(B, D, C) write(D, CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), buffer::ptr_node::create(buffer::create(L)))
+
+#define READ_RXBUF(B, C) read(CONTINUATION(C), B)
+
+#ifdef UNIT_TESTS_BUILT
+
+#define INTERCEPT(S) { \
+if(connection->interceptor) { \
+  auto a = connection->interceptor->intercept(connection, (S)); \
+  if (a == Interceptor::ACTION::FAIL) { \
+    return _fault(); \
+  } else if (a == Interceptor::ACTION::STOP) { \
+    stop(); \
+    connection->dispatch_queue->queue_reset(connection); \
+    return nullptr; \
+  }}}
+  
+#else
+#define INTERCEPT(S)
+#endif
+
+ProtocolV2::ProtocolV2(AsyncConnection *connection)
+    : Protocol(2, connection),
+      state(NONE),
+      peer_supported_features(0),
+      client_cookie(0),
+      server_cookie(0),
+      global_seq(0),
+      connect_seq(0),
+      peer_global_seq(0),
+      message_seq(0),
+      reconnecting(false),
+      replacing(false),
+      can_write(false),
+      bannerExchangeCallback(nullptr),
+      tx_frame_asm(&session_stream_handlers, false),
+      rx_frame_asm(&session_stream_handlers, false),
+      next_tag(static_cast<Tag>(0)),
+      keepalive(false) {
+}
+
+ProtocolV2::~ProtocolV2() {
+}
+
+void ProtocolV2::connect() {
+  ldout(cct, 1) << __func__ << dendl;
+  state = START_CONNECT;
+  pre_auth.enabled = true;
+}
+
+void ProtocolV2::accept() {
+  ldout(cct, 1) << __func__ << dendl;
+  state = START_ACCEPT;
+}
+
+bool ProtocolV2::is_connected() { return can_write; }
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV2::discard_out_queue() {
+  ldout(cct, 10) << __func__ << " started" << dendl;
+
+  for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (auto& [ prio, entries ] : out_queue) {
+    static_cast<void>(prio);
+    for (auto& entry : entries) {
+      ldout(cct, 20) << __func__ << " discard " << *entry.m << dendl;
+      entry.m->put();
+    }
+  }
+  out_queue.clear();
+  write_in_progress = false;
+}
+
+void ProtocolV2::reset_session() {
+  ldout(cct, 1) << __func__ << dendl;
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (connection->delay_state) {
+    connection->delay_state->discard();
+  }
+
+  connection->dispatch_queue->discard_queue(connection->conn_id);
+  discard_out_queue();
+  connection->outgoing_bl.clear();
+
+  connection->dispatch_queue->queue_remote_reset(connection);
+
+  out_seq = 0;
+  in_seq = 0;
+  client_cookie = 0;
+  server_cookie = 0;
+  connect_seq = 0;
+  peer_global_seq = 0;
+  message_seq = 0;
+  ack_left = 0;
+  can_write = false;
+}
+
+void ProtocolV2::stop() {
+  ldout(cct, 1) << __func__ << dendl;
+  if (state == CLOSED) {
+    return;
+  }
+
+  if (connection->delay_state) connection->delay_state->flush();
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+
+  reset_recv_state();
+  discard_out_queue();
+
+  connection->_stop();
+
+  can_write = false;
+  state = CLOSED;
+}
+
+void ProtocolV2::fault() { _fault(); }
+
+void ProtocolV2::requeue_sent() {
+  write_in_progress = false;
+  if (sent.empty()) {
+    return;
+  }
+
+  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  out_seq -= sent.size();
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(cct, 5) << __func__ << " requeueing message m=" << m
+                  << " seq=" << m->get_seq() << " type=" << m->get_type() << " "
+                  << *m << dendl;
+    rq.emplace_front(out_queue_entry_t{false, m});
+  }
+}
+
+uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+  ldout(cct, 10) << __func__ << " " << seq << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    return seq;
+  }
+  auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+  uint64_t count = out_seq;
+  while (!rq.empty()) {
+    Message* const m = rq.front().m;
+    if (m->get_seq() == 0 || m->get_seq() > seq) break;
+    ldout(cct, 5) << __func__ << " discarding message m=" << m
+                  << " seq=" << m->get_seq() << " ack_seq=" << seq << " "
+                  << *m << dendl;
+    m->put();
+    rq.pop_front();
+    count++;
+  }
+  if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST);
+  return count;
+}
+
+void ProtocolV2::reset_security() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  auth_meta.reset(new AuthConnectionMeta);
+  session_stream_handlers.rx.reset(nullptr);
+  session_stream_handlers.tx.reset(nullptr);
+  pre_auth.rxbuf.clear();
+  pre_auth.txbuf.clear();
+}
+
+// it's expected the `write_lock` is held while calling this method.
+void ProtocolV2::reset_recv_state() {
+  ldout(cct, 5) << __func__ << dendl;
+
+  if (!connection->center->in_thread()) {
+    // execute in the same thread that uses the rx/tx handlers. We need
+    // to do the warp because holding `write_lock` is not enough as
+    // `write_event()` unlocks it just before calling `write_message()`.
+    // `submit_to()` here is NOT blocking.
+    connection->center->submit_to(connection->center->get_id(), [this] {
+      ldout(cct, 5) << "reset_recv_state (warped) reseting crypto handlers"
+                    << dendl;
+      // Possibly unnecessary. See the comment in `deactivate_existing`.
+      std::lock_guard<std::mutex> l(connection->lock);
+      std::lock_guard<std::mutex> wl(connection->write_lock);
+      reset_security();
+    }, /* nowait = */true);
+  } else {
+    reset_security();
+  }
+
+  // clean read and write callbacks
+  connection->pendingReadLen.reset();
+  connection->writeCallback.reset();
+
+  next_tag = static_cast<Tag>(0);
+
+  reset_throttle();
+}
+
+size_t ProtocolV2::get_current_msg_size() const {
+  ceph_assert(rx_frame_asm.get_num_segments() > 0);
+  size_t sum = 0;
+  // we don't include SegmentIndex::Msg::HEADER.
+  for (size_t i = 1; i < rx_frame_asm.get_num_segments(); i++) {
+    sum += rx_frame_asm.get_segment_logical_len(i);
+  }
+  return sum;
+}
+
+void ProtocolV2::reset_throttle() {
+  if (state > THROTTLE_MESSAGE && state <= THROTTLE_DONE &&
+      connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " releasing " << 1
+                   << " message to policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    connection->policy.throttler_messages->put();
+  }
+  if (state > THROTTLE_BYTES && state <= THROTTLE_DONE) {
+    if (connection->policy.throttler_bytes) {
+      const size_t cur_msg_size = get_current_msg_size();
+      ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+                     << " bytes to policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      connection->policy.throttler_bytes->put(cur_msg_size);
+    }
+  }
+  if (state > THROTTLE_DISPATCH_QUEUE && state <= THROTTLE_DONE) {
+    const size_t cur_msg_size = get_current_msg_size();
+    ldout(cct, 10)
+        << __func__ << " releasing " << cur_msg_size
+        << " bytes to dispatch_queue throttler "
+        << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+        << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+    connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+  }
+}
+
+CtPtr ProtocolV2::_fault() {
+  ldout(cct, 10) << __func__ << dendl;
+
+  if (state == CLOSED || state == NONE) {
+    ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+    return nullptr;
+  }
+
+  if (connection->policy.lossy &&
+      !(state >= START_CONNECT && state <= SESSION_RECONNECTING)) {
+    ldout(cct, 2) << __func__ << " on lossy channel, failing" << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  connection->write_lock.lock();
+
+  can_write = false;
+  // requeue sent items
+  requeue_sent();
+
+  if (out_queue.empty() && state >= START_ACCEPT &&
+      state <= SESSION_ACCEPTING && !replacing) {
+    ldout(cct, 2) << __func__ << " with nothing to send and in the half "
+                   << " accept state just closed" << dendl;
+    connection->write_lock.unlock();
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  replacing = false;
+  connection->fault();
+  reset_recv_state();
+
+  reconnecting = false;
+
+  if (connection->policy.standby && out_queue.empty() && !keepalive &&
+      state != WAIT) {
+    ldout(cct, 1) << __func__ << " with nothing to send, going to standby"
+                  << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return nullptr;
+  }
+  if (connection->policy.server) {
+    ldout(cct, 1) << __func__ << " server, going to standby, even though i have stuff queued" << dendl;
+    state = STANDBY;
+    connection->write_lock.unlock();
+    return nullptr;
+  }
+
+  connection->write_lock.unlock();
+
+  if (!(state >= START_CONNECT && state <= SESSION_RECONNECTING) &&
+      state != WAIT &&
+      state != SESSION_ACCEPTING /* due to connection race */) {
+    // policy maybe empty when state is in accept
+    if (connection->policy.server) {
+      ldout(cct, 1) << __func__ << " server, going to standby" << dendl;
+      state = STANDBY;
+    } else {
+      ldout(cct, 1) << __func__ << " initiating reconnect" << dendl;
+      connect_seq++;
+      global_seq = messenger->get_global_seq();
+      state = START_CONNECT;
+      pre_auth.enabled = true;
+      connection->state = AsyncConnection::STATE_CONNECTING;
+    }
+    backoff = utime_t();
+    connection->center->dispatch_event_external(connection->read_handler);
+  } else {
+    if (state == WAIT) {
+      backoff.set_from_double(cct->_conf->ms_max_backoff);
+    } else if (backoff == utime_t()) {
+      backoff.set_from_double(cct->_conf->ms_initial_backoff);
+    } else {
+      backoff += backoff;
+      if (backoff > cct->_conf->ms_max_backoff)
+        backoff.set_from_double(cct->_conf->ms_max_backoff);
+    }
+
+    if (server_cookie) {
+      connect_seq++;
+    }
+
+    global_seq = messenger->get_global_seq();
+    state = START_CONNECT;
+    pre_auth.enabled = true;
+    connection->state = AsyncConnection::STATE_CONNECTING;
+    ldout(cct, 1) << __func__ << " waiting " << backoff << dendl;
+    // woke up again;
+    connection->register_time_events.insert(
+        connection->center->create_time_event(backoff.to_nsec() / 1000,
+                                              connection->wakeup_handler));
+  }
+  return nullptr;
+}
+
+void ProtocolV2::prepare_send_message(uint64_t features,
+				      Message *m) {
+  ldout(cct, 20) << __func__ << " m=" << *m << dendl;
+
+  // associate message with Connection (for benefit of encode_payload)
+  if (m->empty_payload()) {
+    ldout(cct, 20) << __func__ << " encoding features " << features << " " << m
+                   << " " << *m << dendl;
+  } else {
+    ldout(cct, 20) << __func__ << " half-reencoding features " << features
+                   << " " << m << " " << *m << dendl;
+  }
+
+  // encode and copy out of *m
+  m->encode(features, 0);
+}
+
+void ProtocolV2::send_message(Message *m) {
+  uint64_t f = connection->get_features();
+
+  // TODO: Currently not all messages supports reencode like MOSDMap, so here
+  // only let fast dispatch support messages prepare message
+  const bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+  if (can_fast_prepare) {
+    prepare_send_message(f, m);
+  }
+
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  bool is_prepared = can_fast_prepare;
+  // "features" changes will change the payload encoding
+  if (can_fast_prepare && (!can_write || connection->get_features() != f)) {
+    // ensure the correctness of message encoding
+    m->clear_payload();
+    is_prepared = false;
+    ldout(cct, 10) << __func__ << " clear encoded buffer previous " << f
+                   << " != " << connection->get_features() << dendl;
+  }
+  if (state == CLOSED) {
+    ldout(cct, 10) << __func__ << " connection closed."
+                   << " Drop message " << m << dendl;
+    m->put();
+  } else {
+    ldout(cct, 5) << __func__ << " enqueueing message m=" << m
+                  << " type=" << m->get_type() << " " << *m << dendl;
+    m->trace.event("async enqueueing message");
+    out_queue[m->get_priority()].emplace_back(
+      out_queue_entry_t{is_prepared, m});
+    ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+                   << dendl;
+    if (((!replacing && can_write) || state == STANDBY) && !write_in_progress) {
+      write_in_progress = true;
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+}
+
+void ProtocolV2::send_keepalive() {
+  ldout(cct, 10) << __func__ << dendl;
+  std::lock_guard<std::mutex> l(connection->write_lock);
+  if (state != CLOSED) {
+    keepalive = true;
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+}
+
+void ProtocolV2::read_event() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  switch (state) {
+    case START_CONNECT:
+      run_continuation(CONTINUATION(start_client_banner_exchange));
+      break;
+    case START_ACCEPT:
+      run_continuation(CONTINUATION(start_server_banner_exchange));
+      break;
+    case READY:
+      run_continuation(CONTINUATION(read_frame));
+      break;
+    case THROTTLE_MESSAGE:
+      run_continuation(CONTINUATION(throttle_message));
+      break;
+    case THROTTLE_BYTES:
+      run_continuation(CONTINUATION(throttle_bytes));
+      break;
+    case THROTTLE_DISPATCH_QUEUE:
+      run_continuation(CONTINUATION(throttle_dispatch_queue));
+      break;
+    default:
+      break;
+  }
+}
+
+ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() {
+  out_queue_entry_t out_entry;
+
+  if (!out_queue.empty()) {
+    auto it = out_queue.rbegin();
+    auto& entries = it->second;
+    ceph_assert(!entries.empty());
+    out_entry = entries.front();
+    entries.pop_front();
+    if (entries.empty()) {
+      out_queue.erase(it->first);
+    }
+  }
+  return out_entry;
+}
+
+ssize_t ProtocolV2::write_message(Message *m, bool more) {
+  FUNCTRACE(cct);
+  ceph_assert(connection->center->in_thread());
+  m->set_seq(++out_seq);
+
+  connection->lock.lock();
+  uint64_t ack_seq = in_seq;
+  ack_left = 0;
+  connection->lock.unlock();
+
+  ceph_msg_header &header = m->get_header();
+  ceph_msg_footer &footer = m->get_footer();
+
+  ceph_msg_header2 header2{header.seq,        header.tid,
+                           header.type,       header.priority,
+                           header.version,
+                           init_le32(0),      header.data_off,
+                           init_le64(ack_seq),
+                           footer.flags,      header.compat_version,
+                           header.reserved};
+
+  auto message = MessageFrame::Encode(
+			     header2,
+			     m->get_payload(),
+			     m->get_middle(),
+			     m->get_data());
+  if (!append_frame(message)) {
+    m->put();
+    return -EILSEQ;
+  }
+
+  ldout(cct, 5) << __func__ << " sending message m=" << m
+                << " seq=" << m->get_seq() << " " << *m << dendl;
+
+  m->trace.event("async writing message");
+  ldout(cct, 20) << __func__ << " sending m=" << m << " seq=" << m->get_seq()
+                 << " src=" << entity_name_t(messenger->get_myname())
+                 << " off=" << header2.data_off
+                 << dendl;
+  ssize_t total_send_size = connection->outgoing_bl.length();
+  ssize_t rc = connection->_try_send(more);
+  if (rc < 0) {
+    ldout(cct, 1) << __func__ << " error sending " << m << ", "
+                  << cpp_strerror(rc) << dendl;
+  } else {
+    connection->logger->inc(
+        l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+    ldout(cct, 10) << __func__ << " sending " << m
+                   << (rc ? " continuely." : " done.") << dendl;
+  }
+  if (m->get_type() == CEPH_MSG_OSD_OP)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+  else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+    OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+  m->put();
+
+  return rc;
+}
+
+template <class F>
+bool ProtocolV2::append_frame(F& frame) {
+  ceph::bufferlist bl;
+  try {
+    bl = frame.get_buffer(tx_frame_asm);
+  } catch (ceph::crypto::onwire::TxHandlerError &e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return false;
+  }
+
+  ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+                 << " bytes " << tx_frame_asm << dendl;
+  connection->outgoing_bl.append(bl);
+  return true;
+}
+
+void ProtocolV2::handle_message_ack(uint64_t seq) {
+  if (connection->policy.lossy) {  // lossy connections don't keep sent messages
+    return;
+  }
+
+  ldout(cct, 15) << __func__ << " seq=" << seq << dendl;
+
+  // trim sent list
+  static const int max_pending = 128;
+  int i = 0;
+  Message *pending[max_pending];
+  connection->write_lock.lock();
+  while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+    Message *m = sent.front();
+    sent.pop_front();
+    pending[i++] = m;
+    ldout(cct, 10) << __func__ << " got ack seq " << seq
+                   << " >= " << m->get_seq() << " on " << m << " " << *m
+                   << dendl;
+  }
+  connection->write_lock.unlock();
+  for (int k = 0; k < i; k++) {
+    pending[k]->put();
+  }
+}
+
+void ProtocolV2::write_event() {
+  ldout(cct, 10) << __func__ << dendl;
+  ssize_t r = 0;
+
+  connection->write_lock.lock();
+  if (can_write) {
+    if (keepalive) {
+      ldout(cct, 10) << __func__ << " appending keepalive" << dendl;
+      auto keepalive_frame = KeepAliveFrame::Encode();
+      if (!append_frame(keepalive_frame)) {
+        connection->write_lock.unlock();
+        connection->lock.lock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+      keepalive = false;
+    }
+
+    auto start = ceph::mono_clock::now();
+    bool more;
+    do {
+      const auto out_entry = _get_next_outgoing();
+      if (!out_entry.m) {
+        break;
+      }
+
+      if (!connection->policy.lossy) {
+        // put on sent list
+        sent.push_back(out_entry.m);
+        out_entry.m->get();
+      }
+      more = !out_queue.empty();
+      connection->write_lock.unlock();
+
+      // send_message or requeue messages may not encode message
+      if (!out_entry.is_prepared) {
+        prepare_send_message(connection->get_features(), out_entry.m);
+      }
+
+      r = write_message(out_entry.m, more);
+
+      connection->write_lock.lock();
+      if (r == 0) {
+        ;
+      } else if (r < 0) {
+        ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+        break;
+      } else if (r > 0)
+        break;
+    } while (can_write);
+    write_in_progress = false;
+
+    // if r > 0 mean data still lefted, so no need _try_send.
+    if (r == 0) {
+      uint64_t left = ack_left;
+      if (left) {
+        ceph_le64 s;
+        s = in_seq;
+        ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+                       << " messages" << dendl;
+        auto ack_frame = AckFrame::Encode(in_seq);
+        if (append_frame(ack_frame)) {
+          ack_left -= left;
+          left = ack_left;
+          r = connection->_try_send(left);
+        } else {
+          r = -EILSEQ;
+        }
+      } else if (is_queued()) {
+        r = connection->_try_send();
+      }
+    }
+    connection->write_lock.unlock();
+
+    connection->logger->tinc(l_msgr_running_send_time,
+                             ceph::mono_clock::now() - start);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+      connection->lock.lock();
+      fault();
+      connection->lock.unlock();
+      return;
+    }
+  } else {
+    write_in_progress = false;
+    connection->write_lock.unlock();
+    connection->lock.lock();
+    connection->write_lock.lock();
+    if (state == STANDBY && !connection->policy.server && is_queued()) {
+      ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+      if (server_cookie) {  // only increment connect_seq if there is a session
+        connect_seq++;
+      }
+      connection->_connect();
+    } else if (connection->cs && state != NONE && state != CLOSED &&
+               state != START_CONNECT) {
+      r = connection->_try_send();
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+        connection->write_lock.unlock();
+        fault();
+        connection->lock.unlock();
+        return;
+      }
+    }
+    connection->write_lock.unlock();
+    connection->lock.unlock();
+  }
+}
+
+bool ProtocolV2::is_queued() {
+  return !out_queue.empty() || connection->is_queued();
+}
+
+CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+                       rx_buffer_t &&buffer) {
+  const auto len = buffer->length();
+  const auto buf = buffer->c_str();
+  next.node = std::move(buffer);
+  ssize_t r = connection->read(len, buf,
+    [&next, this](char *buffer, int r) {
+      if (unlikely(pre_auth.enabled) && r >= 0) {
+        pre_auth.rxbuf.append(*next.node);
+	ceph_assert(!cct->_conf->ms_die_on_bug ||
+		    pre_auth.rxbuf.length() < 20000000);
+      }
+      next.r = r;
+      run_continuation(next);
+    });
+  if (r <= 0) {
+    // error or done synchronously
+    if (unlikely(pre_auth.enabled) && r >= 0) {
+      pre_auth.rxbuf.append(*next.node);
+      ceph_assert(!cct->_conf->ms_die_on_bug ||
+		  pre_auth.rxbuf.length() < 20000000);
+    }
+    next.r = r;
+    return &next;
+  }
+
+  return nullptr;
+}
+
+template <class F>
+CtPtr ProtocolV2::write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        F &frame) {
+  ceph::bufferlist bl;
+  try {
+    bl = frame.get_buffer(tx_frame_asm);
+  } catch (ceph::crypto::onwire::TxHandlerError &e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+                 << " bytes " << tx_frame_asm << dendl;
+  return write(desc, next, bl);
+}
+
+CtPtr ProtocolV2::write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        bufferlist &buffer) {
+  if (unlikely(pre_auth.enabled)) {
+    pre_auth.txbuf.append(buffer);
+    ceph_assert(!cct->_conf->ms_die_on_bug ||
+		pre_auth.txbuf.length() < 20000000);
+  }
+
+  ssize_t r =
+      connection->write(buffer, [&next, desc, this](int r) {
+        if (r < 0) {
+          ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+                        << " (" << cpp_strerror(r) << ")" << dendl;
+          connection->inject_delay();
+          _fault();
+        }
+        run_continuation(next);
+      });
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  } else if (r == 0) {
+    next.setParams();
+    return &next;
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV2::_banner_exchange(CtRef callback) {
+  ldout(cct, 20) << __func__ << dendl;
+  bannerExchangeCallback = &callback;
+
+  bufferlist banner_payload;
+  encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0);
+  encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0);
+
+  bufferlist bl;
+  bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX));
+  encode((uint16_t)banner_payload.length(), bl, 0);
+  bl.claim_append(banner_payload);
+
+  INTERCEPT(state == BANNER_CONNECTING ? 3 : 4);
+
+  return WRITE(bl, "banner", _wait_for_peer_banner);
+}
+
+CtPtr ProtocolV2::_wait_for_peer_banner() {
+  unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(__le16);
+  return READ(banner_len, _handle_peer_banner);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner failed r=" << r << " ("
+                  << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX);
+
+  if (memcmp(buffer->c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len)) {
+    if (memcmp(buffer->c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) {
+      lderr(cct) << __func__ << " peer " << *connection->peer_addrs
+                 << " is using msgr V1 protocol" << dendl;
+      return _fault();
+    }
+    ldout(cct, 1) << __func__ << " accept peer sent bad banner" << dendl;
+    return _fault();
+  }
+
+  uint16_t payload_len;
+  bufferlist bl;
+  buffer->set_offset(banner_prefix_len);
+  buffer->set_length(sizeof(__le16));
+  bl.push_back(std::move(buffer));
+  auto ti = bl.cbegin();
+  try {
+    decode(payload_len, ti);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode banner payload len failed " << dendl;
+    return _fault();
+  }
+
+  INTERCEPT(state == BANNER_CONNECTING ? 5 : 6);
+
+  return READ(payload_len, _handle_peer_banner_payload);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner_payload(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read peer banner payload failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  uint64_t peer_supported_features;
+  uint64_t peer_required_features;
+
+  bufferlist bl;
+  bl.push_back(std::move(buffer));
+  auto ti = bl.cbegin();
+  try {
+    decode(peer_supported_features, ti);
+    decode(peer_required_features, ti);
+  } catch (const buffer::error &e) {
+    lderr(cct) << __func__ << " decode banner payload failed " << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 1) << __func__ << " supported=" << std::hex
+                << peer_supported_features << " required=" << std::hex
+                << peer_required_features << std::dec << dendl;
+
+  // Check feature bit compatibility
+
+  uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES;
+  uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES;
+
+  if ((required_features & peer_supported_features) != required_features) {
+    ldout(cct, 1) << __func__ << " peer does not support all required features"
+                  << " required=" << std::hex << required_features
+                  << " supported=" << std::hex << peer_supported_features
+                  << std::dec << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+  if ((supported_features & peer_required_features) != peer_required_features) {
+    ldout(cct, 1) << __func__ << " we do not support all peer required features"
+                  << " required=" << std::hex << peer_required_features
+                  << " supported=" << supported_features << std::dec << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  this->peer_supported_features = peer_supported_features;
+  if (peer_required_features == 0) {
+    this->connection_features = msgr2_required;
+  }
+
+  // if the peer supports msgr2.1, switch to it
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  tx_frame_asm.set_is_rev1(is_rev1);
+  rx_frame_asm.set_is_rev1(is_rev1);
+
+  if (state == BANNER_CONNECTING) {
+    state = HELLO_CONNECTING;
+  }
+  else {
+    ceph_assert(state == BANNER_ACCEPTING);
+    state = HELLO_ACCEPTING;
+  }
+
+  auto hello = HelloFrame::Encode(messenger->get_mytype(),
+                                  connection->target_addr);
+
+  INTERCEPT(state == HELLO_CONNECTING ? 7 : 8);
+
+  return WRITE(hello, "hello frame", read_frame);
+}
+
+CtPtr ProtocolV2::handle_hello(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != HELLO_CONNECTING && state != HELLO_ACCEPTING) {
+    lderr(cct) << __func__ << " not in hello exchange state!" << dendl;
+    return _fault();
+  }
+
+  auto hello = HelloFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__ << " received hello:"
+                << " peer_type=" << (int)hello.entity_type()
+                << " peer_addr_for_me=" << hello.peer_addr() << dendl;
+
+  sockaddr_storage ss;
+  socklen_t len = sizeof(ss);
+  getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+  ldout(cct, 5) << __func__ << " getsockname says I am " << (sockaddr *)&ss
+		<< " when talking to " << connection->target_addr << dendl;
+
+  if (connection->get_peer_type() == -1) {
+    connection->set_peer_type(hello.entity_type());
+
+    ceph_assert(state == HELLO_ACCEPTING);
+    connection->policy = messenger->get_policy(hello.entity_type());
+    ldout(cct, 10) << __func__ << " accept of host_type "
+                   << (int)hello.entity_type()
+                   << ", policy.lossy=" << connection->policy.lossy
+                   << " policy.server=" << connection->policy.server
+                   << " policy.standby=" << connection->policy.standby
+                   << " policy.resetcheck=" << connection->policy.resetcheck
+                   << dendl;
+  } else {
+    ceph_assert(state == HELLO_CONNECTING);
+    if (connection->get_peer_type() != hello.entity_type()) {
+      ldout(cct, 1) << __func__ << " connection peer type does not match what"
+                    << " peer advertises " << connection->get_peer_type()
+                    << " != " << (int)hello.entity_type() << dendl;
+      stop();
+      connection->dispatch_queue->queue_reset(connection);
+      return nullptr;
+    }
+  }
+
+  if (messenger->get_myaddrs().empty() ||
+      messenger->get_myaddrs().front().is_blank_ip()) {
+    entity_addr_t a;
+    if (cct->_conf->ms_learn_addr_from_peer) {
+      ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+		    << " says I am " << hello.peer_addr() << " (socket says "
+		    << (sockaddr*)&ss << ")" << dendl;
+      a = hello.peer_addr();
+    } else {
+      ldout(cct, 1) << __func__ << " socket to  " << connection->target_addr
+		    << " says I am " << (sockaddr*)&ss
+		    << " (peer says " << hello.peer_addr() << ")" << dendl;
+      a.set_sockaddr((sockaddr *)&ss);
+    }
+    a.set_type(entity_addr_t::TYPE_MSGR2); // anything but NONE; learned_addr ignores this
+    a.set_port(0);
+    connection->lock.unlock();
+    messenger->learned_addr(a);
+    if (cct->_conf->ms_inject_internal_delays &&
+        cct->_conf->ms_inject_socket_failures) {
+      if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+        ldout(cct, 10) << __func__ << " sleep for "
+                       << cct->_conf->ms_inject_internal_delays << dendl;
+        utime_t t;
+        t.set_from_double(cct->_conf->ms_inject_internal_delays);
+        t.sleep();
+      }
+    }
+    connection->lock.lock();
+    if (state != HELLO_CONNECTING) {
+      ldout(cct, 1) << __func__
+                    << " state changed while learned_addr, mark_down or "
+                    << " replacing must be happened just now" << dendl;
+      return nullptr;
+    }
+  }
+
+
+
+  CtPtr callback;
+  callback = bannerExchangeCallback;
+  bannerExchangeCallback = nullptr;
+  ceph_assert(callback);
+  return callback;
+}
+
+CtPtr ProtocolV2::read_frame() {
+  if (state == CLOSED) {
+    return nullptr;
+  }
+
+  ldout(cct, 20) << __func__ << dendl;
+  rx_preamble.clear();
+  rx_epilogue.clear();
+  rx_segments_data.clear();
+
+  return READ(rx_frame_asm.get_preamble_onwire_len(),
+              handle_read_frame_preamble_main);
+}
+
+CtPtr ProtocolV2::handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame preamble failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_preamble.push_back(std::move(buffer));
+
+  ldout(cct, 30) << __func__ << " preamble\n";
+  rx_preamble.hexdump(*_dout);
+  *_dout << dendl;
+
+  try {
+    next_tag = rx_frame_asm.disassemble_preamble(rx_preamble);
+  } catch (FrameError& e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  } catch (ceph::crypto::onwire::MsgAuthError&) {
+    ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+    return _fault();
+  }
+
+  ldout(cct, 25) << __func__ << " disassembled preamble " << rx_frame_asm
+                 << dendl;
+
+  if (session_stream_handlers.rx) {
+    ldout(cct, 30) << __func__ << " preamble after decrypt\n";
+    rx_preamble.hexdump(*_dout);
+    *_dout << dendl;
+  }
+
+  // does it need throttle?
+  if (next_tag == Tag::MESSAGE) {
+    if (state != READY) {
+      lderr(cct) << __func__ << " not in ready state!" << dendl;
+      return _fault();
+    }
+    state = THROTTLE_MESSAGE;
+    return CONTINUE(throttle_message);
+  } else {
+    return read_frame_segment();
+  }
+}
+
+CtPtr ProtocolV2::handle_read_frame_dispatch() {
+  ldout(cct, 10) << __func__
+                 << " tag=" << static_cast<uint32_t>(next_tag) << dendl;
+
+  switch (next_tag) {
+    case Tag::HELLO:
+    case Tag::AUTH_REQUEST:
+    case Tag::AUTH_BAD_METHOD:
+    case Tag::AUTH_REPLY_MORE:
+    case Tag::AUTH_REQUEST_MORE:
+    case Tag::AUTH_DONE:
+    case Tag::AUTH_SIGNATURE:
+    case Tag::CLIENT_IDENT:
+    case Tag::SERVER_IDENT:
+    case Tag::IDENT_MISSING_FEATURES:
+    case Tag::SESSION_RECONNECT:
+    case Tag::SESSION_RESET:
+    case Tag::SESSION_RETRY:
+    case Tag::SESSION_RETRY_GLOBAL:
+    case Tag::SESSION_RECONNECT_OK:
+    case Tag::KEEPALIVE2:
+    case Tag::KEEPALIVE2_ACK:
+    case Tag::ACK:
+    case Tag::WAIT:
+      return handle_frame_payload();
+    case Tag::MESSAGE:
+      return handle_message();
+    default: {
+      lderr(cct) << __func__
+                 << " received unknown tag=" << static_cast<uint32_t>(next_tag)
+                 << dendl;
+      return _fault();
+    }
+  }
+
+  return nullptr;
+}
+
+CtPtr ProtocolV2::read_frame_segment() {
+  size_t seg_idx = rx_segments_data.size();
+  ldout(cct, 20) << __func__ << " seg_idx=" << seg_idx << dendl;
+  rx_segments_data.emplace_back();
+
+  uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx);
+  if (onwire_len == 0) {
+    return _handle_read_frame_segment();
+  }
+
+  rx_buffer_t rx_buffer;
+  uint16_t align = rx_frame_asm.get_segment_align(seg_idx);
+  try {
+    rx_buffer = buffer::ptr_node::create(buffer::create_aligned(
+        onwire_len, align));
+  } catch (std::bad_alloc&) {
+    // Catching because of potential issues with satisfying alignment.
+    ldout(cct, 1) << __func__ << " can't allocate aligned rx_buffer"
+                  << " len=" << onwire_len
+                  << " align=" << align
+                  << dendl;
+    return _fault();
+  }
+
+  return READ_RXBUF(std::move(rx_buffer), handle_read_frame_segment);
+}
+
+CtPtr ProtocolV2::handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r) {
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame segment failed r=" << r << " ("
+                  << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_segments_data.back().push_back(std::move(rx_buffer));
+  return _handle_read_frame_segment();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_segment() {
+  if (rx_segments_data.size() == rx_frame_asm.get_num_segments()) {
+    // OK, all segments planned to read are read. Can go with epilogue.
+    uint32_t epilogue_onwire_len = rx_frame_asm.get_epilogue_onwire_len();
+    if (epilogue_onwire_len == 0) {
+      return _handle_read_frame_epilogue_main();
+    }
+    return READ(epilogue_onwire_len, handle_read_frame_epilogue_main);
+  }
+  // TODO: for makeshift only. This will be more generic and throttled
+  return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_frame_payload() {
+  ceph_assert(!rx_segments_data.empty());
+  auto& payload = rx_segments_data.back();
+
+  ldout(cct, 30) << __func__ << "\n";
+  payload.hexdump(*_dout);
+  *_dout << dendl;
+
+  switch (next_tag) {
+    case Tag::HELLO:
+      return handle_hello(payload);
+    case Tag::AUTH_REQUEST:
+      return handle_auth_request(payload);
+    case Tag::AUTH_BAD_METHOD:
+      return handle_auth_bad_method(payload);
+    case Tag::AUTH_REPLY_MORE:
+      return handle_auth_reply_more(payload);
+    case Tag::AUTH_REQUEST_MORE:
+      return handle_auth_request_more(payload);
+    case Tag::AUTH_DONE:
+      return handle_auth_done(payload);
+    case Tag::AUTH_SIGNATURE:
+      return handle_auth_signature(payload);
+    case Tag::CLIENT_IDENT:
+      return handle_client_ident(payload);
+    case Tag::SERVER_IDENT:
+      return handle_server_ident(payload);
+    case Tag::IDENT_MISSING_FEATURES:
+      return handle_ident_missing_features(payload);
+    case Tag::SESSION_RECONNECT:
+      return handle_reconnect(payload);
+    case Tag::SESSION_RESET:
+      return handle_session_reset(payload);
+    case Tag::SESSION_RETRY:
+      return handle_session_retry(payload);
+    case Tag::SESSION_RETRY_GLOBAL:
+      return handle_session_retry_global(payload);
+    case Tag::SESSION_RECONNECT_OK:
+      return handle_reconnect_ok(payload);
+    case Tag::KEEPALIVE2:
+      return handle_keepalive2(payload);
+    case Tag::KEEPALIVE2_ACK:
+      return handle_keepalive2_ack(payload);
+    case Tag::ACK:
+      return handle_message_ack(payload);
+    case Tag::WAIT:
+      return handle_wait(payload);
+    default:
+      ceph_abort();
+  }
+  return nullptr;
+}
+
+CtPtr ProtocolV2::ready() {
+  ldout(cct, 25) << __func__ << dendl;
+
+  reconnecting = false;
+  replacing = false;
+
+  // make sure no pending tick timer
+  if (connection->last_tick_id) {
+    connection->center->delete_time_event(connection->last_tick_id);
+  }
+  connection->last_tick_id = connection->center->create_time_event(
+      connection->inactive_timeout_us, connection->tick_handler);
+
+  {
+    std::lock_guard<std::mutex> l(connection->write_lock);
+    can_write = true;
+    if (!out_queue.empty()) {
+      connection->center->dispatch_event_external(connection->write_handler);
+    }
+  }
+
+  connection->maybe_start_delay_thread();
+
+  state = READY;
+  ldout(cct, 1) << __func__ << " entity=" << peer_name << " client_cookie="
+                << std::hex << client_cookie << " server_cookie="
+                << server_cookie << std::dec << " in_seq=" << in_seq
+                << " out_seq=" << out_seq << dendl;
+
+  INTERCEPT(15);
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r)
+{
+  ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " read frame epilogue failed r=" << r
+                  << " (" << cpp_strerror(r) << ")" << dendl;
+    return _fault();
+  }
+
+  rx_epilogue.push_back(std::move(buffer));
+  return _handle_read_frame_epilogue_main();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_epilogue_main() {
+  bool aborted;
+  try {
+    rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]);
+    aborted = !rx_frame_asm.disassemble_remaining_segments(
+        rx_segments_data.data(), rx_epilogue);
+  } catch (FrameError& e) {
+    ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+    return _fault();
+  } catch (ceph::crypto::onwire::MsgAuthError&) {
+    ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+    return _fault();
+  }
+
+  // we do have a mechanism that allows transmitter to start sending message
+  // and abort after putting entire data field on wire. This will be used by
+  // the kernel client to avoid unnecessary buffering.
+  if (aborted) {
+    reset_throttle();
+    state = READY;
+    return CONTINUE(read_frame);
+  }
+  return handle_read_frame_dispatch();
+}
+
+CtPtr ProtocolV2::handle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+  ceph_assert(state == THROTTLE_DONE);
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+  ltt_recv_stamp = ceph_clock_now();
+#endif
+  recv_stamp = ceph_clock_now();
+
+  const size_t cur_msg_size = get_current_msg_size();
+  auto msg_frame = MessageFrame::Decode(rx_segments_data);
+
+  // XXX: paranoid copy just to avoid oops
+  ceph_msg_header2 current_header = msg_frame.header();
+
+  ldout(cct, 5) << __func__
+		<< " got " << msg_frame.front_len()
+		<< " + " << msg_frame.middle_len()
+		<< " + " << msg_frame.data_len()
+		<< " byte message."
+		<< " envelope type=" << current_header.type
+		<< " src " << peer_name
+		<< " off " << current_header.data_off
+                << dendl;
+
+  INTERCEPT(16);
+  ceph_msg_header header{current_header.seq,
+                         current_header.tid,
+                         current_header.type,
+                         current_header.priority,
+                         current_header.version,
+                         init_le32(msg_frame.front_len()),
+                         init_le32(msg_frame.middle_len()),
+                         init_le32(msg_frame.data_len()),
+                         current_header.data_off,
+                         peer_name,
+                         current_header.compat_version,
+                         current_header.reserved,
+                         init_le32(0)};
+  ceph_msg_footer footer{init_le32(0), init_le32(0),
+	                 init_le32(0), init_le64(0), current_header.flags};
+
+  Message *message = decode_message(cct, 0, header, footer,
+      msg_frame.front(),
+      msg_frame.middle(),
+      msg_frame.data(),
+      connection);
+  if (!message) {
+    ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+    return _fault();
+  } else {
+    state = READ_MESSAGE_COMPLETE;
+  }
+
+  INTERCEPT(17);
+
+  message->set_byte_throttler(connection->policy.throttler_bytes);
+  message->set_message_throttler(connection->policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(cur_msg_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  // check received seq#.  if it is old, drop the message.
+  // note that incoming messages may skip ahead.  this is convenient for the
+  // client side queueing because messages can't be renumbered, but the (kernel)
+  // client will occasionally pull a message out of the sent queue to send
+  // elsewhere.  in that case it doesn't matter if we "got" it or not.
+  uint64_t cur_seq = in_seq;
+  if (message->get_seq() <= cur_seq) {
+    ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+                  << " <= " << cur_seq << " " << message << " " << *message
+                  << ", discarding" << dendl;
+    message->put();
+    if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+        cct->_conf->ms_die_on_old_message) {
+      ceph_assert(0 == "old msgs despite reconnect_seq feature");
+    }
+    return nullptr;
+  }
+  if (message->get_seq() > cur_seq + 1) {
+    ldout(cct, 0) << __func__ << " missed message?  skipped from seq "
+                  << cur_seq << " to " << message->get_seq() << dendl;
+    if (cct->_conf->ms_die_on_skipped_message) {
+      ceph_assert(0 == "skipped incoming seq");
+    }
+  }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+  if (message->get_type() == CEPH_MSG_OSD_OP ||
+      message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+    utime_t ltt_processed_stamp = ceph_clock_now();
+    double usecs_elapsed =
+        (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000;
+    ostringstream buf;
+    if (message->get_type() == CEPH_MSG_OSD_OP)
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+                           false);
+    else
+      OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+                           false);
+  }
+#endif
+
+  // note last received message.
+  in_seq = message->get_seq();
+  ldout(cct, 5) << __func__ << " received message m=" << message
+                << " seq=" << message->get_seq()
+                << " from=" << message->get_source() << " type=" << header.type
+                << " " << *message << dendl;
+
+  bool need_dispatch_writer = false;
+  if (!connection->policy.lossy) {
+    ack_left++;
+    need_dispatch_writer = true;
+  }
+
+  state = READY;
+
+  connection->logger->inc(l_msgr_recv_messages);
+  connection->logger->inc(l_msgr_recv_bytes,
+                          rx_frame_asm.get_frame_onwire_len());
+
+  messenger->ms_fast_preprocess(message);
+  auto fast_dispatch_time = ceph::mono_clock::now();
+  connection->logger->tinc(l_msgr_running_recv_time,
+                           fast_dispatch_time - connection->recv_start_time);
+  if (connection->delay_state) {
+    double delay_period = 0;
+    if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+      delay_period =
+          cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+      ldout(cct, 1) << "queue_received will delay after "
+                    << (ceph_clock_now() + delay_period) << " on " << message
+                    << " " << *message << dendl;
+    }
+    connection->delay_state->queue(delay_period, message);
+  } else if (messenger->ms_can_fast_dispatch(message)) {
+    connection->lock.unlock();
+    connection->dispatch_queue->fast_dispatch(message);
+    connection->recv_start_time = ceph::mono_clock::now();
+    connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+                             connection->recv_start_time - fast_dispatch_time);
+    connection->lock.lock();
+  } else {
+    connection->dispatch_queue->enqueue(message, message->get_priority(),
+                                        connection->conn_id);
+  }
+
+  handle_message_ack(current_header.ack_seq);
+
+  // we might have been reused by another connection
+  // let's check if that is the case
+  if (state != READY) {
+    // yes, that was the case, let's do nothing
+    return nullptr;
+  }
+
+  if (need_dispatch_writer && connection->is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(read_frame);
+}
+
+
+CtPtr ProtocolV2::throttle_message() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->policy.throttler_messages) {
+    ldout(cct, 10) << __func__ << " wants " << 1
+                   << " message from policy throttler "
+                   << connection->policy.throttler_messages->get_current()
+                   << "/" << connection->policy.throttler_messages->get_max()
+                   << dendl;
+    if (!connection->policy.throttler_messages->get_or_fail()) {
+      ldout(cct, 10) << __func__ << " wants 1 message from policy throttle "
+                     << connection->policy.throttler_messages->get_current()
+                     << "/" << connection->policy.throttler_messages->get_max()
+                     << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  state = THROTTLE_BYTES;
+  return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV2::throttle_bytes() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  const size_t cur_msg_size = get_current_msg_size();
+  if (cur_msg_size) {
+    if (connection->policy.throttler_bytes) {
+      ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                     << " bytes from policy throttler "
+                     << connection->policy.throttler_bytes->get_current() << "/"
+                     << connection->policy.throttler_bytes->get_max() << dendl;
+      if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+        ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+                       << " bytes from policy throttler "
+                       << connection->policy.throttler_bytes->get_current()
+                       << "/" << connection->policy.throttler_bytes->get_max()
+                       << " failed, just wait." << dendl;
+        // following thread pool deal with th full message queue isn't a
+        // short time, so we can wait a ms.
+        if (connection->register_time_events.empty()) {
+          connection->register_time_events.insert(
+              connection->center->create_time_event(
+                  1000, connection->wakeup_handler));
+        }
+        return nullptr;
+      }
+    }
+  }
+
+  state = THROTTLE_DISPATCH_QUEUE;
+  return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV2::throttle_dispatch_queue() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  const size_t cur_msg_size = get_current_msg_size();
+  if (cur_msg_size) {
+    if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+            cur_msg_size)) {
+      ldout(cct, 10)
+          << __func__ << " wants " << cur_msg_size
+          << " bytes from dispatch throttle "
+          << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+          << connection->dispatch_queue->dispatch_throttler.get_max()
+          << " failed, just wait." << dendl;
+      // following thread pool deal with th full message queue isn't a
+      // short time, so we can wait a ms.
+      if (connection->register_time_events.empty()) {
+        connection->register_time_events.insert(
+            connection->center->create_time_event(1000,
+                                                  connection->wakeup_handler));
+      }
+      return nullptr;
+    }
+  }
+
+  throttle_stamp = ceph_clock_now();
+  state = THROTTLE_DONE;
+
+  return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_keepalive2(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto keepalive_frame = KeepAliveFrame::Decode(payload);
+
+  ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+  connection->write_lock.lock();
+  auto keepalive_ack_frame = KeepAliveFrameAck::Encode(keepalive_frame.timestamp());
+  if (!append_frame(keepalive_ack_frame)) {
+    connection->write_lock.unlock();
+    return _fault();
+  }
+  connection->write_lock.unlock();
+
+  ldout(cct, 20) << __func__ << " got KEEPALIVE2 "
+                 << keepalive_frame.timestamp() << dendl;
+  connection->set_last_keepalive(ceph_clock_now());
+
+  if (is_connected()) {
+    connection->center->dispatch_event_external(connection->write_handler);
+  }
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_keepalive2_ack(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload);
+  connection->set_last_keepalive_ack(keepalive_ack_frame.timestamp());
+  ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_message_ack(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != READY) {
+    lderr(cct) << __func__ << " not in ready state!" << dendl;
+    return _fault();
+  }
+
+  auto ack = AckFrame::Decode(payload);
+  handle_message_ack(ack.seq());
+  return CONTINUE(read_frame);
+}
+
+/* Client Protocol Methods */
+
+CtPtr ProtocolV2::start_client_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  INTERCEPT(1);
+
+  state = BANNER_CONNECTING;
+
+  global_seq = messenger->get_global_seq();
+
+  return _banner_exchange(CONTINUATION(post_client_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_client_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = AUTH_CONNECTING;
+
+  return send_auth_request();
+}
+
+CtPtr ProtocolV2::send_auth_request(std::vector<uint32_t> &allowed_methods) {
+  ldout(cct, 20) << __func__ << " peer_type " << (int)connection->peer_type
+		 << " auth_client " << messenger->auth_client << dendl;
+  ceph_assert(messenger->auth_client);
+
+  bufferlist bl;
+  vector<uint32_t> preferred_modes;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->get_auth_request(
+    connection, am.get(),
+    &am->auth_method, &preferred_modes, &bl);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    ldout(cct, 0) << __func__ << " get_initial_auth_request returned " << r
+		  << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  INTERCEPT(9);
+
+  auto frame = AuthRequestFrame::Encode(auth_meta->auth_method, preferred_modes,
+                                        bl);
+  return WRITE(frame, "auth request", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_bad_method(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto bad_method = AuthBadMethodFrame::Decode(payload);
+  ldout(cct, 1) << __func__ << " method=" << bad_method.method()
+		<< " result " << cpp_strerror(bad_method.result())
+                << ", allowed methods=" << bad_method.allowed_methods()
+		<< ", allowed modes=" << bad_method.allowed_modes()
+                << dendl;
+  ceph_assert(messenger->auth_client);
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_bad_method(
+    connection,
+    am.get(),
+    bad_method.method(), bad_method.result(),
+    bad_method.allowed_methods(),
+    bad_method.allowed_modes());
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING || r < 0) {
+    return _fault();
+  }
+  return send_auth_request(bad_method.allowed_methods());
+}
+
+CtPtr ProtocolV2::handle_auth_reply_more(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_more = AuthReplyMoreFrame::Decode(payload);
+  ldout(cct, 5) << __func__
+                << " auth reply more len=" << auth_more.auth_payload().length()
+                << dendl;
+  ceph_assert(messenger->auth_client);
+  ceph::bufferlist reply;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_reply_more(
+    connection, am.get(), auth_more.auth_payload(), &reply);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    lderr(cct) << __func__ << " auth_client handle_auth_reply_more returned "
+	       << r << dendl;
+    return _fault();
+  }
+  auto more_reply = AuthRequestMoreFrame::Encode(reply);
+  return WRITE(more_reply, "auth request more", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_CONNECTING) {
+    lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_done = AuthDoneFrame::Decode(payload);
+
+  ceph_assert(messenger->auth_client);
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_client->handle_auth_done(
+    connection,
+    am.get(),
+    auth_done.global_id(),
+    auth_done.con_mode(),
+    auth_done.auth_payload(),
+    &am->session_key,
+    &am->connection_secret);
+  connection->lock.lock();
+  if (state != AUTH_CONNECTING) {
+    ldout(cct, 1) << __func__ << " state changed!" << dendl;
+    return _fault();
+  }
+  if (r < 0) {
+    return _fault();
+  }
+  auth_meta->con_mode = auth_done.con_mode();
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+      cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/false);
+
+  state = AUTH_CONNECTING_SIGN;
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  pre_auth.enabled = false;
+  pre_auth.rxbuf.clear();
+  return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::finish_client_auth() {
+  if (!server_cookie) {
+    ceph_assert(connect_seq == 0);
+    state = SESSION_CONNECTING;
+    return send_client_ident();
+  } else {  // reconnecting to previous session
+    state = SESSION_RECONNECTING;
+    ceph_assert(connect_seq > 0);
+    return send_reconnect();
+  }
+}
+
+CtPtr ProtocolV2::send_client_ident() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (!connection->policy.lossy && !client_cookie) {
+    client_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+  }
+
+  uint64_t flags = 0;
+  if (connection->policy.lossy) {
+    flags |= CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  auto client_ident = ClientIdentFrame::Encode(
+      messenger->get_myaddrs(),
+      connection->target_addr,
+      messenger->get_myname().num(),
+      global_seq,
+      connection->policy.features_supported,
+      connection->policy.features_required | msgr2_required,
+      flags,
+      client_cookie);
+
+  ldout(cct, 5) << __func__ << " sending identification: "
+                << "addrs=" << messenger->get_myaddrs()
+                << " target=" << connection->target_addr
+                << " gid=" << messenger->get_myname().num()
+                << " global_seq=" << global_seq
+                << " features_supported=" << std::hex
+                << connection->policy.features_supported
+                << " features_required="
+		            << (connection->policy.features_required | msgr2_required)
+                << " flags=" << flags
+                << " cookie=" << client_cookie << std::dec << dendl;
+
+  INTERCEPT(11);
+
+  return WRITE(client_ident, "client ident", read_frame);
+}
+
+CtPtr ProtocolV2::send_reconnect() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  auto reconnect = ReconnectFrame::Encode(messenger->get_myaddrs(),
+                                          client_cookie,
+                                          server_cookie,
+                                          global_seq,
+                                          connect_seq,
+                                          in_seq);
+
+  ldout(cct, 5) << __func__ << " reconnect to session: client_cookie="
+                << std::hex << client_cookie << " server_cookie="
+                << server_cookie << std::dec
+                << " gs=" << global_seq << " cs=" << connect_seq
+                << " ms=" << in_seq << dendl;
+
+  INTERCEPT(13);
+
+  return WRITE(reconnect, "reconnect", read_frame);
+}
+
+CtPtr ProtocolV2::handle_ident_missing_features(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_CONNECTING) {
+    lderr(cct) << __func__ << " not in session connect state!" << dendl;
+    return _fault();
+  }
+
+  auto ident_missing =
+      IdentMissingFeaturesFrame::Decode(payload);
+  lderr(cct) << __func__
+             << " client does not support all server features: " << std::hex
+             << ident_missing.features() << std::dec << dendl;
+
+  return _fault();
+}
+
+CtPtr ProtocolV2::handle_session_reset(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto reset = ResetFrame::Decode(payload);
+
+  ldout(cct, 1) << __func__ << " received session reset full=" << reset.full()
+                << dendl;
+  if (reset.full()) {
+    reset_session();
+  } else {
+    server_cookie = 0;
+    connect_seq = 0;
+    in_seq = 0;
+  }
+
+  state = SESSION_CONNECTING;
+  return send_client_ident();
+}
+
+CtPtr ProtocolV2::handle_session_retry(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto retry = RetryFrame::Decode(payload);
+  connect_seq = retry.connect_seq() + 1;
+
+  ldout(cct, 1) << __func__
+                << " received session retry connect_seq=" << retry.connect_seq()
+                << ", inc to cs=" << connect_seq << dendl;
+
+  return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_session_retry_global(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto retry = RetryGlobalFrame::Decode(payload);
+  global_seq = messenger->get_global_seq(retry.global_seq());
+
+  ldout(cct, 1) << __func__ << " received session retry global global_seq="
+                << retry.global_seq() << ", choose new gs=" << global_seq
+                << dendl;
+
+  return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_wait(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__
+		 << " received WAIT (connection race)"
+		 << " payload.length()=" << payload.length()
+		 << dendl;
+
+  if (state != SESSION_CONNECTING && state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session (re)connect state!" << dendl;
+    return _fault();
+  }
+
+  state = WAIT;
+  WaitFrame::Decode(payload);
+  return _fault();
+}
+
+CtPtr ProtocolV2::handle_reconnect_ok(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_RECONNECTING) {
+    lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+    return _fault();
+  }
+
+  auto reconnect_ok = ReconnectOkFrame::Decode(payload);
+  ldout(cct, 5) << __func__
+                << " reconnect accepted: sms=" << reconnect_ok.msg_seq()
+                << dendl;
+
+  out_seq = discard_requeued_up_to(out_seq, reconnect_ok.msg_seq());
+
+  backoff = utime_t();
+  ldout(cct, 10) << __func__ << " reconnect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+CtPtr ProtocolV2::handle_server_ident(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_CONNECTING) {
+    lderr(cct) << __func__ << " not in session connect state!" << dendl;
+    return _fault();
+  }
+
+  auto server_ident = ServerIdentFrame::Decode(payload);
+  ldout(cct, 5) << __func__ << " received server identification:"
+                << " addrs=" << server_ident.addrs()
+                << " gid=" << server_ident.gid()
+                << " global_seq=" << server_ident.global_seq()
+                << " features_supported=" << std::hex
+                << server_ident.supported_features()
+                << " features_required=" << server_ident.required_features()
+                << " flags=" << server_ident.flags()
+                << " cookie=" << server_ident.cookie() << std::dec << dendl;
+
+  // is this who we intended to talk to?
+  // be a bit forgiving here, since we may be connecting based on addresses parsed out
+  // of mon_host or something.
+  if (!server_ident.addrs().contains(connection->target_addr)) {
+    ldout(cct,1) << __func__ << " peer identifies as " << server_ident.addrs()
+		 << ", does not include " << connection->target_addr << dendl;
+    return _fault();
+  }
+
+  server_cookie = server_ident.cookie();
+
+  connection->set_peer_addrs(server_ident.addrs());
+  peer_name = entity_name_t(connection->get_peer_type(), server_ident.gid());
+  connection->set_features(server_ident.supported_features() &
+                           connection->policy.features_supported);
+  peer_global_seq = server_ident.global_seq();
+
+  connection->policy.lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+
+  backoff = utime_t();
+  ldout(cct, 10) << __func__ << " connect success " << connect_seq
+                 << ", lossy = " << connection->policy.lossy << ", features "
+                 << connection->get_features() << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  connection->dispatch_queue->queue_connect(connection);
+  messenger->ms_deliver_handle_fast_connect(connection);
+
+  return ready();
+}
+
+/* Server Protocol Methods */
+
+CtPtr ProtocolV2::start_server_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  INTERCEPT(2);
+
+  state = BANNER_ACCEPTING;
+
+  return _banner_exchange(CONTINUATION(post_server_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_server_banner_exchange() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  state = AUTH_ACCEPTING;
+
+  return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request(ceph::bufferlist &payload) {
+  ldout(cct, 20) << __func__ << " payload.length()=" << payload.length()
+                 << dendl;
+
+  if (state != AUTH_ACCEPTING) {
+    lderr(cct) << __func__ << " not in auth accept state!" << dendl;
+    return _fault();
+  }
+
+  auto request = AuthRequestFrame::Decode(payload);
+  ldout(cct, 10) << __func__ << " AuthRequest(method=" << request.method()
+		 << ", preferred_modes=" << request.preferred_modes()
+                 << ", payload_len=" << request.auth_payload().length() << ")"
+                 << dendl;
+  auth_meta->auth_method = request.method();
+  auth_meta->con_mode = messenger->auth_server->pick_con_mode(
+    connection->get_peer_type(), auth_meta->auth_method,
+    request.preferred_modes());
+  if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) {
+    return _auth_bad_method(-EOPNOTSUPP);
+  }
+  return _handle_auth_request(request.auth_payload(), false);
+}
+
+CtPtr ProtocolV2::_auth_bad_method(int r)
+{
+  ceph_assert(r < 0);
+  std::vector<uint32_t> allowed_methods;
+  std::vector<uint32_t> allowed_modes;
+  messenger->auth_server->get_supported_auth_methods(
+    connection->get_peer_type(), &allowed_methods, &allowed_modes);
+  ldout(cct, 1) << __func__ << " auth_method " << auth_meta->auth_method
+		<< " r " << cpp_strerror(r)
+		<< ", allowed_methods " << allowed_methods
+		<< ", allowed_modes " << allowed_modes
+		<< dendl;
+  auto bad_method = AuthBadMethodFrame::Encode(auth_meta->auth_method, r,
+                                               allowed_methods, allowed_modes);
+  return WRITE(bad_method, "bad auth method", read_frame);
+}
+
+CtPtr ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more)
+{
+  if (!messenger->auth_server) {
+    return _fault();
+  }
+  bufferlist reply;
+  auto am = auth_meta;
+  connection->lock.unlock();
+  int r = messenger->auth_server->handle_auth_request(
+    connection, am.get(),
+    more, am->auth_method, auth_payload,
+    &reply);
+  connection->lock.lock();
+  if (state != AUTH_ACCEPTING && state != AUTH_ACCEPTING_MORE) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+  if (r == 1) {
+    INTERCEPT(10);
+    state = AUTH_ACCEPTING_SIGN;
+
+    auto auth_done = AuthDoneFrame::Encode(connection->peer_global_id,
+                                           auth_meta->con_mode,
+                                           reply);
+    return WRITE(auth_done, "auth done", finish_auth);
+  } else if (r == 0) {
+    state = AUTH_ACCEPTING_MORE;
+
+    auto more = AuthReplyMoreFrame::Encode(reply);
+    return WRITE(more, "auth reply more", read_frame);
+  } else if (r == -EBUSY) {
+    // kick the client and maybe they'll come back later
+    return _fault();
+  } else {
+    return _auth_bad_method(r);
+  }
+}
+
+CtPtr ProtocolV2::finish_auth()
+{
+  ceph_assert(auth_meta);
+  // TODO: having a possibility to check whether we're server or client could
+  // allow reusing finish_auth().
+  bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+  session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+      cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/true);
+
+  const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+    auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+  auto sig_frame = AuthSignatureFrame::Encode(sig);
+  pre_auth.enabled = false;
+  pre_auth.rxbuf.clear();
+  return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request_more(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_ACCEPTING_MORE) {
+    lderr(cct) << __func__ << " not in auth accept more state!" << dendl;
+    return _fault();
+  }
+
+  auto auth_more = AuthRequestMoreFrame::Decode(payload);
+  return _handle_auth_request(auth_more.auth_payload(), true);
+}
+
+CtPtr ProtocolV2::handle_auth_signature(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != AUTH_ACCEPTING_SIGN && state != AUTH_CONNECTING_SIGN) {
+    lderr(cct) << __func__
+               << " pre-auth verification signature seen in wrong state!"
+               << dendl;
+    return _fault();
+  }
+
+  auto sig_frame = AuthSignatureFrame::Decode(payload);
+
+  const auto actual_tx_sig = auth_meta->session_key.empty() ?
+    sha256_digest_t() : auth_meta->session_key.hmac_sha256(cct, pre_auth.txbuf);
+  if (sig_frame.signature() != actual_tx_sig) {
+    ldout(cct, 2) << __func__ << " pre-auth signature mismatch"
+                  << " actual_tx_sig=" << actual_tx_sig
+                  << " sig_frame.signature()=" << sig_frame.signature()
+                  << dendl;
+    return _fault();
+  } else {
+    ldout(cct, 20) << __func__ << " pre-auth signature success"
+                   << " sig_frame.signature()=" << sig_frame.signature()
+                   << dendl;
+    pre_auth.txbuf.clear();
+  }
+
+  if (state == AUTH_ACCEPTING_SIGN) {
+    // server had sent AuthDone and client responded with correct pre-auth
+    // signature. we can start accepting new sessions/reconnects.
+    state = SESSION_ACCEPTING;
+    return CONTINUE(read_frame);
+  } else if (state == AUTH_CONNECTING_SIGN) {
+    // this happened at client side
+    return finish_client_auth();
+  } else {
+    ceph_assert_always("state corruption" == nullptr);
+  }
+}
+
+CtPtr ProtocolV2::handle_client_ident(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_ACCEPTING) {
+    lderr(cct) << __func__ << " not in session accept state!" << dendl;
+    return _fault();
+  }
+
+  auto client_ident = ClientIdentFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__ << " received client identification:"
+                << " addrs=" << client_ident.addrs()
+		            << " target=" << client_ident.target_addr()
+                << " gid=" << client_ident.gid()
+                << " global_seq=" << client_ident.global_seq()
+                << " features_supported=" << std::hex
+                << client_ident.supported_features()
+                << " features_required=" << client_ident.required_features()
+                << " flags=" << client_ident.flags()
+                << " cookie=" << client_ident.cookie() << std::dec << dendl;
+
+  if (client_ident.addrs().empty() ||
+      client_ident.addrs().front() == entity_addr_t()) {
+    ldout(cct,5) << __func__ << " oops, client_ident.addrs() is empty" << dendl;
+    return _fault();  // a v2 peer should never do this
+  }
+  if (!messenger->get_myaddrs().contains(client_ident.target_addr())) {
+    ldout(cct,5) << __func__ << " peer is trying to reach "
+		 << client_ident.target_addr()
+		 << " which is not us (" << messenger->get_myaddrs() << ")"
+		 << dendl;
+    return _fault();
+  }
+
+  connection->set_peer_addrs(client_ident.addrs());
+  connection->target_addr = connection->_infer_target_addr(client_ident.addrs());
+
+  peer_name = entity_name_t(connection->get_peer_type(), client_ident.gid());
+  connection->set_peer_id(client_ident.gid());
+
+  client_cookie = client_ident.cookie();
+
+  uint64_t feat_missing =
+    (connection->policy.features_required | msgr2_required) &
+    ~(uint64_t)client_ident.supported_features();
+  if (feat_missing) {
+    ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+                  << feat_missing << std::dec << dendl;
+    auto ident_missing_features =
+        IdentMissingFeaturesFrame::Encode(feat_missing);
+
+    return WRITE(ident_missing_features, "ident missing features", read_frame);
+  }
+
+  connection_features =
+      client_ident.supported_features() & connection->policy.features_supported;
+
+  peer_global_seq = client_ident.global_seq();
+
+  // Looks good so far, let's check if there is already an existing connection
+  // to this peer.
+
+  connection->lock.unlock();
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  if (existing &&
+      existing->protocol->proto_type != 2) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (existing) {
+    return handle_existing_connection(existing);
+  }
+
+  // if everything is OK reply with server identification
+  return send_server_ident();
+}
+
+CtPtr ProtocolV2::handle_reconnect(ceph::bufferlist &payload)
+{
+  ldout(cct, 20) << __func__
+		 << " payload.length()=" << payload.length() << dendl;
+
+  if (state != SESSION_ACCEPTING) {
+    lderr(cct) << __func__ << " not in session accept state!" << dendl;
+    return _fault();
+  }
+
+  auto reconnect = ReconnectFrame::Decode(payload);
+
+  ldout(cct, 5) << __func__
+                << " received reconnect:" 
+                << " client_cookie=" << std::hex << reconnect.client_cookie()
+                << " server_cookie=" << reconnect.server_cookie() << std::dec
+                << " gs=" << reconnect.global_seq()
+                << " cs=" << reconnect.connect_seq()
+                << " ms=" << reconnect.msg_seq()
+		            << dendl;
+
+  // Should we check if one of the ident.addrs match connection->target_addr
+  // as we do in ProtocolV1?
+  connection->set_peer_addrs(reconnect.addrs());
+  connection->target_addr = connection->_infer_target_addr(reconnect.addrs());
+  peer_global_seq = reconnect.global_seq();
+
+  connection->lock.unlock();
+  AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+  if (existing &&
+      existing->protocol->proto_type != 2) {
+    ldout(cct,1) << __func__ << " existing " << existing << " proto "
+		 << existing->protocol.get() << " version is "
+		 << existing->protocol->proto_type << ", marking down" << dendl;
+    existing->mark_down();
+    existing = nullptr;
+  }
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED);
+    return _fault();
+  }
+
+  if (!existing) {
+    // there is no existing connection therefore cannot reconnect to previous
+    // session
+    ldout(cct, 0) << __func__
+                  << " no existing connection exists, reseting client" << dendl;
+    auto reset = ResetFrame::Encode(true);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  std::lock_guard<std::mutex> l(existing->lock);
+
+  ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+  if (!exproto) {
+    ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+    ceph_assert(false);
+  }
+
+  if (exproto->state == CLOSED) {
+    ldout(cct, 5) << __func__ << " existing " << existing
+                  << " already closed. Reseting client" << dendl;
+    auto reset = ResetFrame::Encode(true);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  if (exproto->replacing) {
+    ldout(cct, 1) << __func__
+                  << " existing racing replace happened while replacing."
+                  << " existing=" << existing << dendl;
+    auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->client_cookie != reconnect.client_cookie()) {
+    ldout(cct, 1) << __func__ << " existing=" << existing
+                  << " client cookie mismatch, I must have reseted:"
+                  << " cc=" << std::hex << exproto->client_cookie
+                  << " rcc=" << reconnect.client_cookie()
+                  << ", reseting client." << std::dec
+                  << dendl;
+    auto reset = ResetFrame::Encode(connection->policy.resetcheck);
+    return WRITE(reset, "session reset", read_frame);
+  } else if (exproto->server_cookie == 0) {
+    // this happens when:
+    //   - a connects to b
+    //   - a sends client_ident
+    //   - b gets client_ident, sends server_ident and sets cookie X
+    //   - connection fault
+    //   - b reconnects to a with cookie X, connect_seq=1
+    //   - a has cookie==0
+    ldout(cct, 1) << __func__ << " I was a client and didn't received the"
+                  << " server_ident. Asking peer to resume session"
+                  << " establishment" << dendl;
+    auto reset = ResetFrame::Encode(false);
+    return WRITE(reset, "session reset", read_frame);
+  }
+
+  if (exproto->peer_global_seq > reconnect.global_seq()) {
+    ldout(cct, 5) << __func__
+                  << " stale global_seq: sgs=" << exproto->peer_global_seq
+                  << " cgs=" << reconnect.global_seq()
+                  << ", ask client to retry global" << dendl;
+    auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+
+    INTERCEPT(18);
+
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->connect_seq > reconnect.connect_seq()) {
+    ldout(cct, 5) << __func__
+                  << " stale connect_seq scs=" << exproto->connect_seq
+                  << " ccs=" << reconnect.connect_seq()
+                  << " , ask client to retry" << dendl;
+    auto retry = RetryFrame::Encode(exproto->connect_seq);
+    return WRITE(retry, "session retry", read_frame);
+  }
+
+  if (exproto->connect_seq == reconnect.connect_seq()) {
+    // reconnect race: both peers are sending reconnect messages
+    if (existing->peer_addrs->msgr2_addr() >
+            messenger->get_myaddrs().msgr2_addr() &&
+        !existing->policy.server) {
+      // the existing connection wins
+      ldout(cct, 1)
+          << __func__
+          << " reconnect race detected, this connection loses to existing="
+          << existing << dendl;
+
+      auto wait = WaitFrame::Encode();
+      return WRITE(wait, "wait", read_frame);
+    } else {
+      // this connection wins
+      ldout(cct, 1) << __func__
+                    << " reconnect race detected, replacing existing="
+                    << existing << " socket by this connection's socket"
+                    << dendl;
+    }
+  }
+
+  ldout(cct, 1) << __func__ << " reconnect to existing=" << existing << dendl;
+
+  reconnecting = true;
+
+  // everything looks good
+  exproto->connect_seq = reconnect.connect_seq();
+  exproto->message_seq = reconnect.msg_seq();
+
+  return reuse_connection(existing, exproto);
+}
+
+CtPtr ProtocolV2::handle_existing_connection(AsyncConnectionRef existing) {
+  ldout(cct, 20) << __func__ << " existing=" << existing << dendl;
+
+  std::lock_guard<std::mutex> l(existing->lock);
+
+  ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+  if (!exproto) {
+    ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+    ceph_assert(false);
+  }
+
+  if (exproto->state == CLOSED) {
+    ldout(cct, 1) << __func__ << " existing " << existing << " already closed."
+                  << dendl;
+    return send_server_ident();
+  }
+
+  if (exproto->replacing) {
+    ldout(cct, 1) << __func__
+                  << " existing racing replace happened while replacing."
+                  << " existing=" << existing << dendl;
+    auto wait = WaitFrame::Encode();
+    return WRITE(wait, "wait", read_frame);
+  }
+
+  if (exproto->peer_global_seq > peer_global_seq) {
+    ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq="
+                  << peer_global_seq
+                  << " existing->peer_global_seq=" << exproto->peer_global_seq
+                  << ", stopping this connection." << dendl;
+    stop();
+    connection->dispatch_queue->queue_reset(connection);
+    return nullptr;
+  }
+
+  if (existing->policy.lossy) {
+    // existing connection can be thrown out in favor of this one
+    ldout(cct, 1)
+        << __func__ << " existing=" << existing
+        << " is a lossy channel. Stopping existing in favor of this connection"
+        << dendl;
+    existing->protocol->stop();
+    existing->dispatch_queue->queue_reset(existing.get());
+    return send_server_ident();
+  }
+
+  if (exproto->server_cookie && exproto->client_cookie &&
+      exproto->client_cookie != client_cookie) {
+    // Found previous session
+    // peer has reseted and we're going to reuse the existing connection
+    // by replacing the communication socket
+    ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+                  << ", peer must have reseted." << dendl;
+    if (connection->policy.resetcheck) {
+      exproto->reset_session();
+    }
+    return reuse_connection(existing, exproto);
+  }
+
+  if (exproto->client_cookie == client_cookie) {
+    // session establishment interrupted between client_ident and server_ident,
+    // continuing...
+    ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+                  << ", continuing session establishment." << dendl;
+    return reuse_connection(existing, exproto);
+  }
+
+  if (exproto->state == READY || exproto->state == STANDBY) {
+    ldout(cct, 1) << __func__ << " existing=" << existing
+                  << " is READY/STANDBY, lets reuse it" << dendl;
+    return reuse_connection(existing, exproto);
+  }
+
+  // Looks like a connection race: server and client are both connecting to
+  // each other at the same time.
+  if (connection->peer_addrs->msgr2_addr() <
+          messenger->get_myaddrs().msgr2_addr() ||
+      existing->policy.server) {
+    // this connection wins
+    ldout(cct, 1) << __func__
+                  << " connection race detected, replacing existing="
+                  << existing << " socket by this connection's socket" << dendl;
+    return reuse_connection(existing, exproto);
+  } else {
+    // the existing connection wins
+    ldout(cct, 1)
+        << __func__
+        << " connection race detected, this connection loses to existing="
+        << existing << dendl;
+    ceph_assert(connection->peer_addrs->msgr2_addr() >
+                messenger->get_myaddrs().msgr2_addr());
+
+    // make sure we follow through with opening the existing
+    // connection (if it isn't yet open) since we know the peer
+    // has something to send to us.
+    existing->send_keepalive();
+    auto wait = WaitFrame::Encode();
+    return WRITE(wait, "wait", read_frame);
+  }
+}
+
+CtPtr ProtocolV2::reuse_connection(AsyncConnectionRef existing,
+                                   ProtocolV2 *exproto) {
+  ldout(cct, 20) << __func__ << " existing=" << existing
+                 << " reconnect=" << reconnecting << dendl;
+
+  connection->inject_delay();
+
+  std::lock_guard<std::mutex> l(existing->write_lock);
+
+  connection->center->delete_file_event(connection->cs.fd(),
+                                        EVENT_READABLE | EVENT_WRITABLE);
+
+  if (existing->delay_state) {
+    existing->delay_state->flush();
+    ceph_assert(!connection->delay_state);
+  }
+  exproto->reset_recv_state();
+  exproto->pre_auth.enabled = false;
+
+  if (!reconnecting) {
+    exproto->peer_supported_features = peer_supported_features;
+    exproto->tx_frame_asm.set_is_rev1(tx_frame_asm.get_is_rev1());
+    exproto->rx_frame_asm.set_is_rev1(rx_frame_asm.get_is_rev1());
+
+    exproto->client_cookie = client_cookie;
+    exproto->peer_name = peer_name;
+    exproto->connection_features = connection_features;
+    existing->set_features(connection_features);
+  }
+  exproto->peer_global_seq = peer_global_seq;
+
+  ceph_assert(connection->center->in_thread());
+  auto temp_cs = std::move(connection->cs);
+  EventCenter *new_center = connection->center;
+  Worker *new_worker = connection->worker;
+  // we can steal the session_stream_handlers under the assumption
+  // this happens in the event center's thread as there should be
+  // no user outside its boundaries (simlarly to e.g. outgoing_bl).
+  auto temp_stream_handlers = std::move(session_stream_handlers);
+  exproto->auth_meta = auth_meta;
+
+  ldout(messenger->cct, 5) << __func__ << " stop myself to swap existing"
+                           << dendl;
+
+  // avoid _stop shutdown replacing socket
+  // queue a reset on the new connection, which we're dumping for the old
+  stop();
+
+  connection->dispatch_queue->queue_reset(connection);
+
+  exproto->can_write = false;
+  exproto->write_in_progress = false;
+  exproto->reconnecting = reconnecting;
+  exproto->replacing = true;
+  existing->state_offset = 0;
+  // avoid previous thread modify event
+  exproto->state = NONE;
+  existing->state = AsyncConnection::STATE_NONE;
+  // Discard existing prefetch buffer in `recv_buf`
+  existing->recv_start = existing->recv_end = 0;
+  // there shouldn't exist any buffer
+  ceph_assert(connection->recv_start == connection->recv_end);
+
+  auto deactivate_existing = std::bind(
+      [ existing,
+        new_worker,
+        new_center,
+        exproto,
+        temp_stream_handlers=std::move(temp_stream_handlers)
+      ](ConnectedSocket &cs) mutable {
+        // we need to delete time event in original thread
+        {
+          std::lock_guard<std::mutex> l(existing->lock);
+          existing->write_lock.lock();
+          exproto->requeue_sent();
+          // XXX: do we really need the locking for `outgoing_bl`? There is
+          // a comment just above its definition saying "lockfree, only used
+          // in own thread". I'm following lockfull schema just in the case.
+          // From performance point of view it should be fine – this happens
+          // far away from hot paths.
+          existing->outgoing_bl.clear();
+          existing->open_write = false;
+          exproto->session_stream_handlers = std::move(temp_stream_handlers);
+          existing->write_lock.unlock();
+          if (exproto->state == NONE) {
+            existing->shutdown_socket();
+            existing->cs = std::move(cs);
+            existing->worker->references--;
+            new_worker->references++;
+            existing->logger = new_worker->get_perf_counter();
+            existing->worker = new_worker;
+            existing->center = new_center;
+            if (existing->delay_state)
+              existing->delay_state->set_center(new_center);
+          } else if (exproto->state == CLOSED) {
+            auto back_to_close = std::bind(
+                [](ConnectedSocket &cs) mutable { cs.close(); }, std::move(cs));
+            new_center->submit_to(new_center->get_id(),
+                                  std::move(back_to_close), true);
+            return;
+          } else {
+            ceph_abort();
+          }
+        }
+
+        // Before changing existing->center, it may already exists some
+        // events in existing->center's queue. Then if we mark down
+        // `existing`, it will execute in another thread and clean up
+        // connection. Previous event will result in segment fault
+        auto transfer_existing = [existing, exproto]() mutable {
+          std::lock_guard<std::mutex> l(existing->lock);
+          if (exproto->state == CLOSED) return;
+          ceph_assert(exproto->state == NONE);
+
+          exproto->state = SESSION_ACCEPTING;
+          // we have called shutdown_socket above
+          ceph_assert(existing->last_tick_id == 0);
+          // restart timer since we are going to re-build connection
+          existing->last_connect_started = ceph::coarse_mono_clock::now();
+          existing->last_tick_id = existing->center->create_time_event(
+            existing->connect_timeout_us, existing->tick_handler);
+          existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+          existing->center->create_file_event(existing->cs.fd(), EVENT_READABLE,
+                                              existing->read_handler);
+          if (!exproto->reconnecting) {
+            exproto->run_continuation(exproto->send_server_ident());
+          } else {
+            exproto->run_continuation(exproto->send_reconnect_ok());
+          }
+        };
+        if (existing->center->in_thread())
+          transfer_existing();
+        else
+          existing->center->submit_to(existing->center->get_id(),
+                                      std::move(transfer_existing), true);
+      },
+      std::move(temp_cs));
+
+  existing->center->submit_to(existing->center->get_id(),
+                              std::move(deactivate_existing), true);
+  return nullptr;
+}
+
+CtPtr ProtocolV2::send_server_ident() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  // this is required for the case when this connection is being replaced
+  out_seq = discard_requeued_up_to(out_seq, 0);
+  in_seq = 0;
+
+  if (!connection->policy.lossy) {
+    server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+  }
+
+  uint64_t flags = 0;
+  if (connection->policy.lossy) {
+    flags = flags | CEPH_MSG_CONNECT_LOSSY;
+  }
+
+  uint64_t gs = messenger->get_global_seq();
+  auto server_ident = ServerIdentFrame::Encode(
+          messenger->get_myaddrs(),
+          messenger->get_myname().num(),
+          gs,
+          connection->policy.features_supported,
+          connection->policy.features_required | msgr2_required,
+          flags,
+          server_cookie);
+
+  ldout(cct, 5) << __func__ << " sending identification:"
+                << " addrs=" << messenger->get_myaddrs()
+                << " gid=" << messenger->get_myname().num()
+                << " global_seq=" << gs << " features_supported=" << std::hex
+                << connection->policy.features_supported
+                << " features_required="
+		            << (connection->policy.features_required | msgr2_required)
+                << " flags=" << flags
+                << " cookie=" << server_cookie << std::dec << dendl;
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->msgr2_addr()
+                  << " just fail later one(this)" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  connection->set_features(connection_features);
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+
+  INTERCEPT(12);
+
+  return WRITE(server_ident, "server ident", server_ready);
+}
+
+CtPtr ProtocolV2::server_ready() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  if (connection->delay_state) {
+    ceph_assert(connection->delay_state->ready());
+  }
+
+  return ready();
+}
+
+CtPtr ProtocolV2::send_reconnect_ok() {
+  ldout(cct, 20) << __func__ << dendl;
+
+  out_seq = discard_requeued_up_to(out_seq, message_seq);
+
+  uint64_t ms = in_seq;
+  auto reconnect_ok = ReconnectOkFrame::Encode(ms);
+
+  ldout(cct, 5) << __func__ << " sending reconnect_ok: msg_seq=" << ms << dendl;
+
+  connection->lock.unlock();
+  // Because "replacing" will prevent other connections preempt this addr,
+  // it's safe that here we don't acquire Connection's lock
+  ssize_t r = messenger->accept_conn(connection);
+
+  connection->inject_delay();
+
+  connection->lock.lock();
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+                  << connection->peer_addrs->msgr2_addr()
+                  << " just fail later one(this)" << dendl;
+    connection->inject_delay();
+    return _fault();
+  }
+  if (state != SESSION_ACCEPTING) {
+    ldout(cct, 1) << __func__
+                  << " state changed while accept_conn, it must be mark_down"
+                  << dendl;
+    ceph_assert(state == CLOSED || state == NONE);
+    messenger->unregister_conn(connection);
+    connection->inject_delay();
+    return _fault();
+  }
+
+  // notify
+  connection->dispatch_queue->queue_accept(connection);
+  messenger->ms_deliver_handle_fast_accept(connection);
+
+  INTERCEPT(14);
+
+  return WRITE(reconnect_ok, "reconnect ok", server_ready);
+}
diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h
new file mode 100644
index 00000000..4941cea5
--- /dev/null
+++ b/src/msg/async/ProtocolV2.h
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V2_
+#define _MSG_ASYNC_PROTOCOL_V2_
+
+#include "Protocol.h"
+#include "crypto_onwire.h"
+#include "frames_v2.h"
+
+class ProtocolV2 : public Protocol {
+private:
+  enum State {
+    NONE,
+    START_CONNECT,
+    BANNER_CONNECTING,
+    HELLO_CONNECTING,
+    AUTH_CONNECTING,
+    AUTH_CONNECTING_SIGN,
+    SESSION_CONNECTING,
+    SESSION_RECONNECTING,
+    START_ACCEPT,
+    BANNER_ACCEPTING,
+    HELLO_ACCEPTING,
+    AUTH_ACCEPTING,
+    AUTH_ACCEPTING_MORE,
+    AUTH_ACCEPTING_SIGN,
+    SESSION_ACCEPTING,
+    READY,
+    THROTTLE_MESSAGE,
+    THROTTLE_BYTES,
+    THROTTLE_DISPATCH_QUEUE,
+    THROTTLE_DONE,
+    READ_MESSAGE_COMPLETE,
+    STANDBY,
+    WAIT,
+    CLOSED
+  };
+
+  static const char *get_state_name(int state) {
+    const char *const statenames[] = {"NONE",
+                                      "START_CONNECT",
+                                      "BANNER_CONNECTING",
+                                      "HELLO_CONNECTING",
+                                      "AUTH_CONNECTING",
+                                      "AUTH_CONNECTING_SIGN",
+                                      "SESSION_CONNECTING",
+                                      "SESSION_RECONNECTING",
+                                      "START_ACCEPT",
+                                      "BANNER_ACCEPTING",
+                                      "HELLO_ACCEPTING",
+                                      "AUTH_ACCEPTING",
+                                      "AUTH_ACCEPTING_MORE",
+                                      "AUTH_ACCEPTING_SIGN",
+                                      "SESSION_ACCEPTING",
+                                      "READY",
+                                      "THROTTLE_MESSAGE",
+                                      "THROTTLE_BYTES",
+                                      "THROTTLE_DISPATCH_QUEUE",
+                                      "THROTTLE_DONE",
+                                      "READ_MESSAGE_COMPLETE",
+                                      "STANDBY",
+                                      "WAIT",
+                                      "CLOSED"};
+    return statenames[state];
+  }
+
+  // TODO: move into auth_meta?
+  ceph::crypto::onwire::rxtx_t session_stream_handlers;
+
+  entity_name_t peer_name;
+  State state;
+  uint64_t peer_supported_features;  // CEPH_MSGR2_FEATURE_*
+
+  uint64_t client_cookie;
+  uint64_t server_cookie;
+  uint64_t global_seq;
+  uint64_t connect_seq;
+  uint64_t peer_global_seq;
+  uint64_t message_seq;
+  bool reconnecting;
+  bool replacing;
+  bool can_write;
+  struct out_queue_entry_t {
+    bool is_prepared {false};
+    Message* m {nullptr};
+  };
+  std::map<int, std::list<out_queue_entry_t>> out_queue;
+  std::list<Message *> sent;
+  std::atomic<uint64_t> out_seq{0};
+  std::atomic<uint64_t> in_seq{0};
+  std::atomic<uint64_t> ack_left{0};
+
+  using ProtFuncPtr = void (ProtocolV2::*)();
+  Ct<ProtocolV2> *bannerExchangeCallback;
+
+  ceph::msgr::v2::FrameAssembler tx_frame_asm;
+  ceph::msgr::v2::FrameAssembler rx_frame_asm;
+
+  ceph::bufferlist rx_preamble;
+  ceph::bufferlist rx_epilogue;
+  ceph::msgr::v2::segment_bls_t rx_segments_data;
+  ceph::msgr::v2::Tag next_tag;
+  utime_t backoff;  // backoff time
+  utime_t recv_stamp;
+  utime_t throttle_stamp;
+
+  struct {
+    ceph::bufferlist rxbuf;
+    ceph::bufferlist txbuf;
+    bool enabled {true};
+  } pre_auth;
+
+  bool keepalive;
+  bool write_in_progress = false;
+
+  ostream &_conn_prefix(std::ostream *_dout);
+  void run_continuation(Ct<ProtocolV2> *pcontinuation);
+  void run_continuation(Ct<ProtocolV2> &continuation);
+
+  Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+                       rx_buffer_t&& buffer);
+  template <class F>
+  Ct<ProtocolV2> *write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+			F &frame);
+  Ct<ProtocolV2> *write(const std::string &desc,
+                        CONTINUATION_TYPE<ProtocolV2> &next,
+                        bufferlist &buffer);
+
+  template <class F>
+  bool append_frame(F& frame);
+
+  void requeue_sent();
+  uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+  void reset_recv_state();
+  void reset_security();
+  void reset_throttle();
+  Ct<ProtocolV2> *_fault();
+  void discard_out_queue();
+  void reset_session();
+  void prepare_send_message(uint64_t features, Message *m);
+  out_queue_entry_t _get_next_outgoing();
+  ssize_t write_message(Message *m, bool more);
+  void handle_message_ack(uint64_t seq);
+
+  CONTINUATION_DECL(ProtocolV2, _wait_for_peer_banner);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner_payload);
+
+  Ct<ProtocolV2> *_banner_exchange(Ct<ProtocolV2> &callback);
+  Ct<ProtocolV2> *_wait_for_peer_banner();
+  Ct<ProtocolV2> *_handle_peer_banner(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *_handle_peer_banner_payload(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *handle_hello(ceph::bufferlist &payload);
+
+  CONTINUATION_DECL(ProtocolV2, read_frame);
+  CONTINUATION_DECL(ProtocolV2, finish_auth);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_preamble_main);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_segment);
+  READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_epilogue_main);
+  CONTINUATION_DECL(ProtocolV2, throttle_message);
+  CONTINUATION_DECL(ProtocolV2, throttle_bytes);
+  CONTINUATION_DECL(ProtocolV2, throttle_dispatch_queue);
+
+  Ct<ProtocolV2> *read_frame();
+  Ct<ProtocolV2> *finish_auth();
+  Ct<ProtocolV2> *finish_client_auth();
+  Ct<ProtocolV2> *handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *read_frame_segment();
+  Ct<ProtocolV2> *handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r);
+  Ct<ProtocolV2> *_handle_read_frame_segment();
+  Ct<ProtocolV2> *handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r);
+  Ct<ProtocolV2> *_handle_read_frame_epilogue_main();
+  Ct<ProtocolV2> *handle_read_frame_dispatch();
+  Ct<ProtocolV2> *handle_frame_payload();
+
+  Ct<ProtocolV2> *ready();
+
+  Ct<ProtocolV2> *handle_message();
+  Ct<ProtocolV2> *throttle_message();
+  Ct<ProtocolV2> *throttle_bytes();
+  Ct<ProtocolV2> *throttle_dispatch_queue();
+  Ct<ProtocolV2> *read_message_data_prepare();
+
+  Ct<ProtocolV2> *handle_keepalive2(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_keepalive2_ack(ceph::bufferlist &payload);
+
+  Ct<ProtocolV2> *handle_message_ack(ceph::bufferlist &payload);
+
+public:
+  uint64_t connection_features;
+
+  ProtocolV2(AsyncConnection *connection);
+  virtual ~ProtocolV2();
+
+  virtual void connect() override;
+  virtual void accept() override;
+  virtual bool is_connected() override;
+  virtual void stop() override;
+  virtual void fault() override;
+  virtual void send_message(Message *m) override;
+  virtual void send_keepalive() override;
+
+  virtual void read_event() override;
+  virtual void write_event() override;
+  virtual bool is_queued() override;
+
+private:
+  // Client Protocol
+  CONTINUATION_DECL(ProtocolV2, start_client_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, post_client_banner_exchange);
+
+  Ct<ProtocolV2> *start_client_banner_exchange();
+  Ct<ProtocolV2> *post_client_banner_exchange();
+  inline Ct<ProtocolV2> *send_auth_request() {
+    std::vector<uint32_t> empty;
+    return send_auth_request(empty);
+  }
+  Ct<ProtocolV2> *send_auth_request(std::vector<uint32_t> &allowed_methods);
+  Ct<ProtocolV2> *handle_auth_bad_method(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_reply_more(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_done(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_signature(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *send_client_ident();
+  Ct<ProtocolV2> *send_reconnect();
+  Ct<ProtocolV2> *handle_ident_missing_features(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_reset(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_retry(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_session_retry_global(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_wait(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_reconnect_ok(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_server_ident(ceph::bufferlist &payload);
+
+  // Server Protocol
+  CONTINUATION_DECL(ProtocolV2, start_server_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, post_server_banner_exchange);
+  CONTINUATION_DECL(ProtocolV2, server_ready);
+
+  Ct<ProtocolV2> *start_server_banner_exchange();
+  Ct<ProtocolV2> *post_server_banner_exchange();
+  Ct<ProtocolV2> *handle_auth_request(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_auth_request_more(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *_handle_auth_request(bufferlist& auth_payload, bool more);
+  Ct<ProtocolV2> *_auth_bad_method(int r);
+  Ct<ProtocolV2> *handle_client_ident(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_ident_missing_features_write(int r);
+  Ct<ProtocolV2> *handle_reconnect(ceph::bufferlist &payload);
+  Ct<ProtocolV2> *handle_existing_connection(AsyncConnectionRef existing);
+  Ct<ProtocolV2> *reuse_connection(AsyncConnectionRef existing,
+                                   ProtocolV2 *exproto);
+  Ct<ProtocolV2> *send_server_ident();
+  Ct<ProtocolV2> *send_reconnect_ok();
+  Ct<ProtocolV2> *server_ready();
+
+  size_t get_current_msg_size() const;
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V2_ */
diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc
new file mode 100644
index 00000000..8976c3cc
--- /dev/null
+++ b/src/msg/async/Stack.cc
@@ -0,0 +1,217 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <mutex>
+
+#include "include/compat.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "PosixStack.h"
+#ifdef HAVE_RDMA
+#include "rdma/RDMAStack.h"
+#endif
+#ifdef HAVE_DPDK
+#include "dpdk/DPDKStack.h"
+#endif
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "stack "
+
+std::function<void ()> NetworkStack::add_thread(unsigned i)
+{
+  Worker *w = workers[i];
+  return [this, w]() {
+      char tp_name[16];
+      sprintf(tp_name, "msgr-worker-%u", w->id);
+      ceph_pthread_setname(pthread_self(), tp_name);
+      const unsigned EventMaxWaitUs = 30000000;
+      w->center.set_owner();
+      ldout(cct, 10) << __func__ << " starting" << dendl;
+      w->initialize();
+      w->init_done();
+      while (!w->done) {
+        ldout(cct, 30) << __func__ << " calling event process" << dendl;
+
+        ceph::timespan dur;
+        int r = w->center.process_events(EventMaxWaitUs, &dur);
+        if (r < 0) {
+          ldout(cct, 20) << __func__ << " process events failed: "
+                         << cpp_strerror(errno) << dendl;
+          // TODO do something?
+        }
+        w->perf_logger->tinc(l_msgr_running_total_time, dur);
+      }
+      w->reset();
+      w->destroy();
+  };
+}
+
+std::shared_ptr<NetworkStack> NetworkStack::create(CephContext *c, const string &t)
+{
+  if (t == "posix")
+    return std::make_shared<PosixNetworkStack>(c, t);
+#ifdef HAVE_RDMA
+  else if (t == "rdma")
+    return std::make_shared<RDMAStack>(c, t);
+#endif
+#ifdef HAVE_DPDK
+  else if (t == "dpdk")
+    return std::make_shared<DPDKStack>(c, t);
+#endif
+
+  lderr(c) << __func__ << " ms_async_transport_type " << t <<
+    " is not supported! " << dendl;
+  ceph_abort();
+  return nullptr;
+}
+
+Worker* NetworkStack::create_worker(CephContext *c, const string &type, unsigned i)
+{
+  if (type == "posix")
+    return new PosixWorker(c, i);
+#ifdef HAVE_RDMA
+  else if (type == "rdma")
+    return new RDMAWorker(c, i);
+#endif
+#ifdef HAVE_DPDK
+  else if (type == "dpdk")
+    return new DPDKWorker(c, i);
+#endif
+
+  lderr(c) << __func__ << " ms_async_transport_type " << type <<
+    " is not supported! " << dendl;
+  ceph_abort();
+  return nullptr;
+}
+
+NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c)
+{
+  ceph_assert(cct->_conf->ms_async_op_threads > 0);
+
+  const int InitEventNumber = 5000;
+  num_workers = cct->_conf->ms_async_op_threads;
+  if (num_workers >= EventCenter::MAX_EVENTCENTER) {
+    ldout(cct, 0) << __func__ << " max thread limit is "
+                  << EventCenter::MAX_EVENTCENTER << ", switching to this now. "
+                  << "Higher thread values are unnecessary and currently unsupported."
+                  << dendl;
+    num_workers = EventCenter::MAX_EVENTCENTER;
+  }
+
+  for (unsigned i = 0; i < num_workers; ++i) {
+    Worker *w = create_worker(cct, type, i);
+    w->center.init(InitEventNumber, i, type);
+    workers.push_back(w);
+  }
+}
+
+void NetworkStack::start()
+{
+  std::unique_lock<decltype(pool_spin)> lk(pool_spin);
+
+  if (started) {
+    return ;
+  }
+
+  for (unsigned i = 0; i < num_workers; ++i) {
+    if (workers[i]->is_init())
+      continue;
+    std::function<void ()> thread = add_thread(i);
+    spawn_worker(i, std::move(thread));
+  }
+  started = true;
+  lk.unlock();
+
+  for (unsigned i = 0; i < num_workers; ++i)
+    workers[i]->wait_for_init();
+}
+
+Worker* NetworkStack::get_worker()
+{
+  ldout(cct, 30) << __func__ << dendl;
+
+   // start with some reasonably large number
+  unsigned min_load = std::numeric_limits<int>::max();
+  Worker* current_best = nullptr;
+
+  pool_spin.lock();
+  // find worker with least references
+  // tempting case is returning on references == 0, but in reality
+  // this will happen so rarely that there's no need for special case.
+  for (unsigned i = 0; i < num_workers; ++i) {
+    unsigned worker_load = workers[i]->references.load();
+    if (worker_load < min_load) {
+      current_best = workers[i];
+      min_load = worker_load;
+    }
+  }
+
+  pool_spin.unlock();
+  ceph_assert(current_best);
+  ++current_best->references;
+  return current_best;
+}
+
+void NetworkStack::stop()
+{
+  std::lock_guard<decltype(pool_spin)> lk(pool_spin);
+  for (unsigned i = 0; i < num_workers; ++i) {
+    workers[i]->done = true;
+    workers[i]->center.wakeup();
+    join_worker(i);
+  }
+  started = false;
+}
+
+class C_drain : public EventCallback {
+  Mutex drain_lock;
+  Cond drain_cond;
+  unsigned drain_count;
+
+ public:
+  explicit C_drain(size_t c)
+      : drain_lock("C_drain::drain_lock"),
+        drain_count(c) {}
+  void do_request(uint64_t id) override {
+    Mutex::Locker l(drain_lock);
+    drain_count--;
+    if (drain_count == 0) drain_cond.Signal();
+  }
+  void wait() {
+    Mutex::Locker l(drain_lock);
+    while (drain_count)
+      drain_cond.Wait(drain_lock);
+  }
+};
+
+void NetworkStack::drain()
+{
+  ldout(cct, 30) << __func__ << " started." << dendl;
+  pthread_t cur = pthread_self();
+  pool_spin.lock();
+  C_drain drain(num_workers);
+  for (unsigned i = 0; i < num_workers; ++i) {
+    ceph_assert(cur != workers[i]->center.get_owner());
+    workers[i]->center.dispatch_event_external(EventCallbackRef(&drain));
+  }
+  pool_spin.unlock();
+  drain.wait();
+  ldout(cct, 30) << __func__ << " end." << dendl;
+}
diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h
new file mode 100644
index 00000000..a093dadb
--- /dev/null
+++ b/src/msg/async/Stack.h
@@ -0,0 +1,356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_STACK_H
+#define CEPH_MSG_ASYNC_STACK_H
+
+#include "include/spinlock.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/Event.h"
+
+class Worker;
+class ConnectedSocketImpl {
+ public:
+  virtual ~ConnectedSocketImpl() {}
+  virtual int is_connected() = 0;
+  virtual ssize_t read(char*, size_t) = 0;
+  virtual ssize_t zero_copy_read(bufferptr&) = 0;
+  virtual ssize_t send(bufferlist &bl, bool more) = 0;
+  virtual void shutdown() = 0;
+  virtual void close() = 0;
+  virtual int fd() const = 0;
+  virtual int socket_fd() const = 0;
+};
+
+class ConnectedSocket;
+struct SocketOptions {
+  bool nonblock = true;
+  bool nodelay = true;
+  int rcbuf_size = 0;
+  int priority = -1;
+  entity_addr_t connect_bind_addr;
+};
+
+/// \cond internal
+class ServerSocketImpl {
+ public:
+  unsigned addr_type; ///< entity_addr_t::TYPE_*
+  unsigned addr_slot; ///< position of our addr in myaddrs().v
+  ServerSocketImpl(unsigned type, unsigned slot)
+    : addr_type(type), addr_slot(slot) {}
+  virtual ~ServerSocketImpl() {}
+  virtual int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) = 0;
+  virtual void abort_accept() = 0;
+  /// Get file descriptor
+  virtual int fd() const = 0;
+};
+/// \endcond
+
+/// \addtogroup networking-module
+/// @{
+
+/// A TCP (or other stream-based protocol) connection.
+///
+/// A \c ConnectedSocket represents a full-duplex stream between
+/// two endpoints, a local endpoint and a remote endpoint.
+class ConnectedSocket {
+  std::unique_ptr<ConnectedSocketImpl> _csi;
+
+ public:
+  /// Constructs a \c ConnectedSocket not corresponding to a connection
+  ConnectedSocket() {};
+  /// \cond internal
+  explicit ConnectedSocket(std::unique_ptr<ConnectedSocketImpl> csi)
+      : _csi(std::move(csi)) {}
+  /// \endcond
+   ~ConnectedSocket() {
+    if (_csi)
+      _csi->close();
+  }
+  /// Moves a \c ConnectedSocket object.
+  ConnectedSocket(ConnectedSocket&& cs) = default;
+  /// Move-assigns a \c ConnectedSocket object.
+  ConnectedSocket& operator=(ConnectedSocket&& cs) = default;
+
+  int is_connected() {
+    return _csi->is_connected();
+  }
+  /// Read the input stream with copy.
+  ///
+  /// Copy an object returning data sent from the remote endpoint.
+  ssize_t read(char* buf, size_t len) {
+    return _csi->read(buf, len);
+  }
+  /// Gets the input stream.
+  ///
+  /// Gets an object returning data sent from the remote endpoint.
+  ssize_t zero_copy_read(bufferptr &data) {
+    return _csi->zero_copy_read(data);
+  }
+  /// Gets the output stream.
+  ///
+  /// Gets an object that sends data to the remote endpoint.
+  ssize_t send(bufferlist &bl, bool more) {
+    return _csi->send(bl, more);
+  }
+  /// Disables output to the socket.
+  ///
+  /// Current or future writes that have not been successfully flushed
+  /// will immediately fail with an error.  This is useful to abort
+  /// operations on a socket that is not making progress due to a
+  /// peer failure.
+  void shutdown() {
+    return _csi->shutdown();
+  }
+  /// Disables input from the socket.
+  ///
+  /// Current or future reads will immediately fail with an error.
+  /// This is useful to abort operations on a socket that is not making
+  /// progress due to a peer failure.
+  void close() {
+    _csi->close();
+    _csi.reset();
+  }
+
+  /// Get file descriptor
+  int fd() const {
+    return _csi->fd();
+  }
+  int socket_fd() const {
+    return _csi->socket_fd();
+  }
+
+  explicit operator bool() const {
+    return _csi.get();
+  }
+};
+/// @}
+
+/// \addtogroup networking-module
+/// @{
+
+/// A listening socket, waiting to accept incoming network connections.
+class ServerSocket {
+  std::unique_ptr<ServerSocketImpl> _ssi;
+ public:
+  /// Constructs a \c ServerSocket not corresponding to a connection
+  ServerSocket() {}
+  /// \cond internal
+  explicit ServerSocket(std::unique_ptr<ServerSocketImpl> ssi)
+      : _ssi(std::move(ssi)) {}
+  ~ServerSocket() {
+    if (_ssi)
+      _ssi->abort_accept();
+  }
+  /// \endcond
+  /// Moves a \c ServerSocket object.
+  ServerSocket(ServerSocket&& ss) = default;
+  /// Move-assigns a \c ServerSocket object.
+  ServerSocket& operator=(ServerSocket&& cs) = default;
+
+  /// Accepts the next connection to successfully connect to this socket.
+  ///
+  /// \Accepts a \ref ConnectedSocket representing the connection, and
+  ///          a \ref entity_addr_t describing the remote endpoint.
+  int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+    return _ssi->accept(sock, opt, out, w);
+  }
+
+  /// Stops any \ref accept() in progress.
+  ///
+  /// Current and future \ref accept() calls will terminate immediately
+  /// with an error.
+  void abort_accept() {
+    _ssi->abort_accept();
+    _ssi.reset();
+  }
+
+  /// Get file descriptor
+  int fd() const {
+    return _ssi->fd();
+  }
+
+  /// get listen/bind addr
+  unsigned get_addr_slot() {
+    return _ssi->addr_slot;
+  }
+
+  explicit operator bool() const {
+    return _ssi.get();
+  }
+};
+/// @}
+
+class NetworkStack;
+
+enum {
+  l_msgr_first = 94000,
+  l_msgr_recv_messages,
+  l_msgr_send_messages,
+  l_msgr_recv_bytes,
+  l_msgr_send_bytes,
+  l_msgr_created_connections,
+  l_msgr_active_connections,
+
+  l_msgr_running_total_time,
+  l_msgr_running_send_time,
+  l_msgr_running_recv_time,
+  l_msgr_running_fast_dispatch_time,
+
+  l_msgr_last,
+};
+
+class Worker {
+  std::mutex init_lock;
+  std::condition_variable init_cond;
+  bool init = false;
+
+ public:
+  bool done = false;
+
+  CephContext *cct;
+  PerfCounters *perf_logger;
+  unsigned id;
+
+  std::atomic_uint references;
+  EventCenter center;
+
+  Worker(const Worker&) = delete;
+  Worker& operator=(const Worker&) = delete;
+
+  Worker(CephContext *c, unsigned i)
+    : cct(c), perf_logger(NULL), id(i), references(0), center(c) {
+    char name[128];
+    sprintf(name, "AsyncMessenger::Worker-%u", id);
+    // initialize perf_logger
+    PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last);
+
+    plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
+    plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
+    plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(UNIT_BYTES));
+    plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(UNIT_BYTES));
+    plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number");
+    plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number");
+
+    plb.add_time(l_msgr_running_total_time, "msgr_running_total_time", "The total time of thread running");
+    plb.add_time(l_msgr_running_send_time, "msgr_running_send_time", "The total time of message sending");
+    plb.add_time(l_msgr_running_recv_time, "msgr_running_recv_time", "The total time of message receiving");
+    plb.add_time(l_msgr_running_fast_dispatch_time, "msgr_running_fast_dispatch_time", "The total time of fast dispatch");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+  virtual ~Worker() {
+    if (perf_logger) {
+      cct->get_perfcounters_collection()->remove(perf_logger);
+      delete perf_logger;
+    }
+  }
+
+  virtual int listen(entity_addr_t &addr, unsigned addr_slot,
+                     const SocketOptions &opts, ServerSocket *) = 0;
+  virtual int connect(const entity_addr_t &addr,
+                      const SocketOptions &opts, ConnectedSocket *socket) = 0;
+  virtual void destroy() {}
+
+  virtual void initialize() {}
+  PerfCounters *get_perf_counter() { return perf_logger; }
+  void release_worker() {
+    int oldref = references.fetch_sub(1);
+    ceph_assert(oldref > 0);
+  }
+  void init_done() {
+    init_lock.lock();
+    init = true;
+    init_cond.notify_all();
+    init_lock.unlock();
+  }
+  bool is_init() {
+    std::lock_guard<std::mutex> l(init_lock);
+    return init;
+  }
+  void wait_for_init() {
+    std::unique_lock<std::mutex> l(init_lock);
+    while (!init)
+      init_cond.wait(l);
+  }
+  void reset() {
+    init_lock.lock();
+    init = false;
+    init_cond.notify_all();
+    init_lock.unlock();
+    done = false;
+  }
+};
+
+class NetworkStack {
+  std::string type;
+  unsigned num_workers = 0;
+  ceph::spinlock pool_spin;
+  bool started = false;
+
+  std::function<void ()> add_thread(unsigned i);
+
+ protected:
+  CephContext *cct;
+  vector<Worker*> workers;
+
+  explicit NetworkStack(CephContext *c, const string &t);
+ public:
+  NetworkStack(const NetworkStack &) = delete;
+  NetworkStack& operator=(const NetworkStack &) = delete;
+  virtual ~NetworkStack() {
+    for (auto &&w : workers)
+      delete w;
+  }
+
+  static std::shared_ptr<NetworkStack> create(
+          CephContext *c, const string &type);
+
+  static Worker* create_worker(
+          CephContext *c, const string &t, unsigned i);
+  // backend need to override this method if supports zero copy read
+  virtual bool support_zero_copy_read() const { return false; }
+  // backend need to override this method if backend doesn't support shared
+  // listen table.
+  // For example, posix backend has in kernel global listen table. If one
+  // thread bind a port, other threads also aware this.
+  // But for dpdk backend, we maintain listen table in each thread. So we
+  // need to let each thread do binding port.
+  virtual bool support_local_listen_table() const { return false; }
+  virtual bool nonblock_connect_need_writable_event() const { return true; }
+
+  void start();
+  void stop();
+  virtual Worker *get_worker();
+  Worker *get_worker(unsigned i) {
+    return workers[i];
+  }
+  void drain();
+  unsigned get_num_worker() const {
+    return num_workers;
+  }
+
+  // direct is used in tests only
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&) = 0;
+  virtual void join_worker(unsigned i) = 0;
+
+  virtual bool is_ready() { return true; };
+  virtual void ready() { };
+};
+
+#endif //CEPH_MSG_ASYNC_STACK_H
diff --git a/src/msg/async/crypto_onwire.cc b/src/msg/async/crypto_onwire.cc
new file mode 100644
index 00000000..4e423406
--- /dev/null
+++ b/src/msg/async/crypto_onwire.cc
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <array>
+#include <openssl/evp.h>
+
+#include "crypto_onwire.h"
+
+#include "common/debug.h"
+#include "common/ceph_crypto.h"
+#include "include/types.h"
+
+#define dout_subsys ceph_subsys_ms
+
+namespace ceph::crypto::onwire {
+
+static constexpr const std::size_t AESGCM_KEY_LEN{16};
+static constexpr const std::size_t AESGCM_IV_LEN{12};
+static constexpr const std::size_t AESGCM_TAG_LEN{16};
+static constexpr const std::size_t AESGCM_BLOCK_LEN{16};
+
+struct nonce_t {
+  ceph_le32 fixed;
+  ceph_le64 counter;
+
+  bool operator==(const nonce_t& rhs) const {
+    return !memcmp(this, &rhs, sizeof(*this));
+  }
+} __attribute__((packed));
+static_assert(sizeof(nonce_t) == AESGCM_IV_LEN);
+
+using key_t = std::array<std::uint8_t, AESGCM_KEY_LEN>;
+
+// http://www.mindspring.com/~dmcgrew/gcm-nist-6.pdf
+// https://www.openssl.org/docs/man1.0.2/crypto/EVP_aes_128_gcm.html#GCM-mode
+// https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption
+// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+class AES128GCM_OnWireTxHandler : public ceph::crypto::onwire::TxHandler {
+  CephContext* const cct;
+  std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+  ceph::bufferlist buffer;
+  nonce_t nonce, initial_nonce;
+  bool used_initial_nonce;
+  bool new_nonce_format;  // 64-bit counter?
+  static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+  AES128GCM_OnWireTxHandler(CephContext* const cct,
+			    const key_t& key,
+			    const nonce_t& nonce,
+			    bool new_nonce_format)
+    : cct(cct),
+      ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+      nonce(nonce), initial_nonce(nonce), used_initial_nonce(false),
+      new_nonce_format(new_nonce_format) {
+    ceph_assert_always(ectx);
+    ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+    if (1 != EVP_EncryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+			        nullptr, nullptr, nullptr)) {
+      throw std::runtime_error("EVP_EncryptInit_ex failed");
+    }
+
+    if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr,
+			       key.data(), nullptr)) {
+      throw std::runtime_error("EVP_EncryptInit_ex failed");
+    }
+  }
+
+  ~AES128GCM_OnWireTxHandler() override {
+    ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+    ::ceph::crypto::zeroize_for_security(&initial_nonce, sizeof(initial_nonce));
+  }
+
+  void reset_tx_handler(const uint32_t* first, const uint32_t* last) override;
+
+  void authenticated_encrypt_update(const ceph::bufferlist& plaintext) override;
+  ceph::bufferlist authenticated_encrypt_final() override;
+};
+
+void AES128GCM_OnWireTxHandler::reset_tx_handler(const uint32_t* first,
+                                                 const uint32_t* last)
+{
+  if (nonce == initial_nonce) {
+    if (used_initial_nonce) {
+      throw ceph::crypto::onwire::TxHandlerError("out of nonces");
+    }
+    used_initial_nonce = true;
+  }
+
+  if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+      reinterpret_cast<const unsigned char*>(&nonce))) {
+    throw std::runtime_error("EVP_EncryptInit_ex failed");
+  }
+
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() == 0);
+  buffer.reserve(std::accumulate(first, last, AESGCM_TAG_LEN));
+
+  if (!new_nonce_format) {
+    // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+    // susceptible to overflow!
+    nonce.fixed = nonce.fixed + 1;
+  } else {
+    nonce.counter = nonce.counter + 1;
+  }
+}
+
+void AES128GCM_OnWireTxHandler::authenticated_encrypt_update(
+  const ceph::bufferlist& plaintext)
+{
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() >=
+              plaintext.length());
+  auto filler = buffer.append_hole(plaintext.length());
+
+  for (const auto& plainbuf : plaintext.buffers()) {
+    int update_len = 0;
+
+    if(1 != EVP_EncryptUpdate(ectx.get(),
+	reinterpret_cast<unsigned char*>(filler.c_str()),
+	&update_len,
+	reinterpret_cast<const unsigned char*>(plainbuf.c_str()),
+	plainbuf.length())) {
+      throw std::runtime_error("EVP_EncryptUpdate failed");
+    }
+    ceph_assert_always(update_len >= 0);
+    ceph_assert(static_cast<unsigned>(update_len) == plainbuf.length());
+    filler.advance(update_len);
+  }
+
+  ldout(cct, 15) << __func__
+		 << " plaintext.length()=" << plaintext.length()
+		 << " buffer.length()=" << buffer.length()
+		 << dendl;
+}
+
+ceph::bufferlist AES128GCM_OnWireTxHandler::authenticated_encrypt_final()
+{
+  int final_len = 0;
+  ceph_assert(buffer.get_append_buffer_unused_tail_length() ==
+              AESGCM_BLOCK_LEN);
+  auto filler = buffer.append_hole(AESGCM_BLOCK_LEN);
+  if(1 != EVP_EncryptFinal_ex(ectx.get(),
+	reinterpret_cast<unsigned char*>(filler.c_str()),
+	&final_len)) {
+    throw std::runtime_error("EVP_EncryptFinal_ex failed");
+  }
+  ceph_assert_always(final_len == 0);
+
+  static_assert(AESGCM_BLOCK_LEN == AESGCM_TAG_LEN);
+  if(1 != EVP_CIPHER_CTX_ctrl(ectx.get(),
+	EVP_CTRL_GCM_GET_TAG, AESGCM_TAG_LEN,
+	filler.c_str())) {
+    throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+  }
+
+  ldout(cct, 15) << __func__
+		 << " buffer.length()=" << buffer.length()
+		 << " final_len=" << final_len
+		 << dendl;
+  return std::move(buffer);
+}
+
+// RX PART
+class AES128GCM_OnWireRxHandler : public ceph::crypto::onwire::RxHandler {
+  CephContext* const cct;
+  std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+  nonce_t nonce;
+  bool new_nonce_format;  // 64-bit counter?
+  static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+  AES128GCM_OnWireRxHandler(CephContext* const cct,
+			    const key_t& key,
+			    const nonce_t& nonce,
+			    bool new_nonce_format)
+    : cct(cct),
+      ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+      nonce(nonce), new_nonce_format(new_nonce_format) {
+    ceph_assert_always(ectx);
+    ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+    if (1 != EVP_DecryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+			        nullptr, nullptr, nullptr)) {
+      throw std::runtime_error("EVP_DecryptInit_ex failed");
+    }
+
+    if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr,
+			       key.data(), nullptr)) {
+      throw std::runtime_error("EVP_DecryptInit_ex failed");
+    }
+  }
+
+  ~AES128GCM_OnWireRxHandler() override {
+    ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+  }
+
+  std::uint32_t get_extra_size_at_final() override {
+    return AESGCM_TAG_LEN;
+  }
+  void reset_rx_handler() override;
+  void authenticated_decrypt_update(ceph::bufferlist& bl) override;
+  void authenticated_decrypt_update_final(ceph::bufferlist& bl) override;
+};
+
+void AES128GCM_OnWireRxHandler::reset_rx_handler()
+{
+  if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+	reinterpret_cast<const unsigned char*>(&nonce))) {
+    throw std::runtime_error("EVP_DecryptInit_ex failed");
+  }
+
+  if (!new_nonce_format) {
+    // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+    // susceptible to overflow!
+    nonce.fixed = nonce.fixed + 1;
+  } else {
+    nonce.counter = nonce.counter + 1;
+  }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update(
+  ceph::bufferlist& bl)
+{
+  // discard cached crcs as we will be writing through c_str()
+  bl.invalidate_crc();
+  for (auto& buf : bl.buffers()) {
+    auto p = reinterpret_cast<unsigned char*>(const_cast<char*>(buf.c_str()));
+    int update_len = 0;
+
+    if (1 != EVP_DecryptUpdate(ectx.get(), p, &update_len, p, buf.length())) {
+      throw std::runtime_error("EVP_DecryptUpdate failed");
+    }
+    ceph_assert_always(update_len >= 0);
+    ceph_assert(static_cast<unsigned>(update_len) == buf.length());
+  }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update_final(
+  ceph::bufferlist& bl)
+{
+  unsigned orig_len = bl.length();
+  ceph_assert(orig_len >= AESGCM_TAG_LEN);
+
+  // decrypt optional data. Caller is obliged to provide only signature but it
+  // may supply ciphertext as well. Combining the update + final is reflected
+  // combined together.
+  ceph::bufferlist auth_tag;
+  bl.splice(orig_len - AESGCM_TAG_LEN, AESGCM_TAG_LEN, &auth_tag);
+  if (bl.length() > 0) {
+    authenticated_decrypt_update(bl);
+  }
+
+  // we need to ensure the tag is stored in continuous memory.
+  if (1 != EVP_CIPHER_CTX_ctrl(ectx.get(), EVP_CTRL_GCM_SET_TAG,
+	AESGCM_TAG_LEN, auth_tag.c_str())) {
+    throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+  }
+
+  // I expect that 0 bytes will be appended. The call is supposed solely to
+  // authenticate the message.
+  {
+    int final_len = 0;
+    if (0 >= EVP_DecryptFinal_ex(ectx.get(), nullptr, &final_len)) {
+      throw MsgAuthError();
+    }
+    ceph_assert_always(final_len == 0);
+    ceph_assert(bl.length() + AESGCM_TAG_LEN == orig_len);
+  }
+}
+
+ceph::crypto::onwire::rxtx_t ceph::crypto::onwire::rxtx_t::create_handler_pair(
+  CephContext* cct,
+  const AuthConnectionMeta& auth_meta,
+  bool new_nonce_format,
+  bool crossed)
+{
+  if (auth_meta.is_mode_secure()) {
+    ceph_assert_always(auth_meta.connection_secret.length() >= \
+      sizeof(key_t) + 2 * sizeof(nonce_t));
+    const char* secbuf = auth_meta.connection_secret.c_str();
+
+    key_t key;
+    {
+      ::memcpy(key.data(), secbuf, sizeof(key));
+      secbuf += sizeof(key);
+    }
+
+    nonce_t rx_nonce;
+    {
+      ::memcpy(&rx_nonce, secbuf, sizeof(rx_nonce));
+      secbuf += sizeof(rx_nonce);
+    }
+
+    nonce_t tx_nonce;
+    {
+      ::memcpy(&tx_nonce, secbuf, sizeof(tx_nonce));
+      secbuf += sizeof(tx_nonce);
+    }
+
+    return {
+      std::make_unique<AES128GCM_OnWireRxHandler>(
+	cct, key, crossed ? tx_nonce : rx_nonce, new_nonce_format),
+      std::make_unique<AES128GCM_OnWireTxHandler>(
+	cct, key, crossed ? rx_nonce : tx_nonce, new_nonce_format)
+    };
+  } else {
+    return { nullptr, nullptr };
+  }
+}
+
+} // namespace ceph::crypto::onwire
diff --git a/src/msg/async/crypto_onwire.h b/src/msg/async/crypto_onwire.h
new file mode 100644
index 00000000..55f75508
--- /dev/null
+++ b/src/msg/async/crypto_onwire.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CRYPTO_ONWIRE_H
+#define CEPH_CRYPTO_ONWIRE_H
+
+#include <cstdint>
+#include <memory>
+
+#include "auth/Auth.h"
+#include "include/buffer.h"
+
+namespace ceph::math {
+
+// TODO
+template <typename T>
+class always_aligned_t {
+  T val;
+
+  template <class... Args>
+  always_aligned_t(Args&&... args)
+    : val(std::forward<Args>(args)...) {
+  }
+};
+
+} // namespace ceph::math
+
+namespace ceph::crypto::onwire {
+
+struct MsgAuthError : public std::runtime_error {
+  MsgAuthError()
+    : runtime_error("message signature mismatch") {
+  }
+};
+
+struct TxHandlerError : public std::runtime_error {
+  TxHandlerError(const char* what)
+    : std::runtime_error(std::string("tx handler error: ") + what) {}
+};
+
+struct TxHandler {
+  virtual ~TxHandler() = default;
+
+  // Instance of TxHandler must be reset before doing any encrypt-update
+  // step. This applies also to situation when encrypt-final was already
+  // called and another round of update-...-update-final will take place.
+  //
+  // The input parameter informs implementation how the -update sequence
+  // is fragmented and allows to make concious decision about allocation
+  // or reusage of provided memory. One implementation could do in-place
+  // encryption while other might prefer one huge output buffer.
+  //
+  // It's undefined what will happen if client doesn't follow the order.
+  //
+  // TODO: switch to always_aligned_t
+  virtual void reset_tx_handler(const uint32_t* first,
+                                const uint32_t* last) = 0;
+
+  void reset_tx_handler(std::initializer_list<uint32_t> update_size_sequence) {
+    if (update_size_sequence.size() > 0) {
+      const uint32_t* first = &*update_size_sequence.begin();
+      reset_tx_handler(first, first + update_size_sequence.size());
+    } else {
+      reset_tx_handler(nullptr, nullptr);
+    }
+  }
+
+  // Perform encryption. Client gives full ownership right to provided
+  // bufferlist. The method MUST NOT be called after _final() if there
+  // was no call to _reset().
+  virtual void authenticated_encrypt_update(
+    const ceph::bufferlist& plaintext) = 0;
+
+  // Generates authentication signature and returns bufferlist crafted
+  // basing on plaintext from preceding call to _update().
+  virtual ceph::bufferlist authenticated_encrypt_final() = 0;
+};
+
+class RxHandler {
+public:
+  virtual ~RxHandler() = default;
+
+  // Transmitter can append extra bytes of ciphertext at the -final step.
+  // This method return how much was added, and thus let client translate
+  // plaintext size into ciphertext size to grab from wire.
+  virtual std::uint32_t get_extra_size_at_final() = 0;
+
+  // Instance of RxHandler must be reset before doing any decrypt-update
+  // step. This applies also to situation when decrypt-final was already
+  // called and another round of update-...-update-final will take place.
+  virtual void reset_rx_handler() = 0;
+
+  // Perform decryption ciphertext must be ALWAYS aligned to 16 bytes.
+  virtual void authenticated_decrypt_update(ceph::bufferlist& bl) = 0;
+
+  // Perform decryption of last cipertext's portion and verify signature
+  // for overall decryption sequence.
+  // Throws on integrity/authenticity checks
+  virtual void authenticated_decrypt_update_final(ceph::bufferlist& bl) = 0;
+};
+
+struct rxtx_t {
+  //rxtx_t(rxtx_t&& r) : rx(std::move(rx)), tx(std::move(tx)) {}
+  // Each peer can use different handlers.
+  // Hmm, isn't that too much flexbility?
+  std::unique_ptr<RxHandler> rx;
+  std::unique_ptr<TxHandler> tx;
+
+  static rxtx_t create_handler_pair(
+    CephContext* ctx,
+    const class AuthConnectionMeta& auth_meta,
+    bool new_nonce_format,
+    bool crossed);
+};
+
+} // namespace ceph::crypto::onwire
+
+#endif // CEPH_CRYPTO_ONWIRE_H
diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc
new file mode 100644
index 00000000..dedc9e3c
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "ARP.h"
+
+arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num)
+    : _arp(a), _proto_num(proto_num)
+{
+  _arp.add(proto_num, this);
+}
+
+arp_for_protocol::~arp_for_protocol()
+{
+  _arp.del(_proto_num);
+}
+
+arp::arp(interface* netif):
+    _netif(netif),
+    _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }),
+    _rx_packets(
+        _proto.receive(
+            [this] (Packet p, ethernet_address ea) {
+              return process_packet(std::move(p), ea);
+            },
+            [this](forward_hash& out_hash_data, Packet& p, size_t off) {
+              return forward(out_hash_data, p, off);
+            }
+        )
+    )
+{}
+
+Tub<l3_protocol::l3packet> arp::get_packet()
+{
+  Tub<l3_protocol::l3packet> p;
+  if (!_packetq.empty()) {
+    p = std::move(_packetq.front());
+    _packetq.pop_front();
+  }
+  return p;
+}
+
+bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  auto ah = p.get_header<arp_hdr>(off);
+  auto i = _arp_for_protocol.find(ntoh(ah->ptype));
+  if (i != _arp_for_protocol.end()) {
+    return i->second->forward(out_hash_data, p, off);
+  }
+  return false;
+}
+
+void arp::add(uint16_t proto_num, arp_for_protocol* afp)
+{
+  _arp_for_protocol[proto_num] = afp;
+}
+
+void arp::del(uint16_t proto_num)
+{
+  _arp_for_protocol.erase(proto_num);
+}
+
+int arp::process_packet(Packet p, ethernet_address from)
+{
+  auto ah = p.get_header<arp_hdr>()->ntoh();
+  auto i = _arp_for_protocol.find(ah.ptype);
+  if (i != _arp_for_protocol.end()) {
+    i->second->received(std::move(p));
+  }
+  return 0;
+}
diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h
new file mode 100644
index 00000000..54569564
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_ARP_H_
+#define CEPH_MSG_ARP_H_
+
+#include <errno.h>
+
+#include <unordered_map>
+#include <functional>
+
+#include "msg/async/Event.h"
+
+#include "ethernet.h"
+#include "circular_buffer.h"
+#include "ip_types.h"
+#include "net.h"
+#include "Packet.h"
+
+class arp;
+template <typename L3>
+class arp_for;
+
+class arp_for_protocol {
+ protected:
+  arp& _arp;
+  uint16_t _proto_num;
+ public:
+  arp_for_protocol(arp& a, uint16_t proto_num);
+  virtual ~arp_for_protocol();
+  virtual int received(Packet p) = 0;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; }
+};
+
+class interface;
+
+class arp {
+  interface* _netif;
+  l3_protocol _proto;
+  subscription<Packet, ethernet_address> _rx_packets;
+  std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol;
+  circular_buffer<l3_protocol::l3packet> _packetq;
+ private:
+  struct arp_hdr {
+    uint16_t htype;
+    uint16_t ptype;
+    arp_hdr ntoh() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::ntoh(htype);
+      hdr.ptype = ::ntoh(ptype);
+      return hdr;
+    }
+    arp_hdr hton() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::hton(htype);
+      hdr.ptype = ::hton(ptype);
+      return hdr;
+    }
+  };
+ public:
+  explicit arp(interface* netif);
+  void add(uint16_t proto_num, arp_for_protocol* afp);
+  void del(uint16_t proto_num);
+ private:
+  ethernet_address l2self() { return _netif->hw_address(); }
+  int process_packet(Packet p, ethernet_address from);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  Tub<l3_protocol::l3packet> get_packet();
+  template <class l3_proto>
+  friend class arp_for;
+};
+
+template <typename L3>
+class arp_for : public arp_for_protocol {
+ public:
+  using l2addr = ethernet_address;
+  using l3addr = typename L3::address_type;
+ private:
+  static constexpr auto max_waiters = 512;
+  enum oper {
+    op_request = 1,
+    op_reply = 2,
+  };
+  struct arp_hdr {
+    uint16_t htype;
+    uint16_t ptype;
+    uint8_t hlen;
+    uint8_t plen;
+    uint16_t oper;
+    l2addr sender_hwaddr;
+    l3addr sender_paddr;
+    l2addr target_hwaddr;
+    l3addr target_paddr;
+
+    arp_hdr ntoh() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::ntoh(htype);
+      hdr.ptype = ::ntoh(ptype);
+      hdr.oper = ::ntoh(oper);
+      hdr.sender_hwaddr = sender_hwaddr.ntoh();
+      hdr.sender_paddr = sender_paddr.ntoh();
+      hdr.target_hwaddr = target_hwaddr.ntoh();
+      hdr.target_paddr = target_paddr.ntoh();
+      return hdr;
+    }
+
+    arp_hdr hton() {
+      arp_hdr hdr = *this;
+      hdr.htype = ::hton(htype);
+      hdr.ptype = ::hton(ptype);
+      hdr.oper = ::hton(oper);
+      hdr.sender_hwaddr = sender_hwaddr.hton();
+      hdr.sender_paddr = sender_paddr.hton();
+      hdr.target_hwaddr = target_hwaddr.hton();
+      hdr.target_paddr = target_paddr.hton();
+      return hdr;
+    }
+  };
+  struct resolution {
+    std::vector<std::pair<resolution_cb, Packet>> _waiters;
+    uint64_t timeout_fd;
+  };
+  class C_handle_arp_timeout : public EventCallback {
+    arp_for *arp;
+    l3addr paddr;
+    bool first_request;
+
+   public:
+    C_handle_arp_timeout(arp_for *a, l3addr addr, bool first):
+        arp(a), paddr(addr), first_request(first) {}
+    void do_request(uint64_t r) {
+      arp->send_query(paddr);
+      auto &res = arp->_in_progress[paddr];
+
+      for (auto& p : res._waiters) {
+        p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT);
+      }
+      res._waiters.clear();
+      res.timeout_fd = arp->center->create_time_event(
+          1*1000*1000, this);
+    }
+  };
+  friend class C_handle_arp_timeout;
+
+ private:
+  CephContext *cct;
+  EventCenter *center;
+  l3addr _l3self = L3::broadcast_address();
+  std::unordered_map<l3addr, l2addr> _table;
+  std::unordered_map<l3addr, resolution> _in_progress;
+ private:
+  Packet make_query_packet(l3addr paddr);
+  virtual int received(Packet p) override;
+  int handle_request(arp_hdr* ah);
+  l2addr l2self() { return _arp.l2self(); }
+  void send(l2addr to, Packet &&p);
+ public:
+  void send_query(const l3addr& paddr);
+  explicit arp_for(CephContext *c, arp& a, EventCenter *cen)
+      : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) {
+    _table[L3::broadcast_address()] = ethernet::broadcast_address();
+  }
+  ~arp_for() {
+    for (auto && p : _in_progress)
+      center->delete_time_event(p.second.timeout_fd);
+  }
+  void wait(const l3addr& addr, Packet p, resolution_cb cb);
+  void learn(l2addr l2, l3addr l3);
+  void run();
+  void set_self_addr(l3addr addr) {
+    _table.erase(_l3self);
+    _table[addr] = l2self();
+    _l3self = addr;
+  }
+  friend class arp;
+};
+
+template <typename L3>
+void arp_for<L3>::send(l2addr to, Packet &&p) {
+  _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)});
+}
+
+template <typename L3>
+Packet arp_for<L3>::make_query_packet(l3addr paddr) {
+  arp_hdr hdr;
+  hdr.htype = ethernet::arp_hardware_type();
+  hdr.ptype = L3::arp_protocol_type();
+  hdr.hlen = sizeof(l2addr);
+  hdr.plen = sizeof(l3addr);
+  hdr.oper = op_request;
+  hdr.sender_hwaddr = l2self();
+  hdr.sender_paddr = _l3self;
+  hdr.target_hwaddr = ethernet::broadcast_address();
+  hdr.target_paddr = paddr;
+  hdr = hdr.hton();
+  return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr));
+}
+
+template <typename L3>
+void arp_for<L3>::send_query(const l3addr& paddr) {
+  send(ethernet::broadcast_address(), make_query_packet(paddr));
+}
+
+template <typename L3>
+void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) {
+  _table[paddr] = hwaddr;
+  auto i = _in_progress.find(paddr);
+  if (i != _in_progress.end()) {
+    auto& res = i->second;
+    center->delete_time_event(res.timeout_fd);
+    for (auto &&p : res._waiters) {
+      p.first(hwaddr, std::move(p.second), 0);
+    }
+    _in_progress.erase(i);
+  }
+}
+
+template <typename L3>
+void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) {
+  auto i = _table.find(paddr);
+  if (i != _table.end()) {
+    cb(i->second, std::move(p), 0);
+    return ;
+  }
+
+  auto j = _in_progress.find(paddr);
+  auto first_request = j == _in_progress.end();
+  auto& res = first_request ? _in_progress[paddr] : j->second;
+
+  if (first_request) {
+    res.timeout_fd = center->create_time_event(
+        1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request));
+    send_query(paddr);
+  }
+
+  if (res._waiters.size() >= max_waiters) {
+    cb(ethernet_address(), std::move(p), -EBUSY);
+    return ;
+  }
+
+  res._waiters.emplace_back(cb, std::move(p));
+  return ;
+}
+
+template <typename L3>
+int arp_for<L3>::received(Packet p) {
+  auto ah = p.get_header<arp_hdr>();
+  if (!ah) {
+    return 0;
+  }
+  auto h = ah->ntoh();
+  if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) {
+    return 0;
+  }
+  switch (h.oper) {
+    case op_request:
+      return handle_request(&h);
+    case op_reply:
+      _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr);
+      return 0;
+    default:
+      return 0;
+  }
+}
+
+template <typename L3>
+int arp_for<L3>::handle_request(arp_hdr* ah) {
+  if (ah->target_paddr == _l3self
+      && _l3self != L3::broadcast_address()) {
+    ah->oper = op_reply;
+    ah->target_hwaddr = ah->sender_hwaddr;
+    ah->target_paddr = ah->sender_paddr;
+    ah->sender_hwaddr = l2self();
+    ah->sender_paddr = _l3self;
+    *ah = ah->hton();
+    send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah)));
+  }
+  return 0;
+}
+
+#endif /* CEPH_MSG_ARP_H_ */
diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc
new file mode 100644
index 00000000..278efe9e
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.cc
@@ -0,0 +1,1267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <atomic>
+#include <vector>
+#include <queue>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_pci.h>
+#include <rte_ethdev.h>
+#include <rte_cycles.h>
+#include <rte_memzone.h>
+
+#include "include/page.h"
+#include "align.h"
+#include "IP.h"
+#include "const.h"
+#include "dpdk_rte.h"
+#include "DPDK.h"
+#include "toeplitz.h"
+
+#include "common/Cycles.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+
+void* as_cookie(struct rte_pktmbuf_pool_private& p) {
+  return &p;
+};
+
+#ifndef MARKER
+typedef void    *MARKER[0];   /**< generic marker for a point in a structure */
+#endif
+
+/******************* Net device related constatns *****************************/
+static constexpr uint16_t default_ring_size      = 512;
+
+//
+// We need 2 times the ring size of buffers because of the way PMDs
+// refill the ring.
+//
+static constexpr uint16_t mbufs_per_queue_rx     = 2 * default_ring_size;
+static constexpr uint16_t rx_gc_thresh           = 64;
+
+//
+// No need to keep more descriptors in the air than can be sent in a single
+// rte_eth_tx_burst() call.
+//
+static constexpr uint16_t mbufs_per_queue_tx     = 2 * default_ring_size;
+
+static constexpr uint16_t mbuf_cache_size        = 512;
+//
+// Size of the data buffer in the non-inline case.
+//
+// We may want to change (increase) this value in future, while the
+// inline_mbuf_data_size value will unlikely change due to reasons described
+// above.
+//
+static constexpr size_t mbuf_data_size = 4096;
+
+static constexpr uint16_t mbuf_overhead          =
+                          sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+//
+// We'll allocate 2K data buffers for an inline case because this would require
+// a single page per mbuf. If we used 4K data buffers here it would require 2
+// pages for a single buffer (due to "mbuf_overhead") and this is a much more
+// demanding memory constraint.
+//
+static constexpr size_t inline_mbuf_data_size = 2048;
+
+
+// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers
+static constexpr uint8_t max_frags = 32 + 1;
+
+//
+// Intel's 40G NIC HW limit for a number of fragments in an xmit segment.
+//
+// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices
+// spec. for more details.
+//
+static constexpr uint8_t i40e_max_xmit_segment_frags = 8;
+
+//
+// VMWare's virtual NIC limit for a number of fragments in an xmit segment.
+//
+// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT
+//
+static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16;
+
+static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead;
+
+static size_t huge_page_size = 512 * CEPH_PAGE_SIZE;
+
+uint32_t qp_mempool_obj_size()
+{
+  uint32_t mp_size = 0;
+  struct rte_mempool_objsz mp_obj_sz = {};
+
+  //
+  // We will align each size to huge page size because DPDK allocates
+  // physically contiguous memory region for each pool object.
+  //
+
+  // Rx
+  mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+
+                      sizeof(struct rte_pktmbuf_pool_private),
+                      huge_page_size);
+
+  //Tx
+  std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz));
+  mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0,
+                                                &mp_obj_sz)+
+                      sizeof(struct rte_pktmbuf_pool_private),
+                      huge_page_size);
+  return mp_size;
+}
+
+static constexpr const char* pktmbuf_pool_name   = "dpdk_net_pktmbuf_pool";
+
+/*
+ * When doing reads from the NIC queues, use this batch size
+ */
+static constexpr uint8_t packet_read_size        = 32;
+/******************************************************************************/
+
+int DPDKDevice::init_port_start()
+{
+  ceph_assert(_port_idx < rte_eth_dev_count());
+
+  rte_eth_dev_info_get(_port_idx, &_dev_info);
+
+  //
+  // This is a workaround for a missing handling of a HW limitation in the
+  // DPDK i40e driver. This and all related to _is_i40e_device code should be
+  // removed once this handling is added.
+  //
+  if (std::string("rte_i40evf_pmd") == _dev_info.driver_name ||
+      std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+    ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl;
+    _is_i40e_device = true;
+  }
+
+  if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) {
+    ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl;
+    _is_vmxnet3_device = true;
+  }
+
+  //
+  // Another workaround: this time for a lack of number of RSS bits.
+  // ixgbe PF NICs support up to 16 RSS queues.
+  // ixgbe VF NICs support up to 4 RSS queues.
+  // i40e PF NICs support up to 64 RSS queues.
+  // i40e VF NICs support up to 16 RSS queues.
+  //
+  if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+  } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4);
+  } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64);
+  } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) {
+    _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+  }
+
+  // Clear txq_flags - we want to support all available offload features
+  // except for multi-mempool and refcnt'ing which we don't need
+  _dev_info.default_txconf.txq_flags =
+      ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT;
+
+  //
+  // Disable features that are not supported by port's HW
+  //
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
+  }
+
+  if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) {
+    _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+  }
+
+  /* for port configuration all features are off by default */
+  rte_eth_conf port_conf = { 0 };
+
+  ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues "
+                << _dev_info.max_rx_queues << "  max_tx_queues "
+                << _dev_info.max_tx_queues << dendl;
+
+  _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues});
+
+  ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using "
+                << _num_queues << " queues" << dendl;;
+
+  // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
+  // Even if port has a single queue we still want the RSS feature to be
+  // available in order to make HW calculate RSS hash for us.
+  if (_num_queues > 1) {
+    if (_dev_info.hash_key_size == 40) {
+      _rss_key = default_rsskey_40bytes;
+    } else if (_dev_info.hash_key_size == 52) {
+      _rss_key = default_rsskey_52bytes;
+    } else if (_dev_info.hash_key_size != 0) {
+      // WTF?!!
+      rte_exit(EXIT_FAILURE,
+               "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested",
+               _port_idx, _dev_info.hash_key_size);
+    } else {
+      _rss_key = default_rsskey_40bytes;
+      _dev_info.hash_key_size = 40;
+    }
+
+    port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+    port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
+    if (_dev_info.hash_key_size) {
+      port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data());
+      port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size;
+    }
+  } else {
+    port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+  }
+
+  if (_num_queues > 1) {
+    if (_dev_info.reta_size) {
+      // RETA size should be a power of 2
+      ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0);
+
+      // Set the RSS table to the correct size
+      _redir_table.resize(_dev_info.reta_size);
+      _rss_table_bits = std::lround(std::log2(_dev_info.reta_size));
+      ldout(cct, 5) << __func__ << " Port " << int(_port_idx)
+                    << ": RSS table size is " << _dev_info.reta_size << dendl;
+    } else {
+      // FIXME: same with sw_reta
+      _redir_table.resize(128);
+      _rss_table_bits = std::lround(std::log2(128));
+    }
+  } else {
+    _redir_table.push_back(0);
+  }
+
+  // Set Rx VLAN stripping
+  if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
+    port_conf.rxmode.hw_vlan_strip = 1;
+  }
+
+  // Enable HW CRC stripping
+  port_conf.rxmode.hw_strip_crc = 1;
+
+#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
+  // Enable LRO
+  if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
+    ldout(cct, 1) << __func__ << " LRO is on" << dendl;
+    port_conf.rxmode.enable_lro = 1;
+    _hw_features.rx_lro = true;
+  } else
+#endif
+    ldout(cct, 1) << __func__ << " LRO is off" << dendl;
+
+  // Check that all CSUM features are either all set all together or not set
+  // all together. If this assumption breaks we need to rework the below logic
+  // by splitting the csum offload feature bit into separate bits for IPv4,
+  // TCP.
+  ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+          (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) ||
+         (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+          !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)));
+
+  // Set Rx checksum checking
+  if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+      (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
+    ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl;
+    port_conf.rxmode.hw_ip_checksum = 1;
+    _hw_features.rx_csum_offload = 1;
+  }
+
+  if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
+    ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl;
+    _hw_features.tx_csum_ip_offload = 1;
+  }
+
+  // TSO is supported starting from DPDK v1.8
+  if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
+    ldout(cct, 1) << __func__ << " TSO is supported" << dendl;
+    _hw_features.tx_tso = 1;
+  }
+
+  // Check that Tx TCP CSUM features are either all set all together
+  // or not set all together. If this assumption breaks we need to rework the
+  // below logic by splitting the csum offload feature bit into separate bits
+  // for TCP.
+  ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) ||
+          !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM));
+
+  if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) {
+    ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl;
+    _hw_features.tx_csum_l4_offload = 1;
+  }
+
+  int retval;
+
+  ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl;
+
+  /*
+   * Standard DPDK port initialisation - config port, then set up
+   * rx and tx rings.
+   */
+  if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues,
+                                      &port_conf)) != 0) {
+    lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx
+               << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl;
+    return retval;
+  }
+
+  //rte_eth_promiscuous_enable(port_num);
+  ldout(cct, 1) << __func__ << " done." << dendl;
+
+  return 0;
+}
+
+void DPDKDevice::set_hw_flow_control()
+{
+  // Read the port's current/default flow control settings
+  struct rte_eth_fc_conf fc_conf;
+  auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf);
+
+  if (ret == -ENOTSUP) {
+    ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+                  << ": not support to get hardware flow control settings: " << ret << dendl;
+    goto not_supported;
+  }
+
+  if (ret < 0) {
+    lderr(cct) << __func__ << " port " << int(_port_idx)
+               << ": failed to get hardware flow control settings: " << ret << dendl;
+    ceph_abort();
+  }
+
+  if (_enable_fc) {
+    fc_conf.mode = RTE_FC_FULL;
+  } else {
+    fc_conf.mode = RTE_FC_NONE;
+  }
+
+  ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf);
+  if (ret == -ENOTSUP) {
+    ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+                  << ": not support to set hardware flow control settings: " << ret << dendl;
+    goto not_supported;
+  }
+
+  if (ret < 0) {
+    lderr(cct) << __func__ << " port " << int(_port_idx)
+               << ": failed to set hardware flow control settings: " << ret << dendl;
+    ceph_abort();
+  }
+
+  ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ":  HW FC " << _enable_fc << dendl;
+  return;
+
+not_supported:
+  ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl;
+}
+
+int DPDKDevice::init_port_fini()
+{
+  // Changing FC requires HW reset, so set it before the port is initialized.
+  set_hw_flow_control();
+
+  if (rte_eth_dev_start(_port_idx) != 0) {
+    lderr(cct) << __func__ << " can't start port " << _port_idx << dendl;
+    return -1;
+  }
+
+  if (_num_queues > 1) {
+    if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) {
+      ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl;
+
+      // Setup HW touse the TOEPLITZ hash function as an RSS hash function
+      struct rte_eth_hash_filter_info info = {};
+
+      info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
+      info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+
+      if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH,
+                                  RTE_ETH_FILTER_SET, &info) < 0) {
+        lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl;
+        return -1;
+      }
+    }
+
+    set_rss_table();
+  }
+
+  // Wait for a link
+  if (check_port_link_status() < 0) {
+    lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl;
+    return -1;
+  }
+
+  ldout(cct, 5) << __func__ << " created DPDK device" << dendl;
+  return 0;
+}
+
+void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) {
+  ceph_assert(!cpu_weights.empty());
+  if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) {
+    // special case queue sending to self only, to avoid requiring a hash value
+    return;
+  }
+  register_packet_provider([this] {
+    Tub<Packet> p;
+    if (!_proxy_packetq.empty()) {
+      p = std::move(_proxy_packetq.front());
+      _proxy_packetq.pop_front();
+    }
+    return p;
+  });
+  build_sw_reta(cpu_weights);
+}
+
+void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) {
+  float total_weight = 0;
+  for (auto&& x : cpu_weights) {
+    total_weight += x.second;
+  }
+  float accum = 0;
+  unsigned idx = 0;
+  std::array<uint8_t, 128> reta;
+  for (auto&& entry : cpu_weights) {
+    auto cpu = entry.first;
+    auto weight = entry.second;
+    accum += weight;
+    while (idx < (accum / total_weight * reta.size() - 0.5)) {
+      reta[idx++] = cpu;
+    }
+  }
+  _sw_reta = reta;
+}
+
+
+bool DPDKQueuePair::init_rx_mbuf_pool()
+{
+  std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx";
+
+  // reserve the memory for Rx buffers containers
+  _rx_free_pkts.reserve(mbufs_per_queue_rx);
+  _rx_free_bufs.reserve(mbufs_per_queue_rx);
+
+  _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str());
+  if (!_pktmbuf_pool_rx) {
+    ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str()
+                  << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl;
+
+    //
+    // Don't pass single-producer/single-consumer flags to mbuf create as it
+    // seems faster to use a cache instead.
+    //
+    struct rte_pktmbuf_pool_private roomsz = {};
+    roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM;
+    _pktmbuf_pool_rx = rte_mempool_create(
+        name.c_str(),
+        mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size,
+        mbuf_cache_size,
+        sizeof(struct rte_pktmbuf_pool_private),
+        rte_pktmbuf_pool_init, as_cookie(roomsz),
+        rte_pktmbuf_init, nullptr,
+        rte_socket_id(), 0);
+    if (!_pktmbuf_pool_rx) {
+      lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl;
+      return false;
+    }
+
+    //
+    // allocate more data buffer
+    int bufs_count =  cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx;
+    int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
+    std::string mz_name = "rx_buffer_data" + std::to_string(_qid);
+    const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(),
+          mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size);
+    ceph_assert(mz);
+    void* m = mz->addr;
+    for (int i = 0; i < bufs_count; i++) {
+      ceph_assert(m);
+      _alloc_bufs.push_back(m);
+      m += mbuf_data_size;
+    }
+
+    if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size,
+                               rte_eth_dev_socket_id(_dev_port_idx),
+                               _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) {
+      lderr(cct) << __func__ << " cannot initialize rx queue" << dendl;
+      return false;
+    }
+  }
+
+  return _pktmbuf_pool_rx != nullptr;
+}
+
+int DPDKDevice::check_port_link_status()
+{
+  int count = 0;
+
+  ldout(cct, 20) << __func__ << dendl;
+  const int sleep_time = 100 * 1000;
+  const int max_check_time = 90;  /* 9s (90 * 100ms) in total */
+  while (true) {
+    struct rte_eth_link link;
+    memset(&link, 0, sizeof(link));
+    rte_eth_link_get_nowait(_port_idx, &link);
+
+    if (true) {
+      if (link.link_status) {
+        ldout(cct, 5) << __func__ << " done port "
+                      << static_cast<unsigned>(_port_idx)
+                      << " link Up - speed " << link.link_speed
+                      << " Mbps - "
+                      << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n"))
+                      << dendl;
+        break;
+      } else if (count++ < max_check_time) {
+        ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl;
+        usleep(sleep_time);
+      } else {
+        lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl;
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+class C_handle_dev_stats : public EventCallback {
+  DPDKQueuePair *_qp;
+ public:
+  C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { }
+  void do_request(uint64_t id) {
+    _qp->handle_stats();
+  }
+};
+
+DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid)
+  : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid),
+    _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid),
+    _tx_gc_poller(this)
+{
+  if (!init_rx_mbuf_pool()) {
+    lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl;
+    ceph_abort();
+  }
+
+  static_assert(offsetof(tx_buf, private_end) -
+                offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM,
+                "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! "
+                "Increase the headroom size in the DPDK configuration");
+  static_assert(offsetof(tx_buf, _mbuf) == 0,
+                "There is a pad at the beginning of the tx_buf before _mbuf "
+                "field!");
+  static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0,
+                "inline_mbuf_data_size has to be a power of two!");
+
+  std::string name(std::string("queue") + std::to_string(qid));
+  PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last);
+
+  plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets");
+  plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets");
+  plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch");
+  plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch");
+  plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments");
+  plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments");
+  plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations");
+  plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations");
+  plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+
+  if (!_qid)
+    device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+void DPDKQueuePair::handle_stats()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  rte_eth_stats rte_stats = {};
+  int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats);
+
+  if (rc) {
+    ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl;
+    return ;
+  }
+
+#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0)
+  _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts);
+  _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc);
+#endif
+  _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed);
+  _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf);
+
+  _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors);
+  _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors);
+  device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+bool DPDKQueuePair::poll_tx() {
+  bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback;
+#ifdef CEPH_PERF_DEV
+  uint64_t start = Cycles::rdtsc();
+#endif
+  uint32_t total_work = 0;
+  if (_tx_packetq.size() < 16) {
+    // refill send queue from upper layers
+    uint32_t work;
+    do {
+      work = 0;
+      for (auto&& pr : _pkt_providers) {
+        auto p = pr();
+        if (p) {
+          work++;
+          if (likely(nonloopback)) {
+            // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl;
+            _tx_packetq.push_back(std::move(*p));
+          } else {
+            auto th = p->get_header<eth_hdr>(0);
+            if (th->dst_mac == th->src_mac) {
+              _dev->l2receive(_qid, std::move(*p));
+            } else {
+              _tx_packetq.push_back(std::move(*p));
+            }
+          }
+          if (_tx_packetq.size() == 128) {
+            break;
+          }
+        }
+      }
+      total_work += work;
+    } while (work && total_work < 256 && _tx_packetq.size() < 128);
+  }
+  if (!_tx_packetq.empty()) {
+    uint64_t c = send(_tx_packetq);
+    perf_logger->inc(l_dpdk_qp_tx_packets, c);
+    perf_logger->set(l_dpdk_qp_tx_last_bunch, c);
+#ifdef CEPH_PERF_DEV
+    tx_count += total_work;
+    tx_cycles += Cycles::rdtsc() - start;
+#endif
+    return true;
+  }
+
+  return false;
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m)
+{
+  _frags.clear();
+  _bufs.clear();
+
+  for (; m != nullptr; m = m->next) {
+    char* data = rte_pktmbuf_mtod(m, char*);
+
+    _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)});
+    _bufs.push_back(data);
+  }
+
+  auto del = std::bind(
+          [this](std::vector<char*> &bufs) {
+            for (auto&& b : bufs) { _alloc_bufs.push_back(b); }
+          }, std::move(_bufs));
+  return Packet(
+      _frags.begin(), _frags.end(), make_deleter(std::move(del)));
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m)
+{
+  _rx_free_pkts.push_back(m);
+  _num_rx_free_segs += m->nb_segs;
+
+  if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
+    char* data = rte_pktmbuf_mtod(m, char*);
+
+    return Packet(fragment{data, rte_pktmbuf_data_len(m)},
+                  make_deleter([this, data] { _alloc_bufs.push_back(data); }));
+  } else {
+    return from_mbuf_lro(m);
+  }
+}
+
+inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head)
+{
+  for (; head != nullptr; head = head->next) {
+    if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) {
+      //
+      // If we failed to allocate a new buffer - push the rest of the
+      // cluster back to the free_packets list for a later retry.
+      //
+      _rx_free_pkts.push_back(head);
+      return false;
+    }
+    _rx_free_bufs.push_back(head);
+  }
+
+  return true;
+}
+
+bool DPDKQueuePair::rx_gc(bool force)
+{
+  if (_num_rx_free_segs >= rx_gc_thresh || force) {
+    ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs
+                   << " thresh " << rx_gc_thresh
+                   << " free pkts " << _rx_free_pkts.size()
+                   << dendl;
+
+    while (!_rx_free_pkts.empty()) {
+      //
+      // Use back() + pop_back() semantics to avoid an extra
+      // _rx_free_pkts.clear() at the end of the function - clear() has a
+      // linear complexity.
+      //
+      auto m = _rx_free_pkts.back();
+      _rx_free_pkts.pop_back();
+
+      if (!refill_one_cluster(m)) {
+        ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl;
+        break;
+      }
+    }
+    for (auto&& m : _rx_free_bufs) {
+      rte_pktmbuf_prefree_seg(m);
+    }
+
+    if (_rx_free_bufs.size()) {
+      rte_mempool_put_bulk(_pktmbuf_pool_rx,
+                           (void **)_rx_free_bufs.data(),
+                           _rx_free_bufs.size());
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size());
+
+      _num_rx_free_segs -= _rx_free_bufs.size();
+      _rx_free_bufs.clear();
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) ||
+             (!_rx_free_pkts.empty() && _num_rx_free_segs));
+    }
+  }
+
+  return _num_rx_free_segs >= rx_gc_thresh;
+}
+
+
+void DPDKQueuePair::process_packets(
+    struct rte_mbuf **bufs, uint16_t count)
+{
+  uint64_t nr_frags = 0, bytes = 0;
+
+  for (uint16_t i = 0; i < count; i++) {
+    struct rte_mbuf *m = bufs[i];
+    offload_info oi;
+
+    Tub<Packet> p = from_mbuf(m);
+
+    // Drop the packet if translation above has failed
+    if (!p) {
+      perf_logger->inc(l_dpdk_qp_rx_no_memory_errors);
+      continue;
+    }
+    // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl;
+
+    nr_frags += m->nb_segs;
+    bytes    += m->pkt_len;
+
+    // Set stipped VLAN value if available
+    if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) &&
+        (m->ol_flags & PKT_RX_VLAN_STRIPPED)) {
+      oi.vlan_tci = m->vlan_tci;
+    }
+
+    if (_dev->get_hw_features().rx_csum_offload) {
+      if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
+        // Packet with bad checksum, just drop it.
+        perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors);
+        continue;
+      }
+      // Note that when _hw_features.rx_csum_offload is on, the receive
+      // code for ip, tcp and udp will assume they don't need to check
+      // the checksum again, because we did this here.
+    }
+
+    p->set_offload_info(oi);
+    if (m->ol_flags & PKT_RX_RSS_HASH) {
+      p->set_rss_hash(m->hash.rss);
+    }
+
+    _dev->l2receive(_qid, std::move(*p));
+  }
+
+  perf_logger->inc(l_dpdk_qp_rx_packets, count);
+  perf_logger->set(l_dpdk_qp_rx_last_bunch, count);
+  perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags);
+  perf_logger->inc(l_dpdk_qp_rx_bytes, bytes);
+}
+
+bool DPDKQueuePair::poll_rx_once()
+{
+  struct rte_mbuf *buf[packet_read_size];
+
+  /* read a port */
+#ifdef CEPH_PERF_DEV
+  uint64_t start = Cycles::rdtsc();
+#endif
+  uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid,
+                                       buf, packet_read_size);
+
+  /* Now process the NIC packets read */
+  if (likely(count > 0)) {
+    process_packets(buf, count);
+#ifdef CEPH_PERF_DEV
+    rx_cycles = Cycles::rdtsc() - start;
+    rx_count += count;
+#endif
+  }
+#ifdef CEPH_PERF_DEV
+  else {
+    if (rx_count > 10000 && tx_count) {
+      ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns "
+                    << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns"
+                    << dendl;
+      rx_count = rx_cycles = tx_count = tx_cycles = 0;
+    }
+  }
+#endif
+
+  return count;
+}
+
+DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c,
+        DPDKDevice *dev, uint8_t qid): cct(c)
+{
+  std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx";
+
+  _pool = rte_mempool_lookup(name.c_str());
+  if (!_pool) {
+    ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str()
+                  << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl;
+    //
+    // We are going to push the buffers from the mempool into
+    // the circular_buffer and then poll them from there anyway, so
+    // we prefer to make a mempool non-atomic in this case.
+    //
+    _pool = rte_mempool_create(name.c_str(),
+                               mbufs_per_queue_tx, inline_mbuf_size,
+                               mbuf_cache_size,
+                               sizeof(struct rte_pktmbuf_pool_private),
+                               rte_pktmbuf_pool_init, nullptr,
+                               rte_pktmbuf_init, nullptr,
+                               rte_socket_id(), 0);
+
+    if (!_pool) {
+      lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl;
+      ceph_abort();
+    }
+    if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size,
+                               rte_eth_dev_socket_id(dev->port_idx()),
+                               dev->def_tx_conf()) < 0) {
+      lderr(cct) << __func__ << " cannot initialize tx queue" << dendl;
+      ceph_abort();
+    }
+  }
+
+  //
+  // Fill the factory with the buffers from the mempool allocated
+  // above.
+  //
+  init_factory();
+}
+
+bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head)
+{
+  bool is_tso = head->ol_flags & PKT_TX_TCP_SEG;
+
+  // For a non-TSO case: number of fragments should not exceed 8
+  if (!is_tso){
+    return head->nb_segs > i40e_max_xmit_segment_frags;
+  }
+
+  //
+  // For a TSO case each MSS window should not include more than 8
+  // fragments including headers.
+  //
+
+  // Calculate the number of frags containing headers.
+  //
+  // Note: we support neither VLAN nor tunneling thus headers size
+  // accounting is super simple.
+  //
+  size_t headers_size = head->l2_len + head->l3_len + head->l4_len;
+  unsigned hdr_frags = 0;
+  size_t cur_payload_len = 0;
+  rte_mbuf *cur_seg = head;
+
+  while (cur_seg && cur_payload_len < headers_size) {
+    cur_payload_len += cur_seg->data_len;
+    cur_seg = cur_seg->next;
+    hdr_frags++;
+  }
+
+  //
+  // Header fragments will be used for each TSO segment, thus the
+  // maximum number of data segments will be 8 minus the number of
+  // header fragments.
+  //
+  // It's unclear from the spec how the first TSO segment is treated
+  // if the last fragment with headers contains some data bytes:
+  // whether this fragment will be accounted as a single fragment or
+  // as two separate fragments. We prefer to play it safe and assume
+  // that this fragment will be accounted as two separate fragments.
+  //
+  size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags;
+
+  if (head->nb_segs <= max_win_size) {
+    return false;
+  }
+
+  // Get the data (without headers) part of the first data fragment
+  size_t prev_frag_data = cur_payload_len - headers_size;
+  auto mss = head->tso_segsz;
+
+  while (cur_seg) {
+    unsigned frags_in_seg = 0;
+    size_t cur_seg_size = 0;
+
+    if (prev_frag_data) {
+      cur_seg_size = prev_frag_data;
+      frags_in_seg++;
+      prev_frag_data = 0;
+    }
+
+    while (cur_seg_size < mss && cur_seg) {
+      cur_seg_size += cur_seg->data_len;
+      cur_seg = cur_seg->next;
+      frags_in_seg++;
+
+      if (frags_in_seg > max_win_size) {
+        return true;
+      }
+    }
+
+    if (cur_seg_size > mss) {
+      prev_frag_data = cur_seg_size - mss;
+    }
+  }
+
+  return false;
+}
+
+void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head)
+{
+  // Handle TCP checksum offload
+  auto oi = p.offload_info();
+  if (oi.needs_ip_csum) {
+    head->ol_flags |= PKT_TX_IP_CKSUM;
+    // TODO: Take a VLAN header into an account here
+    head->l2_len = sizeof(struct ether_hdr);
+    head->l3_len = oi.ip_hdr_len;
+  }
+  if (qp.port().get_hw_features().tx_csum_l4_offload) {
+    if (oi.protocol == ip_protocol_num::tcp) {
+      head->ol_flags |= PKT_TX_TCP_CKSUM;
+      // TODO: Take a VLAN header into an account here
+      head->l2_len = sizeof(struct ether_hdr);
+      head->l3_len = oi.ip_hdr_len;
+
+      if (oi.tso_seg_size) {
+        ceph_assert(oi.needs_ip_csum);
+        head->ol_flags |= PKT_TX_TCP_SEG;
+        head->l4_len = oi.tcp_hdr_len;
+        head->tso_segsz = oi.tso_seg_size;
+      }
+    }
+  }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc(
+        CephContext *cct, Packet&& p, DPDKQueuePair& qp)
+{
+  // Too fragmented - linearize
+  if (p.nr_frags() > max_frags) {
+    p.linearize();
+    qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+  }
+
+ build_mbuf_cluster:
+  rte_mbuf *head = nullptr, *last_seg = nullptr;
+  unsigned nsegs = 0;
+
+  //
+  // Create a HEAD of the fragmented packet: check if frag0 has to be
+  // copied and if yes - send it in a copy way
+  //
+  if (!check_frag0(p)) {
+    if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+      ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+      return nullptr;
+    }
+  } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+    ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+    return nullptr;
+  }
+
+  unsigned total_nsegs = nsegs;
+
+  for (unsigned i = 1; i < p.nr_frags(); i++) {
+    rte_mbuf *h = nullptr, *new_last_seg = nullptr;
+    if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) {
+      ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl;
+      me(head)->recycle();
+      return nullptr;
+    }
+
+    total_nsegs += nsegs;
+
+    // Attach a new buffers' chain to the packet chain
+    last_seg->next = h;
+    last_seg = new_last_seg;
+  }
+
+  // Update the HEAD buffer with the packet info
+  head->pkt_len = p.len();
+  head->nb_segs = total_nsegs;
+
+  set_cluster_offload_info(p, qp, head);
+
+  //
+  // If a packet hasn't been linearized already and the resulting
+  // cluster requires the linearisation due to HW limitation:
+  //
+  //    - Recycle the cluster.
+  //    - Linearize the packet.
+  //    - Build the cluster once again
+  //
+  if (head->nb_segs > max_frags ||
+      (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) ||
+      (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) {
+    me(head)->recycle();
+    p.linearize();
+    qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+
+    goto build_mbuf_cluster;
+  }
+
+  me(last_seg)->set_packet(std::move(p));
+
+  return me(head);
+}
+
+void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head)
+{
+  rte_mbuf* cur_seg = head;
+  size_t cur_seg_offset = 0;
+  unsigned cur_frag_idx = 0;
+  size_t cur_frag_offset = 0;
+
+  while (true) {
+    size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset,
+                              inline_mbuf_data_size - cur_seg_offset);
+
+    memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset),
+           p.frag(cur_frag_idx).base + cur_frag_offset, to_copy);
+
+    cur_frag_offset += to_copy;
+    cur_seg_offset += to_copy;
+
+    if (cur_frag_offset >= p.frag(cur_frag_idx).size) {
+      ++cur_frag_idx;
+      if (cur_frag_idx >= p.nr_frags()) {
+        //
+        // We are done - set the data size of the last segment
+        // of the cluster.
+        //
+        cur_seg->data_len = cur_seg_offset;
+        break;
+      }
+
+      cur_frag_offset = 0;
+    }
+
+    if (cur_seg_offset >= inline_mbuf_data_size) {
+      cur_seg->data_len = inline_mbuf_data_size;
+      cur_seg = cur_seg->next;
+      cur_seg_offset = 0;
+
+      // FIXME: assert in a fast-path - remove!!!
+      ceph_assert(cur_seg);
+    }
+  }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp)
+{
+  // sanity
+  if (!p.len()) {
+    return nullptr;
+  }
+
+  /*
+   * Here we are going to use the fact that the inline data size is a
+   * power of two.
+   *
+   * We will first try to allocate the cluster and only if we are
+   * successful - we will go and copy the data.
+   */
+  auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size);
+  unsigned nsegs = aligned_len / inline_mbuf_data_size;
+  rte_mbuf *head = nullptr, *last_seg = nullptr;
+
+  tx_buf* buf = qp.get_tx_buf();
+  if (!buf) {
+    return nullptr;
+  }
+
+  head = buf->rte_mbuf_p();
+  last_seg = head;
+  for (unsigned i = 1; i < nsegs; i++) {
+    buf = qp.get_tx_buf();
+    if (!buf) {
+      me(head)->recycle();
+      return nullptr;
+    }
+
+    last_seg->next = buf->rte_mbuf_p();
+    last_seg = last_seg->next;
+  }
+
+  //
+  // If we've got here means that we have succeeded already!
+  // We only need to copy the data and set the head buffer with the
+  // relevant info.
+  //
+  head->pkt_len = p.len();
+  head->nb_segs = nsegs;
+
+  copy_packet_to_cluster(p, head);
+  set_cluster_offload_info(p, qp, head);
+
+  return me(head);
+}
+
+size_t DPDKQueuePair::tx_buf::copy_one_data_buf(
+    DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len)
+{
+  tx_buf* buf = qp.get_tx_buf();
+  if (!buf) {
+    return 0;
+  }
+
+  size_t len = std::min(buf_len, inline_mbuf_data_size);
+
+  m = buf->rte_mbuf_p();
+
+  // mbuf_put()
+  m->data_len = len;
+  m->pkt_len  = len;
+
+  qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops);
+  qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len);
+
+  memcpy(rte_pktmbuf_mtod(m, void*), data, len);
+
+  return len;
+}
+
+void DPDKDevice::set_rss_table()
+{
+  // always fill our local indirection table.
+  unsigned i = 0;
+  for (auto& r : _redir_table) {
+    r = i++ % _num_queues;
+  }
+
+  if (_dev_info.reta_size == 0)
+    return;
+
+  int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE);
+  rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
+
+  // Configure the HW indirection table
+  i = 0;
+  for (auto& x : reta_conf) {
+    x.mask = ~0ULL;
+    for (auto& r: x.reta) {
+      r = i++ % _num_queues;
+    }
+  }
+
+  if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) {
+    rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx);
+  }
+}
+
+/******************************** Interface functions *************************/
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+    CephContext *cct,
+    unsigned cores,
+    uint8_t port_idx,
+    bool use_lro,
+    bool enable_fc)
+{
+  // Check that we have at least one DPDK-able port
+  if (rte_eth_dev_count() == 0) {
+    rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
+  } else {
+    ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl;
+  }
+
+  return std::unique_ptr<DPDKDevice>(
+      new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc));
+}
diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h
new file mode 100644
index 00000000..fa12af6b
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.h
@@ -0,0 +1,918 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_DEV_H
+#define CEPH_DPDK_DEV_H
+
+#include <memory>
+#include <functional>
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_version.h>
+
+#include "include/page.h"
+#include "common/Tub.h"
+#include "common/perf_counters.h"
+#include "msg/async/Event.h"
+#include "const.h"
+#include "circular_buffer.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "net.h"
+#include "toeplitz.h"
+
+
+struct free_deleter {
+  void operator()(void* p) { ::free(p); }
+};
+
+
+enum {
+  l_dpdk_dev_first = 58800,
+  l_dpdk_dev_rx_mcast,
+  l_dpdk_dev_rx_total_errors,
+  l_dpdk_dev_tx_total_errors,
+  l_dpdk_dev_rx_badcrc_errors,
+  l_dpdk_dev_rx_dropped_errors,
+  l_dpdk_dev_rx_nombuf_errors,
+  l_dpdk_dev_last
+};
+
+enum {
+  l_dpdk_qp_first = 58900,
+  l_dpdk_qp_rx_packets,
+  l_dpdk_qp_tx_packets,
+  l_dpdk_qp_rx_bad_checksum_errors,
+  l_dpdk_qp_rx_no_memory_errors,
+  l_dpdk_qp_rx_bytes,
+  l_dpdk_qp_tx_bytes,
+  l_dpdk_qp_rx_last_bunch,
+  l_dpdk_qp_tx_last_bunch,
+  l_dpdk_qp_rx_fragments,
+  l_dpdk_qp_tx_fragments,
+  l_dpdk_qp_rx_copy_ops,
+  l_dpdk_qp_tx_copy_ops,
+  l_dpdk_qp_rx_copy_bytes,
+  l_dpdk_qp_tx_copy_bytes,
+  l_dpdk_qp_rx_linearize_ops,
+  l_dpdk_qp_tx_linearize_ops,
+  l_dpdk_qp_tx_queue_length,
+  l_dpdk_qp_last
+};
+
+class DPDKDevice;
+class DPDKWorker;
+
+class DPDKQueuePair {
+  using packet_provider_type = std::function<Tub<Packet> ()>;
+ public:
+  void configure_proxies(const std::map<unsigned, float>& cpu_weights);
+  // build REdirection TAble for cpu_weights map: target cpu -> weight
+  void build_sw_reta(const std::map<unsigned, float>& cpu_weights);
+  void proxy_send(Packet p) {
+    _proxy_packetq.push_back(std::move(p));
+  }
+  void register_packet_provider(packet_provider_type func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  bool poll_tx();
+  friend class DPDKDevice;
+
+  class tx_buf_factory;
+
+  class tx_buf {
+    friend class DPDKQueuePair;
+   public:
+    static tx_buf* me(rte_mbuf* mbuf) {
+      return reinterpret_cast<tx_buf*>(mbuf);
+    }
+
+   private:
+    /**
+     * Checks if the original packet of a given cluster should be linearized
+     * due to HW limitations.
+     *
+     * @param head head of a cluster to check
+     *
+     * @return TRUE if a packet should be linearized.
+     */
+    static bool i40e_should_linearize(rte_mbuf *head);
+
+    /**
+     * Sets the offload info in the head buffer of an rte_mbufs cluster.
+     *
+     * @param p an original packet the cluster is built for
+     * @param qp QP handle
+     * @param head a head of an rte_mbufs cluster
+     */
+    static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head);
+
+    /**
+     * Creates a tx_buf cluster representing a given packet in a "zero-copy"
+     * way.
+     *
+     * @param p packet to translate
+     * @param qp DPDKQueuePair handle
+     *
+     * @return the HEAD tx_buf of the cluster or nullptr in case of a
+     *         failure
+     */
+    static tx_buf* from_packet_zc(
+            CephContext *cct, Packet&& p, DPDKQueuePair& qp);
+
+    /**
+     * Copy the contents of the "packet" into the given cluster of
+     * rte_mbuf's.
+     *
+     * @note Size of the cluster has to be big enough to accommodate all the
+     *       contents of the given packet.
+     *
+     * @param p packet to copy
+     * @param head head of the rte_mbuf's cluster
+     */
+    static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head);
+
+    /**
+     * Creates a tx_buf cluster representing a given packet in a "copy" way.
+     *
+     * @param p packet to translate
+     * @param qp DPDKQueuePair handle
+     *
+     * @return the HEAD tx_buf of the cluster or nullptr in case of a
+     *         failure
+     */
+    static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp);
+
+    /**
+     * Zero-copy handling of a single fragment.
+     *
+     * @param do_one_buf Functor responsible for a single rte_mbuf
+     *                   handling
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * @return TRUE in case of success
+     */
+    template <class DoOneBufFunc>
+    static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp,
+                            fragment& frag, rte_mbuf*& head,
+                            rte_mbuf*& last_seg, unsigned& nsegs) {
+      size_t len, left_to_set = frag.size;
+      char* base = frag.base;
+
+      rte_mbuf* m;
+
+      // TODO: ceph_assert() in a fast path! Remove me ASAP!
+      ceph_assert(frag.size);
+
+      // Create a HEAD of mbufs' cluster and set the first bytes into it
+      len = do_one_buf(qp, head, base, left_to_set);
+      if (!len) {
+        return false;
+      }
+
+      left_to_set -= len;
+      base += len;
+      nsegs = 1;
+
+      //
+      // Set the rest of the data into the new mbufs and chain them to
+      // the cluster.
+      //
+      rte_mbuf* prev_seg = head;
+      while (left_to_set) {
+        len = do_one_buf(qp, m, base, left_to_set);
+        if (!len) {
+          me(head)->recycle();
+          return false;
+        }
+
+        left_to_set -= len;
+        base += len;
+        nsegs++;
+
+        prev_seg->next = m;
+        prev_seg = m;
+      }
+
+      // Return the last mbuf in the cluster
+      last_seg = prev_seg;
+
+      return true;
+    }
+
+    /**
+     * Zero-copy handling of a single fragment.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * @return TRUE in case of success
+     */
+    static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag,
+                                   rte_mbuf*& head, rte_mbuf*& last_seg,
+                                   unsigned& nsegs) {
+      return do_one_frag(set_one_data_buf, qp, frag, head,
+                         last_seg, nsegs);
+    }
+
+    /**
+     * Copies one fragment into the cluster of rte_mbuf's.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param frag Fragment to copy (in)
+     * @param head Head of the cluster (out)
+     * @param last_seg Last segment of the cluster (out)
+     * @param nsegs Number of segments in the cluster (out)
+     *
+     * We return the "last_seg" to avoid traversing the cluster in order to get
+     * it.
+     *
+     * @return TRUE in case of success
+     */
+    static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag,
+                              rte_mbuf*& head, rte_mbuf*& last_seg,
+                              unsigned& nsegs) {
+      return do_one_frag(copy_one_data_buf, qp, frag, head,
+                         last_seg, nsegs);
+    }
+
+    /**
+     * Allocates a single rte_mbuf and sets it to point to a given data
+     * buffer.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param m New allocated rte_mbuf (out)
+     * @param va virtual address of a data buffer (in)
+     * @param buf_len length of the data to copy (in)
+     *
+     * @return The actual number of bytes that has been set in the mbuf
+     */
+    static size_t set_one_data_buf(
+        DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) {
+      static constexpr size_t max_frag_len = 15 * 1024; // 15K
+
+      // FIXME: current all tx buf is allocated without rte_malloc
+      return copy_one_data_buf(qp, m, va, buf_len);
+      //
+      // Currently we break a buffer on a 15K boundary because 82599
+      // devices have a 15.5K limitation on a maximum single fragment
+      // size.
+      //
+      rte_iova_t pa = rte_malloc_virt2iova(va);
+      if (!pa)
+        return copy_one_data_buf(qp, m, va, buf_len);
+
+      ceph_assert(buf_len);
+      tx_buf* buf = qp.get_tx_buf();
+      if (!buf) {
+        return 0;
+      }
+
+      size_t len = std::min(buf_len, max_frag_len);
+
+      buf->set_zc_info(va, pa, len);
+      m = buf->rte_mbuf_p();
+
+      return len;
+    }
+
+    /**
+     *  Allocates a single rte_mbuf and copies a given data into it.
+     *
+     * @param qp DPDKQueuePair handle (in)
+     * @param m New allocated rte_mbuf (out)
+     * @param data Data to copy from (in)
+     * @param buf_len length of the data to copy (in)
+     *
+     * @return The actual number of bytes that has been copied
+     */
+    static size_t copy_one_data_buf(
+        DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len);
+
+    /**
+     * Checks if the first fragment of the given packet satisfies the
+     * zero-copy flow requirement: its first 128 bytes should not cross the
+     * 4K page boundary. This is required in order to avoid splitting packet
+     * headers.
+     *
+     * @param p packet to check
+     *
+     * @return TRUE if packet is ok and FALSE otherwise.
+     */
+    static bool check_frag0(Packet& p)
+    {
+      //
+      // First frag is special - it has headers that should not be split.
+      // If the addressing is such that the first fragment has to be
+      // split, then send this packet in a (non-zero) copy flow. We'll
+      // check if the first 128 bytes of the first fragment reside in the
+      // physically contiguous area. If that's the case - we are good to
+      // go.
+      //
+      if (p.frag(0).size < 128)
+        return false;
+
+      return true;
+    }
+
+   public:
+    tx_buf(tx_buf_factory& fc) : _fc(fc) {
+
+      _buf_physaddr = _mbuf.buf_physaddr;
+      _data_off     = _mbuf.data_off;
+    }
+
+    rte_mbuf* rte_mbuf_p() { return &_mbuf; }
+
+    void set_zc_info(void* va, phys_addr_t pa, size_t len) {
+      // mbuf_put()
+      _mbuf.data_len           = len;
+      _mbuf.pkt_len            = len;
+
+      // Set the mbuf to point to our data
+      _mbuf.buf_addr           = va;
+      _mbuf.buf_physaddr       = pa;
+      _mbuf.data_off           = 0;
+      _is_zc                   = true;
+    }
+
+    void reset_zc() {
+
+      //
+      // If this mbuf was the last in a cluster and contains an
+      // original packet object then call the destructor of the
+      // original packet object.
+      //
+      if (_p) {
+        //
+        // Reset the std::optional. This in particular is going
+        // to call the "packet"'s destructor and reset the
+        // "optional" state to "nonengaged".
+        //
+        _p.destroy();
+
+      } else if (!_is_zc) {
+        return;
+      }
+
+      // Restore the rte_mbuf fields we trashed in set_zc_info()
+      _mbuf.buf_physaddr = _buf_physaddr;
+      _mbuf.buf_addr     = rte_mbuf_to_baddr(&_mbuf);
+      _mbuf.data_off     = _data_off;
+
+      _is_zc             = false;
+    }
+
+    void recycle() {
+      struct rte_mbuf *m = &_mbuf, *m_next;
+
+      while (m != nullptr) {
+        m_next = m->next;
+        rte_pktmbuf_reset(m);
+        _fc.put(me(m));
+        m = m_next;
+      }
+    }
+
+    void set_packet(Packet&& p) {
+      _p = std::move(p);
+    }
+
+   private:
+    struct rte_mbuf _mbuf;
+    MARKER private_start;
+    Tub<Packet> _p;
+    phys_addr_t _buf_physaddr;
+    uint16_t _data_off;
+    // TRUE if underlying mbuf has been used in the zero-copy flow
+    bool _is_zc = false;
+    // buffers' factory the buffer came from
+    tx_buf_factory& _fc;
+    MARKER private_end;
+  };
+
+  class tx_buf_factory {
+    //
+    // Number of buffers to free in each GC iteration:
+    // We want the buffers to be allocated from the mempool as many as
+    // possible.
+    //
+    // On the other hand if there is no Tx for some time we want the
+    // completions to be eventually handled. Thus we choose the smallest
+    // possible packets count number here.
+    //
+    static constexpr int gc_count = 1;
+   public:
+    tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid);
+    ~tx_buf_factory() {
+      // put all mbuf back into mempool in order to make the next factory work
+      while (gc());
+      rte_mempool_put_bulk(_pool, (void**)_ring.data(),
+                           _ring.size());
+    }
+
+
+    /**
+     * @note Should not be called if there are no free tx_buf's
+     *
+     * @return a free tx_buf object
+     */
+    tx_buf* get() {
+      // Take completed from the HW first
+      tx_buf *pkt = get_one_completed();
+      if (pkt) {
+        pkt->reset_zc();
+        return pkt;
+      }
+
+      //
+      // If there are no completed at the moment - take from the
+      // factory's cache.
+      //
+      if (_ring.empty()) {
+        return nullptr;
+      }
+
+      pkt = _ring.back();
+      _ring.pop_back();
+
+      return pkt;
+    }
+
+    void put(tx_buf* buf) {
+      buf->reset_zc();
+      _ring.push_back(buf);
+    }
+
+    bool gc() {
+      for (int cnt = 0; cnt < gc_count; ++cnt) {
+        auto tx_buf_p = get_one_completed();
+        if (!tx_buf_p) {
+          return false;
+        }
+
+        put(tx_buf_p);
+      }
+
+      return true;
+    }
+   private:
+    /**
+     * Fill the mbufs circular buffer: after this the _pool will become
+     * empty. We will use it to catch the completed buffers:
+     *
+     * - Underlying PMD drivers will "free" the mbufs once they are
+     *   completed.
+     * - We will poll the _pktmbuf_pool_tx till it's empty and release
+     *   all the buffers from the freed mbufs.
+     */
+    void init_factory() {
+      while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) {
+        _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this});
+      }
+    }
+
+    /**
+     * PMD puts the completed buffers back into the mempool they have
+     * originally come from.
+     *
+     * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call
+     *       rte_pktmbuf_reset() here again.
+     *
+     * @return a single tx_buf that has been completed by HW.
+     */
+    tx_buf* get_one_completed() {
+      return tx_buf::me(rte_pktmbuf_alloc(_pool));
+    }
+
+   private:
+    CephContext *cct;
+    std::vector<tx_buf*> _ring;
+    rte_mempool* _pool = nullptr;
+  };
+
+ public:
+  explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid);
+  ~DPDKQueuePair() {
+    if (device_stat_time_fd) {
+      center->delete_time_event(device_stat_time_fd);
+    }
+    rx_gc(true);
+  }
+
+  void rx_start() {
+    _rx_poller.construct(this);
+  }
+
+  uint32_t send(circular_buffer<Packet>& pb) {
+    // Zero-copy send
+    return _send(pb, [&] (Packet&& p) {
+      return tx_buf::from_packet_zc(cct, std::move(p), *this);
+    });
+  }
+
+  DPDKDevice& port() const { return *_dev; }
+  tx_buf* get_tx_buf() { return _tx_buf_factory.get(); }
+
+  void handle_stats();
+
+ private:
+  template <class Func>
+  uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) {
+    if (_tx_burst.size() == 0) {
+      for (auto&& p : pb) {
+        // TODO: ceph_assert() in a fast path! Remove me ASAP!
+        ceph_assert(p.len());
+
+        tx_buf* buf = packet_to_tx_buf_p(std::move(p));
+        if (!buf) {
+          break;
+        }
+
+        _tx_burst.push_back(buf->rte_mbuf_p());
+      }
+    }
+
+    uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid,
+                                     _tx_burst.data() + _tx_burst_idx,
+                                     _tx_burst.size() - _tx_burst_idx);
+
+    uint64_t nr_frags = 0, bytes = 0;
+
+    for (int i = 0; i < sent; i++) {
+      rte_mbuf* m = _tx_burst[_tx_burst_idx + i];
+      bytes    += m->pkt_len;
+      nr_frags += m->nb_segs;
+      pb.pop_front();
+    }
+
+    perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags);
+    perf_logger->inc(l_dpdk_qp_tx_bytes, bytes);
+
+    _tx_burst_idx += sent;
+
+    if (_tx_burst_idx == _tx_burst.size()) {
+      _tx_burst_idx = 0;
+      _tx_burst.clear();
+    }
+
+    return sent;
+  }
+
+  /**
+   * Allocate a new data buffer and set the mbuf to point to it.
+   *
+   * Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+   * points to the private data of RTE_PKTMBUF_HEADROOM before the actual
+   * data buffer.
+   *
+   * @param m mbuf to update
+   */
+  static bool refill_rx_mbuf(rte_mbuf* m, size_t size,
+                             std::vector<void*> &datas) {
+    if (datas.empty())
+      return false;
+    void *data = datas.back();
+    datas.pop_back();
+
+    //
+    // Set the mbuf to point to our data.
+    //
+    // Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+    // points to the private data of RTE_PKTMBUF_HEADROOM before the
+    // actual data buffer.
+    //
+    m->buf_addr      = (char*)data - RTE_PKTMBUF_HEADROOM;
+    m->buf_physaddr  = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM;
+    return true;
+  }
+
+  bool init_rx_mbuf_pool();
+  bool rx_gc(bool force=false);
+  bool refill_one_cluster(rte_mbuf* head);
+
+  /**
+   * Polls for a burst of incoming packets. This function will not block and
+   * will immediately return after processing all available packets.
+   *
+   */
+  bool poll_rx_once();
+
+  /**
+   * Translates an rte_mbuf's into packet and feeds them to _rx_stream.
+   *
+   * @param bufs An array of received rte_mbuf's
+   * @param count Number of buffers in the bufs[]
+   */
+  void process_packets(struct rte_mbuf **bufs, uint16_t count);
+
+  /**
+   * Translate rte_mbuf into the "packet".
+   * @param m mbuf to translate
+   *
+   * @return a "optional" object representing the newly received data if in an
+   *         "engaged" state or an error if in a "disengaged" state.
+   */
+  Tub<Packet> from_mbuf(rte_mbuf* m);
+
+  /**
+   * Transform an LRO rte_mbuf cluster into the "packet" object.
+   * @param m HEAD of the mbufs' cluster to transform
+   *
+   * @return a "optional" object representing the newly received LRO packet if
+   *         in an "engaged" state or an error if in a "disengaged" state.
+   */
+  Tub<Packet> from_mbuf_lro(rte_mbuf* m);
+
+ private:
+  CephContext *cct;
+  std::vector<packet_provider_type> _pkt_providers;
+  Tub<std::array<uint8_t, 128>> _sw_reta;
+  circular_buffer<Packet> _proxy_packetq;
+  stream<Packet> _rx_stream;
+  circular_buffer<Packet> _tx_packetq;
+  std::vector<void*> _alloc_bufs;
+
+  PerfCounters *perf_logger;
+  DPDKDevice* _dev;
+  uint8_t _dev_port_idx;
+  EventCenter *center;
+  uint8_t _qid;
+  rte_mempool *_pktmbuf_pool_rx;
+  std::vector<rte_mbuf*> _rx_free_pkts;
+  std::vector<rte_mbuf*> _rx_free_bufs;
+  std::vector<fragment> _frags;
+  std::vector<char*> _bufs;
+  size_t _num_rx_free_segs = 0;
+  uint64_t device_stat_time_fd = 0;
+
+#ifdef CEPH_PERF_DEV
+  uint64_t rx_cycles = 0;
+  uint64_t rx_count = 0;
+  uint64_t tx_cycles = 0;
+  uint64_t tx_count = 0;
+#endif
+
+  class DPDKTXPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKTXPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->poll_tx();
+    }
+  } _tx_poller;
+
+  class DPDKRXGCPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKRXGCPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->rx_gc();
+    }
+  } _rx_gc_poller;
+  tx_buf_factory _tx_buf_factory;
+  class DPDKRXPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKRXPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->poll_rx_once();
+    }
+  };
+  Tub<DPDKRXPoller> _rx_poller;
+  class DPDKTXGCPoller : public EventCenter::Poller {
+    DPDKQueuePair *qp;
+
+   public:
+    explicit DPDKTXGCPoller(DPDKQueuePair *qp)
+        : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {}
+
+    virtual int poll() {
+      return qp->_tx_buf_factory.gc();
+    }
+  } _tx_gc_poller;
+  std::vector<rte_mbuf*> _tx_burst;
+  uint16_t _tx_burst_idx = 0;
+};
+
+class DPDKDevice {
+ public:
+  CephContext *cct;
+  PerfCounters *perf_logger;
+  std::vector<std::unique_ptr<DPDKQueuePair>> _queues;
+  std::vector<DPDKWorker*> workers;
+  size_t _rss_table_bits = 0;
+  uint8_t _port_idx;
+  uint16_t _num_queues;
+  unsigned cores;
+  hw_features _hw_features;
+  uint8_t _queues_ready = 0;
+  unsigned _home_cpu;
+  bool _use_lro;
+  bool _enable_fc;
+  std::vector<uint8_t> _redir_table;
+  rss_key_type _rss_key;
+  bool _is_i40e_device = false;
+  bool _is_vmxnet3_device = false;
+
+ public:
+  rte_eth_dev_info _dev_info = {};
+
+  /**
+   * The final stage of a port initialization.
+   * @note Must be called *after* all queues from stage (2) have been
+   *       initialized.
+   */
+  int init_port_fini();
+
+ private:
+  /**
+   * Port initialization consists of 3 main stages:
+   * 1) General port initialization which ends with a call to
+   *    rte_eth_dev_configure() where we request the needed number of Rx and
+   *    Tx queues.
+   * 2) Individual queues initialization. This is done in the constructor of
+   *    DPDKQueuePair class. In particular the memory pools for queues are allocated
+   *    in this stage.
+   * 3) The final stage of the initialization which starts with the call of
+   *    rte_eth_dev_start() after which the port becomes fully functional. We
+   *    will also wait for a link to get up in this stage.
+   */
+
+
+  /**
+   * First stage of the port initialization.
+   *
+   * @return 0 in case of success and an appropriate error code in case of an
+   *         error.
+   */
+  int init_port_start();
+
+  /**
+   * Check the link status of out port in up to 9s, and print them finally.
+   */
+  int check_port_link_status();
+
+  /**
+   * Configures the HW Flow Control
+   */
+  void set_hw_flow_control();
+
+ public:
+  DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc):
+      cct(c), _port_idx(port_idx), _num_queues(num_queues),
+      _home_cpu(0), _use_lro(use_lro),
+      _enable_fc(enable_fc) {
+    _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues);
+    /* now initialise the port we will use */
+    int ret = init_port_start();
+    if (ret != 0) {
+      rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx);
+    }
+    string name(std::string("port") + std::to_string(port_idx));
+    PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last);
+
+    plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets");
+    plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors");
+
+    plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors");
+    plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors");
+    plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors");
+    plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors");
+
+    perf_logger = plb.create_perf_counters();
+    cct->get_perfcounters_collection()->add(perf_logger);
+  }
+
+  ~DPDKDevice() {
+    rte_eth_dev_stop(_port_idx);
+  }
+
+  DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; }
+  void l2receive(int qid, Packet p) {
+    _queues[qid]->_rx_stream.produce(std::move(p));
+  }
+  subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) {
+    auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet));
+    _queues[cpuid]->rx_start();
+    return std::move(sub);
+  }
+  ethernet_address hw_address() {
+    struct ether_addr mac;
+    rte_eth_macaddr_get(_port_idx, &mac);
+
+    return mac.addr_bytes;
+  }
+  hw_features get_hw_features() {
+    return _hw_features;
+  }
+  const rss_key_type& rss_key() const { return _rss_key; }
+  uint16_t hw_queues_count() { return _num_queues; }
+  std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) {
+    std::unique_ptr<DPDKQueuePair> qp;
+    qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid));
+    return std::move(qp);
+  }
+  unsigned hash2qid(uint32_t hash) {
+    // return hash % hw_queues_count();
+    return _redir_table[hash & (_redir_table.size() - 1)];
+  }
+  void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) {
+    ceph_assert(!_queues[i]);
+    _queues[i] = std::move(qp);
+  }
+  void unset_local_queue(unsigned i) {
+    ceph_assert(_queues[i]);
+    _queues[i].reset();
+  }
+  template <typename Func>
+  unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) {
+    auto& qp = queue_for_cpu(src_cpuid);
+    if (!qp._sw_reta)
+      return src_cpuid;
+
+    ceph_assert(!qp._sw_reta);
+    auto hash = hashfn() >> _rss_table_bits;
+    auto& reta = *qp._sw_reta;
+    return reta[hash % reta.size()];
+  }
+  unsigned hash2cpu(uint32_t hash) {
+    // there is an assumption here that qid == get_id() which will
+    // not necessary be true in the future
+    return forward_dst(hash2qid(hash), [hash] { return hash; });
+  }
+
+  hw_features& hw_features_ref() { return _hw_features; }
+
+  const rte_eth_rxconf* def_rx_conf() const {
+    return &_dev_info.default_rxconf;
+  }
+
+  const rte_eth_txconf* def_tx_conf() const {
+    return &_dev_info.default_txconf;
+  }
+
+  /**
+   *  Set the RSS table in the device and store it in the internal vector.
+   */
+  void set_rss_table();
+
+  uint8_t port_idx() { return _port_idx; }
+  bool is_i40e_device() const {
+    return _is_i40e_device;
+  }
+  bool is_vmxnet3_device() const {
+    return _is_vmxnet3_device;
+  }
+};
+
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+    CephContext *c, unsigned cores, uint8_t port_idx = 0,
+    bool use_lro = true, bool enable_fc = true);
+
+
+/**
+ * @return Number of bytes needed for mempool objects of each QP.
+ */
+uint32_t qp_mempool_obj_size();
+
+#endif // CEPH_DPDK_DEV_H
diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc
new file mode 100644
index 00000000..3101ae57
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.cc
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <memory>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <tuple>
+
+#include "common/ceph_argparse.h"
+#include "dpdk_rte.h"
+#include "DPDKStack.h"
+#include "DPDK.h"
+#include "IP.h"
+#include "TCP-Stack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdkstack "
+
+static int dpdk_thread_adaptor(void* f)
+{
+  (*static_cast<std::function<void ()>*>(f))();
+  return 0;
+}
+
+void DPDKWorker::initialize()
+{
+  static enum {
+    WAIT_DEVICE_STAGE,
+    WAIT_PORT_FIN_STAGE,
+    DONE
+  } create_stage = WAIT_DEVICE_STAGE;
+  static Mutex lock("DPDKStack::lock");
+  static Cond cond;
+  static unsigned queue_init_done = 0;
+  static unsigned cores = 0;
+  static std::shared_ptr<DPDKDevice> sdev;
+
+  unsigned i = center.get_id();
+  if (i == 0) {
+    // Hardcoded port index 0.
+    // TODO: Inherit it from the opts
+    cores = cct->_conf->ms_async_op_threads;
+    std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device(
+        cct, cores, cct->_conf->ms_dpdk_port_id,
+        cct->_conf->ms_dpdk_lro,
+        cct->_conf->ms_dpdk_hw_flow_control);
+    sdev = std::shared_ptr<DPDKDevice>(dev.release());
+    sdev->workers.resize(cores);
+    ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl;
+
+    Mutex::Locker l(lock);
+    create_stage = WAIT_PORT_FIN_STAGE;
+    cond.Signal();
+  } else {
+    Mutex::Locker l(lock);
+    while (create_stage <= WAIT_DEVICE_STAGE)
+      cond.Wait(lock);
+  }
+  ceph_assert(sdev);
+  if (i < sdev->hw_queues_count()) {
+    auto qp = sdev->init_local_queue(cct, &center, cct->_conf->ms_dpdk_hugepages, i);
+    std::map<unsigned, float> cpu_weights;
+    for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count();
+         j < cores; j+= sdev->hw_queues_count())
+      cpu_weights[i] = 1;
+    cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight;
+    qp->configure_proxies(cpu_weights);
+    sdev->set_local_queue(i, std::move(qp));
+    Mutex::Locker l(lock);
+    ++queue_init_done;
+    cond.Signal();
+  } else {
+    // auto master = qid % sdev->hw_queues_count();
+    // sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
+    ceph_abort();
+  }
+  if (i == 0) {
+    {
+      Mutex::Locker l(lock);
+      while (queue_init_done < cores)
+        cond.Wait(lock);
+    }
+
+    if (sdev->init_port_fini() < 0) {
+      lderr(cct) << __func__ << " init_port_fini failed " << dendl;
+      ceph_abort();
+    }
+    Mutex::Locker l(lock);
+    create_stage = DONE;
+    cond.Signal();
+  } else {
+    Mutex::Locker l(lock);
+    while (create_stage <= WAIT_PORT_FIN_STAGE)
+      cond.Wait(lock);
+  }
+
+  sdev->workers[i] = this;
+  _impl = std::unique_ptr<DPDKWorker::Impl>(
+          new DPDKWorker::Impl(cct, i, &center, sdev));
+  {
+    Mutex::Locker l(lock);
+    if (!--queue_init_done) {
+      create_stage = WAIT_DEVICE_STAGE;
+      sdev.reset();
+    }
+  }
+}
+
+using AvailableIPAddress = std::tuple<string, string, string>;
+static bool parse_available_address(
+        const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res)
+{
+  vector<string> ip_vec, gate_vec, mask_vec;
+  string_to_vec(ip_vec, ips);
+  string_to_vec(gate_vec, gates);
+  string_to_vec(mask_vec, masks);
+  if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size())
+    return false;
+
+  for (size_t i = 0; i < ip_vec.size(); ++i) {
+    res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]});
+  }
+  return true;
+}
+
+static bool match_available_address(const vector<AvailableIPAddress> &avails,
+                                    const entity_addr_t &ip, int &res)
+{
+  for (size_t i = 0; i < avails.size(); ++i) {
+    entity_addr_t addr;
+    auto a = std::get<0>(avails[i]).c_str();
+    if (!addr.parse(a))
+      continue;
+    if (addr.is_same_host(ip)) {
+      res = i;
+      return true;
+    }
+  }
+  return false;
+}
+
+DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev)
+    : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif)
+{
+  vector<AvailableIPAddress> tuples;
+  bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"),
+                                        cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"),
+                                        cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples);
+  if (!parsed) {
+    lderr(cct) << __func__ << " no available address "
+               << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", "
+               << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", "
+               << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", "
+               << dendl;
+    ceph_abort();
+  }
+  _inet.set_host_address(ipv4_address(std::get<0>(tuples[0])));
+  _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0])));
+  _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0])));
+}
+
+DPDKWorker::Impl::~Impl()
+{
+  _dev->unset_local_queue(id);
+}
+
+int DPDKWorker::listen(entity_addr_t &sa, const SocketOptions &opt,
+                       ServerSocket *sock)
+{
+  ceph_assert(sa.get_family() == AF_INET);
+  ceph_assert(sock);
+
+  ldout(cct, 10) << __func__ << " addr " << sa << dendl;
+  // vector<AvailableIPAddress> tuples;
+  // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr,
+  //                                       cct->_conf->ms_dpdk_gateway_ipv4_addr,
+  //                                       cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples);
+  // if (!parsed) {
+  //   lderr(cct) << __func__ << " no available address "
+  //              << cct->_conf->ms_dpdk_host_ipv4_addr << ", "
+  //              << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", "
+  //              << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", "
+  //              << dendl;
+  //   return -EINVAL;
+  // }
+  // int idx;
+  // parsed = match_available_address(tuples, sa, idx);
+  // if (!parsed) {
+  //   lderr(cct) << __func__ << " no matched address for " << sa << dendl;
+  //   return -EINVAL;
+  // }
+  // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx])));
+  // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx])));
+  // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx])));
+  return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(),
+		      sock);
+}
+
+int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  // ceph_assert(addr.get_family() == AF_INET);
+  int r =  tcpv4_connect(_impl->_inet.get_tcp(), addr, socket);
+  ldout(cct, 10) << __func__ << " addr " << addr << dendl;
+  return r;
+}
+
+void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+  // create a extra master thread
+  //
+  funcs[i] = std::move(func);
+  int r = 0;
+  r = dpdk::eal::init(cct);
+  if (r < 0) {
+    lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl;
+    ceph_abort();
+  }
+  // if dpdk::eal::init already called by NVMEDevice, we will select 1..n
+  // cores
+  ceph_assert(rte_lcore_count() >= i + 1);
+  unsigned core_id;
+  int j = i;
+  RTE_LCORE_FOREACH_SLAVE(core_id) {
+    if (i-- == 0) {
+      break;
+    }
+  }
+  dpdk::eal::execute_on_master([&]() {
+    r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id);
+    if (r < 0) {
+      lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl;
+      ceph_abort();
+    }
+  });
+}
+
+void DPDKStack::join_worker(unsigned i)
+{
+  dpdk::eal::execute_on_master([&]() {
+    rte_eal_wait_lcore(i+1);
+  });
+}
diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h
new file mode 100644
index 00000000..a44ae383
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.h
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef CEPH_MSG_DPDKSTACK_H
+#define CEPH_MSG_DPDKSTACK_H
+
+#include <functional>
+
+#include "common/ceph_context.h"
+#include "common/Tub.h"
+
+#include "msg/async/Stack.h"
+#include "net.h"
+#include "const.h"
+#include "IP.h"
+#include "Packet.h"
+
+class interface;
+
+template <typename Protocol>
+class NativeConnectedSocketImpl;
+
+// DPDKServerSocketImpl
+template <typename Protocol>
+class DPDKServerSocketImpl : public ServerSocketImpl {
+  typename Protocol::listener _listener;
+ public:
+  DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt,
+		       int type);
+  int listen() {
+    return _listener.listen();
+  }
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override {
+    return _listener.fd();
+  }
+};
+
+// NativeConnectedSocketImpl
+template <typename Protocol>
+class NativeConnectedSocketImpl : public ConnectedSocketImpl {
+  typename Protocol::connection _conn;
+  uint32_t _cur_frag = 0;
+  uint32_t _cur_off = 0;
+  Tub<Packet> _buf;
+  Tub<bufferptr> _cache_ptr;
+
+ public:
+  explicit NativeConnectedSocketImpl(typename Protocol::connection conn)
+          : _conn(std::move(conn)) {}
+  NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs)
+      : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf))  {}
+  virtual int is_connected() override {
+    return _conn.is_connected();
+  }
+
+  virtual ssize_t read(char *buf, size_t len) override {
+    size_t left = len;
+    ssize_t r = 0;
+    size_t off = 0;
+    while (left > 0) {
+      if (!_cache_ptr) {
+        _cache_ptr.construct();
+        r = zero_copy_read(*_cache_ptr);
+        if (r <= 0) {
+          _cache_ptr.destroy();
+          if (r == -EAGAIN)
+            break;
+          return r;
+        }
+      }
+      if (_cache_ptr->length() <= left) {
+        _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off);
+        left -= _cache_ptr->length();
+        off += _cache_ptr->length();
+        _cache_ptr.destroy();
+      } else {
+        _cache_ptr->copy_out(0, left, buf+off);
+        _cache_ptr->set_offset(_cache_ptr->offset() + left);
+        _cache_ptr->set_length(_cache_ptr->length() - left);
+        left = 0;
+        break;
+      }
+    }
+    return len - left ? len - left : -EAGAIN;
+  }
+
+  virtual ssize_t zero_copy_read(bufferptr &data) override {
+    auto err = _conn.get_errno();
+    if (err <= 0)
+      return err;
+
+    if (!_buf) {
+      _buf = std::move(_conn.read());
+      if (!_buf)
+        return -EAGAIN;
+    }
+
+    fragment &f = _buf->frag(_cur_frag);
+    Packet p = _buf->share(_cur_off, f.size);
+    auto del = std::bind(
+            [](Packet &p) {}, std::move(p));
+    data = buffer::claim_buffer(
+            f.size, f.base, make_deleter(std::move(del)));
+    if (++_cur_frag == _buf->nr_frags()) {
+      _cur_frag = 0;
+      _cur_off = 0;
+      _buf.destroy();
+    } else {
+      _cur_off += f.size;
+    }
+    ceph_assert(data.length());
+    return data.length();
+  }
+  virtual ssize_t send(bufferlist &bl, bool more) override {
+    auto err = _conn.get_errno();
+    if (err < 0)
+      return (ssize_t)err;
+
+    size_t available = _conn.peek_sent_available();
+    if (available == 0) {
+      return 0;
+    }
+
+    std::vector<fragment> frags;
+    std::list<bufferptr>::const_iterator pb = bl.buffers().begin();
+    uint64_t left_pbrs = bl.buffers().size();
+    uint64_t len = 0;
+    uint64_t seglen = 0;
+    while (len < available && left_pbrs--) {
+      seglen = pb->length();
+      if (len + seglen > available) {
+        // don't continue if we enough at least 1 fragment since no available
+        // space for next ptr.
+        if (len > 0)
+          break;
+        seglen = std::min(seglen, available);
+      }
+      len += seglen;
+      frags.push_back(fragment{(char*)pb->c_str(), seglen});
+      ++pb;
+    }
+
+    if (len != bl.length()) {
+      bufferlist swapped;
+      bl.splice(0, len, &swapped);
+      auto del = std::bind(
+              [](bufferlist &bl) {}, std::move(swapped));
+      return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+    } else {
+      auto del = std::bind(
+              [](bufferlist &bl) {}, std::move(bl));
+
+      return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+    }
+  }
+  virtual void shutdown() override {
+    _conn.close_write();
+  }
+  // FIXME need to impl close
+  virtual void close() override {
+    _conn.close_write();
+  }
+  virtual int fd() const override {
+    return _conn.fd();
+  }
+  virtual int socket_fd() const override {
+    return _conn.fd();
+  }
+
+};
+
+template <typename Protocol>
+DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl(
+  Protocol& proto, uint16_t port, const SocketOptions &opt, int type)
+  : ServerSocketImpl(type), _listener(proto.listen(port)) {}
+
+template <typename Protocol>
+int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) {
+  if (_listener.get_errno() < 0)
+    return _listener.get_errno();
+  auto c = _listener.accept();
+  if (!c)
+    return -EAGAIN;
+
+  if (out) {
+    *out = c->remote_addr();
+    out->set_type(addr_type);
+  }
+  std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi(
+          new NativeConnectedSocketImpl<Protocol>(std::move(*c)));
+  *s = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+template <typename Protocol>
+void DPDKServerSocketImpl<Protocol>::abort_accept() {
+  _listener.abort_accept();
+}
+
+class DPDKWorker : public Worker {
+  struct Impl {
+    unsigned id;
+    interface _netif;
+    std::shared_ptr<DPDKDevice> _dev;
+    ipv4 _inet;
+    Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev);
+    ~Impl();
+  };
+  std::unique_ptr<Impl> _impl;
+
+  virtual void initialize() override;
+  void set_ipv4_packet_filter(ip_packet_filter* filter) {
+    _impl->_inet.set_packet_filter(filter);
+  }
+  using tcp4 = tcp<ipv4_traits>;
+
+ public:
+  explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {}
+  virtual int listen(entity_addr_t &addr, const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  void arp_learn(ethernet_address l2, ipv4_address l3) {
+    _impl->_inet.learn(l2, l3);
+  }
+  virtual void destroy() override {
+    _impl.reset();
+  }
+
+  friend class DPDKServerSocketImpl<tcp4>;
+};
+
+class DPDKStack : public NetworkStack {
+  vector<std::function<void()> > funcs;
+ public:
+  explicit DPDKStack(CephContext *cct, const string &t): NetworkStack(cct, t) {
+    funcs.resize(cct->_conf->ms_async_max_op_threads);
+  }
+  virtual bool support_zero_copy_read() const override { return true; }
+  virtual bool support_local_listen_table() const override { return true; }
+
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+};
+
+#endif
diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc
new file mode 100644
index 00000000..5d291716
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+  *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "DPDKStack.h"
+#include "EventDPDK.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "DPDKDriver."
+
+int DPDKDriver::init(EventCenter *c, int nevent)
+{
+	return 0;
+}
+
+int DPDKDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+	ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+								 << " add_mask=" << add_mask << dendl;
+
+	int r = manager.listen(fd, add_mask);
+	if (r < 0) {
+		lderr(cct) << __func__ << " add fd=" << fd << " failed. "
+		           << cpp_strerror(-r) << dendl;
+		return -errno;
+	}
+
+	return 0;
+}
+
+int DPDKDriver::del_event(int fd, int cur_mask, int delmask)
+{
+	ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+								 << " delmask=" << delmask << dendl;
+	int r = 0;
+
+	if (delmask != EVENT_NONE) {
+		if ((r = manager.unlisten(fd, delmask)) < 0) {
+			lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask
+								 << " failed." << cpp_strerror(-r) << dendl;
+			return r;
+		}
+	}
+	return 0;
+}
+
+int DPDKDriver::resize_events(int newsize)
+{
+	return 0;
+}
+
+int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+	int num_events = 512;
+	int events[num_events];
+  int masks[num_events];
+
+	int retval = manager.poll(events, masks, num_events, tvp);
+	if (retval > 0) {
+		fired_events.resize(retval);
+		for (int i = 0; i < retval; i++) {
+			fired_events[i].fd = events[i];
+			fired_events[i].mask = masks[i];
+		}
+	}
+	return retval;
+}
diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h
new file mode 100644
index 00000000..541c2210
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EVENTDPDK_H
+#define CEPH_EVENTDPDK_H
+
+#include "msg/async/Event.h"
+#include "msg/async/Stack.h"
+#include "UserspaceEvent.h"
+
+class DPDKDriver : public EventDriver {
+  CephContext *cct;
+
+ public:
+  UserspaceEventManager manager;
+
+  explicit DPDKDriver(CephContext *c): cct(c), manager(c) {}
+  virtual ~DPDKDriver() { }
+
+  int init(EventCenter *c, int nevent) override;
+  int add_event(int fd, int cur_mask, int add_mask) override;
+  int del_event(int fd, int cur_mask, int del_mask) override;
+  int resize_events(int newsize) override;
+  int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override;
+  bool need_wakeup() override { return false; }
+};
+
+#endif //CEPH_EVENTDPDK_H
diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc
new file mode 100644
index 00000000..f730cded
--- /dev/null
+++ b/src/msg/async/dpdk/IP.cc
@@ -0,0 +1,470 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+
+#include "capture.h"
+#include "IP.h"
+#include "toeplitz.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a) {
+  auto ip = a.ip;
+  return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff)
+            << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff);
+}
+
+utime_t ipv4::_frag_timeout = utime_t(30, 0);
+constexpr uint32_t ipv4::_frag_low_thresh;
+constexpr uint32_t ipv4::_frag_high_thresh;
+
+class C_handle_frag_timeout : public EventCallback {
+  ipv4 *_ipv4;
+
+ public:
+  C_handle_frag_timeout(ipv4 *i): _ipv4(i) {}
+  void do_request(uint64_t fd_or_id) {
+    _ipv4->frag_timeout();
+  }
+};
+
+enum {
+  l_dpdk_qp_first = 99000,
+  l_dpdk_total_linearize_operations,
+  l_dpdk_qp_last
+};
+
+ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif)
+  : cct(c), center(cen), _netif(netif), _global_arp(netif),
+    _arp(c, _global_arp, cen),
+    _host_address(0), _gw_address(0), _netmask(0),
+    _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }),
+    _rx_packets(
+      _l3.receive(
+        [this] (Packet p, ethernet_address ea) {
+          return handle_received_packet(std::move(p), ea);
+        },
+        [this] (forward_hash& out_hash_data, Packet& p, size_t off) {
+          return forward(out_hash_data, p, off);
+        }
+      )
+    ),
+    _tcp(*this, cen), _icmp(c, *this),
+    _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp },
+         { uint8_t(ip_protocol_num::icmp), &_icmp }}),
+    _packet_filter(nullptr)
+{
+  PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last);
+  plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations");
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  frag_handler = new C_handle_frag_timeout(this);
+}
+
+bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  auto iph = p.get_header<ip_hdr>(off);
+
+  out_hash_data.push_back(iph->src_ip.ip);
+  out_hash_data.push_back(iph->dst_ip.ip);
+
+  auto h = iph->ntoh();
+  auto l4 = _l4[h.ip_proto];
+  if (l4) {
+    if (h.mf() == false && h.offset() == 0) {
+      // This IP datagram is atomic, forward according to tcp connection hash
+      l4->forward(out_hash_data, p, off + sizeof(ip_hdr));
+    }
+    // else forward according to ip fields only
+  }
+  return true;
+}
+
+int ipv4::handle_received_packet(Packet p, ethernet_address from)
+{
+  auto iph = p.get_header<ip_hdr>(0);
+  if (!iph) {
+    return 0;
+  }
+
+  // Skip checking csum of reassembled IP datagram
+  if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) {
+    checksummer csum;
+    csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+    if (csum.get() != 0) {
+      return 0;
+    }
+  }
+
+  auto h = iph->ntoh();
+  unsigned ip_len = h.len;
+  unsigned ip_hdr_len = h.ihl * 4;
+  unsigned pkt_len = p.len();
+  auto offset = h.offset();
+
+  ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto)
+                 << std::dec << " packet from "
+                 << h.src_ip << " -> " << h.dst_ip << " id=" << h.id
+                 << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len
+                 << " pkt_len=" << pkt_len << " offset=" << offset << dendl;
+
+  if (pkt_len > ip_len) {
+    // Trim extra data in the packet beyond IP total length
+    p.trim_back(pkt_len - ip_len);
+  } else if (pkt_len < ip_len) {
+    // Drop if it contains less than IP total length
+    return 0;
+  }
+  // Drop if the reassembled datagram will be larger than maximum IP size
+  if (offset + p.len() > ip_packet_len_max) {
+    return 0;
+  }
+
+  // FIXME: process options
+  if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) {
+    ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl;
+    _arp.learn(from, h.src_ip);
+  }
+
+  if (_packet_filter) {
+    bool handled = false;
+    _packet_filter->handle(p, &h, from, handled);
+    if (handled) {
+      return 0;
+    }
+  }
+
+  if (h.dst_ip != _host_address) {
+    // FIXME: forward
+    return 0;
+  }
+
+  // Does this IP datagram need reassembly
+  auto mf = h.mf();
+  if (mf == true || offset != 0) {
+    frag_limit_mem();
+    auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto};
+    auto& frag = _frags[frag_id];
+    if (mf == false) {
+      frag.last_frag_received = true;
+    }
+    // This is a newly created frag_id
+    if (frag.mem_size == 0) {
+      _frags_age.push_back(frag_id);
+      frag.rx_time = ceph_clock_now();
+    }
+    auto added_size = frag.merge(h, offset, std::move(p));
+    _frag_mem += added_size;
+    if (frag.is_complete()) {
+      // All the fragments are received
+      auto dropped_size = frag.mem_size;
+      auto& ip_data = frag.data.map.begin()->second;
+      // Choose a cpu to forward this packet
+      auto cpu_id = center->get_id();
+      auto l4 = _l4[h.ip_proto];
+      if (l4) {
+        size_t l4_offset = 0;
+        forward_hash hash_data;
+        hash_data.push_back(hton(h.src_ip.ip));
+        hash_data.push_back(hton(h.dst_ip.ip));
+        l4->forward(hash_data, ip_data, l4_offset);
+        cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data));
+      }
+
+      // No need to forward if the dst cpu is the current cpu
+      if (cpu_id == center->get_id()) {
+        l4->received(std::move(ip_data), h.src_ip, h.dst_ip);
+      } else {
+        auto to = _netif->hw_address();
+        auto pkt = frag.get_assembled_packet(from, to);
+        _netif->forward(center, cpu_id, std::move(pkt));
+      }
+
+      // Delete this frag from _frags and _frags_age
+      frag_drop(frag_id, dropped_size);
+      _frags_age.remove(frag_id);
+      perf_logger->set(l_dpdk_total_linearize_operations,
+                       ipv4_packet_merger::linearizations());
+    } else {
+      // Some of the fragments are missing
+      if (frag_timefd) {
+        frag_arm();
+      }
+    }
+    return 0;
+  }
+
+  auto l4 = _l4[h.ip_proto];
+  if (l4) {
+    // Trim IP header and pass to upper layer
+    p.trim_front(ip_hdr_len);
+    l4->received(std::move(p), h.src_ip, h.dst_ip);
+  }
+  return 0;
+}
+
+void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+  // Figure out where to send the packet to. If it is a directly connected
+  // host, send to it directly, otherwise send to the default gateway.
+  ipv4_address dst;
+  if (in_my_netmask(to)) {
+    dst = to;
+  } else {
+    dst = _gw_address;
+  }
+
+  _arp.wait(std::move(dst), std::move(p), std::move(cb));
+}
+
+const hw_features& ipv4::get_hw_features() const
+{
+  return _netif->get_hw_features();
+}
+
+void ipv4::send(ipv4_address to, ip_protocol_num proto_num,
+        Packet p, ethernet_address e_dst) {
+  auto needs_frag = this->needs_frag(p, proto_num, get_hw_features());
+
+  auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable  {
+    static uint16_t id = 0;
+    auto iph = pkt.prepend_header<ip_hdr>();
+    iph->ihl = sizeof(*iph) / 4;
+    iph->ver = 4;
+    iph->dscp = 0;
+    iph->ecn = 0;
+    iph->len = pkt.len();
+    // FIXME: a proper id
+    iph->id = id++;
+    if (needs_frag) {
+      uint16_t mf = remaining > 0;
+      // The fragment offset is measured in units of 8 octets (64 bits)
+      auto off = offset / 8;
+      iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off;
+    } else {
+      iph->frag = 0;
+    }
+    iph->ttl = 64;
+    iph->ip_proto = (uint8_t)proto_num;
+    iph->csum = 0;
+    iph->src_ip = _host_address;
+    iph->dst_ip = to;
+    ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to
+                   << " len " << pkt.len() << dendl;
+    *iph = iph->hton();
+
+    if (get_hw_features().tx_csum_ip_offload) {
+      iph->csum = 0;
+      pkt.offload_info_ref().needs_ip_csum = true;
+    } else {
+      checksummer csum;
+      csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+      iph->csum = csum.get();
+    }
+
+    _packetq.push_back(
+            l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)});
+  };
+
+  if (needs_frag) {
+    uint16_t offset = 0;
+    uint16_t remaining = p.len();
+    auto mtu = get_hw_features().mtu;
+
+    while (remaining) {
+      auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining);
+      remaining -= can_send;
+      auto pkt = p.share(offset, can_send);
+      send_pkt(pkt, remaining, offset);
+      offset += can_send;
+    }
+  } else {
+    // The whole packet can be send in one shot
+    send_pkt(p, 0, 0);
+  }
+}
+
+Tub<l3_protocol::l3packet> ipv4::get_packet() {
+  // _packetq will be mostly empty here unless it hold remnants of previously
+  // fragmented packet
+  if (_packetq.empty()) {
+    for (size_t i = 0; i < _pkt_providers.size(); i++) {
+      auto l4p = _pkt_providers[_pkt_provider_idx++]();
+      if (_pkt_provider_idx == _pkt_providers.size()) {
+        _pkt_provider_idx = 0;
+      }
+      if (l4p) {
+        ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl;
+        send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst);
+        break;
+      }
+    }
+  }
+
+  Tub<l3_protocol::l3packet> p;
+  if (!_packetq.empty()) {
+    p = std::move(_packetq.front());
+    _packetq.pop_front();
+  }
+  return p;
+}
+
+void ipv4::frag_limit_mem() {
+  if (_frag_mem <= _frag_high_thresh) {
+    return;
+  }
+  auto drop = _frag_mem - _frag_low_thresh;
+  while (drop) {
+    if (_frags_age.empty()) {
+      return;
+    }
+    // Drop the oldest frag (first element) from _frags_age
+    auto frag_id = _frags_age.front();
+    _frags_age.pop_front();
+
+    // Drop from _frags as well
+    auto& frag = _frags[frag_id];
+    auto dropped_size = frag.mem_size;
+    frag_drop(frag_id, dropped_size);
+
+    drop -= std::min(drop, dropped_size);
+  }
+}
+
+void ipv4::frag_timeout() {
+  if (_frags.empty()) {
+    return;
+  }
+  auto now = ceph_clock_now();
+  for (auto it = _frags_age.begin(); it != _frags_age.end();) {
+    auto frag_id = *it;
+    auto& frag = _frags[frag_id];
+    if (now > frag.rx_time + _frag_timeout) {
+      auto dropped_size = frag.mem_size;
+      // Drop from _frags
+      frag_drop(frag_id, dropped_size);
+      // Drop from _frags_age
+      it = _frags_age.erase(it);
+    } else {
+      // The further items can only be younger
+      break;
+    }
+  }
+  if (_frags.size() != 0) {
+    frag_arm(now);
+  } else {
+    _frag_mem = 0;
+  }
+}
+
+int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) {
+  uint32_t old = mem_size;
+  unsigned ip_hdr_len = h.ihl * 4;
+  // Store IP header
+  if (offset == 0) {
+    header = p.share(0, ip_hdr_len);
+  }
+  // Sotre IP payload
+  p.trim_front(ip_hdr_len);
+  data.merge(offset, std::move(p));
+  // Update mem size
+  mem_size = header.memory();
+  for (const auto& x : data.map) {
+    mem_size += x.second.memory();
+  }
+  auto added_size = mem_size - old;
+  return added_size;
+}
+
+bool ipv4::frag::is_complete() {
+  // If all the fragments are received, ipv4::frag::merge() should merge all
+  // the fragments into a single packet
+  auto offset = data.map.begin()->first;
+  auto nr_packet = data.map.size();
+  return last_frag_received && nr_packet == 1 && offset == 0;
+}
+
+Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) {
+  auto& ip_header = header;
+  auto& ip_data = data.map.begin()->second;
+  // Append a ethernet header, needed for forwarding
+  auto eh = ip_header.prepend_header<eth_hdr>();
+  eh->src_mac = from;
+  eh->dst_mac = to;
+  eh->eth_proto = uint16_t(eth_protocol_num::ipv4);
+  *eh = eh->hton();
+  // Prepare a packet contains both ethernet header, ip header and ip data
+  ip_header.append(std::move(ip_data));
+  auto pkt = std::move(ip_header);
+  auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr));
+  // len is the sum of each fragment
+  iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr)));
+  // No fragmentation for the assembled datagram
+  iph->frag = 0;
+  // Since each fragment's csum is checked, no need to csum
+  // again for the assembled datagram
+  offload_info oi;
+  oi.reassembled = true;
+  pkt.set_offload_info(oi);
+  return pkt;
+}
+
+void icmp::received(Packet p, ipaddr from, ipaddr to) {
+  auto hdr = p.get_header<icmp_hdr>(0);
+  if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) {
+    return;
+  }
+  hdr->type = icmp_hdr::msg_type::echo_reply;
+  hdr->code = 0;
+  hdr->csum = 0;
+  checksummer csum;
+  csum.sum(reinterpret_cast<char*>(hdr), p.len());
+  hdr->csum = csum.get();
+
+  if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+    auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable {
+        if (r == 0) {
+          _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp});
+        }
+    };
+    _inet.wait_l2_dst_address(from, std::move(p), cb);
+  }
+}
diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h
new file mode 100644
index 00000000..480b4b95
--- /dev/null
+++ b/src/msg/async/dpdk/IP.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_IP_H_
+#define CEPH_MSG_IP_H_
+
+#include <arpa/inet.h>
+#include <unordered_map>
+#include <cstdint>
+#include <array>
+#include <map>
+#include <list>
+#include <chrono>
+
+#include "msg/async/Event.h"
+#include "common/Throttle.h"
+
+#include "array_map.h"
+#include "ARP.h"
+#include "IPChecksum.h"
+#include "ip_types.h"
+#include "const.h"
+#include "net.h"
+#include "PacketUtil.h"
+#include "toeplitz.h"
+
+class ipv4;
+template <ip_protocol_num ProtoNum>
+class ipv4_l4;
+
+template <typename InetTraits>
+class tcp;
+
+struct ipv4_traits {
+  using address_type = ipv4_address;
+  using inet_type = ipv4_l4<ip_protocol_num::tcp>;
+  struct l4packet {
+    ipv4_address to;
+    Packet p;
+    ethernet_address e_dst;
+    ip_protocol_num proto_num;
+  };
+  using packet_provider_type = std::function<Tub<l4packet> ()>;
+  static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+    csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len);
+  }
+  static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min;
+};
+
+template <ip_protocol_num ProtoNum>
+class ipv4_l4 {
+ public:
+  ipv4& _inet;
+ public:
+  ipv4_l4(ipv4& inet) : _inet(inet) {}
+  void register_packet_provider(ipv4_traits::packet_provider_type func);
+  void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+class ip_protocol {
+ public:
+  virtual ~ip_protocol() {}
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; }
+};
+
+template <typename InetTraits>
+struct l4connid {
+  using ipaddr = typename InetTraits::address_type;
+  using inet_type = typename InetTraits::inet_type;
+  struct connid_hash;
+
+  ipaddr local_ip;
+  ipaddr foreign_ip;
+  uint16_t local_port;
+  uint16_t foreign_port;
+
+  bool operator==(const l4connid& x) const {
+    return local_ip == x.local_ip
+           && foreign_ip == x.foreign_ip
+           && local_port == x.local_port
+           && foreign_port == x.foreign_port;
+  }
+
+  uint32_t hash(const rss_key_type& rss_key) {
+    forward_hash hash_data;
+    hash_data.push_back(hton(foreign_ip.ip));
+    hash_data.push_back(hton(local_ip.ip));
+    hash_data.push_back(hton(foreign_port));
+    hash_data.push_back(hton(local_port));
+    return toeplitz_hash(rss_key, hash_data);
+  }
+};
+
+class ipv4_tcp final : public ip_protocol {
+  ipv4_l4<ip_protocol_num::tcp> _inet_l4;
+  std::unique_ptr<tcp<ipv4_traits>> _tcp;
+ public:
+  ipv4_tcp(ipv4& inet, EventCenter *c);
+  ~ipv4_tcp();
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) override;
+  virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override;
+  friend class ipv4;
+};
+
+struct icmp_hdr {
+  enum class msg_type : uint8_t {
+    echo_reply = 0,
+    echo_request = 8,
+  };
+  msg_type type;
+  uint8_t code;
+  uint16_t csum;
+  uint32_t rest;
+} __attribute__((packed));
+
+
+class icmp {
+ public:
+  using ipaddr = ipv4_address;
+  using inet_type = ipv4_l4<ip_protocol_num::icmp>;
+  explicit icmp(CephContext *c, inet_type& inet)
+      : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) {
+    _inet.register_packet_provider([this] {
+      Tub<ipv4_traits::l4packet> l4p;
+      if (!_packetq.empty()) {
+        l4p = std::move(_packetq.front());
+        _packetq.pop_front();
+        _queue_space.put(l4p->p.len());
+      }
+      return l4p;
+    });
+  }
+  void received(Packet p, ipaddr from, ipaddr to);
+
+ private:
+  CephContext *cct;
+  // ipv4_l4<ip_protocol_num::icmp>
+  inet_type& _inet;
+  circular_buffer<ipv4_traits::l4packet> _packetq;
+  Throttle _queue_space;
+};
+
+class ipv4_icmp final : public ip_protocol {
+  CephContext *cct;
+  ipv4_l4<ip_protocol_num::icmp> _inet_l4;
+  icmp _icmp;
+ public:
+  ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {}
+  virtual void received(Packet p, ipv4_address from, ipv4_address to) override {
+    _icmp.received(std::move(p), from, to);
+  }
+  friend class ipv4;
+};
+
+struct ip_hdr;
+
+struct ip_packet_filter {
+  virtual ~ip_packet_filter() {};
+  virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0;
+};
+
+struct ipv4_frag_id {
+  struct hash;
+  ipv4_address src_ip;
+  ipv4_address dst_ip;
+  uint16_t identification;
+  uint8_t protocol;
+  bool operator==(const ipv4_frag_id& x) const {
+    return src_ip == x.src_ip &&
+           dst_ip == x.dst_ip &&
+           identification == x.identification &&
+           protocol == x.protocol;
+  }
+};
+
+struct ipv4_frag_id::hash : private std::hash<ipv4_address>,
+                            private std::hash<uint16_t>, private std::hash<uint8_t> {
+  size_t operator()(const ipv4_frag_id& id) const noexcept {
+    using h1 = std::hash<ipv4_address>;
+    using h2 = std::hash<uint16_t>;
+    using h3 = std::hash<uint8_t>;
+    return h1::operator()(id.src_ip) ^
+           h1::operator()(id.dst_ip) ^
+           h2::operator()(id.identification) ^
+           h3::operator()(id.protocol);
+  }
+};
+
+struct ipv4_tag {};
+using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>;
+
+class interface;
+
+class ipv4 {
+ public:
+  using address_type = ipv4_address;
+  using proto_type = uint16_t;
+  static address_type broadcast_address() { return ipv4_address(0xffffffff); }
+  static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); }
+  CephContext *cct;
+  EventCenter *center;
+
+ private:
+  interface* _netif;
+  std::vector<ipv4_traits::packet_provider_type> _pkt_providers;
+  Tub<uint64_t> frag_timefd;
+  EventCallbackRef frag_handler;
+  arp _global_arp;
+  arp_for<ipv4> _arp;
+  ipv4_address _host_address;
+  ipv4_address _gw_address;
+  ipv4_address _netmask;
+  l3_protocol _l3;
+  subscription<Packet, ethernet_address> _rx_packets;
+  ipv4_tcp _tcp;
+  ipv4_icmp _icmp;
+  array_map<ip_protocol*, 256> _l4;
+  ip_packet_filter *_packet_filter;
+  struct frag {
+    Packet header;
+    ipv4_packet_merger data;
+    utime_t rx_time;
+    uint32_t mem_size = 0;
+    // fragment with MF == 0 inidates it is the last fragment
+    bool last_frag_received = false;
+
+    Packet get_assembled_packet(ethernet_address from, ethernet_address to);
+    int32_t merge(ip_hdr &h, uint16_t offset, Packet p);
+    bool is_complete();
+  };
+  std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags;
+  std::list<ipv4_frag_id> _frags_age;
+  static utime_t _frag_timeout;
+  static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024};
+  static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024};
+  uint32_t _frag_mem = 0;
+  circular_buffer<l3_protocol::l3packet> _packetq;
+  unsigned _pkt_provider_idx = 0;
+  PerfCounters *perf_logger;
+
+ private:
+  int handle_received_packet(Packet p, ethernet_address from);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  Tub<l3_protocol::l3packet> get_packet();
+  bool in_my_netmask(ipv4_address a) const {
+    return !((a.ip ^ _host_address.ip) & _netmask.ip);
+  }
+  void frag_limit_mem();
+  void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) {
+    _frags.erase(frag_id);
+    _frag_mem -= dropped_size;
+  }
+  void frag_arm(utime_t now) {
+    auto tp = now + _frag_timeout;
+    frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler));
+  }
+  void frag_arm() {
+    auto now = ceph_clock_now();
+    frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler));
+  }
+
+ public:
+  void frag_timeout();
+
+ public:
+  explicit ipv4(CephContext *c, EventCenter *cen, interface* netif);
+  ~ipv4() {
+    delete frag_handler;
+  }
+  void set_host_address(ipv4_address ip) {
+    _host_address = ip;
+    _arp.set_self_addr(ip);
+  }
+  ipv4_address host_address() {
+    return _host_address;
+  }
+  void set_gw_address(ipv4_address ip) {
+    _gw_address = ip;
+  }
+  ipv4_address gw_address() const {
+    return _gw_address;
+  }
+  void set_netmask_address(ipv4_address ip) {
+    _netmask = ip;
+  }
+  ipv4_address netmask_address() const {
+    return _netmask;
+  }
+  interface *netif() const {
+    return _netif;
+  }
+  // TODO or something. Should perhaps truly be a list
+  // of filters. With ordering. And blackjack. Etc.
+  // But for now, a simple single raw pointer suffices
+  void set_packet_filter(ip_packet_filter *f) {
+    _packet_filter = f;
+  }
+  ip_packet_filter * packet_filter() const {
+    return _packet_filter;
+  }
+  void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst);
+  tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; }
+  void register_l4(proto_type id, ip_protocol* handler);
+  const hw_features& get_hw_features() const;
+  static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) {
+    if (p.len() + ipv4_hdr_len_min <= hw_features.mtu)
+      return false;
+
+    if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso))
+      return false;
+
+    return true;
+  }
+  void learn(ethernet_address l2, ipv4_address l3) {
+    _arp.learn(l2, l3);
+  }
+  void register_packet_provider(ipv4_traits::packet_provider_type&& func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::register_packet_provider(
+    ipv4_traits::packet_provider_type func) {
+  _inet.register_packet_provider([func] {
+    auto l4p = func();
+    if (l4p) {
+      (*l4p).proto_num = ProtoNum;
+    }
+    return l4p;
+  });
+}
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+  _inet.wait_l2_dst_address(to, std::move(p), std::move(cb));
+}
+
+struct ip_hdr {
+  uint8_t ihl : 4;
+  uint8_t ver : 4;
+  uint8_t dscp : 6;
+  uint8_t ecn : 2;
+  uint16_t len;
+  uint16_t id;
+  uint16_t frag;
+  enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 };
+  uint8_t ttl;
+  uint8_t ip_proto;
+  uint16_t csum;
+  ipv4_address src_ip;
+  ipv4_address dst_ip;
+  uint8_t options[0];
+  ip_hdr hton() {
+    ip_hdr hdr = *this;
+    hdr.len = ::hton(len);
+    hdr.id = ::hton(id);
+    hdr.frag = ::hton(frag);
+    hdr.csum = ::hton(csum);
+    hdr.src_ip.ip = ::hton(src_ip.ip);
+    hdr.dst_ip.ip = ::hton(dst_ip.ip);
+    return hdr;
+  }
+  ip_hdr ntoh() {
+    ip_hdr hdr = *this;
+    hdr.len = ::ntoh(len);
+    hdr.id = ::ntoh(id);
+    hdr.frag = ::ntoh(frag);
+    hdr.csum = ::ntoh(csum);
+    hdr.src_ip = src_ip.ntoh();
+    hdr.dst_ip = dst_ip.ntoh();
+    return hdr;
+  }
+
+  bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); }
+  bool df() { return frag & (1 << uint8_t(frag_bits::df)); }
+  uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); }
+} __attribute__((packed));
+
+template <typename InetTraits>
+struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> {
+  size_t operator()(const l4connid<InetTraits>& id) const noexcept {
+    using h1 = std::hash<ipaddr>;
+    using h2 = std::hash<uint16_t>;
+    return h1::operator()(id.local_ip)
+           ^ h1::operator()(id.foreign_ip)
+           ^ h2::operator()(id.local_port)
+           ^ h2::operator()(id.foreign_port);
+  }
+};
+
+#endif /* CEPH_MSG_IP_H */
diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc
new file mode 100644
index 00000000..7a3253c1
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <arpa/inet.h>
+#include "net.h"
+#include "IPChecksum.h"
+
+void checksummer::sum(const char* data, size_t len) {
+  auto orig_len = len;
+  if (odd) {
+    csum += uint8_t(*data++);
+    --len;
+  }
+  auto p64 = reinterpret_cast<const uint64_t*>(data);
+  while (len >= 8) {
+    csum += ntohq(*p64++);
+    len -= 8;
+  }
+  auto p16 = reinterpret_cast<const uint16_t*>(p64);
+  while (len >= 2) {
+    csum += ntohs(*p16++);
+    len -= 2;
+  }
+  auto p8 = reinterpret_cast<const uint8_t*>(p16);
+  if (len) {
+    csum += *p8++ << 8;
+    len -= 1;
+  }
+  odd ^= orig_len & 1;
+}
+
+uint16_t checksummer::get() const {
+  __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64);
+  uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64);
+  csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48);
+  csum = (csum & 0xffff) + (csum >> 16);
+  csum = (csum & 0xffff) + (csum >> 16);
+  return htons(~csum);
+}
+
+void checksummer::sum(const Packet& p) {
+  for (auto&& f : p.fragments()) {
+    sum(f.base, f.size);
+  }
+}
+
+uint16_t ip_checksum(const void* data, size_t len) {
+  checksummer cksum;
+  cksum.sum(reinterpret_cast<const char*>(data), len);
+  return cksum.get();
+}
diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h
new file mode 100644
index 00000000..9af4a86b
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CHECKSUM_H_
+#define CEPH_MSG_CHECKSUM_H_
+
+#include <cstdint>
+#include <cstddef>
+#include <arpa/inet.h>
+
+#include "Packet.h"
+
+uint16_t ip_checksum(const void* data, size_t len);
+
+struct checksummer {
+  __int128 csum = 0;
+  bool odd = false;
+  void sum(const char* data, size_t len);
+  void sum(const Packet& p);
+  void sum(uint8_t data) {
+    if (!odd) {
+      csum += data << 8;
+    } else {
+      csum += data;
+    }
+    odd = !odd;
+  }
+  void sum(uint16_t data) {
+    if (odd) {
+      sum(uint8_t(data >> 8));
+      sum(uint8_t(data));
+    } else {
+      csum += data;
+    }
+  }
+  void sum(uint32_t data) {
+    if (odd) {
+      sum(uint16_t(data));
+      sum(uint16_t(data >> 16));
+    } else {
+      csum += data;
+    }
+  }
+  void sum_many() {}
+  template <typename T0, typename... T>
+  void sum_many(T0 data, T... rest) {
+    sum(data);
+    sum_many(rest...);
+  }
+  uint16_t get() const;
+};
+
+#endif /* CEPH_MSG_CHECKSUM_H_ */
diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc
new file mode 100644
index 00000000..6c2320a0
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+
+#include "capture.h"
+#include "Packet.h"
+
+constexpr size_t Packet::internal_data_size;
+constexpr size_t Packet::default_nr_frags;
+
+void Packet::linearize(size_t at_frag, size_t desired_size) {
+  _impl->unuse_internal_data();
+  size_t nr_frags = 0;
+  size_t accum_size = 0;
+  while (accum_size < desired_size) {
+    accum_size += _impl->frags[at_frag + nr_frags].size;
+    ++nr_frags;
+  }
+  char *new_frag = new char[accum_size];
+  auto p = new_frag;
+  for (size_t i = 0; i < nr_frags; ++i) {
+    auto& f = _impl->frags[at_frag + i];
+    p = std::copy(f.base, f.base + f.size, p);
+  }
+  // collapse nr_frags into one fragment
+  std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags,
+            _impl->frags + at_frag + 1);
+  _impl->_nr_frags -= nr_frags - 1;
+  _impl->frags[at_frag] = fragment{new_frag, accum_size};
+  if (at_frag == 0 && desired_size == len()) {
+    // We can drop the old buffer safely
+    auto x = std::move(_impl->_deleter);
+    _impl->_deleter = make_deleter([new_frag] { delete []new_frag; });
+  } else {
+    auto del = std::bind(
+            [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter));
+    _impl->_deleter = make_deleter(std::move(del));
+  }
+}
+
+class C_free_on_cpu : public EventCallback {
+  deleter del;
+  std::function<void()> cb;
+ public:
+  C_free_on_cpu(deleter &&d, std::function<void()> &&c):
+      del(std::move(d)), cb(std::move(c)) {}
+  void do_request(uint64_t fd) {
+    // deleter needs to be moved from lambda capture to be destroyed here
+    // otherwise deleter destructor will be called on a cpu that called
+    // create_external_event when work_item is destroyed.
+    deleter xxx(std::move(del));
+    cb();
+    delete this;
+  }
+};
+
+Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb)
+{
+  auto del = std::bind(
+      [center, cb] (deleter &del) mutable {
+        center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb)));
+      }, std::move(_impl->_deleter));
+  // make new deleter that runs old deleter on an origin cpu
+  _impl->_deleter = make_deleter(deleter(), std::move(del));
+
+  return Packet(impl::copy(_impl.get()));
+}
+
+std::ostream& operator<<(std::ostream& os, const Packet& p) {
+  os << "Packet{";
+  bool first = true;
+  for (auto&& frag : p.fragments()) {
+    if (!first) {
+      os << ", ";
+    }
+    first = false;
+    if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) {
+      os << '"';
+      for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+        auto c = *p;
+        if (isprint(c)) {
+          os << c;
+        } else if (c == '\r') {
+          os << "\\r";
+        } else if (c == '\n') {
+          os << "\\n";
+        } else if (c == '\t') {
+          os << "\\t";
+        } else {
+          uint8_t b = c;
+          os << "\\x" << (b / 16) << (b % 16);
+        }
+      }
+      os << '"';
+    } else {
+      os << "{";
+      bool nfirst = true;
+      for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+        if (!nfirst) {
+          os << " ";
+        }
+        nfirst = false;
+        uint8_t b = *p;
+        os << b;
+      }
+      os << "}";
+    }
+  }
+  os << "}";
+  return os;
+}
diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h
new file mode 100644
index 00000000..db9cd2a7
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.h
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_H_
+#define CEPH_MSG_PACKET_H_
+
+#include <vector>
+#include <algorithm>
+#include <iosfwd>
+
+#include "include/types.h"
+#include "common/Tub.h"
+#include "common/deleter.h"
+#include "msg/async/Event.h"
+
+#include "const.h"
+
+struct fragment {
+    char* base;
+    size_t size;
+};
+
+struct offload_info {
+  ip_protocol_num protocol = ip_protocol_num::unused;
+  bool needs_csum = false;
+  uint8_t ip_hdr_len = 20;
+  uint8_t tcp_hdr_len = 20;
+  uint8_t udp_hdr_len = 8;
+  bool needs_ip_csum = false;
+  bool reassembled = false;
+  uint16_t tso_seg_size = 0;
+  // HW stripped VLAN header (CPU order)
+  Tub<uint16_t> vlan_tci;
+};
+
+// Zero-copy friendly packet class
+//
+// For implementing zero-copy, we need a flexible destructor that can
+// destroy packet data in different ways: decrementing a reference count,
+// or calling a free()-like function.
+//
+// Moreover, we need different destructors for each set of fragments within
+// a single fragment. For example, a header and trailer might need delete[]
+// to be called, while the internal data needs a reference count to be
+// released.  Matters are complicated in that fragments can be split
+// (due to virtual/physical translation).
+//
+// To implement this, we associate each packet with a single destructor,
+// but allow composing a packet from another packet plus a fragment to
+// be added, with its own destructor, causing the destructors to be chained.
+//
+// The downside is that the data needed for the destructor is duplicated,
+// if it is already available in the fragment itself.
+//
+// As an optimization, when we allocate small fragments, we allocate some
+// extra space, so prepending to the packet does not require extra
+// allocations.  This is useful when adding headers.
+//
+class Packet {
+  // enough for lots of headers, not quite two cache lines:
+  static constexpr size_t internal_data_size = 128 - 16;
+  static constexpr size_t default_nr_frags = 4;
+
+  struct pseudo_vector {
+    fragment* _start;
+    fragment* _finish;
+    pseudo_vector(fragment* start, size_t nr)
+        : _start(start), _finish(_start + nr) {}
+    fragment* begin() { return _start; }
+    fragment* end() { return _finish; }
+    fragment& operator[](size_t idx) { return _start[idx]; }
+  };
+
+  struct impl {
+    // when destroyed, virtual destructor will reclaim resources
+    deleter _deleter;
+    unsigned _len = 0;
+    uint16_t _nr_frags = 0;
+    uint16_t _allocated_frags;
+    offload_info _offload_info;
+    Tub<uint32_t> rss_hash;
+    char data[internal_data_size]; // only frags[0] may use
+    unsigned headroom = internal_data_size; // in data
+    // FIXME: share data/frags space
+
+    fragment frags[];
+
+    explicit impl(size_t nr_frags = default_nr_frags);
+    impl(const impl&) = delete;
+    impl(fragment frag, size_t nr_frags = default_nr_frags);
+
+    pseudo_vector fragments() { return { frags, _nr_frags }; }
+
+    static std::unique_ptr<impl> allocate(size_t nr_frags) {
+      nr_frags = std::max(nr_frags, default_nr_frags);
+      return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags));
+    }
+
+    static std::unique_ptr<impl> copy(impl* old, size_t nr) {
+      auto n = allocate(nr);
+      n->_deleter = std::move(old->_deleter);
+      n->_len = old->_len;
+      n->_nr_frags = old->_nr_frags;
+      n->headroom = old->headroom;
+      n->_offload_info = old->_offload_info;
+      n->rss_hash.construct(old->rss_hash);
+      std::copy(old->frags, old->frags + old->_nr_frags, n->frags);
+      old->copy_internal_fragment_to(n.get());
+      return std::move(n);
+    }
+
+    static std::unique_ptr<impl> copy(impl* old) {
+      return copy(old, old->_nr_frags);
+    }
+
+    static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) {
+      if (old->_allocated_frags >= old->_nr_frags + extra_frags) {
+        return std::move(old);
+      }
+      return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags));
+    }
+    void* operator new(size_t size, size_t nr_frags = default_nr_frags) {
+      ceph_assert(nr_frags == uint16_t(nr_frags));
+      return ::operator new(size + nr_frags * sizeof(fragment));
+    }
+    // Matching the operator new above
+    void operator delete(void* ptr, size_t nr_frags) {
+      return ::operator delete(ptr);
+    }
+    // Since the above "placement delete" hides the global one, expose it
+    void operator delete(void* ptr) {
+      return ::operator delete(ptr);
+    }
+
+    bool using_internal_data() const {
+      return _nr_frags
+              && frags[0].base >= data
+              && frags[0].base < data + internal_data_size;
+    }
+
+    void unuse_internal_data() {
+      if (!using_internal_data()) {
+        return;
+      }
+      auto buf = static_cast<char*>(::malloc(frags[0].size));
+      if (!buf) {
+        throw std::bad_alloc();
+      }
+      deleter d = make_free_deleter(buf);
+      std::copy(frags[0].base, frags[0].base + frags[0].size, buf);
+      frags[0].base = buf;
+      _deleter.append(std::move(d));
+      headroom = internal_data_size;
+    }
+    void copy_internal_fragment_to(impl* to) {
+      if (!using_internal_data()) {
+        return;
+      }
+      to->frags[0].base = to->data + headroom;
+      std::copy(frags[0].base, frags[0].base + frags[0].size,
+              to->frags[0].base);
+    }
+  };
+  explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {}
+  std::unique_ptr<impl> _impl;
+public:
+  static Packet from_static_data(const char* data, size_t len) {
+    return {fragment{const_cast<char*>(data), len}, deleter()};
+  }
+
+  // build empty Packet
+  Packet();
+  // build empty Packet with nr_frags allocated
+  explicit Packet(size_t nr_frags);
+  // move existing Packet
+  Packet(Packet&& x) noexcept;
+  // copy data into Packet
+  Packet(const char* data, size_t len);
+  // copy data into Packet
+  explicit Packet(fragment frag);
+  // zero-copy single fragment
+  Packet(fragment frag, deleter del);
+  // zero-copy multiple fragments
+  Packet(std::vector<fragment> frag, deleter del);
+  // build Packet with iterator
+  template <typename Iterator>
+  Packet(Iterator begin, Iterator end, deleter del);
+  // append fragment (copying new fragment)
+  Packet(Packet&& x, fragment frag);
+  // prepend fragment (copying new fragment, with header optimization)
+  Packet(fragment frag, Packet&& x);
+  // prepend fragment (zero-copy)
+  Packet(fragment frag, deleter del, Packet&& x);
+  // append fragment (zero-copy)
+  Packet(Packet&& x, fragment frag, deleter d);
+  // append deleter
+  Packet(Packet&& x, deleter d);
+
+  Packet& operator=(Packet&& x) {
+    if (this != &x) {
+      this->~Packet();
+      new (this) Packet(std::move(x));
+    }
+    return *this;
+  }
+
+  unsigned len() const { return _impl->_len; }
+  unsigned memory() const { return len() +  sizeof(Packet::impl); }
+
+  fragment frag(unsigned idx) const { return _impl->frags[idx]; }
+  fragment& frag(unsigned idx) { return _impl->frags[idx]; }
+
+  unsigned nr_frags() const { return _impl->_nr_frags; }
+  pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; }
+  fragment* fragment_array() const { return _impl->frags; }
+
+  // share Packet data (reference counted, non COW)
+  Packet share();
+  Packet share(size_t offset, size_t len);
+
+  void append(Packet&& p);
+
+  void trim_front(size_t how_much);
+  void trim_back(size_t how_much);
+
+  // get a header pointer, linearizing if necessary
+  template <typename Header>
+  Header* get_header(size_t offset = 0);
+
+  // get a header pointer, linearizing if necessary
+  char* get_header(size_t offset, size_t size);
+
+  // prepend a header (default-initializing it)
+  template <typename Header>
+  Header* prepend_header(size_t extra_size = 0);
+
+  // prepend a header (uninitialized!)
+  char* prepend_uninitialized_header(size_t size);
+
+  Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{});
+
+  void linearize() { return linearize(0, len()); }
+
+  void reset() { _impl.reset(); }
+
+  void reserve(int n_frags) {
+    if (n_frags > _impl->_nr_frags) {
+      auto extra = n_frags - _impl->_nr_frags;
+      _impl = impl::allocate_if_needed(std::move(_impl), extra);
+    }
+  }
+  Tub<uint32_t> rss_hash() {
+    return _impl->rss_hash;
+  }
+  void set_rss_hash(uint32_t hash) {
+    _impl->rss_hash.construct(hash);
+  }
+private:
+  void linearize(size_t at_frag, size_t desired_size);
+  bool allocate_headroom(size_t size);
+public:
+  class offload_info offload_info() const { return _impl->_offload_info; }
+  class offload_info& offload_info_ref() { return _impl->_offload_info; }
+  void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; }
+};
+
+std::ostream& operator<<(std::ostream& os, const Packet& p);
+
+inline Packet::Packet(Packet&& x) noexcept
+    : _impl(std::move(x._impl)) {
+}
+
+inline Packet::impl::impl(size_t nr_frags)
+    : _len(0), _allocated_frags(nr_frags) {
+}
+
+inline Packet::impl::impl(fragment frag, size_t nr_frags)
+    : _len(frag.size), _allocated_frags(nr_frags) {
+    ceph_assert(_allocated_frags > _nr_frags);
+  if (frag.size <= internal_data_size) {
+    headroom -= frag.size;
+    frags[0] = { data + headroom, frag.size };
+  } else {
+    auto buf = static_cast<char*>(::malloc(frag.size));
+    if (!buf) {
+      throw std::bad_alloc();
+    }
+    deleter d = make_free_deleter(buf);
+    frags[0] = { buf, frag.size };
+    _deleter.append(std::move(d));
+  }
+  std::copy(frag.base, frag.base + frag.size, frags[0].base);
+  ++_nr_frags;
+}
+
+inline Packet::Packet(): _impl(impl::allocate(1)) {
+}
+
+inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) {
+}
+
+inline Packet::Packet(fragment frag): _impl(new impl(frag)) {
+}
+
+inline Packet::Packet(const char* data, size_t size):
+    Packet(fragment{const_cast<char*>(data), size}) {
+}
+
+inline Packet::Packet(fragment frag, deleter d)
+    : _impl(impl::allocate(1)) {
+  _impl->_deleter = std::move(d);
+  _impl->frags[_impl->_nr_frags++] = frag;
+  _impl->_len = frag.size;
+}
+
+inline Packet::Packet(std::vector<fragment> frag, deleter d)
+    : _impl(impl::allocate(frag.size())) {
+  _impl->_deleter = std::move(d);
+  std::copy(frag.begin(), frag.end(), _impl->frags);
+  _impl->_nr_frags = frag.size();
+  _impl->_len = 0;
+  for (auto&& f : _impl->fragments()) {
+    _impl->_len += f.size;
+  }
+}
+
+template <typename Iterator>
+inline Packet::Packet(Iterator begin, Iterator end, deleter del) {
+  unsigned nr_frags = 0, len = 0;
+  nr_frags = std::distance(begin, end);
+  std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; });
+  _impl = impl::allocate(nr_frags);
+  _impl->_deleter = std::move(del);
+  _impl->_len = len;
+  _impl->_nr_frags = nr_frags;
+  std::copy(begin, end, _impl->frags);
+}
+
+inline Packet::Packet(Packet&& x, fragment frag)
+    : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+  _impl->_len += frag.size;
+  char* buf = new char[frag.size];
+  std::copy(frag.base, frag.base + frag.size, buf);
+  _impl->frags[_impl->_nr_frags++] = {buf, frag.size};
+  _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] {
+    delete[] buf;
+  });
+}
+
+inline bool Packet::allocate_headroom(size_t size) {
+  if (_impl->headroom >= size) {
+    _impl->_len += size;
+    if (!_impl->using_internal_data()) {
+      _impl = impl::allocate_if_needed(std::move(_impl), 1);
+      std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+              _impl->frags + _impl->_nr_frags + 1);
+      _impl->frags[0] = { _impl->data + internal_data_size, 0 };
+      ++_impl->_nr_frags;
+    }
+    _impl->headroom -= size;
+    _impl->frags[0].base -= size;
+    _impl->frags[0].size += size;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+
+inline Packet::Packet(fragment frag, Packet&& x)
+    : _impl(std::move(x._impl)) {
+  // try to prepend into existing internal fragment
+  if (allocate_headroom(frag.size)) {
+    std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base);
+    return;
+  } else {
+    // didn't work out, allocate and copy
+    _impl->unuse_internal_data();
+    _impl = impl::allocate_if_needed(std::move(_impl), 1);
+    _impl->_len += frag.size;
+    char *buf = new char[frag.size];
+    std::copy(frag.base, frag.base + frag.size, buf);
+    std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+            _impl->frags + _impl->_nr_frags + 1);
+    ++_impl->_nr_frags;
+    _impl->frags[0] = {buf, frag.size};
+    _impl->_deleter = make_deleter(
+            std::move(_impl->_deleter), [buf] { delete []buf; });
+  }
+}
+
+inline Packet::Packet(Packet&& x, fragment frag, deleter d)
+    : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+  _impl->_len += frag.size;
+  _impl->frags[_impl->_nr_frags++] = frag;
+  d.append(std::move(_impl->_deleter));
+  _impl->_deleter = std::move(d);
+}
+
+inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) {
+  _impl->_deleter.append(std::move(d));
+}
+
+inline void Packet::append(Packet&& p) {
+  if (!_impl->_len) {
+    *this = std::move(p);
+    return;
+  }
+  _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags);
+  _impl->_len += p._impl->_len;
+  p._impl->unuse_internal_data();
+  std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags,
+            _impl->frags + _impl->_nr_frags);
+  _impl->_nr_frags += p._impl->_nr_frags;
+  p._impl->_deleter.append(std::move(_impl->_deleter));
+  _impl->_deleter = std::move(p._impl->_deleter);
+}
+
+inline char* Packet::get_header(size_t offset, size_t size) {
+  if (offset + size > _impl->_len) {
+    return nullptr;
+  }
+  size_t i = 0;
+  while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) {
+    offset -= _impl->frags[i++].size;
+  }
+  if (i == _impl->_nr_frags) {
+    return nullptr;
+  }
+  if (offset + size > _impl->frags[i].size) {
+    linearize(i, offset + size);
+  }
+  return _impl->frags[i].base + offset;
+}
+
+template <typename Header>
+inline Header* Packet::get_header(size_t offset) {
+  return reinterpret_cast<Header*>(get_header(offset, sizeof(Header)));
+}
+
+inline void Packet::trim_front(size_t how_much) {
+  ceph_assert(how_much <= _impl->_len);
+  _impl->_len -= how_much;
+  size_t i = 0;
+  while (how_much && how_much >= _impl->frags[i].size) {
+    how_much -= _impl->frags[i++].size;
+  }
+  std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags);
+  _impl->_nr_frags -= i;
+  if (!_impl->using_internal_data()) {
+    _impl->headroom = internal_data_size;
+  }
+  if (how_much) {
+    if (_impl->using_internal_data()) {
+      _impl->headroom += how_much;
+    }
+    _impl->frags[0].base += how_much;
+    _impl->frags[0].size -= how_much;
+  }
+}
+
+inline void Packet::trim_back(size_t how_much) {
+  ceph_assert(how_much <= _impl->_len);
+  _impl->_len -= how_much;
+  size_t i = _impl->_nr_frags - 1;
+  while (how_much && how_much >= _impl->frags[i].size) {
+    how_much -= _impl->frags[i--].size;
+  }
+  _impl->_nr_frags = i + 1;
+  if (how_much) {
+    _impl->frags[i].size -= how_much;
+    if (i == 0 && _impl->using_internal_data()) {
+        _impl->headroom += how_much;
+    }
+  }
+}
+
+template <typename Header>
+Header* Packet::prepend_header(size_t extra_size) {
+  auto h = prepend_uninitialized_header(sizeof(Header) + extra_size);
+  return new (h) Header{};
+}
+
+// prepend a header (uninitialized!)
+inline char* Packet::prepend_uninitialized_header(size_t size) {
+  if (!allocate_headroom(size)) {
+    // didn't work out, allocate and copy
+    _impl->unuse_internal_data();
+    // try again, after unuse_internal_data we may have space after all
+    if (!allocate_headroom(size)) {
+      // failed
+      _impl->_len += size;
+      _impl = impl::allocate_if_needed(std::move(_impl), 1);
+      char *buf = new char[size];
+      std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+              _impl->frags + _impl->_nr_frags + 1);
+      ++_impl->_nr_frags;
+      _impl->frags[0] = {buf, size};
+      _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+              [buf] { delete []buf; });
+    }
+  }
+  return _impl->frags[0].base;
+}
+
+inline Packet Packet::share() {
+    return share(0, _impl->_len);
+}
+
+inline Packet Packet::share(size_t offset, size_t len) {
+  _impl->unuse_internal_data(); // FIXME: eliminate?
+  Packet n;
+  n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags);
+  size_t idx = 0;
+  while (offset > 0 && offset >= _impl->frags[idx].size) {
+    offset -= _impl->frags[idx++].size;
+  }
+  while (n._impl->_len < len) {
+    auto& f = _impl->frags[idx++];
+    auto fsize = std::min(len - n._impl->_len, f.size - offset);
+    n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize };
+    n._impl->_len += fsize;
+    offset = 0;
+  }
+  n._impl->_offload_info = _impl->_offload_info;
+  ceph_assert(!n._impl->_deleter);
+  n._impl->_deleter = _impl->_deleter.share();
+  return n;
+}
+
+#endif /* CEPH_MSG_PACKET_H_ */
diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h
new file mode 100644
index 00000000..118218e6
--- /dev/null
+++ b/src/msg/async/dpdk/PacketUtil.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_UTIL_H_
+#define CEPH_MSG_PACKET_UTIL_H_
+
+#include <map>
+#include <iostream>
+
+#include "Packet.h"
+
+template <typename Offset, typename Tag>
+class packet_merger {
+ private:
+  static uint64_t& linearizations_ref() {
+    static thread_local uint64_t linearization_count;
+    return linearization_count;
+  }
+ public:
+  std::map<Offset, Packet> map;
+
+  static uint64_t linearizations() {
+    return linearizations_ref();
+  }
+
+  void merge(Offset offset, Packet p) {
+    bool insert = true;
+    auto beg = offset;
+    auto end = beg + p.len();
+    // First, try to merge the packet with existing segment
+    for (auto it = map.begin(); it != map.end();) {
+      auto& seg_pkt = it->second;
+      auto seg_beg = it->first;
+      auto seg_end = seg_beg + seg_pkt.len();
+      // There are 6 cases:
+      if (seg_beg <= beg && end <= seg_end) {
+        // 1) seg_beg beg end seg_end
+        // We already have data in this packet
+        return;
+      } else if (beg <= seg_beg && seg_end <= end) {
+        // 2) beg seg_beg seg_end end
+        // The new segment contains more data than this old segment
+        // Delete the old one, insert the new one
+        it = map.erase(it);
+        insert = true;
+        break;
+      } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) {
+        // 3) beg seg_beg end seg_end
+        // Merge two segments, trim front of old segment
+        auto trim = end - seg_beg;
+        seg_pkt.trim_front(trim);
+        p.append(std::move(seg_pkt));
+        // Delete the old one, insert the new one
+        it = map.erase(it);
+        insert = true;
+        break;
+      } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+        // 4) seg_beg beg seg_end end
+        // Merge two segments, trim front of new segment
+        auto trim = seg_end - beg;
+        p.trim_front(trim);
+        // Append new data to the old segment, keep the old segment
+        seg_pkt.append(std::move(p));
+        seg_pkt.linearize();
+        ++linearizations_ref();
+        insert = false;
+        break;
+      } else {
+        // 5) beg end < seg_beg seg_end
+        //   or
+        // 6) seg_beg seg_end < beg end
+        // Can not merge with this segment, keep looking
+        it++;
+        insert = true;
+      }
+    }
+
+    if (insert) {
+      p.linearize();
+      ++linearizations_ref();
+      map.emplace(beg, std::move(p));
+    }
+
+    // Second, merge adjacent segments after this packet has been merged,
+    // because this packet might fill a "whole" and make two adjacent
+    // segments mergable
+    for (auto it = map.begin(); it != map.end();) {
+      // The first segment
+      auto& seg_pkt = it->second;
+      auto seg_beg = it->first;
+      auto seg_end = seg_beg + seg_pkt.len();
+
+      // The second segment
+      auto it_next = it;
+      it_next++;
+      if (it_next == map.end()) {
+        break;
+      }
+      auto& p = it_next->second;
+      auto beg = it_next->first;
+      auto end = beg + p.len();
+
+      // Merge the the second segment into first segment if possible
+      if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+        // Merge two segments, trim front of second segment
+        auto trim = seg_end - beg;
+        p.trim_front(trim);
+        // Append new data to the first segment, keep the first segment
+        seg_pkt.append(std::move(p));
+
+        // Delete the second segment
+        map.erase(it_next);
+
+        // Keep merging this first segment with its new next packet
+        // So we do not update the iterator: it
+        continue;
+      } else if (end <= seg_end) {
+        // The first segment has all the data in the second segment
+        // Delete the second segment
+        map.erase(it_next);
+        continue;
+      } else if (seg_end < beg) {
+        // Can not merge first segment with second segment
+        it = it_next;
+        continue;
+      } else {
+        // If we reach here, we have a bug with merge.
+        std::cout << "packet_merger: merge error\n";
+        abort();
+      }
+    }
+  }
+};
+
+#endif
diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h
new file mode 100644
index 00000000..996ae93c
--- /dev/null
+++ b/src/msg/async/dpdk/TCP-Stack.h
@@ -0,0 +1,40 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+// tcp/network-stack integration
+
+#ifndef CEPH_MSG_DPDK_TCP_STACK_H
+#define CEPH_MSG_DPDK_TCP_STACK_H
+
+class ServerSocket;
+class ConnectedSocket;
+
+class ipv4_traits;
+template <typename InetTraits>
+class tcp;
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+                 int type, ServerSocket *sa);
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+                  ConnectedSocket *sa);
+
+#endif
diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc
new file mode 100644
index 00000000..c6397709
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.cc
@@ -0,0 +1,840 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "align.h"
+#include "TCP.h"
+#include "IP.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "tcp "
+
+void tcp_option::parse(uint8_t* beg, uint8_t* end)
+{
+  while (beg < end) {
+    auto kind = option_kind(*beg);
+    if (kind != option_kind::nop && kind != option_kind::eol) {
+      // Make sure there is enough room for this option
+      auto len = *(beg + 1);
+      if (beg + len > end) {
+        return;
+      }
+    }
+    switch (kind) {
+      case option_kind::mss:
+        _mss_received = true;
+        _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss);
+        beg += option_len::mss;
+        break;
+      case option_kind::win_scale:
+        _win_scale_received = true;
+        _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift;
+        // We can turn on win_scale option, 7 is Linux's default win scale size
+        _local_win_scale = 7;
+        beg += option_len::win_scale;
+        break;
+      case option_kind::sack:
+        _sack_received = true;
+        beg += option_len::sack;
+        break;
+      case option_kind::nop:
+        beg += option_len::nop;
+        break;
+      case option_kind::eol:
+        return;
+      default:
+        // Ignore options we do not understand
+        auto len = *(beg + 1);
+        beg += len;
+        // Prevent infinite loop
+        if (len == 0) {
+            return;
+        }
+        break;
+    }
+  }
+}
+
+uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size)
+{
+  auto hdr = reinterpret_cast<uint8_t*>(th);
+  auto off = hdr + sizeof(tcp_hdr);
+  uint8_t size = 0;
+  bool syn_on = th->f_syn;
+  bool ack_on = th->f_ack;
+
+  if (syn_on) {
+    if (_mss_received || !ack_on) {
+      auto mss = new (off) tcp_option::mss;
+      mss->mss = _local_mss;
+      off += mss->len;
+      size += mss->len;
+      *mss = mss->hton();
+    }
+    if (_win_scale_received || !ack_on) {
+      auto win_scale = new (off) tcp_option::win_scale;
+      win_scale->shift = _local_win_scale;
+      off += win_scale->len;
+      size += win_scale->len;
+    }
+  }
+  if (size > 0) {
+    // Insert NOP option
+    auto size_max = align_up(uint8_t(size + 1), tcp_option::align);
+    while (size < size_max - uint8_t(option_len::eol)) {
+      new (off) tcp_option::nop;
+      off += option_len::nop;
+      size += option_len::nop;
+    }
+    new (off) tcp_option::eol;
+    size += option_len::eol;
+  }
+  ceph_assert(size == options_size);
+
+  return size;
+}
+
+uint8_t tcp_option::get_size(bool syn_on, bool ack_on)
+{
+  uint8_t size = 0;
+  if (syn_on) {
+    if (_mss_received || !ack_on) {
+      size += option_len::mss;
+    }
+    if (_win_scale_received || !ack_on) {
+      size += option_len::win_scale;
+    }
+  }
+  if (size > 0) {
+    size += option_len::eol;
+    // Insert NOP option to align on 32-bit
+    size = align_up(size, tcp_option::align);
+  }
+  return size;
+}
+
+ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c)
+    : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c)))
+{ }
+
+ipv4_tcp::~ipv4_tcp() { }
+
+void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to)
+{
+  _tcp->received(std::move(p), from, to);
+}
+
+bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+  return _tcp->forward(out_hash_data, p, off);
+}
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+                 int type, ServerSocket *sock)
+{
+  auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type);
+  int r = p->listen();
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+                  ConnectedSocket *sock)
+{
+  auto conn = tcpv4.connect(addr);
+  *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>(
+          new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn))));
+  return 0;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip)
+{
+  ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin)
+                 << " syn=" << bool(rth->f_syn) << dendl;
+  if (rth->f_rst) {
+    return;
+  }
+  Packet p;
+  auto th = p.prepend_header<tcp_hdr>();
+  th->src_port = rth->dst_port;
+  th->dst_port = rth->src_port;
+  if (rth->f_ack) {
+    th->seq = rth->ack;
+  }
+  // If this RST packet is in response to a SYN packet. We ACK the ISN.
+  if (rth->f_syn) {
+    th->ack = rth->seq + 1;
+    th->f_ack = true;
+  }
+  th->f_rst = true;
+  th->data_offset = sizeof(*th) / 4;
+  th->checksum = 0;
+  *th = th->hton();
+
+  checksummer csum;
+  offload_info oi;
+  InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
+  if (get_hw_features().tx_csum_l4_offload) {
+    th->checksum = ~csum.get();
+    oi.needs_csum = true;
+  } else {
+    csum.sum(p);
+    th->checksum = csum.get();
+    oi.needs_csum = false;
+  }
+
+  oi.protocol = ip_protocol_num::tcp;
+  oi.tcp_hdr_len = sizeof(tcp_hdr);
+  p.set_offload_info(oi);
+
+  send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
+}
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+template<typename InetTraits>
+ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) {
+  return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port
+                << " tcb(" << this << " fd=" << fd << " s=" << _state << ").";
+}
+
+template<typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p)
+{
+  auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+  auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+  auto opt_end = opt_start + opt_len;
+  p.trim_front(th->data_offset * 4);
+  tcp_sequence seg_seq = th->seq;
+
+  // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
+  _rcv.next = seg_seq + 1;
+  _rcv.initial = seg_seq;
+
+  // ISS should be selected and a SYN segment sent of the form:
+  // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+  // SND.NXT is set to ISS+1 and SND.UNA to ISS
+  // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
+  // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
+  // have
+  //     th->seq = syn_on ? _snd.initial : _snd.next
+  // to make sure retransmitted SYN has correct SEQ number.
+  do_setup_isn();
+
+  _rcv.urgent = _rcv.next;
+
+  ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl;
+  init_from_options(th, opt_start, opt_end);
+  do_syn_received();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p)
+{
+  auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+  auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+  auto opt_end = opt_start + opt_len;
+  p.trim_front(th->data_offset * 4);
+  tcp_sequence seg_seq = th->seq;
+  auto seg_ack = th->ack;
+
+  ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+                      << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+  bool acceptable = false;
+  // 3.1 first check the ACK bit
+  if (th->f_ack) {
+    // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
+    // RST bit is set, if so drop the segment and return)
+    if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
+      return respond_with_reset(th);
+    }
+
+    // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+    acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
+  }
+
+  // 3.2 second check the RST bit
+  if (th->f_rst) {
+    // If the ACK was acceptable then signal the user "error: connection
+    // reset", drop the segment, enter CLOSED state, delete TCB, and
+    // return.  Otherwise (no ACK) drop the segment and return.
+    if (acceptable) {
+      return do_reset();
+    } else {
+      return;
+    }
+  }
+
+  // 3.3 third check the security and precedence
+  // NOTE: Ignored for now
+
+  // 3.4 fourth check the SYN bit
+  if (th->f_syn) {
+    // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ.  SND.UNA should
+    // be advanced to equal SEG.ACK (if there is an ACK), and any segments
+    // on the retransmission queue which are thereby acknowledged should be
+    // removed.
+    _rcv.next = seg_seq + 1;
+    _rcv.initial = seg_seq;
+    if (th->f_ack) {
+      // TODO: clean retransmission queue
+      _snd.unacknowledged = seg_ack;
+    }
+    if (_snd.unacknowledged > _snd.initial) {
+      // If SND.UNA > ISS (our SYN has been ACKed), change the connection
+      // state to ESTABLISHED, form an ACK segment
+      // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+      ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl;
+      init_from_options(th, opt_start, opt_end);
+      do_established();
+      output();
+    } else {
+      // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
+      // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+      ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl;
+      do_syn_received();
+    }
+  }
+
+  // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
+  // segment and return.
+  return;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p)
+{
+  p.trim_front(th->data_offset * 4);
+  bool do_output = false;
+  bool do_output_data = false;
+  tcp_sequence seg_seq = th->seq;
+  auto seg_ack = th->ack;
+  auto seg_len = p.len();
+  ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+                      << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw
+                      << " rcv next " << _rcv.next.raw << " len " << seg_len
+                      << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+  // 4.1 first check sequence number
+  if (!segment_acceptable(seg_seq, seg_len)) {
+    //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+    return output();
+  }
+
+  // In the following it is assumed that the segment is the idealized
+  // segment that begins at RCV.NXT and does not exceed the window.
+  if (seg_seq < _rcv.next) {
+    // ignore already acknowledged data
+    auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
+    ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl;
+    p.trim_front(dup);
+    seg_len -= dup;
+    seg_seq += dup;
+  }
+  // FIXME: We should trim data outside the right edge of the receive window as well
+
+  if (seg_seq != _rcv.next) {
+    ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw
+                        << " actual " << seg_seq.raw
+                        << " out of order size " << _rcv.out_of_order.map.size()
+                        << dendl;
+    insert_out_of_order(seg_seq, std::move(p));
+    // A TCP receiver SHOULD send an immediate duplicate ACK
+    // when an out-of-order segment arrives.
+    return output();
+  }
+
+  // 4.2 second check the RST bit
+  if (th->f_rst) {
+    if (in_state(SYN_RECEIVED)) {
+      // If this connection was initiated with a passive OPEN (i.e.,
+      // came from the LISTEN state), then return this connection to
+      // LISTEN state and return.  The user need not be informed.  If
+      // this connection was initiated with an active OPEN (i.e., came
+      // from SYN_SENT state) then the connection was refused, signal
+      // the user "connection refused".  In either case, all segments
+      // on the retransmission queue should be removed.  And in the
+      // active OPEN case, enter the CLOSED state and delete the TCB,
+      // and return.
+      errno = -ECONNREFUSED;
+      return do_reset();
+    }
+    if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
+      // If the RST bit is set then, any outstanding RECEIVEs and SEND
+      // should receive "reset" responses.  All segment queues should be
+      // flushed.  Users should also receive an unsolicited general
+      // "connection reset" signal.  Enter the CLOSED state, delete the
+      // TCB, and return.
+      return do_reset();
+    }
+    if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
+      // If the RST bit is set then, enter the CLOSED state, delete the
+      // TCB, and return.
+      return do_closed();
+    }
+  }
+
+  // 4.3 third check security and precedence
+  // NOTE: Ignored for now
+
+  // 4.4 fourth, check the SYN bit
+  if (th->f_syn) {
+    // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+    // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+
+    // If the SYN is in the window it is an error, send a reset, any
+    // outstanding RECEIVEs and SEND should receive "reset" responses,
+    // all segment queues should be flushed, the user should also
+    // receive an unsolicited general "connection reset" signal, enter
+    // the CLOSED state, delete the TCB, and return.
+    respond_with_reset(th);
+    return do_reset();
+
+    // If the SYN is not in the window this step would not be reached
+    // and an ack would have been sent in the first step (sequence
+    // number check).
+  }
+
+  // 4.5 fifth check the ACK field
+  if (!th->f_ack) {
+    // if the ACK bit is off drop the segment and return
+    return;
+  } else {
+    // SYN_RECEIVED STATE
+    if (in_state(SYN_RECEIVED)) {
+      // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
+      // and continue processing.
+      if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
+        ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl;
+        do_established();
+        if (_tcp.push_listen_queue(_local_port, this)) {
+          ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl;
+        } else {
+          ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl;
+          return respond_with_reset(th);
+        }
+      } else {
+        // <SEQ=SEG.ACK><CTL=RST>
+        return respond_with_reset(th);
+      }
+    }
+    auto update_window = [this, th, seg_seq, seg_ack] {
+      ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq
+                          << " seg_ack=" << seg_ack << " old window=" << th->window
+                          << " new window=" << int(_snd.window_scale) << dendl;
+      _snd.window = th->window << _snd.window_scale;
+      _snd.wl1 = seg_seq;
+      _snd.wl2 = seg_ack;
+      if (_snd.window == 0) {
+        _persist_time_out = _rto;
+        start_persist_timer();
+      } else {
+        stop_persist_timer();
+      }
+    };
+    // ESTABLISHED STATE or
+    // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
+    if (in_state(ESTABLISHED | CLOSE_WAIT)) {
+      // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
+      if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
+        // Remote ACKed data we sent
+        auto acked_bytes = data_segment_acked(seg_ack);
+
+        // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
+        if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
+          update_window();
+        }
+
+        // some data is acked, try send more data
+        do_output_data = true;
+
+        auto set_retransmit_timer = [this] {
+          if (_snd.data.empty()) {
+            // All outstanding segments are acked, turn off the timer.
+            stop_retransmit_timer();
+            // Signal the waiter of this event
+            signal_all_data_acked();
+          } else {
+            // Restart the timer becasue new data is acked.
+            start_retransmit_timer();
+          }
+        };
+
+        if (_snd.dupacks >= 3) {
+          // We are in fast retransmit / fast recovery phase
+          uint32_t smss = _snd.mss;
+          if (seg_ack > _snd.recover) {
+            ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl;
+            // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
+            _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
+            // Exit the fast recovery procedure
+            exit_fast_recovery();
+            set_retransmit_timer();
+          } else {
+            ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl;
+            // Retransmit the first unacknowledged segment
+            fast_retransmit();
+            // Deflate the congestion window by the amount of new data
+            // acknowledged by the Cumulative Acknowledgment field
+            _snd.cwnd -= acked_bytes;
+            // If the partial ACK acknowledges at least one SMSS of new
+            // data, then add back SMSS bytes to the congestion window
+            if (acked_bytes >= smss) {
+              _snd.cwnd += smss;
+            }
+            // Send a new segment if permitted by the new value of
+            // cwnd.  Do not exit the fast recovery procedure For
+            // the first partial ACK that arrives during fast
+            // recovery, also reset the retransmit timer.
+            if (++_snd.partial_ack == 1) {
+              start_retransmit_timer();
+            }
+          }
+        } else {
+          // RFC5681: The fast retransmit algorithm uses the arrival
+          // of 3 duplicate ACKs (as defined in section 2, without
+          // any intervening ACKs which move SND.UNA) as an
+          // indication that a segment has been lost.
+          //
+          // So, here we reset dupacks to zero becasue this ACK moves
+          // SND.UNA.
+          exit_fast_recovery();
+          set_retransmit_timer();
+        }
+      } else if (!_snd.data.empty() && seg_len == 0 &&
+                 th->f_fin == 0 && th->f_syn == 0 &&
+                 th->ack == _snd.unacknowledged &&
+                 uint32_t(th->window << _snd.window_scale) == _snd.window) {
+        // Note:
+        // RFC793 states:
+        // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
+        // RFC5681 states:
+        // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
+        // and repair loss, based on incoming duplicate ACKs.
+        // Here, We follow RFC5681.
+        _snd.dupacks++;
+        uint32_t smss = _snd.mss;
+        // 3 duplicated ACKs trigger a fast retransmit
+        if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+          // RFC5681 Step 3.1
+          // Send cwnd + 2 * smss per RFC3042
+          do_output_data = true;
+        } else if (_snd.dupacks == 3) {
+          // RFC6582 Step 3.2
+          if (seg_ack - 1 > _snd.recover) {
+            _snd.recover = _snd.next - 1;
+            // RFC5681 Step 3.2
+            _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
+            fast_retransmit();
+          } else {
+            // Do not enter fast retransmit and do not reset ssthresh
+          }
+          // RFC5681 Step 3.3
+          _snd.cwnd = _snd.ssthresh + 3 * smss;
+        } else if (_snd.dupacks > 3) {
+          // RFC5681 Step 3.4
+          _snd.cwnd += smss;
+          // RFC5681 Step 3.5
+          do_output_data = true;
+        }
+      } else if (seg_ack > _snd.next) {
+        // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
+        // then send an ACK, drop the segment, and return
+        return output();
+      } else if (_snd.window == 0 && th->window > 0) {
+        update_window();
+        do_output_data = true;
+      }
+    }
+    // FIN_WAIT_1 STATE
+    if (in_state(FIN_WAIT_1)) {
+      // In addition to the processing for the ESTABLISHED state, if
+      // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
+      // processing in that state.
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl;
+        _state = FIN_WAIT_2;
+        do_local_fin_acked();
+      }
+    }
+    // FIN_WAIT_2 STATE
+    if (in_state(FIN_WAIT_2)) {
+      // In addition to the processing for the ESTABLISHED state, if
+      // the retransmission queue is empty, the user’s CLOSE can be
+      // acknowledged ("ok") but do not delete the TCB.
+      // TODO
+    }
+    // CLOSING STATE
+    if (in_state(CLOSING)) {
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl;
+        do_local_fin_acked();
+        return do_time_wait();
+      } else {
+        return;
+      }
+    }
+    // LAST_ACK STATE
+    if (in_state(LAST_ACK)) {
+      if (seg_ack == _snd.next + 1) {
+        ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl;
+        do_local_fin_acked();
+        return do_closed();
+      }
+    }
+    // TIME_WAIT STATE
+    if (in_state(TIME_WAIT)) {
+      // The only thing that can arrive in this state is a
+      // retransmission of the remote FIN. Acknowledge it, and restart
+      // the 2 MSL timeout.
+      // TODO
+    }
+  }
+
+  // 4.6 sixth, check the URG bit
+  if (th->f_urg) {
+    // TODO
+  }
+
+  // 4.7 seventh, process the segment text
+  if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
+    if (p.len()) {
+      // Once the TCP takes responsibility for the data it advances
+      // RCV.NXT over the data accepted, and adjusts RCV.WND as
+      // apporopriate to the current buffer availability.  The total of
+      // RCV.NXT and RCV.WND should not be reduced.
+      _rcv.data.push_back(std::move(p));
+      _rcv.next += seg_len;
+      auto merged = merge_out_of_order();
+      signal_data_received();
+      // Send an acknowledgment of the form:
+      // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+      // This acknowledgment should be piggybacked on a segment being
+      // transmitted if possible without incurring undue delay.
+      if (merged) {
+        // TCP receiver SHOULD send an immediate ACK when the
+        // incoming segment fills in all or part of a gap in the
+        // sequence space.
+        do_output = true;
+      } else {
+        do_output = should_send_ack(seg_len);
+      }
+      ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl;
+    }
+  } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
+    // This should not occur, since a FIN has been received from the
+    // remote side. Ignore the segment text.
+    return;
+  }
+
+  // 4.8 eighth, check the FIN bit
+  if (th->f_fin) {
+    if (in_state(CLOSED | LISTEN | SYN_SENT)) {
+      // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
+      // since the SEG.SEQ cannot be validated; drop the segment and return.
+      return;
+    }
+    auto fin_seq = seg_seq + seg_len;
+    if (fin_seq == _rcv.next) {
+      _rcv.next = fin_seq + 1;
+
+      // If this <FIN> packet contains data as well, we can ACK both data
+      // and <FIN> in a single packet, so canncel the previous ACK.
+      clear_delayed_ack();
+      do_output = false;
+      // Send ACK for the FIN!
+      output();
+      signal_data_received();
+      _errno = 0;
+
+      if (in_state(SYN_RECEIVED | ESTABLISHED)) {
+        ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl;
+        _state = CLOSE_WAIT;
+        // EOF
+      }
+      if (in_state(FIN_WAIT_1)) {
+        // If our FIN has been ACKed (perhaps in this segment), then
+        // enter TIME-WAIT, start the time-wait timer, turn off the other
+        // timers; otherwise enter the CLOSING state.
+        // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
+        // not FIN_WAIT_1 if we reach here.
+        ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl;
+        _state = CLOSING;
+      }
+      if (in_state(FIN_WAIT_2)) {
+        ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl;
+        return do_time_wait();
+      }
+    }
+  }
+  if (do_output || (do_output_data && can_send())) {
+    // Since we will do output, we can canncel scheduled delayed ACK.
+    clear_delayed_ack();
+    output();
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::connect()
+{
+  ldout(_tcp.cct, 20) << __func__ << dendl;
+  // An initial send sequence number (ISS) is selected.  A SYN segment of the
+  // form <SEQ=ISS><CTL=SYN> is sent.  Set SND.UNA to ISS, SND.NXT to ISS+1,
+  // enter SYN-SENT state, and return.
+  do_setup_isn();
+
+  // Local receive window scale factor
+  _rcv.window_scale = _option._local_win_scale = 7;
+  // Maximum segment size local can receive
+  _rcv.mss = _option._local_mss = local_mss();
+  // Linux's default window size
+  _rcv.window = 29200 << _rcv.window_scale;
+
+  do_syn_sent();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close_final_cleanup()
+{
+  if (_snd._all_data_acked_fd >= 0) {
+    center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE);
+    _tcp.manager.close(_snd._all_data_acked_fd);
+    _snd._all_data_acked_fd = -1;
+  }
+
+  _snd.closed = true;
+  signal_data_received();
+  ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl;
+  if (in_state(CLOSE_WAIT)) {
+    ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl;
+    _state = LAST_ACK;
+  } else if (in_state(ESTABLISHED)) {
+    ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl;
+    _state = FIN_WAIT_1;
+  }
+  // Send <FIN> to remote
+  // Note: we call output_one to make sure a packet with FIN actually
+  // sent out. If we only call output() and _packetq is not empty,
+  // tcp::tcb::get_packet(), packet with FIN will not be generated.
+  output_one();
+  output();
+  center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::retransmit()
+{
+  auto output_update_rto = [this] {
+    output();
+    // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
+    this->_rto = std::min(this->_rto * 2, this->_rto_max);
+    start_retransmit_timer();
+  };
+
+  // Retransmit SYN
+  if (syn_needs_on()) {
+    if (_snd.syn_retransmit++ < _max_nr_retransmit) {
+      output_update_rto();
+    } else {
+      _errno = -ECONNABORTED;
+      ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max "
+                         << _max_nr_retransmit << dendl;
+      _errno = -ETIMEDOUT;
+      cleanup();
+      return;
+    }
+  }
+
+  // Retransmit FIN
+  if (fin_needs_on()) {
+    if (_snd.fin_retransmit++ < _max_nr_retransmit) {
+      output_update_rto();
+    } else {
+      ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max "
+                         << _max_nr_retransmit << dendl;
+      _errno = -ETIMEDOUT;
+      cleanup();
+      return;
+    }
+  }
+
+  // Retransmit Data
+  if (_snd.data.empty()) {
+    return;
+  }
+
+  // If there are unacked data, retransmit the earliest segment
+  auto& unacked_seg = _snd.data.front();
+
+  // According to RFC5681
+  // Update ssthresh only for the first retransmit
+  uint32_t smss = _snd.mss;
+  if (unacked_seg.nr_transmits == 0) {
+    _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
+  }
+  // RFC6582 Step 4
+  _snd.recover = _snd.next - 1;
+  // Start the slow start process
+  _snd.cwnd = smss;
+  // End fast recovery
+  exit_fast_recovery();
+
+  ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size()
+                      << " nr=" << unacked_seg.nr_transmits << dendl;
+  if (unacked_seg.nr_transmits < _max_nr_retransmit) {
+    unacked_seg.nr_transmits++;
+  } else {
+    // Delete connection when max num of retransmission is reached
+    ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max "
+                       << _max_nr_retransmit << dendl;
+    _errno = -ETIMEDOUT;
+    cleanup();
+    return;
+  }
+  retransmit_one();
+
+  output_update_rto();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::persist() {
+  ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl;
+  // Send 1 byte packet to probe peer's window size
+  _snd.window_probe = true;
+  output_one();
+  _snd.window_probe = false;
+
+  output();
+  // Perform binary exponential back-off per RFC1122
+  _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
+  start_persist_timer();
+}
diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h
new file mode 100644
index 00000000..b7bd7132
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.h
@@ -0,0 +1,1503 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_TCP_H_
+#define CEPH_DPDK_TCP_H_
+
+#include <unordered_map>
+#include <map>
+#include <queue>
+#include <functional>
+#include <deque>
+#include <chrono>
+#include <stdexcept>
+#include <system_error>
+
+#include "msg/async/dpdk/EventDPDK.h"
+
+#include "include/utime.h"
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/ceph_crypto.h"
+#include "msg/async/Event.h"
+#include "IPChecksum.h"
+#include "IP.h"
+#include "const.h"
+#include "byteorder.h"
+#include "shared_ptr.h"
+#include "PacketUtil.h"
+
+#include "include/random.h"
+
+struct tcp_hdr;
+
+enum class tcp_state : uint16_t {
+  CLOSED          = (1 << 0),
+  LISTEN          = (1 << 1),
+  SYN_SENT        = (1 << 2),
+  SYN_RECEIVED    = (1 << 3),
+  ESTABLISHED     = (1 << 4),
+  FIN_WAIT_1      = (1 << 5),
+  FIN_WAIT_2      = (1 << 6),
+  CLOSE_WAIT      = (1 << 7),
+  CLOSING         = (1 << 8),
+  LAST_ACK        = (1 << 9),
+  TIME_WAIT       = (1 << 10)
+};
+
+inline tcp_state operator|(tcp_state s1, tcp_state s2) {
+  return tcp_state(uint16_t(s1) | uint16_t(s2));
+}
+
+inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) {
+  switch (s) {
+    case tcp_state::CLOSED: return str << "CLOSED";
+    case tcp_state::LISTEN: return str << "LISTEN";
+    case tcp_state::SYN_SENT: return str << "SYN_SENT";
+    case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED";
+    case tcp_state::ESTABLISHED: return str << "ESTABLISHED";
+    case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1";
+    case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2";
+    case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT";
+    case tcp_state::CLOSING: return str << "CLOSING";
+    case tcp_state::LAST_ACK: return str << "LAST_ACK";
+    case tcp_state::TIME_WAIT: return str << "TIME_WAIT";
+    default: return str << "UNKNOWN";
+  }
+}
+
+struct tcp_option {
+  // The kind and len field are fixed and defined in TCP protocol
+  enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8,  nop = 1, eol = 0 };
+  enum class option_len:  uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
+  struct mss {
+    option_kind kind = option_kind::mss;
+    option_len len = option_len::mss;
+    uint16_t mss;
+    struct mss hton() {
+      struct mss m = *this;
+      m.mss = ::hton(m.mss);
+      return m;
+    }
+  } __attribute__((packed));
+  struct win_scale {
+    option_kind kind = option_kind::win_scale;
+    option_len len = option_len::win_scale;
+    uint8_t shift;
+  } __attribute__((packed));
+  struct sack {
+    option_kind kind = option_kind::sack;
+    option_len len = option_len::sack;
+  } __attribute__((packed));
+  struct timestamps {
+    option_kind kind = option_kind::timestamps;
+    option_len len = option_len::timestamps;
+    uint32_t t1;
+    uint32_t t2;
+  } __attribute__((packed));
+  struct nop {
+    option_kind kind = option_kind::nop;
+  } __attribute__((packed));
+  struct eol {
+    option_kind kind = option_kind::eol;
+  } __attribute__((packed));
+  static const uint8_t align = 4;
+
+  void parse(uint8_t* beg, uint8_t* end);
+  uint8_t fill(tcp_hdr* th, uint8_t option_size);
+  uint8_t get_size(bool syn_on, bool ack_on);
+
+  // For option negotiattion
+  bool _mss_received = false;
+  bool _win_scale_received = false;
+  bool _timestamps_received = false;
+  bool _sack_received = false;
+
+  // Option data
+  uint16_t _remote_mss = 536;
+  uint16_t _local_mss;
+  uint8_t _remote_win_scale = 0;
+  uint8_t _local_win_scale = 0;
+};
+inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+
+struct tcp_sequence {
+  uint32_t raw;
+};
+
+tcp_sequence ntoh(tcp_sequence ts) {
+  return tcp_sequence { ::ntoh(ts.raw) };
+}
+
+tcp_sequence hton(tcp_sequence ts) {
+  return tcp_sequence { ::hton(ts.raw) };
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) {
+  return os << s.raw;
+}
+
+inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; }
+inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; }
+inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; }
+inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; }
+inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; }
+inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; }
+inline bool operator==(tcp_sequence s, tcp_sequence q)  { return s.raw == q.raw; }
+inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); }
+inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; }
+inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; }
+inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); }
+inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); }
+
+struct tcp_hdr {
+  uint16_t src_port;
+  uint16_t dst_port;
+  tcp_sequence seq;
+  tcp_sequence ack;
+  uint8_t rsvd1 : 4;
+  uint8_t data_offset : 4;
+  uint8_t f_fin : 1;
+  uint8_t f_syn : 1;
+  uint8_t f_rst : 1;
+  uint8_t f_psh : 1;
+  uint8_t f_ack : 1;
+  uint8_t f_urg : 1;
+  uint8_t rsvd2 : 2;
+  uint16_t window;
+  uint16_t checksum;
+  uint16_t urgent;
+
+  tcp_hdr hton() {
+    tcp_hdr hdr = *this;
+    hdr.src_port = ::hton(src_port);
+    hdr.dst_port = ::hton(dst_port);
+    hdr.seq = ::hton(seq);
+    hdr.ack = ::hton(ack);
+    hdr.window = ::hton(window);
+    hdr.checksum = ::hton(checksum);
+    hdr.urgent = ::hton(urgent);
+    return hdr;
+  }
+
+  tcp_hdr ntoh() {
+    tcp_hdr hdr = *this;
+    hdr.src_port = ::ntoh(src_port);
+    hdr.dst_port = ::ntoh(dst_port);
+    hdr.seq = ::ntoh(seq);
+    hdr.ack = ::ntoh(ack);
+    hdr.window = ::ntoh(window);
+    hdr.checksum = ::ntoh(checksum);
+    hdr.urgent = ::ntoh(urgent);
+    return hdr;
+  }
+} __attribute__((packed));
+
+struct tcp_tag {};
+using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>;
+
+template <typename InetTraits>
+class tcp {
+ public:
+  using ipaddr = typename InetTraits::address_type;
+  using inet_type = typename InetTraits::inet_type;
+  using connid = l4connid<InetTraits>;
+  using connid_hash = typename connid::connid_hash;
+  class connection;
+  class listener;
+ private:
+  class tcb;
+
+  class C_handle_delayed_ack : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_delayed_ack(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->_nr_full_seg_received = 0;
+      tc->output();
+    }
+  };
+
+  class C_handle_retransmit : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_retransmit(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->retransmit();
+    }
+  };
+
+  class C_handle_persist : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_handle_persist(tcb *t): tc(t) { }
+    void do_request(uint64_t r) {
+      tc->persist();
+    }
+  };
+
+  class C_all_data_acked : public EventCallback {
+    tcb *tc;
+
+   public:
+    C_all_data_acked(tcb *t): tc(t) {}
+    void do_request(uint64_t fd_or_id) {
+      tc->close_final_cleanup();
+    }
+  };
+
+  class C_actual_remove_tcb : public EventCallback {
+    lw_shared_ptr<tcb> tc;
+   public:
+    C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {}
+    void do_request(uint64_t r) {
+      delete this;
+    }
+  };
+
+  class tcb : public enable_lw_shared_from_this<tcb> {
+    using clock_type = ceph::coarse_real_clock;
+    static constexpr tcp_state CLOSED         = tcp_state::CLOSED;
+    static constexpr tcp_state LISTEN         = tcp_state::LISTEN;
+    static constexpr tcp_state SYN_SENT       = tcp_state::SYN_SENT;
+    static constexpr tcp_state SYN_RECEIVED   = tcp_state::SYN_RECEIVED;
+    static constexpr tcp_state ESTABLISHED    = tcp_state::ESTABLISHED;
+    static constexpr tcp_state FIN_WAIT_1     = tcp_state::FIN_WAIT_1;
+    static constexpr tcp_state FIN_WAIT_2     = tcp_state::FIN_WAIT_2;
+    static constexpr tcp_state CLOSE_WAIT     = tcp_state::CLOSE_WAIT;
+    static constexpr tcp_state CLOSING        = tcp_state::CLOSING;
+    static constexpr tcp_state LAST_ACK       = tcp_state::LAST_ACK;
+    static constexpr tcp_state TIME_WAIT      = tcp_state::TIME_WAIT;
+    tcp_state _state = CLOSED;
+    tcp& _tcp;
+    UserspaceEventManager &manager;
+    connection* _conn = nullptr;
+    bool _connect_done = false;
+    ipaddr _local_ip;
+    ipaddr _foreign_ip;
+    uint16_t _local_port;
+    uint16_t _foreign_port;
+    struct unacked_segment {
+      Packet p;
+      uint16_t data_len;
+      unsigned nr_transmits;
+      clock_type::time_point tx_time;
+    };
+    struct send {
+      tcp_sequence unacknowledged;
+      tcp_sequence next;
+      uint32_t window;
+      uint8_t window_scale;
+      uint16_t mss;
+      tcp_sequence urgent;
+      tcp_sequence wl1;
+      tcp_sequence wl2;
+      tcp_sequence initial;
+      std::deque<unacked_segment> data;
+      std::deque<Packet> unsent;
+      uint32_t unsent_len = 0;
+      uint32_t queued_len = 0;
+      bool closed = false;
+      // Wait for all data are acked
+      int _all_data_acked_fd = -1;
+      // Limit number of data queued into send queue
+      Throttle user_queue_space;
+      // Round-trip time variation
+      std::chrono::microseconds rttvar;
+      // Smoothed round-trip time
+      std::chrono::microseconds srtt;
+      bool first_rto_sample = true;
+      clock_type::time_point syn_tx_time;
+      // Congestion window
+      uint32_t cwnd;
+      // Slow start threshold
+      uint32_t ssthresh;
+      // Duplicated ACKs
+      uint16_t dupacks = 0;
+      unsigned syn_retransmit = 0;
+      unsigned fin_retransmit = 0;
+      uint32_t limited_transfer = 0;
+      uint32_t partial_ack = 0;
+      tcp_sequence recover;
+      bool window_probe = false;
+      send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {}
+    } _snd;
+    struct receive {
+      tcp_sequence next;
+      uint32_t window;
+      uint8_t window_scale;
+      uint16_t mss;
+      tcp_sequence urgent;
+      tcp_sequence initial;
+      std::deque<Packet> data;
+      tcp_packet_merger out_of_order;
+    } _rcv;
+    EventCenter *center;
+    int fd;
+    // positive means no errno, 0 means eof, nagetive means error
+    int16_t _errno = 1;
+    tcp_option _option;
+    EventCallbackRef delayed_ack_event;
+    Tub<uint64_t> _delayed_ack_fd;
+    // Retransmission timeout
+    std::chrono::microseconds _rto{1000*1000};
+    std::chrono::microseconds _persist_time_out{1000*1000};
+    static constexpr std::chrono::microseconds _rto_min{1000*1000};
+    static constexpr std::chrono::microseconds _rto_max{60000*1000};
+    // Clock granularity
+    static constexpr std::chrono::microseconds _rto_clk_granularity{1000};
+    static constexpr uint16_t _max_nr_retransmit{5};
+    EventCallbackRef retransmit_event;
+    Tub<uint64_t> retransmit_fd;
+    EventCallbackRef persist_event;
+    EventCallbackRef all_data_ack_event;
+    Tub<uint64_t> persist_fd;
+    uint16_t _nr_full_seg_received = 0;
+    struct isn_secret {
+      // 512 bits secretkey for ISN generating
+      uint32_t key[16];
+      isn_secret () {
+        for (auto& k : key) {
+          k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max());
+        }
+      }
+    };
+    static isn_secret _isn_secret;
+    tcp_sequence get_isn();
+    circular_buffer<typename InetTraits::l4packet> _packetq;
+    bool _poll_active = false;
+   public:
+    // callback
+    void close_final_cleanup();
+    ostream& _prefix(std::ostream *_dout);
+
+   public:
+    tcb(tcp& t, connid id);
+    ~tcb();
+    void input_handle_listen_state(tcp_hdr* th, Packet p);
+    void input_handle_syn_sent_state(tcp_hdr* th, Packet p);
+    void input_handle_other_state(tcp_hdr* th, Packet p);
+    void output_one(bool data_retransmit = false);
+    bool is_all_data_acked();
+    int send(Packet p);
+    void connect();
+    Tub<Packet> read();
+    void close();
+    void remove_from_tcbs() {
+      auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
+      _tcp._tcbs.erase(id);
+    }
+    Tub<typename InetTraits::l4packet> get_packet();
+    void output() {
+      if (!_poll_active) {
+        _poll_active = true;
+
+        auto tcb = this->shared_from_this();
+        _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) {
+          if (r == 0) {
+            tcb->_tcp.poll_tcb(dst, std::move(tcb));
+          } else if (r == -ETIMEDOUT) {
+            // in other states connection should time out
+            if (tcb->in_state(SYN_SENT)) {
+              tcb->_errno = -ETIMEDOUT;
+              tcb->cleanup();
+            }
+          } else if (r == -EBUSY) {
+            // retry later
+            tcb->_poll_active = false;
+            tcb->start_retransmit_timer();
+          }
+        });
+      }
+    }
+
+    int16_t get_errno() const {
+      return _errno;
+    }
+
+    tcp_state& state() {
+      return _state;
+    }
+
+    uint64_t peek_sent_available() {
+      if (!in_state(ESTABLISHED))
+        return 0;
+      uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current();
+      return left;
+    }
+
+    int is_connected() const {
+      if (_errno <= 0)
+        return _errno;
+      return _connect_done;
+    }
+
+   private:
+    void respond_with_reset(tcp_hdr* th);
+    bool merge_out_of_order();
+    void insert_out_of_order(tcp_sequence seq, Packet p);
+    void trim_receive_data_after_window();
+    bool should_send_ack(uint16_t seg_len);
+    void clear_delayed_ack();
+    Packet get_transmit_packet();
+    void retransmit_one() {
+      bool data_retransmit = true;
+      output_one(data_retransmit);
+    }
+    void start_retransmit_timer() {
+      if (retransmit_fd)
+        center->delete_time_event(*retransmit_fd);
+      retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event));
+    };
+    void stop_retransmit_timer() {
+      if (retransmit_fd) {
+        center->delete_time_event(*retransmit_fd);
+        retransmit_fd.destroy();
+      }
+    };
+    void start_persist_timer() {
+      if (persist_fd)
+        center->delete_time_event(*persist_fd);
+      persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event));
+    };
+    void stop_persist_timer() {
+      if (persist_fd) {
+        center->delete_time_event(*persist_fd);
+        persist_fd.destroy();
+      }
+    };
+    void persist();
+    void retransmit();
+    void fast_retransmit();
+    void update_rto(clock_type::time_point tx_time);
+    void update_cwnd(uint32_t acked_bytes);
+    void cleanup();
+    uint32_t can_send() {
+      if (_snd.window_probe) {
+        return 1;
+      }
+      // Can not send more than advertised window allows
+      auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len);
+      // Can not send more than congestion window allows
+      x = std::min(_snd.cwnd, x);
+      if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+        // RFC5681 Step 3.1
+        // Send cwnd + 2 * smss per RFC3042
+        auto flight = flight_size();
+        auto max = _snd.cwnd + 2 * _snd.mss;
+        x = flight <= max ? std::min(x, max - flight) : 0;
+        _snd.limited_transfer += x;
+      } else if (_snd.dupacks >= 3) {
+        // RFC5681 Step 3.5
+        // Sent 1 full-sized segment at most
+        x = std::min(uint32_t(_snd.mss), x);
+      }
+      return x;
+    }
+    uint32_t flight_size() {
+      uint32_t size = 0;
+      std::for_each(_snd.data.begin(), _snd.data.end(),
+                    [&] (unacked_segment& seg) { size += seg.p.len(); });
+      return size;
+    }
+    uint16_t local_mss() {
+      return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+    }
+    void queue_packet(Packet p) {
+      _packetq.emplace_back(
+          typename InetTraits::l4packet{_foreign_ip, std::move(p)});
+    }
+    void signal_data_received() {
+      manager.notify(fd, EVENT_READABLE);
+    }
+    void signal_all_data_acked() {
+      if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0)
+        manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+    }
+    void do_syn_sent() {
+      _state = SYN_SENT;
+      _snd.syn_tx_time = clock_type::now();
+      // Send <SYN> to remote
+      output();
+    }
+    void do_syn_received() {
+      _state = SYN_RECEIVED;
+      _snd.syn_tx_time = clock_type::now();
+      // Send <SYN,ACK> to remote
+      output();
+    }
+    void do_established() {
+      _state = ESTABLISHED;
+      update_rto(_snd.syn_tx_time);
+      _connect_done = true;
+      manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE);
+    }
+    void do_reset() {
+      _state = CLOSED;
+      // Free packets to be sent which are waiting for user_queue_space
+      _snd.user_queue_space.reset();
+      cleanup();
+      _errno = -ECONNRESET;
+      manager.notify(fd, EVENT_READABLE);
+
+      if (_snd._all_data_acked_fd >= 0)
+        manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+    }
+    void do_time_wait() {
+      // FIXME: Implement TIME_WAIT state timer
+      _state = TIME_WAIT;
+      cleanup();
+    }
+    void do_closed() {
+      _state = CLOSED;
+      cleanup();
+    }
+    void do_setup_isn() {
+      _snd.initial = get_isn();
+      _snd.unacknowledged = _snd.initial;
+      _snd.next = _snd.initial + 1;
+      _snd.recover = _snd.initial;
+    }
+    void do_local_fin_acked() {
+      _snd.unacknowledged += 1;
+      _snd.next += 1;
+    }
+    bool syn_needs_on() {
+      return in_state(SYN_SENT | SYN_RECEIVED);
+    }
+    bool fin_needs_on() {
+      return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
+             _snd.unsent_len == 0 && _snd.queued_len == 0;
+    }
+    bool ack_needs_on() {
+      return !in_state(CLOSED | LISTEN | SYN_SENT);
+    }
+    bool foreign_will_not_send() {
+      return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
+    }
+    bool in_state(tcp_state state) {
+      return uint16_t(_state) & uint16_t(state);
+    }
+    void exit_fast_recovery() {
+      _snd.dupacks = 0;
+      _snd.limited_transfer = 0;
+      _snd.partial_ack = 0;
+    }
+    uint32_t data_segment_acked(tcp_sequence seg_ack);
+    bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len);
+    void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
+    friend class connection;
+
+    friend class C_handle_delayed_ack;
+    friend class C_handle_retransmit;
+    friend class C_handle_persist;
+    friend class C_all_data_acked;
+  };
+
+  CephContext *cct;
+  // ipv4_l4<ip_protocol_num::tcp>
+  inet_type& _inet;
+  EventCenter *center;
+  UserspaceEventManager &manager;
+  std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
+  std::unordered_map<uint16_t, listener*> _listening;
+  std::random_device _rd;
+  std::default_random_engine _e;
+  std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
+  circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
+  // queue for packets that do not belong to any tcb
+  circular_buffer<ipv4_traits::l4packet> _packetq;
+  Throttle _queue_space;
+  // Limit number of data queued into send queue
+ public:
+  class connection {
+    lw_shared_ptr<tcb> _tcb;
+   public:
+    explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
+    connection(const connection&) = delete;
+    connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
+      _tcb->_conn = this;
+    }
+    ~connection();
+    void operator=(const connection&) = delete;
+    connection& operator=(connection&& x) {
+      if (this != &x) {
+        this->~connection();
+        new (this) connection(std::move(x));
+      }
+      return *this;
+    }
+    int fd() const {
+      return _tcb->fd;
+    }
+    int send(Packet p) {
+      return _tcb->send(std::move(p));
+    }
+    Tub<Packet> read() {
+      return _tcb->read();
+    }
+    int16_t get_errno() const {
+      return _tcb->get_errno();
+    }
+    void close_read();
+    void close_write();
+    entity_addr_t remote_addr() const {
+      entity_addr_t addr;
+      auto net_ip = _tcb->_foreign_ip.hton();
+      memcpy((void*)&addr.in4_addr().sin_addr.s_addr,
+             &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr));
+      addr.set_family(AF_INET);
+      return addr;
+    }
+    uint64_t peek_sent_available() {
+      return _tcb->peek_sent_available();
+    }
+    int is_connected() const { return _tcb->is_connected(); }
+  };
+  class listener {
+    tcp& _tcp;
+    uint16_t _port;
+    int _fd = -1;
+    int16_t _errno;
+    queue<connection> _q;
+    size_t _q_max_length;
+
+   private:
+    listener(tcp& t, uint16_t port, size_t queue_length)
+        : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) {
+    }
+   public:
+    listener(const listener&) = delete;
+    void operator=(const listener&) = delete;
+    listener(listener&& x)
+        : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno),
+          _q(std::move(x._q)) {
+      if (_fd >= 0)
+        _tcp._listening[_port] = this;
+    }
+    ~listener() {
+      abort_accept();
+    }
+    int listen() {
+      if (_tcp._listening.find(_port) != _tcp._listening.end())
+        return -EADDRINUSE;
+      _tcp._listening.emplace(_port, this);
+      _fd = _tcp.manager.get_eventfd();
+      return 0;
+    }
+    Tub<connection> accept() {
+      Tub<connection> c;
+      if (!_q.empty()) {
+        c = std::move(_q.front());
+        _q.pop();
+      }
+      return c;
+    }
+    void abort_accept() {
+      while (!_q.empty())
+        _q.pop();
+      if (_fd >= 0) {
+        _tcp._listening.erase(_port);
+        _tcp.manager.close(_fd);
+        _fd = -1;
+      }
+    }
+    int16_t get_errno() const {
+      return _errno;
+    }
+    bool full() const {
+      return _q.size() == _q_max_length;
+    }
+    int fd() const {
+      return _fd;
+    }
+    friend class tcp;
+  };
+ public:
+  explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen);
+  void received(Packet p, ipaddr from, ipaddr to);
+  bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+  listener listen(uint16_t port, size_t queue_length = 100);
+  connection connect(const entity_addr_t &addr);
+  const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); }
+  void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) {
+    _poll_tcbs.emplace_back(std::move(tcb), dst);
+  }
+  bool push_listen_queue(uint16_t port, tcb *t) {
+    auto listener = _listening.find(port);
+    if (listener == _listening.end() || listener->second->full()) {
+      return false;
+    }
+    listener->second->_q.push(connection(t->shared_from_this()));
+    manager.notify(listener->second->_fd, EVENT_READABLE);
+    return true;
+  }
+
+ private:
+  void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p);
+  void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
+  friend class listener;
+};
+
+template <typename InetTraits>
+tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen)
+    : cct(c), _inet(inet), center(cen),
+      manager(static_cast<DPDKDriver*>(cen->get_driver())->manager),
+      _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) {
+  int tcb_polled = 0u;
+  _inet.register_packet_provider([this, tcb_polled] () mutable {
+    Tub<typename InetTraits::l4packet> l4p;
+    auto c = _poll_tcbs.size();
+    if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
+      l4p = std::move(_packetq.front());
+      _packetq.pop_front();
+      _queue_space.put(l4p->p.len());
+    } else {
+      while (c--) {
+        tcb_polled++;
+        lw_shared_ptr<tcb> tcb;
+        ethernet_address dst;
+        std::tie(tcb, dst) = std::move(_poll_tcbs.front());
+        _poll_tcbs.pop_front();
+        l4p = std::move(tcb->get_packet());
+        if (l4p) {
+          l4p->e_dst = dst;
+          break;
+        }
+      }
+    }
+    return l4p;
+  });
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
+  return listener(*this, port, queue_length);
+}
+
+template <typename InetTraits>
+typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) {
+  uint16_t src_port;
+  connid id;
+  auto src_ip = _inet._inet.host_address();
+  auto dst_ip = ipv4_address(addr);
+  auto dst_port = addr.get_port();
+
+  do {
+    src_port = _port_dist(_e);
+    id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port};
+    if (_tcbs.find(id) == _tcbs.end()) {
+      if (_inet._inet.netif()->hw_queues_count() == 1 ||
+          _inet._inet.netif()->hash2cpu(
+              id.hash(_inet._inet.netif()->rss_key())) == center->get_id())
+        break;
+    }
+  } while (true);
+
+  auto tcbp = make_lw_shared<tcb>(*this, id);
+  _tcbs.insert({id, tcbp});
+  tcbp->connect();
+  return connection(tcbp);
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) {
+  auto th = p.get_header<tcp_hdr>(off);
+  if (th) {
+    out_hash_data.push_back(th->src_port);
+    out_hash_data.push_back(th->dst_port);
+  }
+  return true;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) {
+  auto th = p.get_header<tcp_hdr>(0);
+  if (!th) {
+    return;
+  }
+  // th->data_offset is correct even before ntoh()
+  if (unsigned(th->data_offset * 4) < sizeof(*th)) {
+    return;
+  }
+
+  if (!get_hw_features().rx_csum_offload) {
+    checksummer csum;
+    InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
+    csum.sum(p);
+    if (csum.get() != 0) {
+      return;
+    }
+  }
+  auto h = th->ntoh();
+  auto id = connid{to, from, h.dst_port, h.src_port};
+  auto tcbi = _tcbs.find(id);
+  lw_shared_ptr<tcb> tcbp;
+  if (tcbi == _tcbs.end()) {
+    auto listener = _listening.find(id.local_port);
+    if (listener == _listening.end() || listener->second->full()) {
+      // 1) In CLOSE state
+      // 1.1 all data in the incoming segment is discarded.  An incoming
+      // segment containing a RST is discarded. An incoming segment not
+      // containing a RST causes a RST to be sent in response.
+      // FIXME:
+      //      if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
+      //      if ACK on:  <SEQ=SEG.ACK><CTL=RST>
+      return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+    } else {
+      // 2) In LISTEN state
+      // 2.1 first check for an RST
+      if (h.f_rst) {
+        // An incoming RST should be ignored
+        return;
+      }
+      // 2.2 second check for an ACK
+      if (h.f_ack) {
+        // Any acknowledgment is bad if it arrives on a connection
+        // still in the LISTEN state.
+        // <SEQ=SEG.ACK><CTL=RST>
+        return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+      }
+      // 2.3 third check for a SYN
+      if (h.f_syn) {
+        // check the security
+        // NOTE: Ignored for now
+        tcbp = make_lw_shared<tcb>(*this, id);
+        _tcbs.insert({id, tcbp});
+        return tcbp->input_handle_listen_state(&h, std::move(p));
+      }
+      // 2.4 fourth other text or control
+      // So you are unlikely to get here, but if you do, drop the
+      // segment, and return.
+      return;
+    }
+  } else {
+    tcbp = tcbi->second;
+    if (tcbp->state() == tcp_state::SYN_SENT) {
+      // 3) In SYN_SENT State
+      return tcbp->input_handle_syn_sent_state(&h, std::move(p));
+    } else {
+      // 4) In other state, can be one of the following:
+      // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+      // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+      return tcbp->input_handle_other_state(&h, std::move(p));
+    }
+  }
+}
+
+// Send packet does not belong to any tcb
+template <typename InetTraits>
+void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) {
+  if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+    _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable {
+      if (r == 0)
+        _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
+    });
+  }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::connection::~connection() {
+  if (_tcb) {
+    _tcb->_conn = nullptr;
+    close_read();
+    close_write();
+  }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
+    : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip),
+      _local_port(id.local_port), _foreign_port(id.foreign_port),
+      _snd(_tcp.cct),
+      center(t.center),
+      fd(t.manager.get_eventfd()),
+      delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)),
+      retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)),
+      persist_event(new tcp<InetTraits>::C_handle_persist(this)),
+      all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::~tcb()
+{
+  if (_delayed_ack_fd)
+    center->delete_time_event(*_delayed_ack_fd);
+  if (retransmit_fd)
+    center->delete_time_event(*retransmit_fd);
+  if (persist_fd)
+    center->delete_time_event(*persist_fd);
+  delete delayed_ack_event;
+  delete retransmit_event;
+  delete persist_event;
+  delete all_data_ack_event;
+  manager.close(fd);
+  fd = -1;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth)
+{
+  _tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
+}
+
+template <typename InetTraits>
+uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) {
+  uint32_t total_acked_bytes = 0;
+  // Full ACK of segment
+  while (!_snd.data.empty()
+         && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
+    auto acked_bytes = _snd.data.front().p.len();
+    _snd.unacknowledged += acked_bytes;
+    // Ignore retransmitted segments when setting the RTO
+    if (_snd.data.front().nr_transmits == 0) {
+      update_rto(_snd.data.front().tx_time);
+    }
+    update_cwnd(acked_bytes);
+    total_acked_bytes += acked_bytes;
+    _snd.user_queue_space.put(_snd.data.front().data_len);
+    manager.notify(fd, EVENT_WRITABLE);
+    _snd.data.pop_front();
+  }
+  // Partial ACK of segment
+  if (_snd.unacknowledged < seg_ack) {
+    auto acked_bytes = seg_ack - _snd.unacknowledged;
+    if (!_snd.data.empty()) {
+      auto& unacked_seg = _snd.data.front();
+      unacked_seg.p.trim_front(acked_bytes);
+    }
+    _snd.unacknowledged = seg_ack;
+    update_cwnd(acked_bytes);
+    total_acked_bytes += acked_bytes;
+  }
+  return total_acked_bytes;
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) {
+  if (seg_len == 0 && _rcv.window == 0) {
+    // SEG.SEQ = RCV.NXT
+    return seg_seq == _rcv.next;
+  } else if (seg_len == 0 && _rcv.window > 0) {
+    // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+    return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
+  } else if (seg_len > 0 && _rcv.window > 0) {
+    // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+    //    or
+    // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+    bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
+    bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
+    return x || y;
+  } else  {
+    // SEG.LEN > 0 RCV.WND = 0, not acceptable
+    return false;
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
+  // Handle tcp options
+  _option.parse(opt_start, opt_end);
+
+  // Remote receive window scale factor
+  _snd.window_scale = _option._remote_win_scale;
+  // Local receive window scale factor
+  _rcv.window_scale = _option._local_win_scale;
+
+  // Maximum segment size remote can receive
+  _snd.mss = _option._remote_mss;
+  // Maximum segment size local can receive
+  _rcv.mss = _option._local_mss = local_mss();
+
+  // Linux's default window size
+  _rcv.window = 29200 << _rcv.window_scale;
+  _snd.window = th->window << _snd.window_scale;
+
+  // Segment sequence number used for last window update
+  _snd.wl1 = th->seq;
+  // Segment acknowledgment number used for last window update
+  _snd.wl2 = th->ack;
+
+  // Setup initial congestion window
+  if (2190 < _snd.mss) {
+    _snd.cwnd = 2 * _snd.mss;
+  } else if (1095 < _snd.mss && _snd.mss <= 2190) {
+    _snd.cwnd = 3 * _snd.mss;
+  } else {
+    _snd.cwnd = 4 * _snd.mss;
+  }
+
+  // Setup initial slow start threshold
+  _snd.ssthresh = th->window << _snd.window_scale;
+}
+
+template <typename InetTraits>
+Packet tcp<InetTraits>::tcb::get_transmit_packet() {
+  // easy case: empty queue
+  if (_snd.unsent.empty()) {
+    return Packet();
+  }
+  auto can_send = this->can_send();
+  // Max number of TCP payloads we can pass to NIC
+  uint32_t len;
+  if (_tcp.get_hw_features().tx_tso) {
+    // FIXME: Info tap device the size of the split packet
+    len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+  } else {
+    len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
+  }
+  can_send = std::min(can_send, len);
+  // easy case: one small packet
+  if (_snd.unsent.front().len() <= can_send) {
+    auto p = std::move(_snd.unsent.front());
+    _snd.unsent.pop_front();
+    _snd.unsent_len -= p.len();
+    return p;
+  }
+  // moderate case: need to split one packet
+  if (_snd.unsent.front().len() > can_send) {
+    auto p = _snd.unsent.front().share(0, can_send);
+    _snd.unsent.front().trim_front(can_send);
+    _snd.unsent_len -= p.len();
+    return p;
+  }
+  // hard case: merge some packets, possibly split last
+  auto p = std::move(_snd.unsent.front());
+  _snd.unsent.pop_front();
+  can_send -= p.len();
+  while (!_snd.unsent.empty()
+         && _snd.unsent.front().len() <= can_send) {
+    can_send -= _snd.unsent.front().len();
+    p.append(std::move(_snd.unsent.front()));
+    _snd.unsent.pop_front();
+  }
+  // FIXME: this will result in calling "deleter" of packet which free managed objects
+  // will used later
+  // if (!_snd.unsent.empty() && can_send) {
+  //   auto& q = _snd.unsent.front();
+  //   p.append(q.share(0, can_send));
+  //   q.trim_front(can_send);
+  // }
+  _snd.unsent_len -= p.len();
+  return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
+  if (in_state(CLOSED)) {
+    return;
+  }
+
+  Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
+  Packet clone = p.share();  // early clone to prevent share() from calling packet::unuse_internal_data() on header.
+  uint16_t len = p.len();
+  bool syn_on = syn_needs_on();
+  bool ack_on = ack_needs_on();
+
+  auto options_size = _option.get_size(syn_on, ack_on);
+  auto th = p.prepend_header<tcp_hdr>(options_size);
+
+  th->src_port = _local_port;
+  th->dst_port = _foreign_port;
+
+  th->f_syn = syn_on;
+  th->f_ack = ack_on;
+  if (ack_on) {
+    clear_delayed_ack();
+  }
+  th->f_urg = false;
+  th->f_psh = false;
+
+  tcp_sequence seq;
+  if (data_retransmit) {
+    seq = _snd.unacknowledged;
+  } else {
+    seq = syn_on ? _snd.initial : _snd.next;
+    _snd.next += len;
+  }
+  th->seq = seq;
+  th->ack = _rcv.next;
+  th->data_offset = (sizeof(*th) + options_size) / 4;
+  th->window = _rcv.window >> _rcv.window_scale;
+  th->checksum = 0;
+
+  // FIXME: does the FIN have to fit in the window?
+  bool fin_on = fin_needs_on();
+  th->f_fin = fin_on;
+
+  // Add tcp options
+  _option.fill(th, options_size);
+  *th = th->hton();
+
+  offload_info oi;
+  checksummer csum;
+  uint16_t pseudo_hdr_seg_len = 0;
+
+  oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size;
+
+  if (_tcp.get_hw_features().tx_csum_l4_offload) {
+    oi.needs_csum = true;
+
+    //
+    // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
+    // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
+    // complement sum of the pseudo header.
+    //
+    // For TSO the csum should be calculated for a pseudo header with
+    // segment length set to 0. All the rest is the same as for a TCP Tx
+    // CSUM offload case.
+    //
+    if (_tcp.get_hw_features().tx_tso && len > _snd.mss) {
+      oi.tso_seg_size = _snd.mss;
+    } else {
+      pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+    }
+  } else {
+    pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+    oi.needs_csum = false;
+  }
+
+  InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
+                                         pseudo_hdr_seg_len);
+
+  if (_tcp.get_hw_features().tx_csum_l4_offload) {
+    th->checksum = ~csum.get();
+  } else {
+    csum.sum(p);
+    th->checksum = csum.get();
+  }
+
+  oi.protocol = ip_protocol_num::tcp;
+
+  p.set_offload_info(oi);
+
+  if (!data_retransmit && (len || syn_on || fin_on)) {
+    auto now = clock_type::now();
+    if (len) {
+      unsigned nr_transmits = 0;
+      _snd.data.emplace_back(unacked_segment{std::move(clone),
+                                             len, nr_transmits, now});
+    }
+    if (!retransmit_fd) {
+      start_retransmit_timer();
+    }
+  }
+
+  queue_packet(std::move(p));
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::is_all_data_acked() {
+  if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) {
+    return true;
+  }
+  return false;
+}
+
+template <typename InetTraits>
+Tub<Packet> tcp<InetTraits>::tcb::read() {
+  Tub<Packet> p;
+  if (_rcv.data.empty())
+    return p;
+
+  p.construct();
+  for (auto&& q : _rcv.data) {
+    p->append(std::move(q));
+  }
+  _rcv.data.clear();
+  return p;
+}
+
+template <typename InetTraits>
+int tcp<InetTraits>::tcb::send(Packet p) {
+  // We can not send after the connection is closed
+  ceph_assert(!_snd.closed);
+
+  if (in_state(CLOSED))
+    return -ECONNRESET;
+
+  auto len = p.len();
+  if (!_snd.user_queue_space.get_or_fail(len)) {
+    // note: caller must ensure enough queue space to send
+    ceph_abort();
+  }
+  // TODO: Handle p.len() > max user_queue_space case
+  _snd.queued_len += len;
+  _snd.unsent_len += len;
+  _snd.queued_len -= len;
+  _snd.unsent.push_back(std::move(p));
+  if (can_send() > 0) {
+    output();
+  }
+  return len;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close() {
+  if (in_state(CLOSED) || _snd.closed) {
+    return ;
+  }
+  // TODO: We should make this asynchronous
+
+  _errno = -EPIPE;
+  center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+  bool acked = is_all_data_acked();
+  if (!acked) {
+    _snd._all_data_acked_fd = manager.get_eventfd();
+    center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event);
+  } else {
+    close_final_cleanup();
+  }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
+  // We've received a TSO packet, do ack immediately
+  if (seg_len > _rcv.mss) {
+    _nr_full_seg_received = 0;
+    if (_delayed_ack_fd) {
+      center->delete_time_event(*_delayed_ack_fd);
+      _delayed_ack_fd.destroy();
+    }
+    return true;
+  }
+
+  // We've received a full sized segment, ack for every second full sized segment
+  if (seg_len == _rcv.mss) {
+    if (_nr_full_seg_received++ >= 1) {
+      _nr_full_seg_received = 0;
+      if (_delayed_ack_fd) {
+        center->delete_time_event(*_delayed_ack_fd);
+        _delayed_ack_fd.destroy();
+      }
+      return true;
+    }
+  }
+
+  // If the timer is armed and its callback hasn't been run.
+  if (_delayed_ack_fd) {
+    return false;
+  }
+
+  // If the timer is not armed, schedule a delayed ACK.
+  // The maximum delayed ack timer allowed by RFC1122 is 500ms, most
+  // implementations use 200ms.
+  _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event));
+  return false;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::clear_delayed_ack() {
+  if (_delayed_ack_fd) {
+    center->delete_time_event(*_delayed_ack_fd);
+    _delayed_ack_fd.destroy();
+  }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::merge_out_of_order() {
+  bool merged = false;
+  if (_rcv.out_of_order.map.empty()) {
+    return merged;
+  }
+  for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
+    auto& p = it->second;
+    auto seg_beg = it->first;
+    auto seg_len = p.len();
+    auto seg_end = seg_beg + seg_len;
+    if (seg_beg <= _rcv.next && seg_end > _rcv.next) {
+      // This segment has been received out of order and its previous
+      // segment has been received now
+      auto trim = _rcv.next - seg_beg;
+      if (trim) {
+        p.trim_front(trim);
+        seg_len -= trim;
+      }
+      _rcv.next += seg_len;
+      _rcv.data.push_back(std::move(p));
+      // Since c++11, erase() always returns the value of the following element
+      it = _rcv.out_of_order.map.erase(it);
+      merged = true;
+    } else if (_rcv.next >= seg_end) {
+      // This segment has been receive already, drop it
+      it = _rcv.out_of_order.map.erase(it);
+    } else {
+      // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
+      // so we can stop looking here.
+      it++;
+      break;
+    }
+  }
+  return merged;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) {
+  _rcv.out_of_order.merge(seg, std::move(p));
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
+  abort();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::fast_retransmit() {
+  if (!_snd.data.empty()) {
+    auto& unacked_seg = _snd.data.front();
+    unacked_seg.nr_transmits++;
+    retransmit_one();
+    output();
+  }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
+  // Update RTO according to RFC6298
+  auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time);
+  if (_snd.first_rto_sample) {
+    _snd.first_rto_sample = false;
+    // RTTVAR <- R/2
+    // SRTT <- R
+    _snd.rttvar = R / 2;
+    _snd.srtt = R;
+  } else {
+    // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
+    // SRTT <- (1 - alpha) * SRTT + alpha * R'
+    // where alpha = 1/8 and beta = 1/4
+    auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
+    _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
+    _snd.srtt = _snd.srtt * 7 / 8 +  R / 8;
+  }
+  // RTO <- SRTT + max(G, K * RTTVAR)
+  _rto =  _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
+
+  // Make sure 1 sec << _rto << 60 sec
+  _rto = std::max(_rto, _rto_min);
+  _rto = std::min(_rto, _rto_max);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
+  uint32_t smss = _snd.mss;
+  if (_snd.cwnd < _snd.ssthresh) {
+    // In slow start phase
+    _snd.cwnd += std::min(acked_bytes, smss);
+  } else {
+    // In congestion avoidance phase
+    uint32_t round_up = 1;
+    _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
+  }
+}
+
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::cleanup() {
+  manager.notify(fd, EVENT_READABLE);
+  _snd.closed = true;
+  _snd.unsent.clear();
+  _snd.data.clear();
+  _rcv.out_of_order.map.clear();
+  _rcv.data.clear();
+  stop_retransmit_timer();
+  clear_delayed_ack();
+  center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this));
+  remove_from_tcbs();
+}
+
+template <typename InetTraits>
+tcp_sequence tcp<InetTraits>::tcb::get_isn() {
+  // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
+  // with the expression:
+  //   ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
+  //   M is the 4 microsecond timer
+  using namespace std::chrono;
+  uint32_t hash[4];
+  hash[0] = _local_ip.ip;
+  hash[1] = _foreign_ip.ip;
+  hash[2] = (_local_port << 16) + _foreign_port;
+  hash[3] = _isn_secret.key[15];
+  ceph::crypto::MD5 md5;
+  md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key));
+  md5.Final((unsigned char*)hash);
+  auto seq = hash[0];
+  auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
+  seq += m.count() / 4;
+  return make_seq(seq);
+}
+
+template <typename InetTraits>
+Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
+  _poll_active = false;
+  if (_packetq.empty()) {
+    output_one();
+  }
+
+  Tub<typename InetTraits::l4packet> p;
+  if (in_state(CLOSED)) {
+    return p;
+  }
+
+  ceph_assert(!_packetq.empty());
+
+  p = std::move(_packetq.front());
+  _packetq.pop_front();
+  if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) {
+    // If there are packets to send in the queue or tcb is allowed to send
+    // more add tcp back to polling set to keep sending. In addition, dupacks >= 3
+    // is an indication that an segment is lost, stop sending more in this case.
+    output();
+  }
+  return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_read() {
+  // do nothing
+  // _tcb->manager.notify(_tcb->fd, EVENT_READABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_write() {
+  _tcb->close();
+}
+
+template <typename InetTraits>
+constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
+
+template <typename InetTraits>
+typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
+
+
+#endif /* TCP_HH_ */
diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc
new file mode 100644
index 00000000..282dcef1
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "UserspaceEvent.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+int UserspaceEventManager::get_eventfd()
+{
+  int fd;
+  if (!unused_fds.empty()) {
+    fd = unused_fds.front();
+    unused_fds.pop_front();
+  } else {
+    fd = ++max_fd;
+    fds.resize(fd + 1);
+  }
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  ceph_assert(!impl);
+  impl.construct();
+  ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+  return fd;
+}
+
+int UserspaceEventManager::notify(int fd, int mask)
+{
+  ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl;
+  if ((size_t)fd >= fds.size())
+    return -ENOENT;
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  if (!impl)
+    return -ENOENT;
+
+  ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+                 << " listening=" << int(impl->listening_mask)
+                 << " waiting_idx=" << int(impl->waiting_idx) << dendl;
+
+  impl->activating_mask |= mask;
+  if (impl->waiting_idx)
+    return 0;
+
+  if (impl->listening_mask & mask) {
+    if (waiting_fds.size() <= max_wait_idx)
+      waiting_fds.resize(waiting_fds.size()*2);
+    impl->waiting_idx = ++max_wait_idx;
+    waiting_fds[max_wait_idx] = fd;
+  }
+
+  ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+                 << " listening=" << int(impl->listening_mask)
+                 << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl;
+  return 0;
+}
+
+void UserspaceEventManager::close(int fd)
+{
+  ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+  if ((size_t)fd >= fds.size())
+    return ;
+
+  Tub<UserspaceFDImpl> &impl = fds[fd];
+  if (!impl)
+    return ;
+
+  if (fd == max_fd)
+    --max_fd;
+  else
+    unused_fds.push_back(fd);
+
+  if (impl->activating_mask) {
+    if (waiting_fds[max_wait_idx] == fd) {
+      ceph_assert(impl->waiting_idx == max_wait_idx);
+      --max_wait_idx;
+    }
+    waiting_fds[impl->waiting_idx] = -1;
+  }
+  impl.destroy();
+}
+
+int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp)
+{
+  int fd;
+  uint32_t i = 0;
+  int count = 0;
+  ceph_assert(num_events);
+  // leave zero slot for waiting_fds
+  while (i < max_wait_idx) {
+    fd = waiting_fds[++i];
+    if (fd == -1)
+      continue;
+
+    events[count] = fd;
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    ceph_assert(impl);
+    masks[count] = impl->listening_mask & impl->activating_mask;
+    ceph_assert(masks[count]);
+    ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl;
+    impl->activating_mask &= (~masks[count]);
+    impl->waiting_idx = 0;
+    if (++count >= num_events)
+      break;
+  }
+  if (i < max_wait_idx) {
+    memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i));
+  }
+  max_wait_idx -= i;
+  return count;
+}
diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h
new file mode 100644
index 00000000..7e89517d
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_USERSPACEEVENT_H
+#define CEPH_USERSPACEEVENT_H
+
+#include <cstddef>
+#include <errno.h>
+#include <string.h>
+
+#include <vector>
+#include <list>
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+#include "common/Tub.h"
+
+class CephContext;
+
+class UserspaceEventManager {
+  struct UserspaceFDImpl {
+    uint32_t waiting_idx = 0;
+    int16_t read_errno = 0;
+    int16_t write_errno = 0;
+    int8_t listening_mask = 0;
+    int8_t activating_mask = 0;
+    uint32_t magic = 4921;
+  };
+  CephContext *cct;
+  int max_fd = 0;
+  uint32_t max_wait_idx = 0;
+  std::vector<Tub<UserspaceFDImpl> > fds;
+  std::vector<int> waiting_fds;
+  std::list<uint32_t> unused_fds;
+
+ public:
+  explicit UserspaceEventManager(CephContext *c): cct(c) {
+    waiting_fds.resize(1024);
+  }
+
+  int get_eventfd();
+
+  int listen(int fd, int mask) {
+    if ((size_t)fd >= fds.size())
+      return -ENOENT;
+
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    if (!impl)
+      return -ENOENT;
+
+    impl->listening_mask |= mask;
+    if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) {
+      if (waiting_fds.size() <= max_wait_idx)
+        waiting_fds.resize(waiting_fds.size()*2);
+      impl->waiting_idx = ++max_wait_idx;
+      waiting_fds[max_wait_idx] = fd;
+    }
+    return 0;
+  }
+
+  int unlisten(int fd, int mask) {
+    if ((size_t)fd >= fds.size())
+      return -ENOENT;
+
+    Tub<UserspaceFDImpl> &impl = fds[fd];
+    if (!impl)
+      return -ENOENT;
+
+    impl->listening_mask &= (~mask);
+    if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) {
+      if (waiting_fds[max_wait_idx] == fd) {
+        ceph_assert(impl->waiting_idx == max_wait_idx);
+        --max_wait_idx;
+      }
+      waiting_fds[impl->waiting_idx] = -1;
+      impl->waiting_idx = 0;
+    }
+    return 0;
+  }
+
+  int notify(int fd, int mask);
+  void close(int fd);
+  int poll(int *events, int *masks, int num_events, struct timeval *tp);
+
+  bool check() {
+    for (auto &&m : fds) {
+      if (m && m->magic != 4921)
+        return false;
+    }
+    return true;
+  }
+};
+
+#endif //CEPH_USERSPACEEVENT_H
diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h
new file mode 100644
index 00000000..3b48f789
--- /dev/null
+++ b/src/msg/async/dpdk/align.h
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_ALIGN_HH_
+#define CEPH_MSG_DPDK_ALIGN_HH_
+
+#include <cstdint>
+#include <cstdlib>
+
+template <typename T>
+inline constexpr T align_up(T v, T align) {
+  return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_up(T* v, size_t align) {
+  static_assert(sizeof(T) == 1, "align byte pointers only");
+  return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
+}
+
+template <typename T>
+inline constexpr T align_down(T v, T align) {
+  return v & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_down(T* v, size_t align) {
+  static_assert(sizeof(T) == 1, "align byte pointers only");
+  return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align));
+}
+
+#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */
diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h
new file mode 100644
index 00000000..40f7728d
--- /dev/null
+++ b/src/msg/async/dpdk/array_map.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_ARRAY_MAP_HH_
+#define CEPH_ARRAY_MAP_HH_
+
+#include <array>
+
+// unordered_map implemented as a simple array
+
+template <typename Value, size_t Max>
+class array_map {
+  std::array<Value, Max> _a {};
+ public:
+  array_map(std::initializer_list<std::pair<size_t, Value>> i) {
+    for (auto kv : i) {
+      _a[kv.first] = kv.second;
+    }
+  }
+  Value& operator[](size_t key) { return _a[key]; }
+  const Value& operator[](size_t key) const { return _a[key]; }
+
+  Value& at(size_t key) {
+    if (key >= Max) {
+      throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max));
+    }
+    return _a[key];
+  }
+};
+
+#endif /* ARRAY_MAP_HH_ */
diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h
new file mode 100644
index 00000000..a996ec07
--- /dev/null
+++ b/src/msg/async/dpdk/byteorder.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_BYTEORDER_H_
+#define CEPH_MSG_BYTEORDER_H_
+
+#include <arpa/inet.h>  // for ntohs() and friends
+#include <iosfwd>
+#include <utility>
+
+inline uint64_t ntohq(uint64_t v) {
+  return __builtin_bswap64(v);
+}
+inline uint64_t htonq(uint64_t v) {
+  return __builtin_bswap64(v);
+}
+
+inline void ntoh() {}
+inline void hton() {}
+
+inline uint8_t ntoh(uint8_t x) { return x; }
+inline uint8_t hton(uint8_t x) { return x; }
+inline uint16_t ntoh(uint16_t x) { return ntohs(x); }
+inline uint16_t hton(uint16_t x) { return htons(x); }
+inline uint32_t ntoh(uint32_t x) { return ntohl(x); }
+inline uint32_t hton(uint32_t x) { return htonl(x); }
+inline uint64_t ntoh(uint64_t x) { return ntohq(x); }
+inline uint64_t hton(uint64_t x) { return htonq(x); }
+
+inline int8_t ntoh(int8_t x) { return x; }
+inline int8_t hton(int8_t x) { return x; }
+inline int16_t ntoh(int16_t x) { return ntohs(x); }
+inline int16_t hton(int16_t x) { return htons(x); }
+inline int32_t ntoh(int32_t x) { return ntohl(x); }
+inline int32_t hton(int32_t x) { return htonl(x); }
+inline int64_t ntoh(int64_t x) { return ntohq(x); }
+inline int64_t hton(int64_t x) { return htonq(x); }
+
+#endif /* CEPH_MSG_BYTEORDER_H_ */
diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h
new file mode 100644
index 00000000..1ace8eeb
--- /dev/null
+++ b/src/msg/async/dpdk/capture.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_DPDK_CAPTURE_H
+#define CEPH_MSG_DPDK_CAPTURE_H
+
+#include <utility>
+
+template <typename T, typename F>
+class capture_impl {
+  T x;
+  F f;
+ public:
+  capture_impl(capture_impl &) = delete;
+  capture_impl( T && x, F && f )
+      : x{std::forward<T>(x)}, f{std::forward<F>(f)}
+  {}
+
+  template <typename ...Ts> auto operator()( Ts&&...args )
+  -> decltype(f( x, std::forward<Ts>(args)... ))
+  {
+    return f( x, std::forward<Ts>(args)... );
+  }
+
+  template <typename ...Ts> auto operator()( Ts&&...args ) const
+  -> decltype(f( x, std::forward<Ts>(args)... ))
+  {
+    return f( x, std::forward<Ts>(args)... );
+  }
+};
+
+template <typename T, typename F>
+capture_impl<T,F> capture( T && x, F && f ) {
+  return capture_impl<T,F>(
+      std::forward<T>(x), std::forward<F>(f) );
+}
+
+#endif //CEPH_MSG_DPDK_CAPTURE_H
diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h
new file mode 100644
index 00000000..2c92c120
--- /dev/null
+++ b/src/msg/async/dpdk/circular_buffer.h
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_CIRCULAR_BUFFER_HH_
+#define CEPH_CIRCULAR_BUFFER_HH_
+
+// A growable double-ended queue container that can be efficiently
+// extended (and shrunk) from both ends.  Implementation is a single
+// storage vector.
+//
+// Similar to libstdc++'s std::deque, except that it uses a single level
+// store, and so is more efficient for simple stored items.
+// Similar to boost::circular_buffer_space_optimized, except it uses
+// uninitialized storage for unoccupied elements (and thus move/copy
+// constructors instead of move/copy assignments, which are less efficient).
+
+#include <memory>
+#include <algorithm>
+
+#include "transfer.h"
+
+template <typename T, typename Alloc = std::allocator<T>>
+class circular_buffer {
+  struct impl : Alloc {
+    T* storage = nullptr;
+    // begin, end interpreted (mod capacity)
+    size_t begin = 0;
+    size_t end = 0;
+    size_t capacity = 0;
+  };
+  impl _impl;
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using reference = T&;
+  using pointer = T*;
+  using const_reference = const T&;
+  using const_pointer = const T*;
+ public:
+  circular_buffer() = default;
+  circular_buffer(circular_buffer&& X);
+  circular_buffer(const circular_buffer& X) = delete;
+  ~circular_buffer();
+  circular_buffer& operator=(const circular_buffer&) = delete;
+  circular_buffer& operator=(circular_buffer&&) = delete;
+  void push_front(const T& data);
+  void push_front(T&& data);
+  template <typename... A>
+  void emplace_front(A&&... args);
+  void push_back(const T& data);
+  void push_back(T&& data);
+  template <typename... A>
+  void emplace_back(A&&... args);
+  T& front();
+  T& back();
+  void pop_front();
+  void pop_back();
+  bool empty() const;
+  size_t size() const;
+  size_t capacity() const;
+  T& operator[](size_t idx);
+  template <typename Func>
+  void for_each(Func func);
+  // access an element, may return wrong or destroyed element
+  // only useful if you do not rely on data accuracy (e.g. prefetch)
+  T& access_element_unsafe(size_t idx);
+ private:
+  void expand();
+  void maybe_expand(size_t nr = 1);
+  size_t mask(size_t idx) const;
+
+  template<typename CB, typename ValueType>
+  struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> {
+    typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t;
+
+    ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; }
+    ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; }
+    // prefix
+    cbiterator<CB, ValueType>& operator++() {
+      idx++;
+      return *this;
+    }
+    // postfix
+    cbiterator<CB, ValueType> operator++(int unused) {
+      auto v = *this;
+      idx++;
+      return v;
+    }
+    // prefix
+    cbiterator<CB, ValueType>& operator--() {
+      idx--;
+      return *this;
+    }
+    // postfix
+    cbiterator<CB, ValueType> operator--(int unused) {
+      auto v = *this;
+      idx--;
+      return v;
+    }
+    cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const {
+      return cbiterator<CB, ValueType>(cb, idx + n);
+    }
+    cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const {
+      return cbiterator<CB, ValueType>(cb, idx - n);
+    }
+    cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) {
+      idx += n;
+      return *this;
+    }
+    cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) {
+      idx -= n;
+      return *this;
+    }
+    bool operator==(const cbiterator<CB, ValueType>& rhs) const {
+      return idx == rhs.idx;
+    }
+    bool operator!=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx != rhs.idx;
+    }
+    bool operator<(const cbiterator<CB, ValueType>& rhs) const {
+      return idx < rhs.idx;
+    }
+    bool operator>(const cbiterator<CB, ValueType>& rhs) const {
+      return idx > rhs.idx;
+    }
+    bool operator>=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx >= rhs.idx;
+    }
+    bool operator<=(const cbiterator<CB, ValueType>& rhs) const {
+      return idx <= rhs.idx;
+    }
+    typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const {
+      return idx - rhs.idx;
+    }
+   private:
+    CB* cb;
+    size_t idx;
+    cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {}
+    friend class circular_buffer;
+  };
+  friend class iterator;
+
+ public:
+  typedef cbiterator<circular_buffer, T> iterator;
+  typedef cbiterator<const circular_buffer, const T> const_iterator;
+
+  iterator begin() {
+    return iterator(this, _impl.begin);
+  }
+  const_iterator begin() const {
+    return const_iterator(this, _impl.begin);
+  }
+  iterator end() {
+    return iterator(this, _impl.end);
+  }
+  const_iterator end() const {
+    return const_iterator(this, _impl.end);
+  }
+  const_iterator cbegin() const {
+    return const_iterator(this, _impl.begin);
+  }
+  const_iterator cend() const {
+    return const_iterator(this, _impl.end);
+  }
+};
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const {
+  return idx & (_impl.capacity - 1);
+}
+
+template <typename T, typename Alloc>
+inline bool circular_buffer<T, Alloc>::empty() const {
+  return _impl.begin == _impl.end;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::size() const {
+  return _impl.end - _impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::capacity() const {
+  return _impl.capacity;
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x)
+    : _impl(std::move(x._impl)) {
+  x._impl = {};
+}
+
+template <typename T, typename Alloc>
+template <typename Func>
+inline void circular_buffer<T, Alloc>::for_each(Func func) {
+  auto s = _impl.storage;
+  auto m = _impl.capacity - 1;
+  for (auto i = _impl.begin; i != _impl.end; ++i) {
+    func(s[i & m]);
+  }
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::~circular_buffer() {
+  for_each([this] (T& obj) {
+    _impl.destroy(&obj);
+  });
+  _impl.deallocate(_impl.storage, _impl.capacity);
+}
+
+template <typename T, typename Alloc>
+void circular_buffer<T, Alloc>::expand() {
+  auto new_cap = std::max<size_t>(_impl.capacity * 2, 1);
+  auto new_storage = _impl.allocate(new_cap);
+  auto p = new_storage;
+  try {
+    for_each([this, &p] (T& obj) {
+      transfer_pass1(_impl, &obj, p);
+      p++;
+    });
+  } catch (...) {
+    while (p != new_storage) {
+      _impl.destroy(--p);
+    }
+    _impl.deallocate(new_storage, new_cap);
+    throw;
+  }
+  p = new_storage;
+  for_each([this, &p] (T& obj) {
+    transfer_pass2(_impl, &obj, p++);
+  });
+  std::swap(_impl.storage, new_storage);
+  std::swap(_impl.capacity, new_cap);
+  _impl.begin = 0;
+  _impl.end = p - _impl.storage;
+  _impl.deallocate(new_storage, new_cap);
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) {
+  if (_impl.end - _impl.begin + nr > _impl.capacity) {
+    expand();
+  }
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(const T& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, data);
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(T&& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, std::move(data));
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.begin - 1)];
+  _impl.construct(p, std::forward<Args>(args)...);
+  --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(const T& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, data);
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(T&& data) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, std::move(data));
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) {
+  maybe_expand();
+  auto p = &_impl.storage[mask(_impl.end)];
+  _impl.construct(p, std::forward<Args>(args)...);
+  ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::front() {
+  return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::back() {
+  return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_front() {
+  _impl.destroy(&front());
+  ++_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_back() {
+  _impl.destroy(&back());
+  --_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::operator[](size_t idx) {
+  return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) {
+  return _impl.storage[mask(_impl.begin + idx)];
+}
+
+#endif /* CEPH_CIRCULAR_BUFFER_HH_ */
diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h
new file mode 100644
index 00000000..ea5dc49e
--- /dev/null
+++ b/src/msg/async/dpdk/const.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CONST_H_
+#define CEPH_MSG_CONST_H_
+
+#include <stdint.h>
+
+enum class ip_protocol_num : uint8_t {
+  icmp = 1, tcp = 6, unused = 255
+};
+
+enum class eth_protocol_num : uint16_t {
+  ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd
+};
+
+const uint8_t eth_hdr_len = 14;
+const uint8_t tcp_hdr_len_min = 20;
+const uint8_t ipv4_hdr_len_min = 20;
+const uint8_t ipv6_hdr_len_min = 40;
+const uint16_t ip_packet_len_max = 65535;
+
+#endif
diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc
new file mode 100644
index 00000000..9f9d343b
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.cc
@@ -0,0 +1,154 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <bitset>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_version.h>
+
+#include "DPDK.h"
+#include "dpdk_rte.h"
+
+namespace dpdk {
+
+  static inline std::vector<char> string2vector(std::string str) {
+    auto v = std::vector<char>(str.begin(), str.end());
+    v.push_back('\0');
+    return v;
+  }
+
+  bool eal::initialized = false;
+  std::thread eal::t;
+  std::mutex eal::lock;
+  std::condition_variable eal::cond;
+  std::list<std::function<void()>> eal::funcs;
+
+  static int bitcount(unsigned long long n)
+  {
+    return std::bitset<CHAR_BIT * sizeof(n)>{n}.count();
+  }
+
+  int eal::init(CephContext *c)
+  {
+    if (initialized) {
+      return 1;
+    }
+
+    bool done = false;
+    auto num = std::stoull(c->_conf.get_val<std::string>("ms_dpdk_coremask"),
+                           nullptr, 16);
+    unsigned int coremaskbit = bitcount(num);
+
+    ceph_assert(coremaskbit > c->_conf->ms_async_op_threads);
+
+    t = std::thread([&]() {
+      // TODO: Inherit these from the app parameters - "opts"
+      std::vector<std::vector<char>> args {
+          string2vector(string("ceph")),
+          string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")),
+          string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel),
+      };
+
+      Tub<std::string> hugepages_path;
+      if (!c->_conf->ms_dpdk_hugepages.empty()) {
+        hugepages_path.construct(c->_conf->ms_dpdk_hugepages);
+      }
+
+      // If "hugepages" is not provided and DPDK PMD drivers mode is requested -
+      // use the default DPDK huge tables configuration.
+      if (hugepages_path) {
+        args.push_back(string2vector("--huge-dir"));
+        args.push_back(string2vector(*hugepages_path));
+
+        //
+        // We don't know what is going to be our networking configuration so we
+        // assume there is going to be a queue per-CPU. Plus we'll give a DPDK
+        // 64MB for "other stuff".
+        //
+        unsigned int x;
+        std::stringstream ss;
+        ss << std::hex << "fffefffe";
+        ss >> x;
+        size_t size_MB = mem_size(bitcount(x)) >> 20;
+        std::stringstream size_MB_str;
+        size_MB_str << size_MB;
+
+        args.push_back(string2vector("-m"));
+        args.push_back(string2vector(size_MB_str.str()));
+      } else if (!c->_conf->ms_dpdk_pmd.empty()) {
+        args.push_back(string2vector("--no-huge"));
+      }
+
+      std::string rte_file_prefix;
+      rte_file_prefix = "rte_";
+      rte_file_prefix += c->_conf->name.to_str();
+      args.push_back(string2vector("--file-prefix"));
+      args.push_back(string2vector(rte_file_prefix));
+
+      std::vector<char*> cargs;
+
+      for (auto&& a: args) {
+        cargs.push_back(a.data());
+      }
+      /* initialise the EAL for all */
+      int ret = rte_eal_init(cargs.size(), cargs.data());
+      if (ret < 0)
+        return ret;
+
+      std::unique_lock<std::mutex> l(lock);
+      initialized = true;
+      done = true;
+      cond.notify_all();
+      while (true) {
+        if (!funcs.empty()) {
+          auto f = std::move(funcs.front());
+          funcs.pop_front();
+          f();
+          cond.notify_all();
+        } else {
+          cond.wait(l);
+        }
+      }
+    });
+    t.detach();
+    std::unique_lock<std::mutex> l(lock);
+    while (!done)
+      cond.wait(l);
+    return 0;
+  }
+
+  size_t eal::mem_size(int num_cpus)
+  {
+    size_t memsize = 0;
+    //
+    // PMD mempool memory:
+    //
+    // We don't know what is going to be our networking configuration so we
+    // assume there is going to be a queue per-CPU.
+    //
+    memsize += num_cpus * qp_mempool_obj_size();
+
+    // Plus we'll give a DPDK 64MB for "other stuff".
+    memsize += (64UL << 20);
+
+    return memsize;
+  }
+
+} // namespace dpdk
diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h
new file mode 100644
index 00000000..4aa83899
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.h
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef CEPH_DPDK_RTE_H_
+#define CEPH_DPDK_RTE_H_
+
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include <bitset>
+#include <rte_config.h>
+#include <rte_version.h>
+#include <boost/program_options.hpp>
+
+/*********************** Compat section ***************************************/
+// We currently support only versions 2.0 and above.
+#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0))
+#error "DPDK version above 2.0.0 is required"
+#endif
+
+#if defined(RTE_MBUF_REFCNT_ATOMIC)
+#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \
+         "config/common_linuxapp"
+#endif
+/******************************************************************************/
+
+namespace dpdk {
+
+// DPDK Environment Abstraction Layer
+class eal {
+ public:
+  using cpuset = std::bitset<RTE_MAX_LCORE>;
+
+  static std::mutex lock;
+  static std::condition_variable cond;
+  static std::list<std::function<void()>> funcs;
+  static int init(CephContext *c);
+  static void execute_on_master(std::function<void()> &&f) {
+    bool done = false;
+    std::unique_lock<std::mutex> l(lock);
+    funcs.emplace_back([&]() { f(); done = true; });
+    cond.notify_all();
+    while (!done)
+      cond.wait(l);
+  }
+  /**
+   * Returns the amount of memory needed for DPDK
+   * @param num_cpus Number of CPUs the application is going to use
+   *
+   * @return
+   */
+  static size_t mem_size(int num_cpus);
+  static bool initialized;
+  static std::thread t;
+};
+
+} // namespace dpdk
+#endif // CEPH_DPDK_RTE_H_
diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc
new file mode 100644
index 00000000..9aca5078
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.cc
@@ -0,0 +1,16 @@
+#include <iomanip>
+
+#include "ethernet.h"
+
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) {
+  auto& m = ea.mac;
+  using u = uint32_t;
+  os << std::hex << std::setw(2)
+     << u(m[0]) << ":"
+     << u(m[1]) << ":"
+     << u(m[2]) << ":"
+     << u(m[3]) << ":"
+     << u(m[4]) << ":"
+     << u(m[5]);
+  return os;
+}
diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h
new file mode 100644
index 00000000..b007425f
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_ETHERNET_H_
+#define CEPH_MSG_ETHERNET_H_
+
+#include <array>
+#include <sstream>
+
+#include "include/ceph_assert.h"
+#include "byteorder.h"
+
+struct ethernet_address {
+  ethernet_address() {}
+
+  ethernet_address(const uint8_t *eaddr) {
+    std::copy(eaddr, eaddr + 6, mac.begin());
+  }
+
+  ethernet_address(std::initializer_list<uint8_t> eaddr) {
+    ceph_assert(eaddr.size() == mac.size());
+    std::copy(eaddr.begin(), eaddr.end(), mac.begin());
+  }
+
+  ethernet_address ntoh() {
+    return *this;
+  }
+  ethernet_address hton() {
+    return *this;
+  }
+  std::array<uint8_t, 6> mac;
+} __attribute__((packed));
+
+inline bool operator==(const ethernet_address& a, const ethernet_address& b) {
+  return a.mac == b.mac;
+}
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea);
+
+struct ethernet {
+  using address = ethernet_address;
+  static address broadcast_address() {
+      return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+  }
+  static constexpr uint16_t arp_hardware_type() { return 1; }
+};
+
+struct eth_hdr {
+  ethernet_address dst_mac;
+  ethernet_address src_mac;
+  uint16_t eth_proto;
+  eth_hdr hton() {
+    eth_hdr hdr = *this;
+    hdr.eth_proto = ::hton(eth_proto);
+    return hdr;
+  }
+  eth_hdr ntoh() {
+    eth_hdr hdr = *this;
+    hdr.eth_proto = ::ntoh(eth_proto);
+    return hdr;
+  }
+} __attribute__((packed));
+
+ethernet_address parse_ethernet_address(std::string addr);
+
+#endif /* CEPH_MSG_ETHERNET_H_ */
diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h
new file mode 100644
index 00000000..356d8fd6
--- /dev/null
+++ b/src/msg/async/dpdk/ip_types.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_IP_TYPES_H_H
+#define CEPH_IP_TYPES_H_H
+
+#include <boost/asio/ip/address_v4.hpp>
+#include <string>
+
+class Packet;
+class ethernet_address;
+using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>;
+
+struct ipv4_addr {
+  uint32_t ip;
+  uint16_t port;
+
+  ipv4_addr() : ip(0), port(0) {}
+  ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {}
+  ipv4_addr(uint16_t port) : ip(0), port(port) {}
+  ipv4_addr(const std::string &addr);
+  ipv4_addr(const std::string &addr, uint16_t port);
+
+  ipv4_addr(const entity_addr_t &ad) {
+    ip = ntoh(ad.in4_addr().sin_addr.s_addr);
+    port = ad.get_port();
+  }
+
+  ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {}
+};
+
+struct ipv4_address {
+  ipv4_address() : ip(0) {}
+  explicit ipv4_address(uint32_t ip) : ip(ip) {}
+  explicit ipv4_address(const std::string& addr) {
+    ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong());
+  }
+  ipv4_address(ipv4_addr addr) {
+    ip = addr.ip;
+  }
+
+  uint32_t ip;
+
+  ipv4_address hton() {
+    ipv4_address addr;
+    addr.ip = ::hton(ip);
+    return addr;
+  }
+  ipv4_address ntoh() {
+    ipv4_address addr;
+    addr.ip = ::ntoh(ip);
+    return addr;
+  }
+
+  friend bool operator==(ipv4_address x, ipv4_address y) {
+    return x.ip == y.ip;
+  }
+  friend bool operator!=(ipv4_address x, ipv4_address y) {
+    return x.ip != y.ip;
+  }
+} __attribute__((packed));
+
+static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; }
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a);
+
+namespace std {
+
+  template <>
+  struct hash<ipv4_address> {
+    size_t operator()(ipv4_address a) const { return a.ip; }
+  };
+
+}
+
+#endif //CEPH_IP_TYPES_H_H
diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc
new file mode 100644
index 00000000..6e361f18
--- /dev/null
+++ b/src/msg/async/dpdk/net.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ */
+
+#include "net.h"
+#include "DPDK.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "net "
+
+interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center)
+    : cct(cct), _dev(dev),
+      _rx(_dev->receive(
+          center->get_id(),
+          [center, this] (Packet p) {
+            return dispatch_packet(center, std::move(p));
+          }
+      )),
+      _hw_address(_dev->hw_address()),
+      _hw_features(_dev->get_hw_features()) {
+  auto idx = 0u;
+  unsigned qid = center->get_id();
+  dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable {
+    Tub<Packet> p;
+    for (size_t i = 0; i < _pkt_providers.size(); i++) {
+      auto l3p = _pkt_providers[idx++]();
+      if (idx == _pkt_providers.size())
+        idx = 0;
+      if (l3p) {
+        auto l3pv = std::move(*l3p);
+        auto eh = l3pv.p.prepend_header<eth_hdr>();
+        eh->dst_mac = l3pv.to;
+        eh->src_mac = _hw_address;
+        eh->eth_proto = uint16_t(l3pv.proto_num);
+        *eh = eh->hton();
+        ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num)
+                       << " " << _hw_address << " -> " << l3pv.to
+                       << " length " << std::dec << l3pv.p.len() << dendl;
+        p = std::move(l3pv.p);
+        return p;
+      }
+    }
+    return p;
+  });
+}
+
+subscription<Packet, ethernet_address> interface::register_l3(
+    eth_protocol_num proto_num,
+    std::function<int (Packet p, ethernet_address from)> next,
+    std::function<bool (forward_hash&, Packet& p, size_t)> forward)
+{
+  auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward)));
+  ceph_assert(i.second);
+  l3_rx_stream& l3_rx = i.first->second;
+  return l3_rx.packet_stream.listen(std::move(next));
+}
+
+unsigned interface::hash2cpu(uint32_t hash) {
+  return _dev->hash2cpu(hash);
+}
+
+const rss_key_type& interface::rss_key() const {
+  return _dev->rss_key();
+}
+
+uint16_t interface::hw_queues_count() const {
+  return _dev->hw_queues_count();
+}
+
+class C_handle_l2forward : public EventCallback {
+  std::shared_ptr<DPDKDevice> sdev;
+  unsigned &queue_depth;
+  Packet p;
+  unsigned dst;
+
+ public:
+  C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target)
+      : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {}
+  void do_request(uint64_t fd) {
+    sdev->l2receive(dst, std::move(p));
+    queue_depth--;
+    delete this;
+  }
+};
+
+void interface::forward(EventCenter *source, unsigned target, Packet p) {
+  static __thread unsigned queue_depth;
+
+  if (queue_depth < 1000) {
+    queue_depth++;
+    // FIXME: need ensure this event not be called after EventCenter destruct
+    _dev->workers[target]->center.dispatch_event_external(
+        new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target));
+  }
+}
+
+int interface::dispatch_packet(EventCenter *center, Packet p) {
+  auto eh = p.get_header<eth_hdr>();
+  if (eh) {
+    auto i = _proto_map.find(ntoh(eh->eth_proto));
+    auto hwrss = p.rss_hash();
+    if (hwrss) {
+      ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+                     << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+                     << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl;
+    } else {
+      ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+                     << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+                     << " length " << std::dec << p.len() << dendl;
+    }
+    if (i != _proto_map.end()) {
+      l3_rx_stream& l3 = i->second;
+      auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () {
+        auto hwrss = p.rss_hash();
+        if (hwrss) {
+          return *hwrss;
+        } else {
+          forward_hash data;
+          if (l3.forward(data, p, sizeof(eth_hdr))) {
+            return toeplitz_hash(rss_key(), data);
+          }
+          return 0u;
+        }
+      });
+      if (fw != center->get_id()) {
+        ldout(cct, 1) << __func__ << " forward to " << fw << dendl;
+        forward(center, fw, std::move(p));
+      } else {
+        auto h = eh->ntoh();
+        auto from = h.src_mac;
+        p.trim_front(sizeof(*eh));
+        // avoid chaining, since queue length is unlimited
+        // drop instead.
+        if (l3.ready()) {
+          return l3.packet_stream.produce(std::move(p), from);
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+class C_arp_learn : public EventCallback {
+  DPDKWorker *worker;
+  ethernet_address l2_addr;
+  ipv4_address l3_addr;
+
+ public:
+  C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3)
+      : worker(w), l2_addr(l2), l3_addr(l3) {}
+  void do_request(uint64_t id) {
+    worker->arp_learn(l2_addr, l3_addr);
+    delete this;
+  }
+};
+
+void interface::arp_learn(ethernet_address l2, ipv4_address l3)
+{
+  for (auto &&w : _dev->workers) {
+    w->center.dispatch_event_external(
+        new C_arp_learn(w, l2, l3));
+  }
+}
+
+l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func)
+    : _netif(netif), _proto_num(proto_num)  {
+  _netif->register_packet_provider(std::move(func));
+}
+
+subscription<Packet, ethernet_address> l3_protocol::receive(
+    std::function<int (Packet, ethernet_address)> rx_fn,
+    std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) {
+  return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward));
+};
diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h
new file mode 100644
index 00000000..63f0422b
--- /dev/null
+++ b/src/msg/async/dpdk/net.h
@@ -0,0 +1,138 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_NET_H
+#define CEPH_MSG_DPDK_NET_H
+
+#include "const.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "toeplitz.h"
+
+struct hw_features {
+  // Enable tx ip header checksum offload
+  bool tx_csum_ip_offload = false;
+  // Enable tx l4 (TCP or UDP) checksum offload
+  bool tx_csum_l4_offload = false;
+  // Enable rx checksum offload
+  bool rx_csum_offload = false;
+  // LRO is enabled
+  bool rx_lro = false;
+  // Enable tx TCP segment offload
+  bool tx_tso = false;
+  // Enable tx UDP fragmentation offload
+  bool tx_ufo = false;
+  // Maximum Transmission Unit
+  uint16_t mtu = 1500;
+  // Maximun packet len when TCP/UDP offload is enabled
+  uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len;
+};
+
+class forward_hash {
+  uint8_t data[64];
+  size_t end_idx = 0;
+ public:
+  size_t size() const {
+    return end_idx;
+  }
+  void push_back(uint8_t b) {
+    ceph_assert(end_idx < sizeof(data));
+    data[end_idx++] = b;
+  }
+  void push_back(uint16_t b) {
+    push_back(uint8_t(b));
+    push_back(uint8_t(b >> 8));
+  }
+  void push_back(uint32_t b) {
+    push_back(uint16_t(b));
+    push_back(uint16_t(b >> 16));
+  }
+  const uint8_t& operator[](size_t idx) const {
+    return data[idx];
+  }
+};
+
+class interface;
+
+class l3_protocol {
+ public:
+  struct l3packet {
+    eth_protocol_num proto_num;
+    ethernet_address to;
+    Packet p;
+  };
+  using packet_provider_type = std::function<Tub<l3packet> ()>;
+
+ private:
+  interface* _netif;
+  eth_protocol_num _proto_num;
+
+ public:
+  explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func);
+  subscription<Packet, ethernet_address> receive(
+      std::function<int (Packet, ethernet_address)> rx_fn,
+      std::function<bool (forward_hash &h, Packet &p, size_t s)> forward);
+
+ private:
+  friend class interface;
+};
+
+class DPDKDevice;
+struct ipv4_address;
+
+class interface {
+  CephContext *cct;
+  struct l3_rx_stream {
+    stream<Packet, ethernet_address> packet_stream;
+    std::function<bool (forward_hash&, Packet&, size_t)> forward;
+    bool ready() { return packet_stream.started(); }
+    explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {}
+  };
+  std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
+  std::shared_ptr<DPDKDevice> _dev;
+  subscription<Packet> _rx;
+  ethernet_address _hw_address;
+  struct hw_features _hw_features;
+  std::vector<l3_protocol::packet_provider_type> _pkt_providers;
+
+ private:
+  int dispatch_packet(EventCenter *c, Packet p);
+ public:
+  explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center);
+  ethernet_address hw_address() { return _hw_address; }
+  const struct hw_features& get_hw_features() const { return _hw_features; }
+  subscription<Packet, ethernet_address> register_l3(
+      eth_protocol_num proto_num,
+      std::function<int (Packet, ethernet_address)> next,
+      std::function<bool (forward_hash&, Packet&, size_t)> forward);
+  void forward(EventCenter *source, unsigned target, Packet p);
+  unsigned hash2cpu(uint32_t hash);
+  void register_packet_provider(l3_protocol::packet_provider_type func) {
+    _pkt_providers.push_back(std::move(func));
+  }
+  const rss_key_type& rss_key() const;
+  uint16_t hw_queues_count() const;
+  void arp_learn(ethernet_address l2, ipv4_address l3);
+  friend class l3_protocol;
+};
+
+#endif //CEPH_MSG_DPDK_NET_H
diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h
new file mode 100644
index 00000000..984ddca1
--- /dev/null
+++ b/src/msg/async/dpdk/queue.h
@@ -0,0 +1,96 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_QUEUE_H_
+#define CEPH_MSG_DPDK_QUEUE_H_
+
+#include <queue>
+
+#include "circular_buffer.h"
+
+template <typename T>
+class queue {
+  std::queue<T, circular_buffer<T>> _q;
+  size_t _max;
+
+ public:
+  explicit queue(size_t size): _max(size) {}
+
+  // Push an item.
+  //
+  // Returns false if the queue was full and the item was not pushed.
+  bool push(T&& a);
+
+  // pops an item.
+  T pop();
+
+  // Consumes items from the queue, passing them to @func, until @func
+  // returns false or the queue it empty
+  //
+  // Returns false if func returned false.
+  template <typename Func>
+  bool consume(Func&& func);
+
+  // Returns true when the queue is empty.
+  bool empty() const;
+
+  // Returns true when the queue is full.
+  bool full() const;
+
+  size_t size() const { return _q.size(); }
+
+  // Destroy any items in the queue
+  void clear() {
+    while (!_q.empty()) {
+      _q.pop();
+    }
+  }
+};
+
+template <typename T>
+inline bool queue<T>::push(T&& data) {
+  if (_q.size() < _max) {
+    _q.push(std::move(data));
+    notify_not_empty();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <typename T>
+inline T queue<T>::pop() {
+  T data = std::move(_q.front());
+  _q.pop();
+  return data;
+}
+
+template <typename T>
+inline bool queue<T>::empty() const {
+  return _q.empty();
+}
+
+template <typename T>
+inline bool queue<T>::full() const {
+  return _q.size() == _max;
+}
+
+#endif /* CEPH_MSG_DPDK_QUEUE_H_ */
diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h
new file mode 100644
index 00000000..d078063b
--- /dev/null
+++ b/src/msg/async/dpdk/shared_ptr.h
@@ -0,0 +1,391 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_LW_SHARED_PTR_H_
+#define CEPH_LW_SHARED_PTR_H_
+
+#include <utility>
+#include <type_traits>
+#include <functional>
+#include <iostream>
+
+// This header defines a shared pointer facility, lw_shared_ptr<>,
+// modeled after std::shared_ptr<>.
+//
+// Unlike std::shared_ptr<>, this implementation is thread
+// safe, and two pointers sharing the same object must not be used in
+// different threads.
+//
+// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<>
+// occupying just one machine word, and adding just one word to the shared
+// object.  However, it does not support polymorphism.
+//
+// It supports shared_from_this() via enable_shared_from_this<>
+// and lw_enable_shared_from_this<>().
+//
+
+template <typename T>
+class lw_shared_ptr;
+
+template <typename T>
+class enable_lw_shared_from_this;
+
+template <typename T>
+class enable_shared_from_this;
+
+template <typename T, typename... A>
+lw_shared_ptr<T> make_lw_shared(A&&... a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T&& a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T& a);
+
+struct lw_shared_ptr_counter_base {
+    long _count = 0;
+};
+
+
+namespace internal {
+
+template <class T, class U>
+struct lw_shared_ptr_accessors;
+
+template <class T>
+struct lw_shared_ptr_accessors_esft;
+
+template <class T>
+struct lw_shared_ptr_accessors_no_esft;
+
+}
+
+
+// We want to support two use cases for shared_ptr<T>:
+//
+//   1. T is any type (primitive or class type)
+//
+//   2. T is a class type that inherits from enable_shared_from_this<T>.
+//
+// In the first case, we must wrap T in an object containing the counter,
+// since T may be a primitive type and cannot be a base class.
+//
+// In the second case, we want T to reach the counter through its
+// enable_shared_from_this<> base class, so that we can implement
+// shared_from_this().
+//
+// To implement those two conflicting requirements (T alongside its counter;
+// T inherits from an object containing the counter) we use std::conditional<>
+// and some accessor functions to select between two implementations.
+
+
+// CRTP from this to enable shared_from_this:
+template <typename T>
+class enable_lw_shared_from_this : private lw_shared_ptr_counter_base {
+    using ctor = T;
+protected:
+    enable_lw_shared_from_this() noexcept {}
+    enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {}
+    enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {}
+    enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; }
+    enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; }
+public:
+    lw_shared_ptr<T> shared_from_this();
+    lw_shared_ptr<const T> shared_from_this() const;
+
+    template <typename X>
+    friend class lw_shared_ptr;
+    template <typename X>
+    friend class ::internal::lw_shared_ptr_accessors_esft;
+    template <typename X, class Y>
+    friend class ::internal::lw_shared_ptr_accessors;
+};
+
+template <typename T>
+struct shared_ptr_no_esft : private lw_shared_ptr_counter_base {
+    T _value;
+
+    shared_ptr_no_esft() = default;
+    shared_ptr_no_esft(const T& x) : _value(x) {}
+    shared_ptr_no_esft(T&& x) : _value(std::move(x)) {}
+    template <typename... A>
+    shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {}
+
+    template <typename X>
+    friend class lw_shared_ptr;
+    template <typename X>
+    friend class ::internal::lw_shared_ptr_accessors_no_esft;
+    template <typename X, class Y>
+    friend class ::internal::lw_shared_ptr_accessors;
+};
+
+
+/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed,
+/// primarily so that incomplete classes can be used.
+///
+/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>.
+/// The specialization must be visible for all uses of \c lw_shared_ptr<T>.
+///
+/// To customize, the template must have a `static void dispose(T*)` operator that disposes of
+/// the object.
+template <typename T>
+struct lw_shared_ptr_deleter;  // No generic implementation
+
+namespace internal {
+
+template <typename T>
+struct lw_shared_ptr_accessors_esft {
+    using concrete_type = std::remove_const_t<T>;
+    static T* to_value(lw_shared_ptr_counter_base* counter) {
+        return static_cast<T*>(counter);
+    }
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        delete static_cast<T*>(counter);
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // since to_value() is defined above, we don't need to do anything special
+        // to force-instantiate it
+    }
+};
+
+template <typename T>
+struct lw_shared_ptr_accessors_no_esft {
+    using concrete_type = shared_ptr_no_esft<T>;
+    static T* to_value(lw_shared_ptr_counter_base* counter) {
+        return &static_cast<concrete_type*>(counter)->_value;
+    }
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        delete static_cast<concrete_type*>(counter);
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // since to_value() is defined above, we don't need to do anything special
+        // to force-instantiate it
+    }
+};
+
+// Generic case: lw_shared_ptr_deleter<T> is not specialized, select
+// implementation based on whether T inherits from enable_lw_shared_from_this<T>.
+template <typename T, typename U = void>
+struct lw_shared_ptr_accessors : std::conditional_t<
+         std::is_base_of<enable_lw_shared_from_this<T>, T>::value,
+         lw_shared_ptr_accessors_esft<T>,
+         lw_shared_ptr_accessors_no_esft<T>> {
+};
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> {
+    using concrete_type = T;
+    static T* to_value(lw_shared_ptr_counter_base* counter);
+    static void dispose(lw_shared_ptr_counter_base* counter) {
+        lw_shared_ptr_deleter<T>::dispose(to_value(counter));
+    }
+    static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+        // instantiate to_value(); must be defined by shared_ptr_incomplete.hh
+        to_value(p);
+    }
+};
+
+}
+
+template <typename T>
+class lw_shared_ptr {
+    using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>;
+    using concrete_type = typename accessors::concrete_type;
+    mutable lw_shared_ptr_counter_base* _p = nullptr;
+private:
+    lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) {
+        if (_p) {
+            ++_p->_count;
+        }
+    }
+    template <typename... A>
+    static lw_shared_ptr make(A&&... a) {
+        auto p = new concrete_type(std::forward<A>(a)...);
+        accessors::instantiate_to_value(p);
+        return lw_shared_ptr(p);
+    }
+public:
+    using element_type = T;
+
+    lw_shared_ptr() noexcept = default;
+    lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {}
+    lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) {
+        if (_p) {
+            ++_p->_count;
+        }
+    }
+    lw_shared_ptr(lw_shared_ptr&& x) noexcept  : _p(x._p) {
+        x._p = nullptr;
+    }
+    [[gnu::always_inline]]
+    ~lw_shared_ptr() {
+        if (_p && !--_p->_count) {
+            accessors::dispose(_p);
+        }
+    }
+    lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept {
+        if (_p != x._p) {
+            this->~lw_shared_ptr();
+            new (this) lw_shared_ptr(x);
+        }
+        return *this;
+    }
+    lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept {
+        if (_p != x._p) {
+            this->~lw_shared_ptr();
+            new (this) lw_shared_ptr(std::move(x));
+        }
+        return *this;
+    }
+    lw_shared_ptr& operator=(std::nullptr_t) noexcept {
+        return *this = lw_shared_ptr();
+    }
+    lw_shared_ptr& operator=(T&& x) noexcept {
+        this->~lw_shared_ptr();
+        new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x)));
+        return *this;
+    }
+
+    T& operator*() const noexcept { return *accessors::to_value(_p); }
+    T* operator->() const noexcept { return accessors::to_value(_p); }
+    T* get() const noexcept {
+        if (_p) {
+            return accessors::to_value(_p);
+        } else {
+            return nullptr;
+        }
+    }
+
+    long int use_count() const noexcept {
+        if (_p) {
+            return _p->_count;
+        } else {
+            return 0;
+        }
+    }
+
+    operator lw_shared_ptr<const T>() const noexcept {
+        return lw_shared_ptr<const T>(_p);
+    }
+
+    explicit operator bool() const noexcept {
+        return _p;
+    }
+
+    bool owned() const noexcept {
+        return _p->_count == 1;
+    }
+
+    bool operator==(const lw_shared_ptr<const T>& x) const {
+        return _p == x._p;
+    }
+
+    bool operator!=(const lw_shared_ptr<const T>& x) const {
+        return !operator==(x);
+    }
+
+    bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return _p == x._p;
+    }
+
+    bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return !operator==(x);
+    }
+
+    bool operator<(const lw_shared_ptr<const T>& x) const {
+        return _p < x._p;
+    }
+
+    bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+        return _p < x._p;
+    }
+
+    template <typename U>
+    friend class lw_shared_ptr;
+
+    template <typename X, typename... A>
+    friend lw_shared_ptr<X> make_lw_shared(A&&...);
+
+    template <typename U>
+    friend lw_shared_ptr<U> make_lw_shared(U&&);
+
+    template <typename U>
+    friend lw_shared_ptr<U> make_lw_shared(U&);
+
+    template <typename U>
+    friend class enable_lw_shared_from_this;
+};
+
+template <typename T, typename... A>
+inline
+lw_shared_ptr<T> make_lw_shared(A&&... a) {
+    return lw_shared_ptr<T>::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T&& a) {
+    return lw_shared_ptr<T>::make(std::move(a));
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T& a) {
+    return lw_shared_ptr<T>::make(a);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T>
+enable_lw_shared_from_this<T>::shared_from_this() {
+    return lw_shared_ptr<T>(this);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<const T>
+enable_lw_shared_from_this<T>::shared_from_this() const {
+    return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this));
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) {
+    if (!p) {
+        return out << "null";
+    }
+    return out << *p;
+}
+
+namespace std {
+
+  template <typename T>
+  struct hash<lw_shared_ptr<T>> : private hash<T*> {
+    size_t operator()(const lw_shared_ptr<T>& p) const {
+        return hash<T*>::operator()(p.get());
+    }
+  };
+
+}
+
+#endif /* CEPH_LW_SHARED_PTR_H_ */
diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h
new file mode 100644
index 00000000..1898e8f8
--- /dev/null
+++ b/src/msg/async/dpdk/stream.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_STREAM_H_
+#define CEPH_MSG_STREAM_H_
+
+#include <exception>
+#include <cassert>
+
+// A stream<> is the producer side.  It may call produce() as long
+// as the returned from the previous invocation is ready.
+// To signify no more data is available, call close().
+//
+// A subscription<> is the consumer side.  It is created by a call
+// to stream::listen().  Calling subscription::start(),
+// which registers the data processing callback, starts processing
+// events.  It may register for end-of-stream notifications by
+// return the when_done() future, which also delivers error
+// events (as exceptions).
+//
+// The consumer can pause generation of new data by returning
+// positive integer; when it becomes ready, the producer
+// will resume processing.
+
+template <typename... T>
+class subscription;
+
+template <typename... T>
+class stream {
+  subscription<T...>* _sub = nullptr;
+  int done;
+  bool ready;
+ public:
+  using next_fn = std::function<int (T...)>;
+  stream() = default;
+  stream(const stream&) = delete;
+  stream(stream&&) = delete;
+  ~stream() {
+    if (_sub) {
+      _sub->_stream = nullptr;
+    }
+  }
+
+  void operator=(const stream&) = delete;
+  void operator=(stream&&) = delete;
+
+  // Returns a subscription that reads value from this
+  // stream.
+  subscription<T...> listen() {
+    return subscription<T...>(this);
+  }
+
+  // Returns a subscription that reads value from this
+  // stream, and also sets up the listen function.
+  subscription<T...> listen(next_fn next) {
+    auto sub = subscription<T...>(this);
+    sub.start(std::move(next));
+    return sub;
+  }
+
+  // Becomes ready when the listener is ready to accept
+  // values.  Call only once, when beginning to produce
+  // values.
+  bool started() {
+    return ready;
+  }
+
+  // Produce a value.  Call only after started(), and after
+  // a previous produce() is ready.
+  int produce(T... data) {
+      return _sub->_next(std::move(data)...);
+  }
+
+  // End the stream.   Call only after started(), and after
+  // a previous produce() is ready.  No functions may be called
+  // after this.
+  void close() {
+    done = 1;
+  }
+
+  // Signal an error.   Call only after started(), and after
+  // a previous produce() is ready.  No functions may be called
+  // after this.
+  void set_exception(int error) {
+    done = error;
+  }
+ private:
+  void start();
+  friend class subscription<T...>;
+};
+
+template <typename... T>
+class subscription {
+ public:
+  using next_fn = typename stream<T...>::next_fn;
+ private:
+  stream<T...>* _stream;
+  next_fn _next;
+ private:
+  explicit subscription(stream<T...>* s): _stream(s) {
+    ceph_assert(!_stream->_sub);
+    _stream->_sub = this;
+  }
+
+ public:
+  subscription(subscription&& x)
+    : _stream(x._stream), _next(std::move(x._next)) {
+    x._stream = nullptr;
+    if (_stream) {
+      _stream->_sub = this;
+    }
+  }
+  ~subscription() {
+    if (_stream) {
+      _stream->_sub = nullptr;
+    }
+  }
+
+  /// \brief Start receiving events from the stream.
+  ///
+  /// \param next Callback to call for each event
+  void start(std::function<int (T...)> next) {
+    _next = std::move(next);
+    _stream->ready = true;
+  }
+
+  // Becomes ready when the stream is empty, or when an error
+  // happens (in that case, an exception is held).
+  int done() {
+    return _stream->done;
+  }
+
+  friend class stream<T...>;
+};
+
+#endif /* CEPH_MSG_STREAM_H_ */
diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h
new file mode 100644
index 00000000..3ca38808
--- /dev/null
+++ b/src/msg/async/dpdk/toeplitz.h
@@ -0,0 +1,92 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*-
+ * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CEPH_MSG_TOEPLITZ_H_
+#define CEPH_MSG_TOEPLITZ_H_
+
+#include <vector>
+
+using rss_key_type = std::vector<uint8_t>;
+
+// Mellanox Linux's driver key
+static const rss_key_type default_rsskey_40bytes = {
+    0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
+    0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
+    0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
+    0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
+    0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
+};
+
+// Intel's i40e PMD default RSS key
+static const rss_key_type default_rsskey_52bytes = {
+    0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
+    0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
+    0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
+    0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
+    0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
+    0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
+    0x81, 0x15, 0x03, 0x66
+};
+
+template<typename T>
+static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data)
+{
+	uint32_t hash = 0, v;
+	u_int i, b;
+
+	/* XXXRW: Perhaps an assertion about key length vs. data length? */
+
+	v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
+	for (i = 0; i < data.size(); i++) {
+		for (b = 0; b < 8; b++) {
+			if (data[i] & (1<<(7-b)))
+				hash ^= v;
+			v <<= 1;
+			if ((i + 4) < key.size() &&
+			    (key[i+4] & (1<<(7-b))))
+				v |= 1;
+		}
+	}
+	return (hash);
+}
+#endif
diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h
new file mode 100644
index 00000000..599db5bd
--- /dev/null
+++ b/src/msg/async/dpdk/transfer.h
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership.  You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_TRANSFER_H_
+#define CEPH_TRANSFER_H_
+
+// Helper functions for copying or moving multiple objects in an exception
+// safe manner, then destroying the sources.
+//
+// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs,
+// (this copies the object from @from to @to).  If no exceptions are encountered,
+// call transfer_pass2(allocator, &from, &to).  This destroys the object at the
+// origin.  If exceptions were encountered, simply destroy all copied objects.
+//
+// As an optimization, if the objects are moveable without throwing (noexcept)
+// transfer_pass1() simply moves the objects and destroys the source, and
+// transfer_pass2() does nothing.
+
+#include <type_traits>
+#include <utility>
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+                           typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.construct(to, std::move(*from));
+    a.destroy(from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+                           typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+               typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.construct(to, *from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+               typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+    a.destroy(from);
+}
+
+#endif /* CEPH_TRANSFER_H_ */
diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc
new file mode 100644
index 00000000..f047eb18
--- /dev/null
+++ b/src/msg/async/frames_v2.cc
@@ -0,0 +1,480 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "frames_v2.h"
+
+#include <ostream>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include "seastar/fmt/include/fmt/format.h"
+
+namespace ceph::msgr::v2 {
+
+// Unpads bufferlist to unpadded_len.
+static void unpad_zero(bufferlist& bl, uint32_t unpadded_len) {
+  ceph_assert(bl.length() >= unpadded_len);
+  if (bl.length() > unpadded_len) {
+    bl.splice(unpadded_len, bl.length() - unpadded_len);
+  }
+}
+
+// Discards trailing empty segments, unless there is just one segment.
+// A frame always has at least one (possibly empty) segment.
+static size_t calc_num_segments(const bufferlist segment_bls[],
+                                size_t segment_count) {
+  ceph_assert(segment_count > 0 && segment_count <= MAX_NUM_SEGMENTS);
+  for (size_t i = segment_count; i-- > 0; ) {
+    if (segment_bls[i].length() > 0) {
+      return i + 1;
+    }
+  }
+  return 1;
+}
+
+static void check_segment_crc(const bufferlist& segment_bl,
+                              uint32_t expected_crc) {
+  uint32_t crc = segment_bl.crc32c(-1);
+  if (crc != expected_crc) {
+    throw FrameError(fmt::format(
+        "bad segment crc calculated={} expected={}", crc, expected_crc));
+  }
+}
+
+// Returns true if the frame is ready for dispatching, or false if
+// it was aborted by the sender and must be dropped.
+static bool check_epilogue_late_status(__u8 late_status) {
+  __u8 aborted = late_status & FRAME_LATE_STATUS_ABORTED_MASK;
+  if (aborted != FRAME_LATE_STATUS_ABORTED &&
+      aborted != FRAME_LATE_STATUS_COMPLETE) {
+    throw FrameError(fmt::format("bad late_status"));
+  }
+  return aborted == FRAME_LATE_STATUS_COMPLETE;
+}
+
+void FrameAssembler::fill_preamble(Tag tag,
+                                   preamble_block_t& preamble) const {
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&preamble, 0, sizeof(preamble));
+
+  preamble.tag = static_cast<__u8>(tag);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    preamble.segments[i].length = m_descs[i].logical_len;
+    preamble.segments[i].alignment = m_descs[i].align;
+  }
+  preamble.num_segments = m_descs.size();
+  preamble.crc = ceph_crc32c(
+      0, reinterpret_cast<const unsigned char*>(&preamble),
+      sizeof(preamble) - sizeof(preamble.crc));
+}
+
+uint64_t FrameAssembler::get_frame_logical_len() const {
+  ceph_assert(!m_descs.empty());
+  uint64_t logical_len = 0;
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    logical_len += m_descs[i].logical_len;
+  }
+  return logical_len;
+}
+
+uint64_t FrameAssembler::get_frame_onwire_len() const {
+  ceph_assert(!m_descs.empty());
+  uint64_t onwire_len = get_preamble_onwire_len();
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    onwire_len += get_segment_onwire_len(i);
+  }
+  onwire_len += get_epilogue_onwire_len();
+  return onwire_len;
+}
+
+bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble,
+                                        bufferlist segment_bls[]) const {
+  epilogue_crc_rev0_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+
+  bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue));
+  frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    epilogue.crc_values[i] = segment_bls[i].crc32c(-1);
+    if (segment_bls[i].length() > 0) {
+      frame_bl.claim_append(segment_bls[i]);
+    }
+  }
+  frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
+                                           bufferlist segment_bls[]) const {
+  bufferlist preamble_bl(sizeof(preamble));
+  preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                     sizeof(preamble));
+
+  epilogue_secure_rev0_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  bufferlist epilogue_bl(sizeof(epilogue));
+  epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+                     sizeof(epilogue));
+
+  // preamble + MAX_NUM_SEGMENTS + epilogue
+  uint32_t onwire_lens[MAX_NUM_SEGMENTS + 2];
+  onwire_lens[0] = preamble_bl.length();
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    onwire_lens[i + 1] = segment_bls[i].length();  // already padded
+  }
+  onwire_lens[m_descs.size() + 1] = epilogue_bl.length();
+  m_crypto->tx->reset_tx_handler(onwire_lens,
+                                 onwire_lens + m_descs.size() + 2);
+  m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    if (segment_bls[i].length() > 0) {
+      m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+    }
+  }
+  m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+  return m_crypto->tx->authenticated_encrypt_final();
+}
+
+bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble,
+                                        bufferlist segment_bls[]) const {
+  epilogue_crc_rev1_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+
+  bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue));
+  frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+
+  ceph_assert(segment_bls[0].length() == m_descs[0].logical_len);
+  if (segment_bls[0].length() > 0) {
+    uint32_t crc = segment_bls[0].crc32c(-1);
+    frame_bl.claim_append(segment_bls[0]);
+    encode(crc, frame_bl);
+  }
+  if (m_descs.size() == 1) {
+    return frame_bl;  // no epilogue if only one segment
+  }
+
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    epilogue.crc_values[i - 1] = segment_bls[i].crc32c(-1);
+    if (segment_bls[i].length() > 0) {
+      frame_bl.claim_append(segment_bls[i]);
+    }
+  }
+  frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble,
+                                           bufferlist segment_bls[]) const {
+  bufferlist preamble_bl;
+  if (segment_bls[0].length() > FRAME_PREAMBLE_INLINE_SIZE) {
+    // first segment is partially inlined, inline buffer is full
+    preamble_bl.reserve(sizeof(preamble));
+    preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                       sizeof(preamble));
+    segment_bls[0].splice(0, FRAME_PREAMBLE_INLINE_SIZE, &preamble_bl);
+  } else {
+    // first segment is fully inlined, inline buffer may need padding
+    uint32_t pad_len = FRAME_PREAMBLE_INLINE_SIZE - segment_bls[0].length();
+    preamble_bl.reserve(sizeof(preamble) + pad_len);
+    preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+                       sizeof(preamble));
+    preamble_bl.claim_append(segment_bls[0]);
+    if (pad_len > 0) {
+      preamble_bl.append_zero(pad_len);
+    }
+  }
+
+  m_crypto->tx->reset_tx_handler({preamble_bl.length()});
+  m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+  auto frame_bl = m_crypto->tx->authenticated_encrypt_final();
+
+  if (segment_bls[0].length() > 0) {
+    m_crypto->tx->reset_tx_handler({segment_bls[0].length()});
+    m_crypto->tx->authenticated_encrypt_update(segment_bls[0]);
+    auto tmp = m_crypto->tx->authenticated_encrypt_final();
+    frame_bl.claim_append(tmp);
+  }
+  if (m_descs.size() == 1) {
+    return frame_bl;  // no epilogue if only one segment
+  }
+
+  epilogue_secure_rev1_block_t epilogue;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  ::memset(&epilogue, 0, sizeof(epilogue));
+  epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+  bufferlist epilogue_bl(sizeof(epilogue));
+  epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+                     sizeof(epilogue));
+
+  // MAX_NUM_SEGMENTS - 1 + epilogue
+  uint32_t onwire_lens[MAX_NUM_SEGMENTS];
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    onwire_lens[i - 1] = segment_bls[i].length();  // already padded
+  }
+  onwire_lens[m_descs.size() - 1] = epilogue_bl.length();
+  m_crypto->tx->reset_tx_handler(onwire_lens, onwire_lens + m_descs.size());
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    if (segment_bls[i].length() > 0) {
+      m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+    }
+  }
+  m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+  auto tmp = m_crypto->tx->authenticated_encrypt_final();
+  frame_bl.claim_append(tmp);
+  return frame_bl;
+}
+
+bufferlist FrameAssembler::assemble_frame(Tag tag, bufferlist segment_bls[],
+                                          const uint16_t segment_aligns[],
+                                          size_t segment_count) {
+  m_descs.resize(calc_num_segments(segment_bls, segment_count));
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    m_descs[i].logical_len = segment_bls[i].length();
+    m_descs[i].align = segment_aligns[i];
+  }
+
+  preamble_block_t preamble;
+  fill_preamble(tag, preamble);
+
+  if (m_crypto->rx) {
+    for (size_t i = 0; i < m_descs.size(); i++) {
+      ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+      // We're padding segments to biggest cipher's block size. Although
+      // AES-GCM can live without that as it's a stream cipher, we don't
+      // want to be fixed to stream ciphers only.
+      uint32_t padded_len = get_segment_padded_len(i);
+      if (padded_len > segment_bls[i].length()) {
+        uint32_t pad_len = padded_len - segment_bls[i].length();
+        segment_bls[i].reserve(pad_len);
+        segment_bls[i].append_zero(pad_len);
+      }
+    }
+    if (m_is_rev1) {
+      return asm_secure_rev1(preamble, segment_bls);
+    }
+    return asm_secure_rev0(preamble, segment_bls);
+  }
+  if (m_is_rev1) {
+    return asm_crc_rev1(preamble, segment_bls);
+  }
+  return asm_crc_rev0(preamble, segment_bls);
+}
+
+Tag FrameAssembler::disassemble_preamble(bufferlist& preamble_bl) {
+  if (m_crypto->rx) {
+    m_crypto->rx->reset_rx_handler();
+    if (m_is_rev1) {
+      ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE +
+                                          get_auth_tag_len());
+      m_crypto->rx->authenticated_decrypt_update_final(preamble_bl);
+    } else {
+      ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+      m_crypto->rx->authenticated_decrypt_update(preamble_bl);
+    }
+  } else {
+    ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+  }
+
+  // I expect ceph_le32 will make the endian conversion for me. Passing
+  // everything through ::Decode is unnecessary.
+  auto preamble = reinterpret_cast<const preamble_block_t*>(
+      preamble_bl.c_str());
+  // check preamble crc before any further processing
+  uint32_t crc = ceph_crc32c(
+      0, reinterpret_cast<const unsigned char*>(preamble),
+      sizeof(*preamble) - sizeof(preamble->crc));
+  if (crc != preamble->crc) {
+    throw FrameError(fmt::format(
+        "bad preamble crc calculated={} expected={}", crc, preamble->crc));
+  }
+
+  // see calc_num_segments()
+  if (preamble->num_segments < 1 ||
+      preamble->num_segments > MAX_NUM_SEGMENTS) {
+    throw FrameError(fmt::format(
+        "bad number of segments num_segments={}", preamble->num_segments));
+  }
+  if (preamble->num_segments > 1 &&
+      preamble->segments[preamble->num_segments - 1].length == 0) {
+    throw FrameError("last segment empty");
+  }
+
+  m_descs.resize(preamble->num_segments);
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    m_descs[i].logical_len = preamble->segments[i].length;
+    m_descs[i].align = preamble->segments[i].alignment;
+  }
+  return static_cast<Tag>(preamble->tag);
+}
+
+bool FrameAssembler::disasm_all_crc_rev0(bufferlist segment_bls[],
+                                         bufferlist& epilogue_bl) const {
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev0_block_t));
+  auto epilogue = reinterpret_cast<const epilogue_crc_rev0_block_t*>(
+      epilogue_bl.c_str());
+
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    check_segment_crc(segment_bls[i], epilogue->crc_values[i]);
+  }
+  return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+bool FrameAssembler::disasm_all_secure_rev0(bufferlist segment_bls[],
+                                            bufferlist& epilogue_bl) const {
+  for (size_t i = 0; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+    if (segment_bls[i].length() > 0) {
+      m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+      unpad_zero(segment_bls[i], m_descs[i].logical_len);
+    }
+  }
+
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev0_block_t) +
+                                      get_auth_tag_len());
+  m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+  auto epilogue = reinterpret_cast<const epilogue_secure_rev0_block_t*>(
+      epilogue_bl.c_str());
+  return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+void FrameAssembler::disasm_first_crc_rev1(bufferlist& preamble_bl,
+                                           bufferlist& segment_bl) const {
+  ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+  if (m_descs[0].logical_len > 0) {
+    ceph_assert(segment_bl.length() == m_descs[0].logical_len +
+                                       FRAME_CRC_SIZE);
+    bufferlist::const_iterator it(&segment_bl, m_descs[0].logical_len);
+    uint32_t expected_crc;
+    decode(expected_crc, it);
+    segment_bl.splice(m_descs[0].logical_len, FRAME_CRC_SIZE);
+    check_segment_crc(segment_bl, expected_crc);
+  } else {
+    ceph_assert(segment_bl.length() == 0);
+  }
+}
+
+bool FrameAssembler::disasm_remaining_crc_rev1(bufferlist segment_bls[],
+                                               bufferlist& epilogue_bl) const {
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev1_block_t));
+  auto epilogue = reinterpret_cast<const epilogue_crc_rev1_block_t*>(
+      epilogue_bl.c_str());
+
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+    check_segment_crc(segment_bls[i], epilogue->crc_values[i - 1]);
+  }
+  return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disasm_first_secure_rev1(bufferlist& preamble_bl,
+                                              bufferlist& segment_bl) const {
+  ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE);
+  uint32_t padded_len = get_segment_padded_len(0);
+  if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+    ceph_assert(segment_bl.length() == padded_len + get_auth_tag_len() -
+                                       FRAME_PREAMBLE_INLINE_SIZE);
+    m_crypto->rx->reset_rx_handler();
+    m_crypto->rx->authenticated_decrypt_update_final(segment_bl);
+    // prepend the inline buffer (already decrypted) to segment_bl
+    bufferlist tmp;
+    segment_bl.swap(tmp);
+    preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+                       &segment_bl);
+    segment_bl.claim_append(tmp);
+  } else {
+    ceph_assert(segment_bl.length() == 0);
+    preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+                       &segment_bl);
+  }
+  unpad_zero(segment_bl, m_descs[0].logical_len);
+  ceph_assert(segment_bl.length() == m_descs[0].logical_len);
+}
+
+bool FrameAssembler::disasm_remaining_secure_rev1(
+    bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+  m_crypto->rx->reset_rx_handler();
+  for (size_t i = 1; i < m_descs.size(); i++) {
+    ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+    if (segment_bls[i].length() > 0) {
+      m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+      unpad_zero(segment_bls[i], m_descs[i].logical_len);
+    }
+  }
+
+  ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev1_block_t) +
+                                      get_auth_tag_len());
+  m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+  auto epilogue = reinterpret_cast<const epilogue_secure_rev1_block_t*>(
+      epilogue_bl.c_str());
+  return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disassemble_first_segment(bufferlist& preamble_bl,
+                                               bufferlist& segment_bl) const {
+  ceph_assert(!m_descs.empty());
+  if (m_is_rev1) {
+    if (m_crypto->rx) {
+      disasm_first_secure_rev1(preamble_bl, segment_bl);
+    } else {
+      disasm_first_crc_rev1(preamble_bl, segment_bl);
+    }
+  } else {
+    // noop, everything is handled in disassemble_remaining_segments()
+  }
+}
+
+bool FrameAssembler::disassemble_remaining_segments(
+    bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+  ceph_assert(!m_descs.empty());
+  if (m_is_rev1) {
+    if (m_descs.size() == 1) {
+      // no epilogue if only one segment
+      ceph_assert(epilogue_bl.length() == 0);
+      return true;
+    }
+    if (m_crypto->rx) {
+      return disasm_remaining_secure_rev1(segment_bls, epilogue_bl);
+    }
+    return disasm_remaining_crc_rev1(segment_bls, epilogue_bl);
+  }
+  if (m_crypto->rx) {
+    return disasm_all_secure_rev0(segment_bls, epilogue_bl);
+  }
+  return disasm_all_crc_rev0(segment_bls, epilogue_bl);
+}
+
+std::ostream& operator<<(std::ostream& os, const FrameAssembler& frame_asm) {
+  if (!frame_asm.m_descs.empty()) {
+    os << frame_asm.get_preamble_onwire_len();
+    for (size_t i = 0; i < frame_asm.m_descs.size(); i++) {
+      os << " + " << frame_asm.get_segment_onwire_len(i)
+         << " (logical " << frame_asm.m_descs[i].logical_len
+         << "/" << frame_asm.m_descs[i].align << ")";
+    }
+    os << " + " << frame_asm.get_epilogue_onwire_len() << " ";
+  }
+  os << "rev1=" << frame_asm.m_is_rev1
+     << " rx=" << frame_asm.m_crypto->rx.get()
+     << " tx=" << frame_asm.m_crypto->tx.get();
+  return os;
+}
+
+}  // namespace ceph::msgr::v2
diff --git a/src/msg/async/frames_v2.h b/src/msg/async/frames_v2.h
new file mode 100644
index 00000000..88fa4e1b
--- /dev/null
+++ b/src/msg/async/frames_v2.h
@@ -0,0 +1,842 @@
+#ifndef _MSG_ASYNC_FRAMES_V2_
+#define _MSG_ASYNC_FRAMES_V2_
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "crypto_onwire.h"
+#include <array>
+#include <iosfwd>
+#include <utility>
+
+#include <boost/container/static_vector.hpp>
+
+/**
+ * Protocol V2 Frame Structures
+ * 
+ * Documentation in: doc/dev/msgr2.rst
+ **/
+
+namespace ceph::msgr::v2 {
+
+// We require these features from any peer, period, in order to encode
+// a entity_addrvec_t.
+const uint64_t msgr2_required = CEPH_FEATUREMASK_MSG_ADDR2;
+
+// We additionally assume the peer has the below features *purely for
+// the purpose of encoding the frames themselves*.  The only complex
+// types in the frames are entity_addr_t and entity_addrvec_t, and we
+// specifically want the peer to understand the (new in nautilus)
+// TYPE_ANY.  We treat narrow this assumption to frames because we
+// expect there may be future clients (the kernel) that understand
+// msgr v2 and understand this encoding but don't necessarily have
+// everything else that SERVER_NAUTILUS implies.  Yes, a fresh feature
+// bit would be a cleaner approach, but those are scarce these days.
+const uint64_t msgr2_frame_assumed =
+		   msgr2_required |
+		   CEPH_FEATUREMASK_SERVER_NAUTILUS;
+
+enum class Tag : __u8 {
+  HELLO = 1,
+  AUTH_REQUEST,
+  AUTH_BAD_METHOD,
+  AUTH_REPLY_MORE,
+  AUTH_REQUEST_MORE,
+  AUTH_DONE,
+  AUTH_SIGNATURE,
+  CLIENT_IDENT,
+  SERVER_IDENT,
+  IDENT_MISSING_FEATURES,
+  SESSION_RECONNECT,
+  SESSION_RESET,
+  SESSION_RETRY,
+  SESSION_RETRY_GLOBAL,
+  SESSION_RECONNECT_OK,
+  WAIT,
+  MESSAGE,
+  KEEPALIVE2,
+  KEEPALIVE2_ACK,
+  ACK
+};
+
+struct segment_t {
+  // TODO: this will be dropped with support for `allocation policies`.
+  // We need them because of the rx_buffers zero-copy optimization.
+  static constexpr __le16 PAGE_SIZE_ALIGNMENT{4096};
+
+  static constexpr __le16 DEFAULT_ALIGNMENT = sizeof(void *);
+
+  ceph_le32 length;
+  ceph_le16 alignment;
+} __attribute__((packed));
+
+struct SegmentIndex {
+  struct Msg {
+    static constexpr std::size_t HEADER = 0;
+    static constexpr std::size_t FRONT = 1;
+    static constexpr std::size_t MIDDLE = 2;
+    static constexpr std::size_t DATA = 3;
+  };
+
+  struct Control {
+    static constexpr std::size_t PAYLOAD = 0;
+  };
+};
+
+static constexpr uint8_t CRYPTO_BLOCK_SIZE { 16 };
+
+static constexpr std::size_t MAX_NUM_SEGMENTS = 4;
+
+// V2 preamble consists of one or more preamble blocks depending on
+// the number of segments a particular frame needs. Each block holds
+// up to MAX_NUM_SEGMENTS segments and has its own CRC.
+//
+// XXX: currently the multi-segment facility is NOT implemented.
+struct preamble_block_t {  
+  // Tag. For multi-segmented frames the value is the same
+  // between subsequent preamble blocks.
+  __u8 tag;
+
+  // Number of segments to go in entire frame. First preable block has
+  // set this to just #segments, second #segments - MAX_NUM_SEGMENTS,
+  // third to #segments - MAX_NUM_SEGMENTS and so on.
+  __u8 num_segments;
+
+  std::array<segment_t, MAX_NUM_SEGMENTS> segments;
+  __u8 _reserved[2];
+
+  // CRC32 for this single preamble block.
+  ceph_le32 crc;
+} __attribute__((packed));
+static_assert(sizeof(preamble_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout<preamble_block_t>::value);
+
+struct epilogue_crc_rev0_block_t {
+  __u8 late_flags;  // FRAME_LATE_FLAG_ABORTED
+  std::array<ceph_le32, MAX_NUM_SEGMENTS> crc_values;
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev0_block_t>);
+
+struct epilogue_crc_rev1_block_t {
+  __u8 late_status;  // FRAME_LATE_STATUS_*
+  ceph_le32 crc_values[MAX_NUM_SEGMENTS - 1];
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev1_block_t>);
+
+struct epilogue_secure_rev0_block_t {
+  __u8 late_flags;  // FRAME_LATE_FLAG_ABORTED
+  __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_flags)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev0_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev0_block_t>);
+
+// epilogue_secure_rev0_block_t with late_flags changed to late_status
+struct epilogue_secure_rev1_block_t {
+  __u8 late_status;  // FRAME_LATE_STATUS_*
+  __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_status)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev1_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev1_block_t>);
+
+static constexpr uint32_t FRAME_CRC_SIZE = 4;
+static constexpr uint32_t FRAME_PREAMBLE_INLINE_SIZE = 48;
+static_assert(FRAME_PREAMBLE_INLINE_SIZE % CRYPTO_BLOCK_SIZE == 0);
+// just for performance, nothing should break otherwise
+static_assert(sizeof(ceph_msg_header2) <= FRAME_PREAMBLE_INLINE_SIZE);
+static constexpr uint32_t FRAME_PREAMBLE_WITH_INLINE_SIZE =
+    sizeof(preamble_block_t) + FRAME_PREAMBLE_INLINE_SIZE;
+
+// A frame can be aborted by the sender after transmitting the
+// preamble and the first segment.  The remainder of the frame
+// is filled with zeros, up until the epilogue.
+//
+// This flag is for msgr2.0.  Note that in crc mode, late_flags
+// is not covered by any crc -- a single bit flip can result in
+// a completed frame being dropped or in an aborted frame with
+// garbage segment payloads being dispatched.
+#define FRAME_LATE_FLAG_ABORTED           (1<<0)
+
+// For msgr2.1, FRAME_LATE_STATUS_ABORTED has the same meaning
+// as FRAME_LATE_FLAG_ABORTED and late_status replaces late_flags.
+// Bit error detection in crc mode is achieved by using a 4-bit
+// nibble per flag with two code words that are far apart in terms
+// of Hamming Distance (HD=4, same as provided by CRC32-C for
+// input lengths over ~5K).
+#define FRAME_LATE_STATUS_ABORTED         0x1
+#define FRAME_LATE_STATUS_COMPLETE        0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK    0xf
+
+#define FRAME_LATE_STATUS_RESERVED_TRUE   0x10
+#define FRAME_LATE_STATUS_RESERVED_FALSE  0xe0
+#define FRAME_LATE_STATUS_RESERVED_MASK   0xf0
+
+struct FrameError : std::runtime_error {
+  using runtime_error::runtime_error;
+};
+
+class FrameAssembler {
+public:
+  // crypto must be non-null
+  FrameAssembler(const ceph::crypto::onwire::rxtx_t* crypto, bool is_rev1)
+      : m_crypto(crypto), m_is_rev1(is_rev1) {}
+
+  void set_is_rev1(bool is_rev1) {
+    m_descs.clear();
+    m_is_rev1 = is_rev1;
+  }
+
+  bool get_is_rev1() {
+    return m_is_rev1;
+  }
+
+  size_t get_num_segments() const {
+    ceph_assert(!m_descs.empty());
+    return m_descs.size();
+  }
+
+  uint32_t get_segment_logical_len(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    return m_descs[seg_idx].logical_len;
+  }
+
+  uint16_t get_segment_align(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    return m_descs[seg_idx].align;
+  }
+
+  // Preamble:
+  //
+  //   preamble_block_t
+  //   [preamble inline buffer + auth tag -- only in msgr2.1 secure mode]
+  //
+  // The preamble is generated unconditionally.
+  //
+  // In msgr2.1 secure mode, the first segment is inlined into the
+  // preamble inline buffer, either fully or partially.
+  uint32_t get_preamble_onwire_len() const {
+    if (m_is_rev1 && m_crypto->rx) {
+      return FRAME_PREAMBLE_WITH_INLINE_SIZE + get_auth_tag_len();
+    }
+    return sizeof(preamble_block_t);
+  }
+
+  // Segment:
+  //
+  //   segment payload
+  //   [zero padding -- only in secure mode]
+  //   [crc or auth tag -- only in msgr2.1, only for the first segment]
+  //
+  // For an empty segment, nothing is generated.  In msgr2.1 secure
+  // mode, if the first segment gets fully inlined into the preamble
+  // inline buffer, it is considered empty.
+  uint32_t get_segment_onwire_len(size_t seg_idx) const {
+    ceph_assert(seg_idx < m_descs.size());
+    if (m_crypto->rx) {
+      uint32_t padded_len = get_segment_padded_len(seg_idx);
+      if (m_is_rev1 && seg_idx == 0) {
+        if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+          return padded_len + get_auth_tag_len() - FRAME_PREAMBLE_INLINE_SIZE;
+        }
+        return 0;
+      }
+      return padded_len;
+    }
+    if (m_is_rev1 && seg_idx == 0 && m_descs[0].logical_len > 0) {
+      return m_descs[0].logical_len + FRAME_CRC_SIZE;
+    }
+    return m_descs[seg_idx].logical_len;
+  }
+
+  // Epilogue:
+  //
+  //   epilogue_*_block_t
+  //   [auth tag -- only in secure mode]
+  //
+  // For msgr2.0, the epilogue is generated unconditionally.  In
+  // crc mode, it stores crcs for all segments; the preamble is
+  // covered by its own crc.  In secure mode, the epilogue auth tag
+  // covers the whole frame.
+  //
+  // For msgr2.1, the epilogue is generated only if the frame has
+  // more than one segment (i.e. at least one of second to fourth
+  // segments is not empty).  In crc mode, it stores crcs for
+  // second to fourh segments; the preamble and the first segment
+  // are covered by their own crcs.  In secure mode, the epilogue
+  // auth tag covers second to fourth segments; the preamble and the
+  // first segment (if not fully inlined into the preamble inline
+  // buffer) are covered by their own auth tags.
+  //
+  // Note that the auth tag format is an implementation detail of a
+  // particular cipher.  FrameAssembler is concerned only with where
+  // the auth tag is placed (at the end of the ciphertext) and how
+  // long it is (RxHandler::get_extra_size_at_final()).  This is to
+  // provide room for other encryption algorithms: currently we use
+  // AES-128-GCM with 16-byte tags, but it is possible to switch to
+  // e.g. AES-128-CBC + HMAC-SHA512 without affecting the protocol
+  // (except for the cipher negotiation, of course).
+  //
+  // Additionally, each variant of the epilogue contains either
+  // late_flags or late_status field that directs handling of frames
+  // with more than one segment.
+  uint32_t get_epilogue_onwire_len() const {
+    ceph_assert(!m_descs.empty());
+    if (m_is_rev1 && m_descs.size() == 1) {
+      return 0;
+    }
+    if (m_crypto->rx) {
+      return (m_is_rev1 ? sizeof(epilogue_secure_rev1_block_t) :
+                  sizeof(epilogue_secure_rev0_block_t)) + get_auth_tag_len();
+    }
+    return m_is_rev1 ? sizeof(epilogue_crc_rev1_block_t) :
+                       sizeof(epilogue_crc_rev0_block_t);
+  }
+
+  uint64_t get_frame_logical_len() const;
+  uint64_t get_frame_onwire_len() const;
+
+  bufferlist assemble_frame(Tag tag, bufferlist segment_bls[],
+                            const uint16_t segment_aligns[],
+                            size_t segment_count);
+
+  Tag disassemble_preamble(bufferlist& preamble_bl);
+
+  // Like msgr1, and unlike msgr2.0, msgr2.1 allows interpreting the
+  // first segment before reading in the rest of the frame.
+  //
+  // For msgr2.1 (set_is_rev1(true)), you may:
+  //
+  // - read in the first segment
+  // - call disassemble_first_segment()
+  // - use the contents of the first segment, for example to
+  //   look up user-provided buffers based on ceph_msg_header2::tid
+  // - read in the remaining segments, possibly directly into
+  //   user-provided buffers
+  // - read in epilogue
+  // - call disassemble_remaining_segments()
+  //
+  // For msgr2.0 (set_is_rev1(false)), disassemble_first_segment() is
+  // a noop.  To accomodate, disassemble_remaining_segments() always
+  // takes all segments and skips over the first segment in msgr2.1
+  // case.  You must:
+  //
+  // - read in all segments
+  // - read in epilogue
+  // - call disassemble_remaining_segments()
+  //
+  // disassemble_remaining_segments() returns true if the frame is
+  // ready for dispatching, or false if it was aborted by the sender
+  // and must be dropped.
+  void disassemble_first_segment(bufferlist& preamble_bl,
+                                 bufferlist& segment_bl) const;
+  bool disassemble_remaining_segments(bufferlist segment_bls[],
+                                      bufferlist& epilogue_bl) const;
+
+private:
+  struct segment_desc_t {
+    uint32_t logical_len;
+    uint16_t align;
+  };
+
+  uint32_t get_segment_padded_len(size_t seg_idx) const {
+    return p2roundup<uint32_t>(m_descs[seg_idx].logical_len,
+                               CRYPTO_BLOCK_SIZE);
+  }
+
+  uint32_t get_auth_tag_len() const {
+    return m_crypto->rx->get_extra_size_at_final();
+  }
+
+  bufferlist asm_crc_rev0(const preamble_block_t& preamble,
+                          bufferlist segment_bls[]) const;
+  bufferlist asm_secure_rev0(const preamble_block_t& preamble,
+                             bufferlist segment_bls[]) const;
+  bufferlist asm_crc_rev1(const preamble_block_t& preamble,
+                          bufferlist segment_bls[]) const;
+  bufferlist asm_secure_rev1(const preamble_block_t& preamble,
+                             bufferlist segment_bls[]) const;
+
+  bool disasm_all_crc_rev0(bufferlist segment_bls[],
+                           bufferlist& epilogue_bl) const;
+  bool disasm_all_secure_rev0(bufferlist segment_bls[],
+                              bufferlist& epilogue_bl) const;
+  void disasm_first_crc_rev1(bufferlist& preamble_bl,
+                             bufferlist& segment_bl) const;
+  bool disasm_remaining_crc_rev1(bufferlist segment_bls[],
+                                 bufferlist& epilogue_bl) const;
+  void disasm_first_secure_rev1(bufferlist& preamble_bl,
+                                bufferlist& segment_bl) const;
+  bool disasm_remaining_secure_rev1(bufferlist segment_bls[],
+                                    bufferlist& epilogue_bl) const;
+
+  void fill_preamble(Tag tag, preamble_block_t& preamble) const;
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const FrameAssembler& frame_asm);
+
+  boost::container::static_vector<segment_desc_t, MAX_NUM_SEGMENTS> m_descs;
+  const ceph::crypto::onwire::rxtx_t* m_crypto;
+  bool m_is_rev1;  // msgr2.1?
+};
+
+template <class T, uint16_t... SegmentAlignmentVs>
+struct Frame {
+  static constexpr size_t SegmentsNumV = sizeof...(SegmentAlignmentVs);
+  static_assert(SegmentsNumV > 0 && SegmentsNumV <= MAX_NUM_SEGMENTS);
+protected:
+  std::array<ceph::bufferlist, SegmentsNumV> segments;
+
+private:
+  static constexpr std::array<uint16_t, SegmentsNumV> alignments {
+    SegmentAlignmentVs...
+  };
+
+public:
+  ceph::bufferlist get_buffer(FrameAssembler& tx_frame_asm) {
+    auto bl = tx_frame_asm.assemble_frame(T::tag, segments.data(),
+                                          alignments.data(), SegmentsNumV);
+    ceph_assert(bl.length() == tx_frame_asm.get_frame_onwire_len());
+    return bl;
+  }
+};
+
+// ControlFrames are used to manage transceiver state (like connections) and
+// orchestrate transfers of MessageFrames. They use only single segment with
+// marshalling facilities -- derived classes specify frame structure through
+// Args pack while ControlFrame provides common encode/decode machinery.
+template <class C, typename... Args>
+class ControlFrame : public Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */> {
+protected:
+  ceph::bufferlist &get_payload_segment() {
+    return this->segments[SegmentIndex::Control::PAYLOAD];
+  }
+
+  // this tuple is only used when decoding values from a payload segment
+  std::tuple<Args...> _values;
+
+  // FIXME: for now, we assume specific features for the purpoess of encoding
+  // the frames themselves (*not* messages in message frames!).
+  uint64_t features = msgr2_frame_assumed;
+
+  template <typename T>
+  inline void _encode_payload_each(T &t) {
+    if constexpr (std::is_same<T, std::vector<uint32_t> const>()) {
+      encode((uint32_t)t.size(), this->get_payload_segment(), features);
+      for (const auto &elem : t) {
+        encode(elem, this->get_payload_segment(), features);
+      }
+    } else {
+      encode(t, this->get_payload_segment(), features);
+    }
+  }
+
+  template <typename T>
+  inline void _decode_payload_each(T &t, bufferlist::const_iterator &ti) const {
+    if constexpr (std::is_same<T, std::vector<uint32_t>>()) {
+      uint32_t size;
+      decode(size, ti);
+      t.resize(size);
+      for (uint32_t i = 0; i < size; ++i) {
+        decode(t[i], ti);
+      }
+    } else {
+      decode(t, ti);
+    }
+  }
+
+  template <std::size_t... Is>
+  inline void _decode_payload(bufferlist::const_iterator &ti,
+                              std::index_sequence<Is...>) const {
+    (_decode_payload_each((Args &)std::get<Is>(_values), ti), ...);
+  }
+
+  template <std::size_t N>
+  inline decltype(auto) get_val() {
+    return std::get<N>(_values);
+  }
+
+  ControlFrame()
+    : Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */>() {
+  }
+
+  void _encode(const Args &... args) {
+    (_encode_payload_each(args), ...);
+  }
+
+  void _decode(const ceph::bufferlist &bl) {
+    auto ti = bl.cbegin();
+    _decode_payload(ti, std::index_sequence_for<Args...>());
+  }
+
+public:
+  static C Encode(const Args &... args) {
+    C c;
+    c._encode(args...);
+    return c;
+  }
+
+  static C Decode(const ceph::bufferlist &payload) {
+    C c;
+    c._decode(payload);
+    return c;
+  }
+};
+
+struct HelloFrame : public ControlFrame<HelloFrame,
+                                        uint8_t,          // entity type
+                                        entity_addr_t> {  // peer address
+  static const Tag tag = Tag::HELLO;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint8_t &entity_type() { return get_val<0>(); }
+  inline entity_addr_t &peer_addr() { return get_val<1>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestFrame : public ControlFrame<AuthRequestFrame,
+                                              uint32_t, // auth method
+                                              vector<uint32_t>, // preferred modes
+                                              bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REQUEST;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint32_t &method() { return get_val<0>(); }
+  inline vector<uint32_t> &preferred_modes() { return get_val<1>(); }
+  inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthBadMethodFrame : public ControlFrame<AuthBadMethodFrame,
+                                                uint32_t, // method
+                                                int32_t,  // result
+                                                std::vector<uint32_t>,   // allowed methods
+                                                std::vector<uint32_t>> { // allowed modes
+  static const Tag tag = Tag::AUTH_BAD_METHOD;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint32_t &method() { return get_val<0>(); }
+  inline int32_t &result() { return get_val<1>(); }
+  inline std::vector<uint32_t> &allowed_methods() { return get_val<2>(); }
+  inline std::vector<uint32_t> &allowed_modes() { return get_val<3>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthReplyMoreFrame : public ControlFrame<AuthReplyMoreFrame,
+                                                bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REPLY_MORE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestMoreFrame : public ControlFrame<AuthRequestMoreFrame,
+                                                  bufferlist> { // auth payload
+  static const Tag tag = Tag::AUTH_REQUEST_MORE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthDoneFrame : public ControlFrame<AuthDoneFrame,
+                                           uint64_t, // global id
+                                           uint32_t, // connection mode
+                                           bufferlist> { // auth method payload
+  static const Tag tag = Tag::AUTH_DONE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &global_id() { return get_val<0>(); }
+  inline uint32_t &con_mode() { return get_val<1>(); }
+  inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AuthSignatureFrame
+    : public ControlFrame<AuthSignatureFrame,
+                          sha256_digest_t> {
+  static const Tag tag = Tag::AUTH_SIGNATURE;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline sha256_digest_t &signature() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ClientIdentFrame
+    : public ControlFrame<ClientIdentFrame,
+                          entity_addrvec_t,  // my addresses
+                          entity_addr_t,  // target address
+                          int64_t,  // global_id
+                          uint64_t,  // global seq
+                          uint64_t,  // supported features
+                          uint64_t,  // required features
+                          uint64_t,  // flags
+                          uint64_t> {  // client cookie
+  static const Tag tag = Tag::CLIENT_IDENT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline entity_addr_t &target_addr() { return get_val<1>(); }
+  inline int64_t &gid() { return get_val<2>(); }
+  inline uint64_t &global_seq() { return get_val<3>(); }
+  inline uint64_t &supported_features() { return get_val<4>(); }
+  inline uint64_t &required_features() { return get_val<5>(); }
+  inline uint64_t &flags() { return get_val<6>(); }
+  inline uint64_t &cookie() { return get_val<7>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ServerIdentFrame
+    : public ControlFrame<ServerIdentFrame,
+                          entity_addrvec_t,  // my addresses
+                          int64_t,  // global_id
+                          uint64_t,  // global seq
+                          uint64_t,  // supported features
+                          uint64_t,  // required features
+                          uint64_t,  // flags
+                          uint64_t> {  // server cookie
+  static const Tag tag = Tag::SERVER_IDENT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline int64_t &gid() { return get_val<1>(); }
+  inline uint64_t &global_seq() { return get_val<2>(); }
+  inline uint64_t &supported_features() { return get_val<3>(); }
+  inline uint64_t &required_features() { return get_val<4>(); }
+  inline uint64_t &flags() { return get_val<5>(); }
+  inline uint64_t &cookie() { return get_val<6>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ReconnectFrame
+    : public ControlFrame<ReconnectFrame,
+                          entity_addrvec_t,  // my addresses
+                          uint64_t,  // client cookie
+                          uint64_t,  // server cookie
+                          uint64_t,  // global sequence
+                          uint64_t,  // connect sequence
+                          uint64_t> { // message sequence
+  static const Tag tag = Tag::SESSION_RECONNECT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline entity_addrvec_t &addrs() { return get_val<0>(); }
+  inline uint64_t &client_cookie() { return get_val<1>(); }
+  inline uint64_t &server_cookie() { return get_val<2>(); }
+  inline uint64_t &global_seq() { return get_val<3>(); }
+  inline uint64_t &connect_seq() { return get_val<4>(); }
+  inline uint64_t &msg_seq() { return get_val<5>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ResetFrame : public ControlFrame<ResetFrame,
+                                        bool> {  // full reset
+  static const Tag tag = Tag::SESSION_RESET;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline bool &full() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct RetryFrame : public ControlFrame<RetryFrame,
+                                        uint64_t> {  // connection seq
+  static const Tag tag = Tag::SESSION_RETRY;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &connect_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct RetryGlobalFrame : public ControlFrame<RetryGlobalFrame,
+                                              uint64_t> { // global seq
+  static const Tag tag = Tag::SESSION_RETRY_GLOBAL;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &global_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct WaitFrame : public ControlFrame<WaitFrame> {
+  static const Tag tag = Tag::WAIT;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct ReconnectOkFrame : public ControlFrame<ReconnectOkFrame,
+                                              uint64_t> { // message seq
+  static const Tag tag = Tag::SESSION_RECONNECT_OK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &msg_seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct IdentMissingFeaturesFrame 
+    : public ControlFrame<IdentMissingFeaturesFrame,
+                          uint64_t> { // missing features mask
+  static const Tag tag = Tag::IDENT_MISSING_FEATURES;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &features() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrame : public ControlFrame<KeepAliveFrame,
+                                            utime_t> {  // timestamp
+  static const Tag tag = Tag::KEEPALIVE2;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  static KeepAliveFrame Encode() {
+    return KeepAliveFrame::Encode(ceph_clock_now());
+  }
+
+  inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrameAck : public ControlFrame<KeepAliveFrameAck,
+                                               utime_t> { // ack timestamp
+  static const Tag tag = Tag::KEEPALIVE2_ACK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+struct AckFrame : public ControlFrame<AckFrame,
+                                      uint64_t> { // message sequence
+  static const Tag tag = Tag::ACK;
+  using ControlFrame::Encode;
+  using ControlFrame::Decode;
+
+  inline uint64_t &seq() { return get_val<0>(); }
+
+protected:
+  using ControlFrame::ControlFrame;
+};
+
+using segment_bls_t =
+    boost::container::static_vector<bufferlist, MAX_NUM_SEGMENTS>;
+
+// This class is used for encoding/decoding header of the message frame.
+// Body is processed almost independently with the sole junction point
+// being the `extra_payload_len` passed to get_buffer().
+struct MessageFrame : public Frame<MessageFrame,
+                                   /* four segments */
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::DEFAULT_ALIGNMENT,
+                                   segment_t::PAGE_SIZE_ALIGNMENT> {
+  static const Tag tag = Tag::MESSAGE;
+
+  static MessageFrame Encode(const ceph_msg_header2 &msg_header,
+                             const ceph::bufferlist &front,
+                             const ceph::bufferlist &middle,
+                             const ceph::bufferlist &data) {
+    MessageFrame f;
+    f.segments[SegmentIndex::Msg::HEADER].append(
+        reinterpret_cast<const char*>(&msg_header), sizeof(msg_header));
+
+    f.segments[SegmentIndex::Msg::FRONT] = front;
+    f.segments[SegmentIndex::Msg::MIDDLE] = middle;
+    f.segments[SegmentIndex::Msg::DATA] = data;
+
+    return f;
+  }
+
+  static MessageFrame Decode(segment_bls_t& recv_segments) {
+    MessageFrame f;
+    // transfer segments' bufferlists. If a MessageFrame contains less
+    // SegmentsNumV segments, the missing ones will be seen as zeroed.
+    for (__u8 idx = 0; idx < std::size(recv_segments); idx++) {
+      f.segments[idx] = std::move(recv_segments[idx]);
+    }
+    return f;
+  }
+
+  inline const ceph_msg_header2 &header() {
+    auto& hdrbl = segments[SegmentIndex::Msg::HEADER];
+    return reinterpret_cast<const ceph_msg_header2&>(*hdrbl.c_str());
+  }
+
+  ceph::bufferlist &front() {
+    return segments[SegmentIndex::Msg::FRONT];
+  }
+
+  ceph::bufferlist &middle() {
+    return segments[SegmentIndex::Msg::MIDDLE];
+  }
+
+  ceph::bufferlist &data() {
+    return segments[SegmentIndex::Msg::DATA];
+  }
+
+  uint32_t front_len() const {
+    return segments[SegmentIndex::Msg::FRONT].length();
+  }
+
+  uint32_t middle_len() const {
+    return segments[SegmentIndex::Msg::MIDDLE].length();
+  }
+
+  uint32_t data_len() const {
+    return segments[SegmentIndex::Msg::DATA].length();
+  }
+
+protected:
+  using Frame::Frame;
+};
+
+} // namespace ceph::msgr::v2
+
+#endif // _MSG_ASYNC_FRAMES_V2_
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
new file mode 100644
index 00000000..2b4e646d
--- /dev/null
+++ b/src/msg/async/net_handler.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "net_handler.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "NetHandler "
+
+namespace ceph{
+
+int NetHandler::create_socket(int domain, bool reuse_addr)
+{
+  int s;
+  int r = 0;
+
+  if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) {
+    r = errno;
+    lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+
+#if !defined(__FreeBSD__)
+  /* Make sure connection-intensive things like the benchmark
+   * will be able to close/open sockets a zillion of times */
+  if (reuse_addr) {
+    int on = 1;
+    if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
+      r = errno;
+      lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: "
+                 << strerror(r) << dendl;
+      close(s);
+      return -r;
+    }
+  }
+#endif
+
+  return s;
+}
+
+int NetHandler::set_nonblock(int sd)
+{
+  int flags;
+  int r = 0;
+
+  /* Set the socket nonblocking.
+   * Note that fcntl(2) for F_GETFL and F_SETFL can't be
+   * interrupted by a signal. */
+  if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
+    r = errno;
+    lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+  if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
+    r = errno;
+    lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl;
+    return -r;
+  }
+
+  return 0;
+}
+
+int NetHandler::set_socket_options(int sd, bool nodelay, int size)
+{
+  int r = 0;
+  // disable Nagle algorithm?
+  if (nodelay) {
+    int flag = 1;
+    r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
+    if (r < 0) {
+      r = errno;
+      ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
+    }
+  }
+  if (size) {
+    r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
+    if (r < 0)  {
+      r = errno;
+      ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
+    }
+  }
+
+  // block ESIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
+  int val = 1;
+  r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
+  if (r) {
+    r = errno;
+    ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
+  }
+#endif
+  return -r;
+}
+
+void NetHandler::set_priority(int sd, int prio, int domain)
+{
+#ifdef SO_PRIORITY
+  if (prio < 0) {
+    return;
+  }
+  int r = -1;
+#ifdef IPTOS_CLASS_CS6
+  int iptos = IPTOS_CLASS_CS6;
+  switch (domain) {
+  case AF_INET:
+    r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
+    break;
+  case AF_INET6:
+    r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
+    break;
+  default:
+    lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")"
+	       << " to " << iptos << dendl;
+    return;
+  }
+  if (r < 0) {
+    r = errno;
+    ldout(cct,0) << "couldn't set TOS to " << iptos
+		 << ": " << cpp_strerror(r) << dendl;
+  }
+
+#endif	// IPTOS_CLASS_CS6
+  // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
+  // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
+  // We need to call setsockopt(SO_PRIORITY) after it.
+  r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
+  if (r < 0) {
+    r = errno;
+    ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio
+		  << ": " << cpp_strerror(r) << dendl;
+  }
+#else
+  return;
+#endif	// SO_PRIORITY
+}
+
+int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock)
+{
+  int ret;
+  int s = create_socket(addr.get_family());
+  if (s < 0)
+    return s;
+
+  if (nonblock) {
+    ret = set_nonblock(s);
+    if (ret < 0) {
+      close(s);
+      return ret;
+    }
+  }
+
+  set_socket_options(s, cct->_conf->ms_tcp_nodelay, cct->_conf->ms_tcp_rcvbuf);
+
+  {
+    entity_addr_t addr = bind_addr;
+    if (cct->_conf->ms_bind_before_connect && (!addr.is_blank_ip())) {
+      addr.set_port(0);
+      ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+      if (ret < 0) {
+        ret = errno;
+        ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl;
+        close(s);
+        return -ret;
+      }
+    }
+  }
+
+  ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+  if (ret < 0) {
+    ret = errno;
+    if (errno == EINPROGRESS && nonblock)
+      return s;
+
+    ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl;
+    close(s);
+    return -ret;
+  }
+
+  return s;
+}
+
+int NetHandler::reconnect(const entity_addr_t &addr, int sd)
+{
+  int r = 0;
+  int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len());
+
+  if (ret < 0 && errno != EISCONN) {
+    r = errno;
+    ldout(cct, 10) << __func__ << " reconnect: " << strerror(r) << dendl;
+    if (r == EINPROGRESS || r == EALREADY)
+      return 1;
+    return -r;
+  }
+
+  return 0;
+}
+
+int NetHandler::connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+  return generic_connect(addr, bind_addr, false);
+}
+
+int NetHandler::nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+  return generic_connect(addr, bind_addr, true);
+}
+
+
+}
diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h
new file mode 100644
index 00000000..19042377
--- /dev/null
+++ b/src/msg/async/net_handler.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_NET_UTILS_H
+#define CEPH_COMMON_NET_UTILS_H
+#include "common/config.h"
+
+namespace ceph {
+  class NetHandler {
+    int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock);
+
+    CephContext *cct;
+   public:
+    int create_socket(int domain, bool reuse_addr=false);
+    explicit NetHandler(CephContext *c): cct(c) {}
+    int set_nonblock(int sd);
+    int set_socket_options(int sd, bool nodelay, int size);
+    int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+    
+    /**
+     * Try to reconnect the socket.
+     *
+     * @return    0         success
+     *            > 0       just break, and wait for event
+     *            < 0       need to goto fail
+     */
+    int reconnect(const entity_addr_t &addr, int sd);
+    int nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+    void set_priority(int sd, int priority, int domain);
+  };
+}
+
+#endif
diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc
new file mode 100644
index 00000000..34299975
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.cc
@@ -0,0 +1,1234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "Infiniband.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "RDMAStack.h"
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "Infiniband "
+
+static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1;
+static const uint32_t MAX_INLINE_DATA = 0;
+static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000");
+static const uint32_t CQ_DEPTH = 30000;
+
+Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr), gid_idx(0)
+{
+#ifdef HAVE_IBV_EXP
+  union ibv_gid cgid;
+  struct ibv_exp_gid_attr gid_attr;
+  bool malformed = false;
+
+  ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl;
+  int r = ibv_query_port(ctxt, port_num, port_attr);
+  if (r == -1) {
+    lderr(cct) << __func__  << " query port failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  lid = port_attr->lid;
+
+  // search for requested GID in GIDs table
+  ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid)
+    << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl;
+  r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(),
+	     "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx"
+	     ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx",
+	     &cgid.raw[ 0], &cgid.raw[ 1],
+	     &cgid.raw[ 2], &cgid.raw[ 3],
+	     &cgid.raw[ 4], &cgid.raw[ 5],
+	     &cgid.raw[ 6], &cgid.raw[ 7],
+	     &cgid.raw[ 8], &cgid.raw[ 9],
+	     &cgid.raw[10], &cgid.raw[11],
+	     &cgid.raw[12], &cgid.raw[13],
+	     &cgid.raw[14], &cgid.raw[15]);
+
+  if (r != 16) {
+    ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl;
+    malformed = true;
+  }
+
+  gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE;
+
+  for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) {
+    r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+    if (r) {
+      lderr(cct) << __func__  << " query gid of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr);
+    if (r) {
+      lderr(cct) << __func__  << " query gid attributes of port " << port_num << " index " << gid_idx << " failed  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+
+    if (malformed) break; // stay with gid_idx=0
+    if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) &&
+	 (memcmp(&gid, &cgid, 16) == 0) ) {
+      ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl;
+      break;
+    }
+  }
+
+  if (gid_idx == port_attr->gid_tbl_len) {
+    lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl;
+    ceph_abort();
+  }
+#else
+  int r = ibv_query_port(ctxt, port_num, port_attr);
+  if (r == -1) {
+    lderr(cct) << __func__  << " query port failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+
+  lid = port_attr->lid;
+  r = ibv_query_gid(ctxt, port_num, 0, &gid);
+  if (r) {
+    lderr(cct) << __func__  << " query gid failed  " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+#endif
+}
+
+
+Device::Device(CephContext *cct, ibv_device* d, struct ibv_context *dc)
+  : device(d), device_attr(new ibv_device_attr), active_port(nullptr)
+{
+  if (device == NULL) {
+    lderr(cct) << __func__ << " device == NULL" << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  name = ibv_get_device_name(device);
+  if (cct->_conf->ms_async_rdma_cm) {
+    ctxt = dc;
+  } else {
+    ctxt = ibv_open_device(device);
+  }
+  if (ctxt == NULL) {
+    lderr(cct) << __func__ << " open rdma device failed. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  int r = ibv_query_device(ctxt, device_attr);
+  if (r == -1) {
+    lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+void Device::binding_port(CephContext *cct, int port_num) {
+  port_cnt = device_attr->phys_port_cnt;
+  for (uint8_t i = 0; i < port_cnt; ++i) {
+    Port *port = new Port(cct, ctxt, i+1);
+    if (i + 1 == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) {
+      active_port = port;
+      ldout(cct, 1) << __func__ << " found active port " << i+1 << dendl;
+      break;
+    } else {
+      ldout(cct, 10) << __func__ << " port " << i+1 << " is not what we want. state: " << port->get_port_attr()->state << ")"<< dendl;
+    }
+    delete port;
+  }
+  if (nullptr == active_port) {
+    lderr(cct) << __func__ << "  port not found" << dendl;
+    ceph_assert(active_port);
+  }
+}
+
+
+Infiniband::QueuePair::QueuePair(
+    CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+    int port, ibv_srq *srq,
+    Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq,
+    uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key)
+: cct(c), infiniband(infiniband),
+  type(type),
+  ctxt(infiniband.device->ctxt),
+  ib_physical_port(port),
+  pd(infiniband.pd->pd),
+  srq(srq),
+  qp(NULL),
+  cm_id(cid),
+  txcq(txcq),
+  rxcq(rxcq),
+  initial_psn(0),
+  max_send_wr(tx_queue_len),
+  max_recv_wr(rx_queue_len),
+  q_key(q_key),
+  dead(false)
+{
+  initial_psn = lrand48() & 0xffffff;
+  if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) {
+    lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+  pd = infiniband.pd->pd;
+}
+
+int Infiniband::QueuePair::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  ibv_qp_init_attr qpia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&qpia, 0, sizeof(qpia));
+  qpia.send_cq = txcq->get_cq();
+  qpia.recv_cq = rxcq->get_cq();
+  if (srq) {
+    qpia.srq = srq;                      // use the same shared receive queue
+  } else {
+    qpia.cap.max_recv_wr = max_recv_wr;
+    qpia.cap.max_recv_sge = 1;
+  }
+  qpia.cap.max_send_wr  = max_send_wr; // max outstanding send requests
+  qpia.cap.max_send_sge = 1;           // max send scatter-gather elements
+  qpia.cap.max_inline_data = MAX_INLINE_DATA;          // max bytes of immediate data on send q
+  qpia.qp_type = type;                 // RC, UC, UD, or XRC
+  qpia.sq_sig_all = 0;                 // only generate CQEs on requested WQEs
+
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = ibv_create_qp(pd, &qpia);
+    if (qp == NULL) {
+      lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl;
+      if (errno == ENOMEM) {
+        lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, "
+                                  " ms_async_rdma_send_buffers or"
+                                  " ms_async_rdma_buffer_size" << dendl;
+      }
+      return -1;
+    }
+  } else {
+    ceph_assert(cm_id->verbs == pd->context);
+    if (rdma_create_qp(cm_id, pd, &qpia)) {
+      lderr(cct) << __func__ << " failed to create queue pair with rdmacm library"
+                 << cpp_strerror(errno) << dendl;
+      return -1;
+    }
+    qp = cm_id->qp;
+  }
+  ldout(cct, 20) << __func__ << " successfully create queue pair: "
+                 << "qp=" << qp << dendl;
+
+  if (cct->_conf->ms_async_rdma_cm)
+    return 0;
+
+  // move from RESET to INIT state
+  ibv_qp_attr qpa;
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state   = IBV_QPS_INIT;
+  qpa.pkey_index = 0;
+  qpa.port_num   = (uint8_t)(ib_physical_port);
+  qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+  qpa.qkey       = q_key;
+
+  int mask = IBV_QP_STATE | IBV_QP_PORT;
+  switch (type) {
+    case IBV_QPT_RC:
+      mask |= IBV_QP_ACCESS_FLAGS;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_UD:
+      mask |= IBV_QP_QKEY;
+      mask |= IBV_QP_PKEY_INDEX;
+      break;
+    case IBV_QPT_RAW_PACKET:
+      break;
+    default:
+      ceph_abort();
+  }
+
+  int ret = ibv_modify_qp(qp, &qpa, mask);
+  if (ret) {
+    ibv_destroy_qp(qp);
+    lderr(cct) << __func__ << " failed to transition to INIT state: "
+               << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  ldout(cct, 20) << __func__ << " successfully change queue pair to INIT:"
+                 << " qp=" << qp << dendl;
+  return 0;
+}
+
+/**
+ * Change RC QueuePair into the ERROR state. This is necessary modify
+ * the Queue Pair into the Error state and poll all of the relevant
+ * Work Completions prior to destroying a Queue Pair.
+ * Since destroying a Queue Pair does not guarantee that its Work
+ * Completions are removed from the CQ upon destruction. Even if the
+ * Work Completions are already in the CQ, it might not be possible to
+ * retrieve them. If the Queue Pair is associated with an SRQ, it is
+ * recommended wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED
+ *
+ * \return
+ *      -errno if the QueuePair can't switch to ERROR
+ *      0 for success.
+ */
+int Infiniband::QueuePair::to_dead()
+{
+  if (dead)
+    return 0;
+  ibv_qp_attr qpa;
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_ERR;
+
+  int mask = IBV_QP_STATE;
+  int ret = ibv_modify_qp(qp, &qpa, mask);
+  if (ret) {
+    lderr(cct) << __func__ << " failed to transition to ERROR state: "
+               << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  dead = true;
+  return ret;
+}
+
+int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (rqp)
+    *rqp = qpa.dest_qp_num;
+  return 0;
+}
+
+/**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to query qp: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (lid)
+    *lid = qpa.ah_attr.dlid;
+  return 0;
+}
+
+/**
+ * Get the state of a QueuePair.
+ */
+int Infiniband::QueuePair::get_state() const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to get state: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return qpa.qp_state;
+}
+
+/**
+ * Return true if the queue pair is in an error state, false otherwise.
+ */
+bool Infiniband::QueuePair::is_error() const
+{
+  ibv_qp_attr qpa;
+  ibv_qp_init_attr qpia;
+
+  int r = ibv_query_qp(qp, &qpa, -1, &qpia);
+  if (r) {
+    lderr(cct) << __func__ << " failed to get state: "
+      << cpp_strerror(errno) << dendl;
+    return true;
+  }
+  return qpa.cur_qp_state == IBV_QPS_ERR;
+}
+
+
+Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib)
+  : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0)
+{
+}
+
+Infiniband::CompletionChannel::~CompletionChannel()
+{
+  if (channel) {
+    int r = ibv_destroy_comp_channel(channel);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionChannel::init()
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  channel = ibv_create_comp_channel(infiniband.device->ctxt);
+  if (!channel) {
+    lderr(cct) << __func__ << " failed to create receive completion channel: "
+                          << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  int rc = NetHandler(cct).set_nonblock(channel->fd);
+  if (rc < 0) {
+    ibv_destroy_comp_channel(channel);
+    return -1;
+  }
+  return 0;
+}
+
+void Infiniband::CompletionChannel::ack_events()
+{
+  ibv_ack_cq_events(cq, cq_events_that_need_ack);
+  cq_events_that_need_ack = 0;
+}
+
+bool Infiniband::CompletionChannel::get_cq_event()
+{
+  ibv_cq *cq = NULL;
+  void *ev_ctx;
+  if (ibv_get_cq_event(channel, &cq, &ev_ctx)) {
+    if (errno != EAGAIN && errno != EINTR)
+      lderr(cct) << __func__ << " failed to retrieve CQ event: "
+                 << cpp_strerror(errno) << dendl;
+    return false;
+  }
+
+  /* accumulate number of cq events that need to
+   *    * be acked, and periodically ack them
+   *       */
+  if (++cq_events_that_need_ack == MAX_ACK_EVENT) {
+    ldout(cct, 20) << __func__ << " ack aq events." << dendl;
+    ibv_ack_cq_events(cq, MAX_ACK_EVENT);
+    cq_events_that_need_ack = 0;
+  }
+
+  return true;
+}
+
+
+Infiniband::CompletionQueue::~CompletionQueue()
+{
+  if (cq) {
+    int r = ibv_destroy_cq(cq);
+    if (r < 0)
+      lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl;
+    ceph_assert(r == 0);
+  }
+}
+
+int Infiniband::CompletionQueue::init()
+{
+  cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0);
+  if (!cq) {
+    lderr(cct) << __func__ << " failed to create receive completion queue: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  if (ibv_req_notify_cq(cq, 0)) {
+    lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl;
+    ibv_destroy_cq(cq);
+    cq = nullptr;
+    return -1;
+  }
+
+  channel->bind_cq(cq);
+  ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl;
+  return 0;
+}
+
+int Infiniband::CompletionQueue::rearm_notify(bool solicite_only)
+{
+  ldout(cct, 20) << __func__ << " started." << dendl;
+  int r = ibv_req_notify_cq(cq, 0);
+  if (r < 0)
+    lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl;
+  return r;
+}
+
+int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) {
+  int r = ibv_poll_cq(cq, num_entries, ret_wc_array);
+  if (r < 0) {
+    lderr(cct) << __func__ << " poll_completion_queue occur met error: "
+      << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  return r;
+}
+
+
+Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device)
+  : pd(ibv_alloc_pd(device->ctxt))
+{
+  if (pd == NULL) {
+    lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl;
+    ceph_abort();
+  }
+}
+
+Infiniband::ProtectionDomain::~ProtectionDomain()
+{
+  ibv_dealloc_pd(pd);
+}
+
+
+Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t len, char* b)
+  : mr(m), bytes(len), offset(0), buffer(b)
+{
+}
+
+Infiniband::MemoryManager::Chunk::~Chunk()
+{
+}
+
+void Infiniband::MemoryManager::Chunk::set_offset(uint32_t o)
+{
+  offset = o;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_offset()
+{
+  return offset;
+}
+
+void Infiniband::MemoryManager::Chunk::set_bound(uint32_t b)
+{
+  bound = b;
+}
+
+void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b)
+{
+  offset = 0;
+  bound = b;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_bound()
+{
+  return bound;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len)
+{
+  uint32_t left = bound - offset;
+  if (left >= len) {
+    memcpy(buf, buffer+offset, len);
+    offset += len;
+    return len;
+  } else {
+    memcpy(buf, buffer+offset, left);
+    offset = 0;
+    bound = 0;
+    return left;
+  }
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len)
+{
+  uint32_t left = bytes - offset;
+  if (left >= len) {
+    memcpy(buffer+offset, buf, len);
+    offset += len;
+    return len;
+  } else {
+    memcpy(buffer+offset, buf, left);
+    offset = bytes;
+    return left;
+  }
+}
+
+bool Infiniband::MemoryManager::Chunk::full()
+{
+  return offset == bytes;
+}
+
+bool Infiniband::MemoryManager::Chunk::over()
+{
+  return Infiniband::MemoryManager::Chunk::offset == bound;
+}
+
+void Infiniband::MemoryManager::Chunk::clear()
+{
+  offset = 0;
+  bound = 0;
+}
+
+Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s)
+  : manager(m), buffer_size(s), lock("cluster_lock")
+{
+}
+
+Infiniband::MemoryManager::Cluster::~Cluster()
+{
+  int r = ibv_dereg_mr(chunk_base->mr);
+  ceph_assert(r == 0);
+  const auto chunk_end = chunk_base + num_chunk;
+  for (auto chunk = chunk_base; chunk != chunk_end; chunk++) {
+    chunk->~Chunk();
+  }
+
+  ::free(chunk_base);
+  manager.free(base);
+}
+
+int Infiniband::MemoryManager::Cluster::fill(uint32_t num)
+{
+  ceph_assert(!base);
+  num_chunk = num;
+  uint32_t bytes = buffer_size * num;
+
+  base = (char*)manager.malloc(bytes);
+  end = base + bytes;
+  ceph_assert(base);
+  chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num));
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num);
+  free_chunks.reserve(num);
+  ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  ceph_assert(m);
+  Chunk* chunk = chunk_base;
+  for (uint32_t offset = 0; offset < bytes; offset += buffer_size){
+    new(chunk) Chunk(m, buffer_size, base+offset);
+    free_chunks.push_back(chunk);
+    chunk++;
+  }
+  return 0;
+}
+
+void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck)
+{
+  Mutex::Locker l(lock);
+  for (auto c : ck) {
+    c->clear();
+    free_chunks.push_back(c);
+  }
+}
+
+int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t bytes)
+{
+  uint32_t num = bytes / buffer_size + 1;
+  if (bytes % buffer_size == 0)
+    --num;
+  int r = num;
+  Mutex::Locker l(lock);
+  if (free_chunks.empty())
+    return 0;
+  if (!bytes) {
+    r = free_chunks.size();
+    for (auto c : free_chunks)
+      chunks.push_back(c);
+    free_chunks.clear();
+    return r;
+  }
+  if (free_chunks.size() < num) {
+    num = free_chunks.size();
+    r = num;
+  }
+  for (uint32_t i = 0; i < num; ++i) {
+    chunks.push_back(free_chunks.back());
+    free_chunks.pop_back();
+  }
+  return r;
+}
+
+bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs)
+{
+  /* unlimited */
+  if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0)
+    return true;
+
+  if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " <<
+        n_bufs_allocated << " requested: " << nbufs <<
+        " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl;
+    return false;
+  }
+
+  return true;
+}
+
+void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) {
+  perf_logger = logger;
+  if (perf_logger != nullptr)
+    perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated);
+}
+
+void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs)
+{
+  n_bufs_allocated += nbufs;
+
+  if (!perf_logger)
+    return;
+
+  if (nbufs > 0) {
+    perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs);
+  } else {
+    perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs);
+  }
+}
+
+void *Infiniband::MemoryManager::mem_pool::slow_malloc()
+{
+  void *p;
+
+  Mutex::Locker l(PoolAllocator::lock);
+  PoolAllocator::g_ctx = ctx;
+  // this will trigger pool expansion via PoolAllocator::malloc()
+  p = boost::pool<PoolAllocator>::malloc();
+  PoolAllocator::g_ctx = nullptr;
+  return p;
+}
+
+Infiniband::MemoryManager::MemPoolContext *Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr;
+Mutex Infiniband::MemoryManager::PoolAllocator::lock("pool-alloc-lock");
+
+// lock is taken by mem_pool::slow_malloc()
+char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type bytes)
+{
+  mem_info *m;
+  Chunk *ch;
+  size_t rx_buf_size;
+  unsigned nbufs;
+  MemoryManager *manager;
+  CephContext *cct;
+
+  ceph_assert(g_ctx);
+  manager     = g_ctx->manager;
+  cct         = manager->cct;
+  rx_buf_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size;
+  nbufs       = bytes/rx_buf_size;
+
+  if (!g_ctx->can_alloc(nbufs))
+    return NULL;
+
+  m = static_cast<mem_info *>(manager->malloc(bytes + sizeof(*m)));
+  if (!m) {
+    lderr(cct) << __func__ << " failed to allocate " <<
+        bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+    return NULL;
+  }
+
+  m->mr = ibv_reg_mr(manager->pd->pd, m->chunks, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+  if (m->mr == NULL) {
+    lderr(cct) << __func__ << " failed to register " <<
+        bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+    manager->free(m);
+    return NULL;
+  }
+
+  m->nbufs = nbufs;
+  // save this chunk context
+  m->ctx   = g_ctx;
+
+  // note that the memory can be allocated before perf logger is set
+  g_ctx->update_stats(nbufs);
+
+  /* initialize chunks */
+  ch = m->chunks;
+  for (unsigned i = 0; i < nbufs; i++) {
+    ch->lkey   = m->mr->lkey;
+    ch->bytes  = cct->_conf->ms_async_rdma_buffer_size;
+    ch->offset = 0;
+    ch->buffer = ch->data; // TODO: refactor tx and remove buffer
+    ch = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(ch) + rx_buf_size);
+  }
+
+  return reinterpret_cast<char *>(m->chunks);
+}
+
+
+void Infiniband::MemoryManager::PoolAllocator::free(char * const block)
+{
+  mem_info *m;
+  Mutex::Locker l(lock);
+    
+  m = reinterpret_cast<mem_info *>(block) - 1;
+  m->ctx->update_stats(-m->nbufs);
+  ibv_dereg_mr(m->mr);
+  m->ctx->manager->free(m);
+}
+
+Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p)
+  : cct(c), device(d), pd(p),
+    rxbuf_pool_ctx(this),
+    rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size,
+               c->_conf->ms_async_rdma_receive_buffers > 0 ?
+                  // if possible make initial pool size 2 * receive_queue_len
+                  // that way there will be no pool expansion upon receive of the
+                  // first packet.
+                  (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ?
+                   c->_conf->ms_async_rdma_receive_buffers :  2 * c->_conf->ms_async_rdma_receive_queue_len) :
+                  // rx pool is infinite, we can set any initial size that we want
+                   2 * c->_conf->ms_async_rdma_receive_queue_len)
+{
+}
+
+Infiniband::MemoryManager::~MemoryManager()
+{
+  if (send)
+    delete send;
+}
+
+void* Infiniband::MemoryManager::huge_pages_malloc(size_t size)
+{
+  size_t real_size = ALIGN_TO_PAGE_SIZE(size + HUGE_PAGE_SIZE);
+  char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS |MAP_POPULATE | MAP_HUGETLB,-1, 0);
+  if (ptr == MAP_FAILED) {
+    ptr = (char *)std::malloc(real_size);
+    if (ptr == NULL) return NULL;
+    real_size = 0;
+  }
+  *((size_t *)ptr) = real_size;
+  return ptr + HUGE_PAGE_SIZE;
+}
+
+void Infiniband::MemoryManager::huge_pages_free(void *ptr)
+{
+  if (ptr == NULL) return;
+  void *real_ptr = (char *)ptr -HUGE_PAGE_SIZE;
+  size_t real_size = *((size_t *)real_ptr);
+  ceph_assert(real_size % HUGE_PAGE_SIZE == 0);
+  if (real_size != 0)
+    munmap(real_ptr, real_size);
+  else
+    std::free(real_ptr);
+}
+
+
+void* Infiniband::MemoryManager::malloc(size_t size)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    return huge_pages_malloc(size);
+  else
+    return std::malloc(size);
+}
+
+void Infiniband::MemoryManager::free(void *ptr)
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage)
+    huge_pages_free(ptr);
+  else
+    std::free(ptr);
+}
+
+void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num)
+{
+  ceph_assert(device);
+  ceph_assert(pd);
+
+  send = new Cluster(*this, size);
+  send->fill(tx_num);
+}
+
+void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks)
+{
+  send->take_back(chunks);
+}
+
+int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return send->get_buffers(c, bytes);
+}
+
+static std::atomic<bool> init_prereq = {false};
+
+void Infiniband::verify_prereq(CephContext *cct) {
+
+  //On RDMA MUST be called before fork
+   int rc = ibv_fork_init();
+   if (rc) {
+      lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl;
+      ceph_abort();
+   }
+
+   ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage <<  dendl;
+   if (cct->_conf->ms_async_rdma_enable_hugepage){
+     rc =  setenv("RDMAV_HUGEPAGES_SAFE","1",1);
+     ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") <<  dendl;
+     if (rc) {
+       lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl;
+       ceph_abort();
+     }
+   }
+
+   //Check ulimit
+   struct rlimit limit;
+   getrlimit(RLIMIT_MEMLOCK, &limit);
+   if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) {
+      lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory."
+				  " We recommend setting this parameter to infinity" << dendl;
+   }
+   init_prereq = true;
+}
+
+Infiniband::Infiniband(CephContext *cct)
+  : cct(cct), lock("IB lock"),
+    device_name(cct->_conf->ms_async_rdma_device_name),
+    port_num( cct->_conf->ms_async_rdma_port_num)
+{
+  if (!init_prereq)
+    verify_prereq(cct);
+  ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl;
+}
+
+void Infiniband::init()
+{
+  Mutex::Locker l(lock);
+
+  if (initialized)
+    return;
+
+  device_list = new DeviceList(cct);
+  initialized = true;
+
+  device = device_list->get_device(device_name.c_str());
+  ceph_assert(device);
+  device->binding_port(cct, port_num);
+  ib_physical_port = device->active_port->get_port_num();
+  pd = new ProtectionDomain(cct, device);
+  ceph_assert(NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0);
+
+  support_srq = cct->_conf->ms_async_rdma_support_srq;
+  if (support_srq)
+    rx_queue_len = device->device_attr->max_srq_wr;
+  else
+    rx_queue_len = device->device_attr->max_qp_wr;
+  if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) {
+    rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len;
+    ldout(cct, 1) << __func__ << " receive queue length is " << rx_queue_len << " receive buffers" << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " requested receive queue length " <<
+                  cct->_conf->ms_async_rdma_receive_queue_len <<
+                  " is too big. Setting " << rx_queue_len << dendl;
+  }
+
+  // check for the misconfiguration
+  if (cct->_conf->ms_async_rdma_receive_buffers > 0 &&
+      rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) {
+    lderr(cct) << __func__ << " rdma_receive_queue_len (" <<
+                  rx_queue_len << ") > ms_async_rdma_receive_buffers(" <<
+                  cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl;
+    ceph_abort();
+  }
+
+  tx_queue_len = device->device_attr->max_qp_wr;
+  if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) {
+    tx_queue_len = cct->_conf->ms_async_rdma_send_buffers;
+    ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers"  << dendl;
+  } else {
+    ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl;
+  }
+
+  ldout(cct, 1) << __func__ << " device allow " << device->device_attr->max_cqe
+                << " completion entries" << dendl;
+
+  memory_manager = new MemoryManager(cct, device, pd);
+  memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len);
+
+  if (support_srq) {
+    srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT);
+    post_chunks_to_rq(rx_queue_len, NULL); //add to srq
+  }
+}
+
+Infiniband::~Infiniband()
+{
+  if (!initialized)
+    return;
+  if (support_srq)
+    ibv_destroy_srq(srq);
+  delete memory_manager;
+  delete pd;
+}
+
+/**
+ * Create a shared receive queue. This basically wraps the verbs call. 
+ *
+ * \param[in] max_wr
+ *      The max number of outstanding work requests in the SRQ.
+ * \param[in] max_sge
+ *      The max number of scatter elements per WR.
+ * \return
+ *      A valid ibv_srq pointer, or NULL on error.
+ */
+ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge)
+{
+  ibv_srq_init_attr sia;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&sia, 0, sizeof(sia));
+  sia.srq_context = device->ctxt;
+  sia.attr.max_wr = max_wr;
+  sia.attr.max_sge = max_sge;
+  return ibv_create_srq(pd->pd, &sia);
+}
+
+int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+  return memory_manager->get_send_buffers(c, bytes);
+}
+
+/**
+ * Create a new QueuePair. This factory should be used in preference to
+ * the QueuePair constructor directly, since this lets derivatives of
+ * Infiniband, e.g. MockInfiniband (if it existed),
+ * return mocked out QueuePair derivatives.
+ *
+ * \return
+ *      QueuePair on success or NULL if init fails
+ * See QueuePair::QueuePair for parameter documentation.
+ */
+Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
+    CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
+{
+  Infiniband::QueuePair *qp = new QueuePair(
+      cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
+  if (qp->init()) {
+    delete qp;
+    return NULL;
+  }
+  return qp;
+}
+
+int Infiniband::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+  int ret, i = 0;
+  ibv_sge isge[num];
+  Chunk *chunk;
+  ibv_recv_wr rx_work_request[num];
+
+  while (i < num) {
+    chunk = get_memory_manager()->get_rx_buffer();
+    if (chunk == NULL) {
+      lderr(cct) << __func__ << " WARNING: out of memory. Requested " << num <<
+        " rx buffers. Got " << i << dendl;
+      if (i == 0)
+        return 0;
+      // if we got some buffers post them and hope for the best
+      rx_work_request[i-1].next = 0;
+      break;
+    }
+
+    isge[i].addr = reinterpret_cast<uint64_t>(chunk->data);
+    isge[i].length = chunk->bytes;
+    isge[i].lkey = chunk->lkey;
+
+    memset(&rx_work_request[i], 0, sizeof(rx_work_request[i]));
+    rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// stash descriptor ptr
+    if (i == num - 1) {
+      rx_work_request[i].next = 0;
+    } else {
+      rx_work_request[i].next = &rx_work_request[i+1];
+    }
+    rx_work_request[i].sg_list = &isge[i];
+    rx_work_request[i].num_sge = 1;
+    i++;
+  }
+  ibv_recv_wr *badworkrequest;
+  if (support_srq) {
+    ret = ibv_post_srq_recv(srq, &rx_work_request[0], &badworkrequest);
+    ceph_assert(ret == 0);
+  } else {
+    ceph_assert(qp);
+    ret = ibv_post_recv(qp, &rx_work_request[0], &badworkrequest);
+    ceph_assert(ret == 0);
+  }
+  return i;
+}
+
+Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c)
+{
+  Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this);
+  if (cc->init()) {
+    delete cc;
+    return NULL;
+  }
+  return cc;
+}
+
+Infiniband::CompletionQueue* Infiniband::create_comp_queue(
+    CephContext *cct, CompletionChannel *cc)
+{
+  Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue(
+      cct, *this, CQ_DEPTH, cc);
+  if (cq->init()) {
+    delete cq;
+    return NULL;
+  }
+  return cq;
+}
+
+// 1 means no valid buffer read, 0 means got enough buffer
+// else return < 0 means error
+int Infiniband::recv_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+  ssize_t r = ::read(sd, &msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+  if (r < 0) {
+    r = -errno;
+    lderr(cct) << __func__ << " got error " << r << ": "
+               << cpp_strerror(r) << dendl;
+  } else if (r == 0) { // valid disconnect message of length 0
+    ldout(cct, 10) << __func__ << " got disconnect message " << dendl;
+  } else if ((size_t)r != sizeof(msg)) { // invalid message
+    ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl;
+    r = -EINVAL;
+  } else { // valid message
+    sscanf(msg, "%hx:%x:%x:%x:%s", &(im.lid), &(im.qpn), &(im.psn), &(im.peer_qpn),gid);
+    wire_gid_to_gid(gid, &(im.gid));
+    ldout(cct, 5) << __func__ << " recevd: " << im.lid << ", " << im.qpn << ", " << im.psn << ", " << im.peer_qpn << ", " << gid  << dendl;
+  }
+  return r;
+}
+
+int Infiniband::send_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+  int retry = 0;
+  ssize_t r;
+
+  char msg[TCP_MSG_LEN];
+  char gid[33];
+retry:
+  gid_to_wire_gid(&(im.gid), gid);
+  sprintf(msg, "%04x:%08x:%08x:%08x:%s", im.lid, im.qpn, im.psn, im.peer_qpn, gid);
+  ldout(cct, 10) << __func__ << " sending: " << im.lid << ", " << im.qpn << ", " << im.psn
+                 << ", " << im.peer_qpn << ", "  << gid  << dendl;
+  r = ::write(sd, msg, sizeof(msg));
+  // Drop incoming qpt
+  if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+    if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+      return -EINVAL;
+    }
+  }
+
+  if ((size_t)r != sizeof(msg)) {
+    // FIXME need to handle EAGAIN instead of retry
+    if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) {
+      retry++;
+      goto retry;
+    }
+    if (r < 0)
+      lderr(cct) << __func__ << " send returned error " << errno << ": "
+                 << cpp_strerror(errno) << dendl;
+    else
+      lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl;
+    return -errno;
+  }
+  return 0;
+}
+
+void Infiniband::wire_gid_to_gid(const char *wgid, union ibv_gid *gid)
+{
+  char tmp[9];
+  uint32_t v32;
+  int i;
+
+  for (tmp[8] = 0, i = 0; i < 4; ++i) {
+    memcpy(tmp, wgid + i * 8, 8);
+    sscanf(tmp, "%x", &v32);
+    *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32);
+  }
+}
+
+void Infiniband::gid_to_wire_gid(const union ibv_gid *gid, char wgid[])
+{
+  for (int i = 0; i < 4; ++i)
+    sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4)));
+}
+
+Infiniband::QueuePair::~QueuePair()
+{
+  if (qp) {
+    ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl;
+    ceph_assert(!ibv_destroy_qp(qp));
+  }
+}
+
+/**
+ * Given a string representation of the `status' field from Verbs
+ * struct `ibv_wc'.
+ *
+ * \param[in] status
+ *      The integer status obtained in ibv_wc.status.
+ * \return
+ *      A string corresponding to the given status.
+ */
+const char* Infiniband::wc_status_to_string(int status)
+{
+  static const char *lookup[] = {
+      "SUCCESS",
+      "LOC_LEN_ERR",
+      "LOC_QP_OP_ERR",
+      "LOC_EEC_OP_ERR",
+      "LOC_PROT_ERR",
+      "WR_FLUSH_ERR",
+      "MW_BIND_ERR",
+      "BAD_RESP_ERR",
+      "LOC_ACCESS_ERR",
+      "REM_INV_REQ_ERR",
+      "REM_ACCESS_ERR",
+      "REM_OP_ERR",
+      "RETRY_EXC_ERR",
+      "RNR_RETRY_EXC_ERR",
+      "LOC_RDD_VIOL_ERR",
+      "REM_INV_RD_REQ_ERR",
+      "REM_ABORT_ERR",
+      "INV_EECN_ERR",
+      "INV_EEC_STATE_ERR",
+      "FATAL_ERR",
+      "RESP_TIMEOUT_ERR",
+      "GENERAL_ERR"
+  };
+
+  if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+    return "<status out of range!>";
+  return lookup[status];
+}
+
+const char* Infiniband::qp_state_string(int status) {
+  switch(status) {
+    case IBV_QPS_RESET : return "IBV_QPS_RESET";
+    case IBV_QPS_INIT  : return "IBV_QPS_INIT";
+    case IBV_QPS_RTR   : return "IBV_QPS_RTR";
+    case IBV_QPS_RTS   : return "IBV_QPS_RTS";
+    case IBV_QPS_SQD   : return "IBV_QPS_SQD";
+    case IBV_QPS_SQE   : return "IBV_QPS_SQE";
+    case IBV_QPS_ERR   : return "IBV_QPS_ERR";
+    default: return " out of range.";
+  }
+}
diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h
new file mode 100644
index 00000000..2889cdfc
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.h
@@ -0,0 +1,529 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INFINIBAND_H
+#define CEPH_INFINIBAND_H
+
+#include <boost/pool/pool.hpp>
+// need this because boost messes with ceph log/assert definitions
+#include "include/ceph_assert.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include "include/int_types.h"
+#include "include/page.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#define HUGE_PAGE_SIZE (2 * 1024 * 1024)
+#define ALIGN_TO_PAGE_SIZE(x) \
+  (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE)
+
+struct IBSYNMsg {
+  uint16_t lid;
+  uint32_t qpn;
+  uint32_t psn;
+  uint32_t peer_qpn;
+  union ibv_gid gid;
+} __attribute__((packed));
+
+class RDMAStack;
+class CephContext;
+
+class Port {
+  struct ibv_context* ctxt;
+  int port_num;
+  struct ibv_port_attr* port_attr;
+  uint16_t lid;
+  int gid_idx = 0;
+  union ibv_gid gid;
+
+ public:
+  explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
+  uint16_t get_lid() { return lid; }
+  ibv_gid  get_gid() { return gid; }
+  int get_port_num() { return port_num; }
+  ibv_port_attr* get_port_attr() { return port_attr; }
+  int get_gid_idx() { return gid_idx; }
+};
+
+
+class Device {
+  ibv_device *device;
+  const char* name;
+  uint8_t  port_cnt = 0;
+ public:
+  explicit Device(CephContext *c, ibv_device* d, struct ibv_context *dc);
+  ~Device() {
+    if (active_port) {
+      delete active_port;
+      ceph_assert(ibv_close_device(ctxt) == 0);
+    }
+  }
+  const char* get_name() { return name;}
+  uint16_t get_lid() { return active_port->get_lid(); }
+  ibv_gid get_gid() { return active_port->get_gid(); }
+  int get_gid_idx() { return active_port->get_gid_idx(); }
+  void binding_port(CephContext *c, int port_num);
+  struct ibv_context *ctxt;
+  ibv_device_attr *device_attr;
+  Port* active_port;
+};
+
+
+class DeviceList {
+  struct ibv_device ** device_list;
+  struct ibv_context ** device_context_list;
+  int num;
+  Device** devices;
+ public:
+  explicit DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)),
+                                device_context_list(rdma_get_devices(&num)) {
+    if (device_list == NULL || num == 0) {
+      lderr(cct) << __func__ << " failed to get rdma device list.  " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    devices = new Device*[num];
+
+    for (int i = 0;i < num; ++i) {
+      devices[i] = new Device(cct, device_list[i], device_context_list[i]);
+    }
+  }
+  ~DeviceList() {
+    for (int i=0; i < num; ++i) {
+      delete devices[i];
+    }
+    delete []devices;
+    ibv_free_device_list(device_list);
+  }
+
+  Device* get_device(const char* device_name) {
+    ceph_assert(devices);
+    for (int i = 0; i < num; ++i) {
+      if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
+        return devices[i];
+      }
+    }
+    return NULL;
+  }
+};
+
+// stat counters
+enum {
+  l_msgr_rdma_dispatcher_first = 94000,
+
+  l_msgr_rdma_polling,
+  l_msgr_rdma_inflight_tx_chunks,
+  l_msgr_rdma_rx_bufs_in_use,
+  l_msgr_rdma_rx_bufs_total,
+
+  l_msgr_rdma_tx_total_wc,
+  l_msgr_rdma_tx_total_wc_errors,
+  l_msgr_rdma_tx_wc_retry_errors,
+  l_msgr_rdma_tx_wc_wr_flush_errors,
+
+  l_msgr_rdma_rx_total_wc,
+  l_msgr_rdma_rx_total_wc_errors,
+  l_msgr_rdma_rx_fin,
+
+  l_msgr_rdma_handshake_errors,
+
+  l_msgr_rdma_total_async_events,
+  l_msgr_rdma_async_last_wqe_events,
+
+  l_msgr_rdma_created_queue_pair,
+  l_msgr_rdma_active_queue_pair,
+
+  l_msgr_rdma_dispatcher_last,
+};
+
+enum {
+  l_msgr_rdma_first = 95000,
+
+  l_msgr_rdma_tx_no_mem,
+  l_msgr_rdma_tx_parital_mem,
+  l_msgr_rdma_tx_failed,
+
+  l_msgr_rdma_tx_chunks,
+  l_msgr_rdma_tx_bytes,
+  l_msgr_rdma_rx_chunks,
+  l_msgr_rdma_rx_bytes,
+  l_msgr_rdma_pending_sent_conns,
+
+  l_msgr_rdma_last,
+};
+
+class RDMADispatcher;
+
+class Infiniband {
+ public:
+  class ProtectionDomain {
+   public:
+    explicit ProtectionDomain(CephContext *cct, Device *device);
+    ~ProtectionDomain();
+
+    ibv_pd* const pd;
+  };
+
+
+  class MemoryManager {
+   public:
+    class Chunk {
+     public:
+      Chunk(ibv_mr* m, uint32_t len, char* b);
+      ~Chunk();
+
+      void set_offset(uint32_t o);
+      uint32_t get_offset();
+      void set_bound(uint32_t b);
+      void prepare_read(uint32_t b);
+      uint32_t get_bound();
+      uint32_t read(char* buf, uint32_t len);
+      uint32_t write(char* buf, uint32_t len);
+      bool full();
+      bool over();
+      void clear();
+
+     public:
+      ibv_mr* mr;
+      uint32_t lkey = 0;
+      uint32_t bytes;
+      uint32_t bound = 0;
+      uint32_t offset;
+      char* buffer; // TODO: remove buffer/refactor TX
+      char  data[0];
+    };
+
+    class Cluster {
+     public:
+      Cluster(MemoryManager& m, uint32_t s);
+      ~Cluster();
+
+      int fill(uint32_t num);
+      void take_back(std::vector<Chunk*> &ck);
+      int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
+      Chunk *get_chunk_by_buffer(const char *c) {
+        uint32_t idx = (c - base) / buffer_size;
+        Chunk *chunk = chunk_base + idx;
+        return chunk;
+      }
+      bool is_my_buffer(const char *c) const {
+        return c >= base && c < end;
+      }
+
+      MemoryManager& manager;
+      uint32_t buffer_size;
+      uint32_t num_chunk = 0;
+      Mutex lock;
+      std::vector<Chunk*> free_chunks;
+      char *base = nullptr;
+      char *end = nullptr;
+      Chunk* chunk_base = nullptr;
+    };
+
+    class MemPoolContext {
+      PerfCounters *perf_logger;
+
+     public:
+      MemoryManager *manager;
+      unsigned n_bufs_allocated;
+      // true if it is possible to alloc
+      // more memory for the pool
+      explicit MemPoolContext(MemoryManager *m) :
+        perf_logger(nullptr),
+        manager(m),
+        n_bufs_allocated(0) {}
+      bool can_alloc(unsigned nbufs);
+      void update_stats(int val);
+      void set_stat_logger(PerfCounters *logger);
+    };
+
+    class PoolAllocator {
+      struct mem_info {
+        ibv_mr   *mr;
+        MemPoolContext *ctx;
+        unsigned nbufs;
+        Chunk    chunks[0];
+      };
+     public:
+      typedef std::size_t size_type;
+      typedef std::ptrdiff_t difference_type;
+
+      static char * malloc(const size_type bytes);
+      static void free(char * const block);
+
+      static MemPoolContext  *g_ctx;
+      static Mutex lock;
+    };
+
+    /**
+     * modify boost pool so that it is possible to
+     * have a thread safe 'context' when allocating/freeing
+     * the memory. It is needed to allow a different pool
+     * configurations and bookkeeping per CephContext and
+     * also to be able to use same allocator to deal with
+     * RX and TX pool.
+     * TODO: use boost pool to allocate TX chunks too
+     */
+    class mem_pool : public boost::pool<PoolAllocator> {
+     private:
+      MemPoolContext *ctx;
+      void *slow_malloc();
+
+     public:
+      explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
+          const size_type nnext_size = 32,
+          const size_type nmax_size = 0) :
+        pool(nrequested_size, nnext_size, nmax_size),
+        ctx(ctx) { }
+
+      void *malloc() {
+        if (!store().empty())
+          return (store().malloc)();
+        // need to alloc more memory...
+        // slow path code
+        return slow_malloc();
+      }
+    };
+
+    MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
+    ~MemoryManager();
+
+    void* malloc(size_t size);
+    void  free(void *ptr);
+
+    void create_tx_pool(uint32_t size, uint32_t tx_num);
+    void return_tx(std::vector<Chunk*> &chunks);
+    int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
+    bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
+    Chunk *get_tx_chunk_by_buffer(const char *c) {
+      return send->get_chunk_by_buffer(c);
+    }
+    uint32_t get_tx_buffer_size() const {
+      return send->buffer_size;
+    }
+
+    Chunk *get_rx_buffer() {
+       return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
+    }
+
+    void release_rx_buffer(Chunk *chunk) {
+      rxbuf_pool.free(chunk);
+    }
+
+    void set_rx_stat_logger(PerfCounters *logger) {
+      rxbuf_pool_ctx.set_stat_logger(logger);
+    }
+
+    CephContext  *cct;
+   private:
+    // TODO: Cluster -> TxPool txbuf_pool
+    // chunk layout fix
+    //  
+    Cluster* send = nullptr;// SEND
+    Device *device;
+    ProtectionDomain *pd;
+    MemPoolContext rxbuf_pool_ctx;
+    mem_pool     rxbuf_pool;
+
+
+    void* huge_pages_malloc(size_t size);
+    void  huge_pages_free(void *ptr);
+  };
+
+ private:
+  uint32_t tx_queue_len = 0;
+  uint32_t rx_queue_len = 0;
+  uint32_t max_sge = 0;
+  uint8_t  ib_physical_port = 0;
+  MemoryManager* memory_manager = nullptr;
+  ibv_srq* srq = nullptr;             // shared receive work queue
+  Device *device = NULL;
+  ProtectionDomain *pd = NULL;
+  DeviceList *device_list = nullptr;
+  void wire_gid_to_gid(const char *wgid, union ibv_gid *gid);
+  void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]);
+  CephContext *cct;
+  Mutex lock;
+  bool initialized = false;
+  const std::string &device_name;
+  uint8_t port_num;
+  bool support_srq = false;
+
+ public:
+  explicit Infiniband(CephContext *c);
+  ~Infiniband();
+  void init();
+  static void verify_prereq(CephContext *cct);
+
+  class CompletionChannel {
+    static const uint32_t MAX_ACK_EVENT = 5000;
+    CephContext *cct;
+    Infiniband& infiniband;
+    ibv_comp_channel *channel;
+    ibv_cq *cq;
+    uint32_t cq_events_that_need_ack;
+
+   public:
+    CompletionChannel(CephContext *c, Infiniband &ib);
+    ~CompletionChannel();
+    int init();
+    bool get_cq_event();
+    int get_fd() { return channel->fd; }
+    ibv_comp_channel* get_channel() { return channel; }
+    void bind_cq(ibv_cq *c) { cq = c; }
+    void ack_events();
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // completion queue.
+  //
+  // You need to call init and it will create a cq and associate to comp channel
+  class CompletionQueue {
+   public:
+    CompletionQueue(CephContext *c, Infiniband &ib,
+                    const uint32_t qd, CompletionChannel *cc)
+      : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
+    ~CompletionQueue();
+    int init();
+    int poll_cq(int num_entries, ibv_wc *ret_wc_array);
+
+    ibv_cq* get_cq() const { return cq; }
+    int rearm_notify(bool solicited_only=true);
+    CompletionChannel* get_cc() const { return channel; }
+   private:
+    CephContext *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    CompletionChannel *channel;
+    ibv_cq *cq;
+    uint32_t queue_depth;
+  };
+
+  // this class encapsulates the creation, use, and destruction of an RC
+  // queue pair.
+  //
+  // you need call init and it will create a qp and bring it to the INIT state.
+  // after obtaining the lid, qpn, and psn of a remote queue pair, one
+  // must call plumb() to bring the queue pair to the RTS state.
+  class QueuePair {
+   public:
+    QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+              int ib_physical_port,  ibv_srq *srq,
+              Infiniband::CompletionQueue* txcq,
+              Infiniband::CompletionQueue* rxcq,
+              uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
+    ~QueuePair();
+
+    int init();
+
+    /**
+     * Get the initial packet sequence number for this QueuePair.
+     * This is randomly generated on creation. It should not be confused
+     * with the remote side's PSN, which is set in #plumb(). 
+     */
+    uint32_t get_initial_psn() const { return initial_psn; };
+    /**
+     * Get the local queue pair number for this QueuePair.
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    uint32_t get_local_qp_number() const { return qp->qp_num; };
+    /**
+     * Get the remote queue pair number for this QueuePair, as set in #plumb().
+     * QPNs are analogous to UDP/TCP port numbers.
+     */
+    int get_remote_qp_number(uint32_t *rqp) const;
+    /**
+     * Get the remote infiniband address for this QueuePair, as set in #plumb().
+     * LIDs are "local IDs" in infiniband terminology. They are short, locally
+     * routable addresses.
+     */
+    int get_remote_lid(uint16_t *lid) const;
+    /**
+     * Get the state of a QueuePair.
+     */
+    int get_state() const;
+    /**
+     * Return true if the queue pair is in an error state, false otherwise.
+     */
+    bool is_error() const;
+    void add_tx_wr(uint32_t amt) { tx_wr_inflight += amt; }
+    void dec_tx_wr(uint32_t amt) { tx_wr_inflight -= amt; }
+    uint32_t get_tx_wr() const { return tx_wr_inflight; }
+    ibv_qp* get_qp() const { return qp; }
+    Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
+    Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
+    int to_dead();
+    bool is_dead() const { return dead; }
+
+   private:
+    CephContext  *cct;
+    Infiniband&  infiniband;     // Infiniband to which this QP belongs
+    ibv_qp_type  type;           // QP type (IBV_QPT_RC, etc.)
+    ibv_context* ctxt;           // device context of the HCA to use
+    int ib_physical_port;
+    ibv_pd*      pd;             // protection domain
+    ibv_srq*     srq;            // shared receive queue
+    ibv_qp*      qp;             // infiniband verbs QP handle
+    struct rdma_cm_id *cm_id;
+    Infiniband::CompletionQueue* txcq;
+    Infiniband::CompletionQueue* rxcq;
+    uint32_t     initial_psn;    // initial packet sequence number
+    uint32_t     max_send_wr;
+    uint32_t     max_recv_wr;
+    uint32_t     q_key;
+    bool dead;
+    std::atomic<uint32_t> tx_wr_inflight = {0}; // counter for inflight Tx WQEs
+  };
+
+ public:
+  typedef MemoryManager::Cluster Cluster;
+  typedef MemoryManager::Chunk Chunk;
+  QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
+      ibv_qp_type type, struct rdma_cm_id *cm_id);
+  ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
+  // post rx buffers to srq, return number of buffers actually posted
+  int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+  void post_chunk_to_pool(Chunk* chunk) {
+    get_memory_manager()->release_rx_buffer(chunk);
+  }
+  int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
+  CompletionChannel *create_comp_channel(CephContext *c);
+  CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
+  uint8_t get_ib_physical_port() { return ib_physical_port; }
+  int send_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+  int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+  uint16_t get_lid() { return device->get_lid(); }
+  ibv_gid get_gid() { return device->get_gid(); }
+  MemoryManager* get_memory_manager() { return memory_manager; }
+  Device* get_device() { return device; }
+  int get_async_fd() { return device->ctxt->async_fd; }
+  bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
+  Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
+  static const char* wc_status_to_string(int status);
+  static const char* qp_state_string(int status);
+  uint32_t get_rx_queue_len() const { return rx_queue_len; }
+};
+
+#endif
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
new file mode 100644
index 00000000..89be7428
--- /dev/null
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -0,0 +1,743 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "RDMAStack.h"
+
+class C_handle_connection_established : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection_established();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+class C_handle_connection_read : public EventCallback {
+  RDMAConnectedSocketImpl *csi;
+  bool active = true;
+ public:
+  explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {}
+  void do_request(uint64_t fd) final {
+    if (active)
+      csi->handle_connection();
+  }
+  void close() {
+    active = false;
+  }
+};
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAConnectedSocketImpl "
+
+RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+						 RDMAWorker *w)
+  : cct(cct), connected(0), error(0), infiniband(ib),
+    dispatcher(s), worker(w), lock("RDMAConnectedSocketImpl::lock"),
+    is_server(false), read_handler(new C_handle_connection_read(this)),
+    established_handler(new C_handle_connection_established(this)),
+    active(false), pending(false)
+{
+  if (!cct->_conf->ms_async_rdma_cm) {
+    qp = infiniband->create_queue_pair(cct, s->get_tx_cq(), s->get_rx_cq(), IBV_QPT_RC, NULL);
+    my_msg.qpn = qp->get_local_qp_number();
+    my_msg.psn = qp->get_initial_psn();
+    my_msg.lid = infiniband->get_lid();
+    my_msg.peer_qpn = 0;
+    my_msg.gid = infiniband->get_gid();
+    notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+    dispatcher->register_qp(qp, this);
+    dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+    dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  }
+}
+
+RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
+{
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  cleanup();
+  worker->remove_pending_conn(this);
+  dispatcher->erase_qpn(my_msg.qpn);
+
+  for (unsigned i=0; i < wc.size(); ++i) {
+    dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id));
+  }
+  for (unsigned i=0; i < buffers.size(); ++i) {
+    dispatcher->post_chunk_to_pool(buffers[i]);
+  }
+
+  Mutex::Locker l(lock);
+  if (notify_fd >= 0)
+    ::close(notify_fd);
+  if (tcp_fd >= 0)
+    ::close(tcp_fd);
+  error = ECONNRESET;
+}
+
+void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v)
+{
+  Mutex::Locker l(lock);
+  if (wc.empty())
+    wc = std::move(v);
+  else
+    wc.insert(wc.end(), v.begin(), v.end());
+  notify();
+}
+
+void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w)
+{
+  Mutex::Locker l(lock);
+  if (wc.empty())
+    return ;
+  w.swap(wc);
+}
+
+int RDMAConnectedSocketImpl::activate()
+{
+  ibv_qp_attr qpa;
+  int r;
+
+  // now connect up the qps and switch to RTR
+  memset(&qpa, 0, sizeof(qpa));
+  qpa.qp_state = IBV_QPS_RTR;
+  qpa.path_mtu = IBV_MTU_1024;
+  qpa.dest_qp_num = peer_msg.qpn;
+  qpa.rq_psn = peer_msg.psn;
+  qpa.max_dest_rd_atomic = 1;
+  qpa.min_rnr_timer = 12;
+  //qpa.ah_attr.is_global = 0;
+  qpa.ah_attr.is_global = 1;
+  qpa.ah_attr.grh.hop_limit = 6;
+  qpa.ah_attr.grh.dgid = peer_msg.gid;
+
+  qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx();
+
+  qpa.ah_attr.dlid = peer_msg.lid;
+  qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl;
+  qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp;
+  qpa.ah_attr.src_path_bits = 0;
+  qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port());
+
+  ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl;
+
+  r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+      IBV_QP_AV |
+      IBV_QP_PATH_MTU |
+      IBV_QP_DEST_QPN |
+      IBV_QP_RQ_PSN |
+      IBV_QP_MIN_RNR_TIMER |
+      IBV_QP_MAX_DEST_RD_ATOMIC);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTR state: "
+               << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl;
+
+  // now move to RTS
+  qpa.qp_state = IBV_QPS_RTS;
+
+  // How long to wait before retrying if packet lost or server dead.
+  // Supposedly the timeout is 4.096us*2^timeout.  However, the actual
+  // timeout appears to be 4.096us*2^(timeout+1), so the setting
+  // below creates a 135ms timeout.
+  qpa.timeout = 14;
+
+  // How many times to retry after timeouts before giving up.
+  qpa.retry_cnt = 7;
+
+  // How many times to retry after RNR (receiver not ready) condition
+  // before giving up. Occurs when the remote side has not yet posted
+  // a receive request.
+  qpa.rnr_retry = 7; // 7 is infinite retry.
+  qpa.sq_psn = my_msg.psn;
+  qpa.max_rd_atomic = 1;
+
+  r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+      IBV_QP_TIMEOUT |
+      IBV_QP_RETRY_CNT |
+      IBV_QP_RNR_RETRY |
+      IBV_QP_SQ_PSN |
+      IBV_QP_MAX_QP_RD_ATOMIC);
+  if (r) {
+    lderr(cct) << __func__ << " failed to transition to RTS state: "
+               << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+
+  // the queue pair should be ready to use once the client has finished
+  // setting up their end.
+  ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl;
+  ldout(cct, 20) << __func__ << " QueuePair: " << qp << " with qp:" << qp->get_qp() << dendl;
+
+  if (!is_server) {
+    connected = 1; //indicate successfully
+    ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << my_msg.qpn << dendl;
+    submit(false);
+  }
+  active = true;
+
+  return 0;
+}
+
+int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:"
+                 << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl;
+  NetHandler net(cct);
+
+  // we construct a socket to transport ib sync message
+  // but we shouldn't block in tcp connecting
+  if (opts.nonblock) {
+    tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr);
+  } else {
+    tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
+  }
+
+  if (tcp_fd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size);
+  if (r < 0) {
+    ::close(tcp_fd);
+    tcp_fd = -1;
+    return -errno;
+  }
+
+  ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl;
+  net.set_priority(tcp_fd, opts.priority, peer_addr.get_family());
+  r = 0;
+  if (opts.nonblock) {
+    worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler);
+  } else {
+    r = handle_connection_established(false);
+  }
+  return r;
+}
+
+int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) {
+  ldout(cct, 20) << __func__ << " start " << dendl;
+  // delete read event
+  worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return -1;
+  }
+  // send handshake msg to server
+  my_msg.peer_qpn = 0;
+  int r = infiniband->send_msg(cct, tcp_fd, my_msg);
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl;
+    if (need_set_fault) {
+      fault();
+    }
+    return r;
+  }
+  worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+  ldout(cct, 20) << __func__ << " finish " << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::handle_connection() {
+  ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl;
+  int r = infiniband->recv_msg(cct, tcp_fd, peer_msg);
+  if (r <= 0) {
+    if (r != -EAGAIN) {
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl;
+      fault();
+    }
+    return;
+  }
+
+  if (1 == connected) {
+    ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl;
+    fault();
+    return;
+  }
+
+  if (!is_server) {// syn + ack from server
+    my_msg.peer_qpn = peer_msg.qpn;
+    ldout(cct, 20) << __func__ << " peer msg :  < " << peer_msg.qpn << ", " << peer_msg.psn
+                   <<  ", " << peer_msg.lid << ", " << peer_msg.peer_qpn << "> " << dendl;
+    if (!connected) {
+      r = activate();
+      ceph_assert(!r);
+    }
+    notify();
+    r = infiniband->send_msg(cct, tcp_fd, my_msg);
+    if (r < 0) {
+      ldout(cct, 1) << __func__ << " send client ack failed." << dendl;
+      dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+      fault();
+    }
+  } else {
+    if (peer_msg.peer_qpn == 0) {// syn from client
+      if (active) {
+        ldout(cct, 10) << __func__ << " server is already active." << dendl;
+        return ;
+      }
+      r = activate();
+      ceph_assert(!r);
+      r = infiniband->send_msg(cct, tcp_fd, my_msg);
+      if (r < 0) {
+        ldout(cct, 1) << __func__ << " server ack failed." << dendl;
+        dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+        fault();
+        return ;
+      }
+    } else { // ack from client
+      connected = 1;
+      ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl;
+      //cleanup();
+      submit(false);
+      notify();
+    }
+  }
+}
+
+ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
+{
+  uint64_t i = 0;
+  int r = ::read(notify_fd, &i, sizeof(i));
+  ldout(cct, 20) << __func__ << " notify_fd : " << i << " in " << my_msg.qpn << " r = " << r << dendl;
+  
+  if (!active) {
+    ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl;
+    return -EAGAIN;
+  }
+  
+  if (0 == connected) {
+    ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl;
+    return -EAGAIN;
+  }
+  ssize_t read = 0;
+  if (!buffers.empty())
+    read = read_buffers(buf,len);
+
+  std::vector<ibv_wc> cqe;
+  get_wc(cqe);
+  if (cqe.empty()) {
+    if (!buffers.empty()) {
+      notify();
+    }
+    if (read > 0) {
+      return read;
+    }
+    if (error) {
+      return -error;
+    } else {
+      return -EAGAIN;
+    }
+  }
+
+  ldout(cct, 20) << __func__ << " poll queue got " << cqe.size() << " responses. QP: " << my_msg.qpn << dendl;
+  for (size_t i = 0; i < cqe.size(); ++i) {
+    ibv_wc* response = &cqe[i];
+    ceph_assert(response->status == IBV_WC_SUCCESS);
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    ldout(cct, 25) << __func__ << " chunk length: " << response->byte_len << " bytes." << chunk << dendl;
+    chunk->prepare_read(response->byte_len);
+    worker->perf_logger->inc(l_msgr_rdma_rx_bytes, response->byte_len);
+    if (response->byte_len == 0) {
+      dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin);
+      if (connected) {
+        error = ECONNRESET;
+        ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
+      }
+      dispatcher->post_chunk_to_pool(chunk);
+    } else {
+      if (read == (ssize_t)len) {
+        buffers.push_back(chunk);
+        ldout(cct, 25) << __func__ << " buffers add a chunk: " << response->byte_len << dendl;
+      } else if (read + response->byte_len > (ssize_t)len) {
+        read += chunk->read(buf+read, (ssize_t)len-read);
+        buffers.push_back(chunk);
+        ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl;
+      } else {
+        read += chunk->read(buf+read, response->byte_len);
+        dispatcher->post_chunk_to_pool(chunk);
+        update_post_backlog();
+      }
+    }
+  }
+
+  worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size());
+  if (is_server && connected == 0) {
+    ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << my_msg.qpn << " peer QP: " << peer_msg.qpn << dendl;
+    connected = 1; //if so, we don't need the last handshake
+    cleanup();
+    submit(false);
+  }
+
+  if (!buffers.empty()) {
+    notify();
+  }
+
+  if (read == 0 && error)
+    return -error;
+  return read == 0 ? -EAGAIN : read;
+}
+
+ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
+{
+  size_t read = 0, tmp = 0;
+  auto c = buffers.begin();
+  for (; c != buffers.end() ; ++c) {
+    tmp = (*c)->read(buf+read, len-read);
+    read += tmp;
+    ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound()  << ". Chunk:" << *c  << dendl;
+    if ((*c)->over()) {
+      dispatcher->post_chunk_to_pool(*c);
+      update_post_backlog();
+      ldout(cct, 25) << __func__ << " one chunk over." << dendl;
+    }
+    if (read == len) {
+      break;
+    }
+  }
+
+  if (c != buffers.end() && (*c)->over())
+    ++c;
+  buffers.erase(buffers.begin(), c);
+  ldout(cct, 25) << __func__ << " got " << read  << " bytes, buffers size: " << buffers.size() << dendl;
+  return read;
+}
+
+ssize_t RDMAConnectedSocketImpl::zero_copy_read(bufferptr &data)
+{
+  if (error)
+    return -error;
+  static const int MAX_COMPLETIONS = 16;
+  ibv_wc wc[MAX_COMPLETIONS];
+  ssize_t size = 0;
+
+  ibv_wc*  response;
+  Chunk* chunk;
+  bool loaded = false;
+  auto iter = buffers.begin();
+  if (iter != buffers.end()) {
+    chunk = *iter;
+    // FIXME need to handle release
+    // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+    buffers.erase(iter);
+    loaded = true;
+    size = chunk->bound;
+  }
+
+  std::vector<ibv_wc> cqe;
+  get_wc(cqe);
+  if (cqe.empty())
+    return size == 0 ? -EAGAIN : size;
+
+  ldout(cct, 20) << __func__ << " pool completion queue got " << cqe.size() << " responses."<< dendl;
+
+  for (size_t i = 0; i < cqe.size(); ++i) {
+    response = &wc[i];
+    chunk = reinterpret_cast<Chunk*>(response->wr_id);
+    chunk->prepare_read(response->byte_len);
+    if (!loaded && i == 0) {
+      // FIXME need to handle release
+      // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+      size = chunk->bound;
+      continue;
+    }
+    buffers.push_back(chunk);
+    iter++;
+  }
+
+  if (size == 0)
+    return -EAGAIN;
+  return size;
+}
+
+ssize_t RDMAConnectedSocketImpl::send(bufferlist &bl, bool more)
+{
+  if (error) {
+    if (!active)
+      return -EPIPE;
+    return -error;
+  }
+  size_t bytes = bl.length();
+  if (!bytes)
+    return 0;
+  {
+    Mutex::Locker l(lock);
+    pending_bl.claim_append(bl);
+    if (!connected) {
+      ldout(cct, 20) << __func__ << " fake send to upper, QP: " << my_msg.qpn << dendl;
+      return bytes;
+    }
+  }
+  ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << dendl;
+  ssize_t r = submit(more);
+  if (r < 0 && r != -EAGAIN)
+    return r;
+  return bytes;
+}
+
+ssize_t RDMAConnectedSocketImpl::submit(bool more)
+{
+  if (error)
+    return -error;
+  Mutex::Locker l(lock);
+  size_t bytes = pending_bl.length();
+  ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: "
+                 << pending_bl.buffers().size() << dendl;
+  if (!bytes)
+    return 0;
+
+  auto fill_tx_via_copy = [this](std::vector<Chunk*> &tx_buffers,
+                                 unsigned bytes,
+                                 auto& start,
+                                 const auto& end) -> unsigned {
+    ceph_assert(start != end);
+    auto chunk_idx = tx_buffers.size();
+    int ret = worker->get_reged_mem(this, tx_buffers, bytes);
+    if (ret == 0) {
+      ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl;
+      worker->perf_logger->inc(l_msgr_rdma_tx_no_mem);
+      return 0;
+    }
+
+    unsigned total_copied = 0;
+    Chunk *current_chunk = tx_buffers[chunk_idx];
+    while (start != end) {
+      const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str());
+      unsigned copied = 0;
+      while (copied < start->length()) {
+        uint32_t r = current_chunk->write((char*)addr+copied, start->length() - copied);
+        copied += r;
+        total_copied += r;
+        bytes -= r;
+        if (current_chunk->full()){
+          if (++chunk_idx == tx_buffers.size())
+            return total_copied;
+          current_chunk = tx_buffers[chunk_idx];
+        }
+      }
+      ++start;
+    }
+    ceph_assert(bytes == 0);
+    return total_copied;
+  };
+
+  std::vector<Chunk*> tx_buffers;
+  auto it = std::cbegin(pending_bl.buffers());
+  auto copy_it = it;
+  unsigned total = 0;
+  unsigned need_reserve_bytes = 0;
+  while (it != pending_bl.buffers().end()) {
+    if (infiniband->is_tx_buffer(it->raw_c_str())) {
+      if (need_reserve_bytes) {
+        unsigned copied = fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+        total += copied;
+        if (copied < need_reserve_bytes)
+          goto sending;
+        need_reserve_bytes = 0;
+      }
+      ceph_assert(copy_it == it);
+      tx_buffers.push_back(infiniband->get_tx_chunk_by_buffer(it->raw_c_str()));
+      total += it->length();
+      ++copy_it;
+    } else {
+      need_reserve_bytes += it->length();
+    }
+    ++it;
+  }
+  if (need_reserve_bytes)
+    total += fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+
+ sending:
+  if (total == 0)
+    return -EAGAIN;
+  ceph_assert(total <= pending_bl.length());
+  bufferlist swapped;
+  if (total < pending_bl.length()) {
+    worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem);
+    pending_bl.splice(total, pending_bl.length()-total, &swapped);
+    pending_bl.swap(swapped);
+  } else {
+    pending_bl.clear();
+  }
+
+  ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers "
+                 << pending_bl.buffers().size() << " tx chunks " << tx_buffers.size() << dendl;
+
+  int r = post_work_request(tx_buffers);
+  if (r < 0)
+    return r;
+
+  ldout(cct, 20) << __func__ << " finished sending " << bytes << " bytes." << dendl;
+  return pending_bl.length() ? -EAGAIN : 0;
+}
+
+int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers)
+{
+  ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " " << tx_buffers[0] << dendl;
+  vector<Chunk*>::iterator current_buffer = tx_buffers.begin();
+  ibv_sge isge[tx_buffers.size()];
+  uint32_t current_sge = 0;
+  ibv_send_wr iswr[tx_buffers.size()];
+  uint32_t current_swr = 0;
+  ibv_send_wr* pre_wr = NULL;
+  uint32_t num = 0; 
+
+  // FIPS zeroization audit 20191115: these memsets are not security related.
+  memset(iswr, 0, sizeof(iswr));
+  memset(isge, 0, sizeof(isge));
+ 
+  while (current_buffer != tx_buffers.end()) {
+    isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer);
+    isge[current_sge].length = (*current_buffer)->get_offset();
+    isge[current_sge].lkey = (*current_buffer)->mr->lkey;
+    ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length  << dendl;
+
+    iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer);
+    iswr[current_swr].next = NULL;
+    iswr[current_swr].sg_list = &isge[current_sge];
+    iswr[current_swr].num_sge = 1;
+    iswr[current_swr].opcode = IBV_WR_SEND;
+    iswr[current_swr].send_flags = IBV_SEND_SIGNALED;
+    /*if (isge[current_sge].length < infiniband->max_inline_data) {
+      iswr[current_swr].send_flags = IBV_SEND_INLINE;
+      ldout(cct, 20) << __func__ << " send_inline." << dendl;
+      }*/
+
+    num++;
+    worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length);
+    if (pre_wr)
+      pre_wr->next = &iswr[current_swr];
+    pre_wr = &iswr[current_swr];
+    ++current_sge;
+    ++current_swr;
+    ++current_buffer;
+  }
+
+  ibv_send_wr *bad_tx_work_request;
+  if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send data"
+                  << " (most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return -errno;
+  }
+  qp->add_tx_wr(num);
+  worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size());
+  ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+  return 0;
+}
+
+void RDMAConnectedSocketImpl::fin() {
+  ibv_send_wr wr;
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&wr, 0, sizeof(wr));
+
+  wr.wr_id = reinterpret_cast<uint64_t>(qp);
+  wr.num_sge = 0;
+  wr.opcode = IBV_WR_SEND;
+  wr.send_flags = IBV_SEND_SIGNALED;
+  ibv_send_wr* bad_tx_work_request;
+  if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) {
+    ldout(cct, 1) << __func__ << " failed to send message="
+                  << " ibv_post_send failed(most probably should be peer not ready): "
+                  << cpp_strerror(errno) << dendl;
+    worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+    return ;
+  }
+  qp->add_tx_wr(1);
+}
+
+void RDMAConnectedSocketImpl::cleanup() {
+  if (read_handler && tcp_fd >= 0) {
+    (static_cast<C_handle_connection_read*>(read_handler))->close();
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+    }, false);
+    delete read_handler;
+    read_handler = nullptr;
+  }
+  if (established_handler) {
+    (static_cast<C_handle_connection_established*>(established_handler))->close();
+    delete established_handler;
+    established_handler = nullptr;
+  }
+}
+
+void RDMAConnectedSocketImpl::notify()
+{
+  // note: notify_fd is an event fd (man eventfd)
+  // write argument must be a 64bit integer
+  uint64_t i = 1;
+
+  ceph_assert(sizeof(i) == write(notify_fd, &i, sizeof(i)));
+}
+
+void RDMAConnectedSocketImpl::shutdown()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::close()
+{
+  if (!error)
+    fin();
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAConnectedSocketImpl::fault()
+{
+  ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl;
+  /*if (qp) {
+    qp->to_dead();
+    qp = NULL;
+    }*/
+  error = ECONNRESET;
+  connected = 1;
+  notify();
+}
+
+void RDMAConnectedSocketImpl::set_accept_fd(int sd)
+{
+  tcp_fd = sd;
+  is_server = true;
+  worker->center.submit_to(worker->center.get_id(), [this]() {
+			   worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+			   }, true);
+}
+
+void RDMAConnectedSocketImpl::post_chunks_to_rq(int num)
+{
+  post_backlog += num - infiniband->post_chunks_to_rq(num, qp->get_qp());
+}
+
+void RDMAConnectedSocketImpl::update_post_backlog()
+{
+  if (post_backlog)
+    post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp->get_qp());
+}
diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
new file mode 100644
index 00000000..432c2d2b
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
@@ -0,0 +1,183 @@
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl "
+
+#define TIMEOUT_MS 3000
+#define RETRY_COUNT 7
+
+RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+						 RDMAWorker *w, RDMACMInfo *info)
+  : RDMAConnectedSocketImpl(cct, ib, s, w), cm_con_handler(new C_handle_cm_connection(this))
+{
+  status = IDLE;
+  notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+  if (info) {
+    is_server = true;
+    cm_id = info->cm_id;
+    cm_channel = info->cm_channel;
+    status = RDMA_ID_CREATED;
+    remote_qpn = info->qp_num;
+    if (alloc_resource()) {
+      close_notify();
+      return;
+    }
+    worker->center.submit_to(worker->center.get_id(), [this]() {
+      worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+      status = CHANNEL_FD_CREATED;
+    }, false);
+    status = RESOURCE_ALLOCATED;
+    local_qpn = qp->get_local_qp_number();
+    my_msg.qpn = local_qpn;
+  } else {
+    is_server = false;
+    cm_channel = rdma_create_event_channel();
+    rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+    status = RDMA_ID_CREATED;
+    ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  }
+}
+
+RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() {
+  ldout(cct, 20) << __func__ << " destruct." << dendl;
+  std::unique_lock l(close_mtx);
+  close_condition.wait(l, [&] { return closed; });
+  if (status >= RDMA_ID_CREATED) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
+
+int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+  worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+  status = CHANNEL_FD_CREATED;
+  if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) {
+    lderr(cct) << __func__ << " failed to resolve addr" << dendl;
+    return -1;
+  }
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close() {
+  error = ECONNRESET;
+  active = false;
+  if (status >= CONNECTED) {
+    rdma_disconnect(cm_id);
+  }
+  close_notify();
+}
+
+void RDMAIWARPConnectedSocketImpl::shutdown() {
+  error = ECONNRESET;
+  active = false;
+}
+
+void RDMAIWARPConnectedSocketImpl::handle_cm_connection() {
+  struct rdma_cm_event *event;
+  rdma_get_cm_event(cm_channel, &event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event)
+                             << " (cm id: " << cm_id << ")" << dendl;
+  struct rdma_conn_param cm_params;
+  switch (event->event) {
+    case RDMA_CM_EVENT_ADDR_RESOLVED:
+      status = ADDR_RESOLVED;
+      if (rdma_resolve_route(cm_id, TIMEOUT_MS)) {
+        lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ROUTE_RESOLVED:
+      status = ROUTE_RESOLVED;
+      if (alloc_resource()) {
+        lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+        break;
+      }
+      local_qpn = qp->get_local_qp_number();
+      my_msg.qpn = local_qpn;
+
+      // FIPS zeroization audit 20191115: this memset is not security related.
+      memset(&cm_params, 0, sizeof(cm_params));
+      cm_params.retry_count = RETRY_COUNT;
+      cm_params.qp_num = local_qpn;
+      if (rdma_connect(cm_id, &cm_params)) {
+        lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl;
+        connected = -ECONNREFUSED;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ESTABLISHED:
+      ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl;
+      status = CONNECTED;
+      if (!is_server) {
+        remote_qpn = event->param.conn.qp_num;
+        activate();
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_ADDR_ERROR:
+    case RDMA_CM_EVENT_ROUTE_ERROR:
+    case RDMA_CM_EVENT_CONNECT_ERROR:
+    case RDMA_CM_EVENT_UNREACHABLE:
+    case RDMA_CM_EVENT_REJECTED:
+      lderr(cct) << __func__ << " rdma connection rejected" << dendl;
+      connected = -ECONNREFUSED;
+      notify();
+      break;
+
+    case RDMA_CM_EVENT_DISCONNECTED:
+      status = DISCONNECTED;
+      close_notify();
+      if (!error) {
+        error = ECONNRESET;
+        notify();
+      }
+      break;
+
+    case RDMA_CM_EVENT_DEVICE_REMOVAL:
+      break;
+
+    default:
+      ceph_abort_msg("unhandled event");
+      break;
+  }
+  rdma_ack_cm_event(event);
+}
+
+void RDMAIWARPConnectedSocketImpl::activate() {
+  ldout(cct, 30) << __func__ << dendl;
+  active = true;
+  connected = 1;
+}
+
+int RDMAIWARPConnectedSocketImpl::alloc_resource() {
+  ldout(cct, 30) << __func__ << dendl;
+  qp = infiniband->create_queue_pair(cct, dispatcher->get_tx_cq(),
+      dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id);
+  if (!qp) {
+    return -1;
+  }
+  if (!cct->_conf->ms_async_rdma_support_srq)
+    dispatcher->post_chunks_to_rq(infiniband->get_rx_queue_len(), qp->get_qp());
+  dispatcher->register_qp(qp, this);
+  dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+  dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+  return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close_notify() {
+  ldout(cct, 30) << __func__ << dendl;
+  if (status >= CHANNEL_FD_CREATED) {
+    worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE);
+  }
+  std::unique_lock l(close_mtx);
+  if (!closed) {
+    closed = true;
+    close_condition.notify_all();
+  }
+}
diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
new file mode 100644
index 00000000..210eaf00
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
@@ -0,0 +1,107 @@
+#include <poll.h>
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl "
+
+RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl(
+  CephContext *cct, Infiniband* i,
+  RDMADispatcher *s, RDMAWorker *w, entity_addr_t& a, unsigned addr_slot)
+  : RDMAServerSocketImpl(cct, i, s, w, a, addr_slot)
+{
+}
+
+int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa,
+				      const SocketOptions &opt)
+{
+  ldout(cct, 20) << __func__ << " bind to rdma point" << dendl;
+  cm_channel = rdma_create_event_channel();
+  rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+  ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+  int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr()));
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  rc = rdma_listen(cm_id, 128);
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+  server_setup_socket = cm_channel->fd;
+  ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl;
+  return 0;
+
+err:
+  server_setup_socket = -1;
+  rdma_destroy_id(cm_id);
+  rdma_destroy_event_channel(cm_channel);
+  return rc;
+}
+
+int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt,
+    entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+  struct pollfd pfd = {
+    .fd = cm_channel->fd,
+    .events = POLLIN,
+  };
+  int ret = poll(&pfd, 1, 0);
+  ceph_assert(ret >= 0);
+  if (!ret)
+    return -EAGAIN;
+
+  struct rdma_cm_event *cm_event;
+  rdma_get_cm_event(cm_channel, &cm_event);
+  ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl;
+
+  struct rdma_cm_id *event_cm_id = cm_event->id;
+  struct rdma_event_channel *event_channel = rdma_create_event_channel();
+
+  rdma_migrate_id(event_cm_id, event_channel);
+
+  struct rdma_cm_id *new_cm_id = event_cm_id;
+  struct rdma_conn_param *remote_conn_param = &cm_event->param.conn;
+  struct rdma_conn_param local_conn_param;
+
+  RDMACMInfo info(new_cm_id, event_channel, remote_conn_param->qp_num);
+  RDMAIWARPConnectedSocketImpl* server =
+    new RDMAIWARPConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w), &info);
+
+  // FIPS zeroization audit 20191115: this memset is not security related.
+  memset(&local_conn_param, 0, sizeof(local_conn_param));
+  local_conn_param.qp_num = server->get_local_qpn();
+
+  if (rdma_accept(new_cm_id, &local_conn_param)) {
+    return -EAGAIN;
+  }
+  server->activate();
+  ldout(cct, 20) << __func__ << " accepted a new QP" << dendl;
+
+  rdma_ack_cm_event(cm_event);
+
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+  struct sockaddr *addr = &new_cm_id->route.addr.dst_addr;
+  out->set_sockaddr(addr);
+
+  return 0;
+}
+
+void RDMAIWARPServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0) {
+    rdma_destroy_id(cm_id);
+    rdma_destroy_event_channel(cm_channel);
+  }
+}
diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc
new file mode 100644
index 00000000..98402cfd
--- /dev/null
+++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAServerSocketImpl "
+
+RDMAServerSocketImpl::RDMAServerSocketImpl(
+  CephContext *cct, Infiniband* i, RDMADispatcher *s, RDMAWorker *w,
+  entity_addr_t& a, unsigned slot)
+  : ServerSocketImpl(a.get_type(), slot),
+    cct(cct), net(cct), server_setup_socket(-1), infiniband(i),
+    dispatcher(s), worker(w), sa(a)
+{
+}
+
+int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt)
+{
+  int rc = 0;
+  server_setup_socket = net.create_socket(sa.get_family(), true);
+  if (server_setup_socket < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " failed to create server socket: "
+               << cpp_strerror(errno) << dendl;
+    return rc;
+  }
+
+  rc = net.set_nonblock(server_setup_socket);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size);
+  if (rc < 0) {
+    goto err;
+  }
+
+  rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len());
+  if (rc < 0) {
+    rc = -errno;
+    ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+                   << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl;
+    goto err;
+  }
+
+  ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port()  << dendl;
+  return 0;
+
+err:
+  ::close(server_setup_socket);
+  server_setup_socket = -1;
+  return rc;
+}
+
+int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w)
+{
+  ldout(cct, 15) << __func__ << dendl;
+
+  ceph_assert(sock);
+
+  sockaddr_storage ss;
+  socklen_t slen = sizeof(ss);
+  int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen);
+  if (sd < 0) {
+    return -errno;
+  }
+
+  int r = net.set_nonblock(sd);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+  if (r < 0) {
+    ::close(sd);
+    return -errno;
+  }
+
+  ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+  out->set_type(addr_type);
+  out->set_sockaddr((sockaddr*)&ss);
+  net.set_priority(sd, opt.priority, out->get_family());
+
+  RDMAConnectedSocketImpl* server;
+  //Worker* w = dispatcher->get_stack()->get_worker();
+  server = new RDMAConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w));
+  server->set_accept_fd(sd);
+  ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl;
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+  *sock = ConnectedSocket(std::move(csi));
+
+  return 0;
+}
+
+void RDMAServerSocketImpl::abort_accept()
+{
+  if (server_setup_socket >= 0)
+    ::close(server_setup_socket);
+}
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
new file mode 100644
index 00000000..f63a8e7d
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -0,0 +1,610 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <poll.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "include/str_list.h"
+#include "include/compat.h"
+#include "common/Cycles.h"
+#include "common/deleter.h"
+#include "common/Tub.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "RDMAStack "
+
+RDMADispatcher::~RDMADispatcher()
+{
+  ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl;
+  polling_stop();
+
+  ceph_assert(qp_conns.empty());
+  ceph_assert(num_qp_conn == 0);
+  ceph_assert(dead_queue_pairs.empty());
+  ceph_assert(num_dead_queue_pair == 0);
+
+  delete async_handler;
+}
+
+RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s)
+  : cct(c), async_handler(new C_handle_cq_async(this)), lock("RDMADispatcher::lock"),
+  w_lock("RDMADispatcher::for worker pending list"), stack(s)
+{
+  PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last);
+
+  plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
+  plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed");
+  plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
+  plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors");
+  plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors");
+
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion");
+  plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request");
+
+  plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events");
+  plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events");
+
+  plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors");
+
+
+  plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number");
+  plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+  Cycles::init();
+}
+
+void RDMADispatcher::polling_start()
+{
+  // take lock because listen/connect can happen from different worker threads
+  Mutex::Locker l(lock);
+
+  if (t.joinable()) 
+    return; // dispatcher thread already running 
+
+  get_stack()->get_infiniband().get_memory_manager()->set_rx_stat_logger(perf_logger);
+
+  tx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+  ceph_assert(tx_cc);
+  rx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+  ceph_assert(rx_cc);
+  tx_cq = get_stack()->get_infiniband().create_comp_queue(cct, tx_cc);
+  ceph_assert(tx_cq);
+  rx_cq = get_stack()->get_infiniband().create_comp_queue(cct, rx_cc);
+  ceph_assert(rx_cq);
+
+  t = std::thread(&RDMADispatcher::polling, this);
+  ceph_pthread_setname(t.native_handle(), "rdma-polling");
+}
+
+void RDMADispatcher::polling_stop()
+{
+  {
+    Mutex::Locker l(lock);
+    done = true;
+  }
+
+  if (!t.joinable())
+    return;
+
+  t.join();
+
+  tx_cc->ack_events();
+  rx_cc->ack_events();
+  delete tx_cq;
+  delete rx_cq;
+  delete tx_cc;
+  delete rx_cc;
+}
+
+void RDMADispatcher::handle_async_event()
+{
+  ldout(cct, 30) << __func__ << dendl;
+  while (1) {
+    ibv_async_event async_event;
+    if (ibv_get_async_event(get_stack()->get_infiniband().get_device()->ctxt, &async_event)) {
+      if (errno != EAGAIN)
+       lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno
+                  << " " << cpp_strerror(errno) << ")" << dendl;
+      return;
+    }
+    perf_logger->inc(l_msgr_rdma_total_async_events);
+    // FIXME: Currently we must ensure no other factor make QP in ERROR state,
+    // otherwise this qp can't be deleted in current cleanup flow.
+    if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) {
+      perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+      uint64_t qpn = async_event.element.qp->qp_num;
+      ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp
+                     << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+      Mutex::Locker l(lock);
+      RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+      if (!conn) {
+        ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl;
+      } else {
+        ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl;
+        conn->fault();
+        if (!cct->_conf->ms_async_rdma_cm)
+          erase_qpn_lockless(qpn);
+      }
+    } else {
+      ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt
+                    << " evt: " << ibv_event_type_str(async_event.event_type)
+                    << dendl;
+    }
+    ibv_ack_async_event(&async_event);
+  }
+}
+
+void RDMADispatcher::post_chunk_to_pool(Chunk* chunk)
+{
+  Mutex::Locker l(lock);
+  get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+  perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+}
+
+int RDMADispatcher::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+  Mutex::Locker l(lock);
+  return get_stack()->get_infiniband().post_chunks_to_rq(num, qp);
+}
+
+void RDMADispatcher::polling()
+{
+  static int MAX_COMPLETIONS = 32;
+  ibv_wc wc[MAX_COMPLETIONS];
+
+  std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+  std::vector<ibv_wc> tx_cqe;
+  ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl;
+  RDMAConnectedSocketImpl *conn = nullptr;
+  uint64_t last_inactive = Cycles::rdtsc();
+  bool rearmed = false;
+  int r = 0;
+
+  while (true) {
+    int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (tx_ret > 0) {
+      ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret
+                     << " responses."<< dendl;
+      handle_tx_event(wc, tx_ret);
+    }
+
+    int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc);
+    if (rx_ret > 0) {
+      ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret
+                     << " responses."<< dendl;
+      perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_ret);
+      perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_ret);
+
+      Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+
+      for (int i = 0; i < rx_ret; ++i) {
+        ibv_wc* response = &wc[i];
+        Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+
+        if (response->status == IBV_WC_SUCCESS) {
+          ceph_assert(wc[i].opcode == IBV_WC_RECV);
+          conn = get_conn_lockless(response->qp_num);
+          if (!conn) {
+            ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk " << chunk << " will be back ? " << r << dendl;
+            get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+            perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+          } else {
+            conn->post_chunks_to_rq(1);
+            polled[conn].push_back(*response);
+          }
+        } else {
+          perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+          ldout(cct, 1) << __func__ << " work request returned error for buffer(" << chunk
+              << ") status(" << response->status << ":"
+              << get_stack()->get_infiniband().wc_status_to_string(response->status) << ")" << dendl;
+          if (response->status != IBV_WC_WR_FLUSH_ERR) {
+            conn = get_conn_lockless(response->qp_num);
+            if (conn && conn->is_connected())
+              conn->fault();
+          }
+          get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+          perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+        }
+      }
+      for (auto &&i : polled)
+        i.first->pass_wc(std::move(i.second));
+      polled.clear();
+    }
+
+    if (!tx_ret && !rx_ret) {
+      // NOTE: Has TX just transitioned to idle? We should do it when idle!
+      // It's now safe to delete queue pairs (see comment by declaration
+      // for dead_queue_pairs).
+      // Additionally, don't delete qp while outstanding_buffers isn't empty,
+      // because we need to check qp's state before sending
+      perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight);
+      if (num_dead_queue_pair) {
+        Mutex::Locker l(lock); // FIXME reuse dead qp because creating one qp costs 1 ms
+        auto it = dead_queue_pairs.begin();
+        while (it != dead_queue_pairs.end()) {
+          auto i = *it;
+          // Bypass QPs that do not collect all Tx completions yet.
+          if (i->get_tx_wr()) {
+            ldout(cct, 20) << __func__ << " bypass qp=" << i << " tx_wr=" << i->get_tx_wr() << dendl;
+            ++it;
+          } else {
+            ldout(cct, 10) << __func__ << " finally delete qp=" << i << dendl;
+            delete i;
+            it = dead_queue_pairs.erase(it);
+            perf_logger->dec(l_msgr_rdma_active_queue_pair);
+            --num_dead_queue_pair;
+          }
+        }
+      }
+      if (!num_qp_conn && done && dead_queue_pairs.empty())
+        break;
+
+      uint64_t now = Cycles::rdtsc();
+      if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) {
+        handle_async_event();
+        if (!rearmed) {
+          // Clean up cq events after rearm notify ensure no new incoming event
+          // arrived between polling and rearm
+          tx_cq->rearm_notify();
+          rx_cq->rearm_notify();
+          rearmed = true;
+          continue;
+        }
+
+        struct pollfd channel_poll[2];
+        channel_poll[0].fd = tx_cc->get_fd();
+        channel_poll[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+        channel_poll[0].revents = 0;
+        channel_poll[1].fd = rx_cc->get_fd();
+        channel_poll[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+        channel_poll[1].revents = 0;
+        r = 0;
+        perf_logger->set(l_msgr_rdma_polling, 0);
+        while (!done && r == 0) {
+          r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100));
+          if (r < 0) {
+            r = -errno;
+            lderr(cct) << __func__ << " poll failed " << r << dendl;
+            ceph_abort();
+          }
+        }
+        if (r > 0 && tx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got tx cq event." << dendl;
+        if (r > 0 && rx_cc->get_cq_event())
+          ldout(cct, 20) << __func__ << " got rx cq event." << dendl;
+        last_inactive = Cycles::rdtsc();
+        perf_logger->set(l_msgr_rdma_polling, 1);
+        rearmed = false;
+      }
+    }
+  }
+}
+
+void RDMADispatcher::notify_pending_workers() {
+  if (num_pending_workers) {
+    RDMAWorker *w = nullptr;
+    {
+      Mutex::Locker l(w_lock);
+      if (!pending_workers.empty()) {
+        w = pending_workers.front();
+        pending_workers.pop_front();
+        --num_pending_workers;
+      }
+    }
+    if (w)
+      w->notify_worker();
+  }
+}
+
+void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi)
+{
+  Mutex::Locker l(lock);
+  ceph_assert(!qp_conns.count(qp->get_local_qp_number()));
+  qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi);
+  ++num_qp_conn;
+}
+
+RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp)
+{
+  auto it = qp_conns.find(qp);
+  if (it == qp_conns.end())
+    return nullptr;
+  if (it->second.first->is_dead())
+    return nullptr;
+  return it->second.second;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp)
+{
+  Mutex::Locker l(lock);
+  // Try to find the QP in qp_conns firstly.
+  auto it = qp_conns.find(qp);
+  if (it != qp_conns.end())
+    return it->second.first;
+
+  // Try again in dead_queue_pairs.
+  for (auto &i: dead_queue_pairs)
+    if (i->get_local_qp_number() == qp)
+      return i;
+
+  return nullptr;
+}
+
+void RDMADispatcher::erase_qpn_lockless(uint32_t qpn)
+{
+  auto it = qp_conns.find(qpn);
+  if (it == qp_conns.end())
+    return ;
+  ++num_dead_queue_pair;
+  dead_queue_pairs.push_back(it->second.first);
+  qp_conns.erase(it);
+  --num_qp_conn;
+}
+
+void RDMADispatcher::erase_qpn(uint32_t qpn)
+{
+  Mutex::Locker l(lock);
+  erase_qpn_lockless(qpn);
+}
+
+void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n)
+{
+  std::vector<Chunk*> tx_chunks;
+
+  for (int i = 0; i < n; ++i) {
+    ibv_wc* response = &cqe[i];
+    Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+    ldout(cct, 25) << __func__ << " QP: " << response->qp_num
+                   << " len: " << response->byte_len << " , addr:" << chunk
+                   << " " << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+
+    QueuePair *qp = get_qp(response->qp_num);
+    if (qp)
+      qp->dec_tx_wr(1);
+
+    if (response->status != IBV_WC_SUCCESS) {
+      perf_logger->inc(l_msgr_rdma_tx_total_wc_errors);
+      if (response->status == IBV_WC_RETRY_EXC_ERR) {
+        ldout(cct, 1) << __func__ << " connection between server and client not working. Disconnect this now" << dendl;
+        perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors);
+      } else if (response->status == IBV_WC_WR_FLUSH_ERR) {
+        ldout(cct, 1) << __func__ << " Work Request Flushed Error: this connection's qp="
+                      << response->qp_num << " should be down while this WR=" << response->wr_id
+                      << " still in flight." << dendl;
+        perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors);
+      } else {
+        ldout(cct, 1) << __func__ << " send work request returned error for buffer("
+                      << response->wr_id << ") status(" << response->status << "): "
+                      << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+        Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+        RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+
+        if (conn && conn->is_connected()) {
+          ldout(cct, 25) << __func__ << " qp state is : " << conn->get_qp_state() << dendl;
+          conn->fault();
+        } else {
+          ldout(cct, 1) << __func__ << " missing qp_num=" << response->qp_num << " discard event" << dendl;
+        }
+      }
+    }
+
+    //TX completion may come either from regular send message or from 'fin' message.
+    //In the case of 'fin' wr_id points to the QueuePair.
+    if (get_stack()->get_infiniband().get_memory_manager()->is_tx_buffer(chunk->buffer)) {
+      tx_chunks.push_back(chunk);
+    } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) {
+      ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl;
+    } else {
+      ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl;
+      ceph_abort();
+    }
+  }
+
+  perf_logger->inc(l_msgr_rdma_tx_total_wc, n);
+  post_tx_buffer(tx_chunks);
+}
+
+/**
+ * Add the given Chunks to the given free queue.
+ *
+ * \param[in] chunks
+ *      The Chunks to enqueue.
+ * \return
+ *      0 if success or -1 for failure
+ */
+void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks)
+{
+  if (chunks.empty())
+    return ;
+
+  inflight -= chunks.size();
+  get_stack()->get_infiniband().get_memory_manager()->return_tx(chunks);
+  ldout(cct, 30) << __func__ << " release " << chunks.size()
+                 << " chunks, inflight " << inflight << dendl;
+  notify_pending_workers();
+}
+
+
+RDMAWorker::RDMAWorker(CephContext *c, unsigned i)
+  : Worker(c, i), stack(nullptr),
+    tx_handler(new C_handle_cq_tx(this)), lock("RDMAWorker::lock")
+{
+  // initialize perf_logger
+  char name[128];
+  sprintf(name, "AsyncMessenger::RDMAWorker-%u", id);
+  PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last);
+
+  plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer");
+  plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted");
+
+  plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
+  plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+  plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
+
+  perf_logger = plb.create_perf_counters();
+  cct->get_perfcounters_collection()->add(perf_logger);
+}
+
+RDMAWorker::~RDMAWorker()
+{
+  delete tx_handler;
+}
+
+void RDMAWorker::initialize()
+{
+  if (!dispatcher) {
+    dispatcher = &stack->get_dispatcher();
+  }
+}
+
+int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot,
+		       const SocketOptions &opt,ServerSocket *sock)
+{
+  get_stack()->get_infiniband().init();
+  dispatcher->polling_start();
+  RDMAServerSocketImpl *p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPServerSocketImpl(
+      cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this,
+      sa, addr_slot);
+  } else {
+    p = new RDMAServerSocketImpl(cct, &get_stack()->get_infiniband(),
+				 &get_stack()->get_dispatcher(), this, sa,
+				 addr_slot);
+  }
+  int r = p->listen(sa, opt);
+  if (r < 0) {
+    delete p;
+    return r;
+  }
+
+  *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+  return 0;
+}
+
+int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+  get_stack()->get_infiniband().init();
+  dispatcher->polling_start();
+
+  RDMAConnectedSocketImpl* p;
+  if (cct->_conf->ms_async_rdma_type == "iwarp") {
+    p = new RDMAIWARPConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+  } else {
+    p = new RDMAConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+  }
+  int r = p->try_connect(addr, opts);
+
+  if (r < 0) {
+    ldout(cct, 1) << __func__ << " try connecting failed." << dendl;
+    delete p;
+    return r;
+  }
+  std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
+  *socket = ConnectedSocket(std::move(csi));
+  return 0;
+}
+
+int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes)
+{
+  ceph_assert(center.in_thread());
+  int r = get_stack()->get_infiniband().get_tx_buffers(c, bytes);
+  ceph_assert(r >= 0);
+  size_t got = get_stack()->get_infiniband().get_memory_manager()->get_tx_buffer_size() * r;
+  ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered  bytes, inflight " << dispatcher->inflight << dendl;
+  stack->get_dispatcher().inflight += r;
+  if (got >= bytes)
+    return r;
+
+  if (o) {
+    if (!o->is_pending()) {
+      pending_sent_conns.push_back(o);
+      perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1);
+      o->set_pending(1);
+    }
+    dispatcher->make_pending_worker(this);
+  }
+  return r;
+}
+
+
+void RDMAWorker::handle_pending_message()
+{
+  ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl;
+  while (!pending_sent_conns.empty()) {
+    RDMAConnectedSocketImpl *o = pending_sent_conns.front();
+    pending_sent_conns.pop_front();
+    ssize_t r = o->submit(false);
+    ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl;
+    if (r < 0) {
+      if (r == -EAGAIN) {
+        pending_sent_conns.push_back(o);
+        dispatcher->make_pending_worker(this);
+        return ;
+      }
+      o->fault();
+    }
+    o->set_pending(0);
+    perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1);
+  }
+  dispatcher->notify_pending_workers();
+}
+
+RDMAStack::RDMAStack(CephContext *cct, const string &t)
+  : NetworkStack(cct, t), ib(cct), dispatcher(cct, this)
+{
+  ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl;
+
+  unsigned num = get_num_worker();
+  for (unsigned i = 0; i < num; ++i) {
+    RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i));
+    w->set_stack(this);
+  }
+  ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << &dispatcher << dendl;
+}
+
+RDMAStack::~RDMAStack()
+{
+  if (cct->_conf->ms_async_rdma_enable_hugepage) {
+    unsetenv("RDMAV_HUGEPAGES_SAFE");	//remove env variable on destruction
+  }
+}
+
+void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+  threads.resize(i+1);
+  threads[i] = std::thread(func);
+}
+
+void RDMAStack::join_worker(unsigned i)
+{
+  ceph_assert(threads.size() > i && threads[i].joinable());
+  threads[i].join();
+}
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
new file mode 100644
index 00000000..e4d34ee0
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_RDMASTACK_H
+#define CEPH_MSG_RDMASTACK_H
+
+#include <sys/eventfd.h>
+
+#include <list>
+#include <vector>
+#include <thread>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "msg/async/Stack.h"
+#include "Infiniband.h"
+
+class RDMAConnectedSocketImpl;
+class RDMAServerSocketImpl;
+class RDMAStack;
+class RDMAWorker;
+
+class RDMADispatcher {
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::QueuePair QueuePair;
+
+  std::thread t;
+  CephContext *cct;
+  Infiniband::CompletionQueue* tx_cq = nullptr;
+  Infiniband::CompletionQueue* rx_cq = nullptr;
+  Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr;
+  EventCallbackRef async_handler;
+  bool done = false;
+  std::atomic<uint64_t> num_dead_queue_pair = {0};
+  std::atomic<uint64_t> num_qp_conn = {0};
+  Mutex lock; // protect `qp_conns`, `dead_queue_pairs`
+  // qp_num -> InfRcConnection
+  // The main usage of `qp_conns` is looking up connection by qp_num,
+  // so the lifecycle of element in `qp_conns` is the lifecycle of qp.
+  //// make qp queue into dead state
+  /**
+   * 1. Connection call mark_down
+   * 2. Move the Queue Pair into the Error state(QueuePair::to_dead)
+   * 3. Wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED(handle_async_event)
+   * 4. Wait for CQ to be empty(handle_tx_event)
+   * 5. Destroy the QP by calling ibv_destroy_qp()(handle_tx_event)
+   *
+   * @param qp The qp needed to dead
+   */
+  ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns;
+
+  /// if a queue pair is closed when transmit buffers are active
+  /// on it, the transmit buffers never get returned via tx_cq.  To
+  /// work around this problem, don't delete queue pairs immediately. Instead,
+  /// save them in this vector and delete them at a safe time, when there are
+  /// no outstanding transmit buffers to be lost.
+  std::vector<QueuePair*> dead_queue_pairs;
+
+  std::atomic<uint64_t> num_pending_workers = {0};
+  Mutex w_lock; // protect pending workers
+  // fixme: lockfree
+  std::list<RDMAWorker*> pending_workers;
+  RDMAStack* stack;
+
+  class C_handle_cq_async : public EventCallback {
+    RDMADispatcher *dispatcher;
+   public:
+    explicit C_handle_cq_async(RDMADispatcher *w): dispatcher(w) {}
+    void do_request(uint64_t fd) {
+      // worker->handle_tx_event();
+      dispatcher->handle_async_event();
+    }
+  };
+
+ public:
+  PerfCounters *perf_logger;
+
+  explicit RDMADispatcher(CephContext* c, RDMAStack* s);
+  virtual ~RDMADispatcher();
+  void handle_async_event();
+
+  void polling_start();
+  void polling_stop();
+  void polling();
+  void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi);
+  void make_pending_worker(RDMAWorker* w) {
+    Mutex::Locker l(w_lock);
+    auto it = std::find(pending_workers.begin(), pending_workers.end(), w);
+    if (it != pending_workers.end())
+      return;
+    pending_workers.push_back(w);
+    ++num_pending_workers;
+  }
+  RDMAStack* get_stack() { return stack; }
+  RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp);
+  QueuePair* get_qp(uint32_t qp);
+  void erase_qpn_lockless(uint32_t qpn);
+  void erase_qpn(uint32_t qpn);
+  Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; }
+  Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; }
+  void notify_pending_workers();
+  void handle_tx_event(ibv_wc *cqe, int n);
+  void post_tx_buffer(std::vector<Chunk*> &chunks);
+
+  std::atomic<uint64_t> inflight = {0};
+
+  void post_chunk_to_pool(Chunk* chunk);
+  int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+};
+
+class RDMAWorker : public Worker {
+  typedef Infiniband::CompletionQueue CompletionQueue;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::MemoryManager MemoryManager;
+  typedef std::vector<Chunk*>::iterator ChunkIter;
+  RDMAStack *stack;
+  EventCallbackRef tx_handler;
+  std::list<RDMAConnectedSocketImpl*> pending_sent_conns;
+  RDMADispatcher* dispatcher = nullptr;
+  Mutex lock;
+
+  class C_handle_cq_tx : public EventCallback {
+    RDMAWorker *worker;
+    public:
+    explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {}
+    void do_request(uint64_t fd) {
+      worker->handle_pending_message();
+    }
+  };
+
+ public:
+  PerfCounters *perf_logger;
+  explicit RDMAWorker(CephContext *c, unsigned i);
+  virtual ~RDMAWorker();
+  virtual int listen(entity_addr_t &addr,
+		     unsigned addr_slot,
+		     const SocketOptions &opts, ServerSocket *) override;
+  virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+  virtual void initialize() override;
+  RDMAStack *get_stack() { return stack; }
+  int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes);
+  void remove_pending_conn(RDMAConnectedSocketImpl *o) {
+    ceph_assert(center.in_thread());
+    pending_sent_conns.remove(o);
+  }
+  void handle_pending_message();
+  void set_stack(RDMAStack *s) { stack = s; }
+  void notify_worker() {
+    center.dispatch_event_external(tx_handler);
+  }
+};
+
+struct RDMACMInfo {
+  RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_)
+    : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {}
+  rdma_cm_id *cm_id;
+  rdma_event_channel *cm_channel;
+  uint32_t qp_num;
+};
+
+class RDMAConnectedSocketImpl : public ConnectedSocketImpl {
+ public:
+  typedef Infiniband::MemoryManager::Chunk Chunk;
+  typedef Infiniband::CompletionChannel CompletionChannel;
+  typedef Infiniband::CompletionQueue CompletionQueue;
+
+ protected:
+  CephContext *cct;
+  Infiniband::QueuePair *qp;
+  IBSYNMsg peer_msg;
+  IBSYNMsg my_msg;
+  int connected;
+  int error;
+  Infiniband* infiniband;
+  RDMADispatcher* dispatcher;
+  RDMAWorker* worker;
+  std::vector<Chunk*> buffers;
+  int notify_fd = -1;
+  bufferlist pending_bl;
+
+  Mutex lock;
+  std::vector<ibv_wc> wc;
+  bool is_server;
+  EventCallbackRef read_handler;
+  EventCallbackRef established_handler;
+  int tcp_fd = -1;
+  bool active;// qp is active ?
+  bool pending;
+  int post_backlog = 0;
+
+  void notify();
+  ssize_t read_buffers(char* buf, size_t len);
+  int post_work_request(std::vector<Chunk*>&);
+
+ public:
+  RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+                          RDMAWorker *w);
+  virtual ~RDMAConnectedSocketImpl();
+
+  void pass_wc(std::vector<ibv_wc> &&v);
+  void get_wc(std::vector<ibv_wc> &w);
+  virtual int is_connected() override { return connected; }
+
+  virtual ssize_t read(char* buf, size_t len) override;
+  virtual ssize_t zero_copy_read(bufferptr &data) override;
+  virtual ssize_t send(bufferlist &bl, bool more) override;
+  virtual void shutdown() override;
+  virtual void close() override;
+  virtual int fd() const override { return notify_fd; }
+  virtual int socket_fd() const override { return tcp_fd; }
+  void fault();
+  const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); }
+  ssize_t submit(bool more);
+  int activate();
+  void fin();
+  void handle_connection();
+  int handle_connection_established(bool need_set_fault = true);
+  void cleanup();
+  void set_accept_fd(int sd);
+  virtual int try_connect(const entity_addr_t&, const SocketOptions &opt);
+  bool is_pending() {return pending;}
+  void set_pending(bool val) {pending = val;}
+  void post_chunks_to_rq(int num);
+  void update_post_backlog();
+};
+
+enum RDMA_CM_STATUS {
+  IDLE = 1,
+  RDMA_ID_CREATED,
+  CHANNEL_FD_CREATED,
+  RESOURCE_ALLOCATED,
+  ADDR_RESOLVED,
+  ROUTE_RESOLVED,
+  CONNECTED,
+  DISCONNECTED,
+  ERROR
+};
+
+class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl {
+  public:
+    RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+                          RDMAWorker *w, RDMACMInfo *info = nullptr);
+    ~RDMAIWARPConnectedSocketImpl();
+    virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override;
+    virtual void close() override;
+    virtual void shutdown() override;
+    virtual void handle_cm_connection();
+    uint32_t get_local_qpn() const { return local_qpn; }
+    void activate();
+    int alloc_resource();
+    void close_notify();
+
+  private:
+    rdma_cm_id *cm_id;
+    rdma_event_channel *cm_channel;
+    uint32_t local_qpn;
+    uint32_t remote_qpn;
+    EventCallbackRef cm_con_handler;
+    bool is_server;
+    std::mutex close_mtx;
+    std::condition_variable close_condition;
+    bool closed;
+    RDMA_CM_STATUS status;
+
+
+  class C_handle_cm_connection : public EventCallback {
+    RDMAIWARPConnectedSocketImpl *csi;
+    public:
+      C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {}
+      void do_request(uint64_t fd) {
+        csi->handle_cm_connection();
+      }
+  };
+};
+
+class RDMAServerSocketImpl : public ServerSocketImpl {
+  protected:
+    CephContext *cct;
+    NetHandler net;
+    int server_setup_socket;
+    Infiniband* infiniband;
+    RDMADispatcher *dispatcher;
+    RDMAWorker *worker;
+    entity_addr_t sa;
+
+ public:
+  RDMAServerSocketImpl(CephContext *cct, Infiniband* i, RDMADispatcher *s,
+		       RDMAWorker *w, entity_addr_t& a, unsigned slot);
+
+  virtual int listen(entity_addr_t &sa, const SocketOptions &opt);
+  virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+  virtual void abort_accept() override;
+  virtual int fd() const override { return server_setup_socket; }
+  int get_fd() { return server_setup_socket; }
+};
+
+class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl {
+  public:
+    RDMAIWARPServerSocketImpl(
+      CephContext *cct, Infiniband *i, RDMADispatcher *s, RDMAWorker *w,
+      entity_addr_t& addr, unsigned addr_slot);
+    virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override;
+    virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+    virtual void abort_accept() override;
+  private:
+    rdma_cm_id *cm_id;
+    rdma_event_channel *cm_channel;
+};
+
+class RDMAStack : public NetworkStack {
+  vector<std::thread> threads;
+  PerfCounters *perf_counter;
+  Infiniband ib;
+  RDMADispatcher dispatcher;
+
+  std::atomic<bool> fork_finished = {false};
+
+ public:
+  explicit RDMAStack(CephContext *cct, const string &t);
+  virtual ~RDMAStack();
+  virtual bool support_zero_copy_read() const override { return false; }
+  virtual bool nonblock_connect_need_writable_event() const override { return false; }
+
+  virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+  virtual void join_worker(unsigned i) override;
+  RDMADispatcher &get_dispatcher() { return dispatcher; }
+  Infiniband &get_infiniband() { return ib; }
+  virtual bool is_ready() override { return fork_finished.load(); };
+  virtual void ready() override { fork_finished = true; };
+};
+
+
+#endif
diff --git a/src/msg/msg_types.cc b/src/msg/msg_types.cc
new file mode 100644
index 00000000..76b9585b
--- /dev/null
+++ b/src/msg/msg_types.cc
@@ -0,0 +1,383 @@
+
+#include "msg_types.h"
+
+#include <arpa/inet.h>
+#include <stdlib.h>
+#include <string.h>
+#include <netdb.h>
+
+#include "common/Formatter.h"
+
+void entity_name_t::dump(Formatter *f) const
+{
+  f->dump_string("type", type_str());
+  f->dump_unsigned("num", num());
+}
+
+void entity_addr_t::dump(Formatter *f) const
+{
+  f->dump_string("type", get_type_name(type));
+  f->dump_stream("addr") << get_sockaddr();
+  f->dump_unsigned("nonce", nonce);
+}
+
+void entity_inst_t::dump(Formatter *f) const
+{
+  f->dump_object("name", name);
+  f->dump_object("addr", addr);
+}
+
+void entity_name_t::generate_test_instances(list<entity_name_t*>& o)
+{
+  o.push_back(new entity_name_t(entity_name_t::MON()));
+  o.push_back(new entity_name_t(entity_name_t::MON(1)));
+  o.push_back(new entity_name_t(entity_name_t::OSD(1)));
+  o.push_back(new entity_name_t(entity_name_t::CLIENT(1)));
+}
+
+void entity_addr_t::generate_test_instances(list<entity_addr_t*>& o)
+{
+  o.push_back(new entity_addr_t());
+  entity_addr_t *a = new entity_addr_t();
+  a->set_nonce(1);
+  o.push_back(a);
+  entity_addr_t *b = new entity_addr_t();
+  b->set_type(entity_addr_t::TYPE_LEGACY);
+  b->set_nonce(5);
+  b->set_family(AF_INET);
+  b->set_in4_quad(0, 127);
+  b->set_in4_quad(1, 0);
+  b->set_in4_quad(2, 1);
+  b->set_in4_quad(3, 2);
+  b->set_port(2);
+  o.push_back(b);
+}
+
+void entity_inst_t::generate_test_instances(list<entity_inst_t*>& o)
+{
+  o.push_back(new entity_inst_t());
+  entity_name_t name;
+  entity_addr_t addr;
+  entity_inst_t *a = new entity_inst_t(name, addr);
+  o.push_back(a);
+}
+
+bool entity_addr_t::parse(const char *s, const char **end, int default_type)
+{
+  *this = entity_addr_t();
+
+  const char *start = s;
+  if (end) {
+    *end = s;
+  }
+
+  int newtype;
+  if (strncmp("v1:", s, 3) == 0) {
+    start += 3;
+    newtype = TYPE_LEGACY;
+  } else if (strncmp("v2:", s, 3) == 0) {
+    start += 3;
+    newtype = TYPE_MSGR2;
+  } else if (strncmp("any:", s, 4) == 0) {
+    start += 4;
+    newtype = TYPE_ANY;
+  } else if (*s == '-') {
+    newtype = TYPE_NONE;
+    if (end) {
+      *end = s + 1;
+    }
+    return true;
+  } else {
+    newtype = default_type ? default_type : TYPE_DEFAULT;
+  }
+
+  bool brackets = false;
+  if (*start == '[') {
+    start++;
+    brackets = true;
+  }
+  
+  // inet_pton() requires a null terminated input, so let's fill two
+  // buffers, one with ipv4 allowed characters, and one with ipv6, and
+  // then see which parses.
+  char buf4[39];
+  char *o = buf4;
+  const char *p = start;
+  while (o < buf4 + sizeof(buf4) &&
+	 *p && ((*p == '.') ||
+		(*p >= '0' && *p <= '9'))) {
+    *o++ = *p++;
+  }
+  *o = 0;
+
+  char buf6[64];  // actually 39 + null is sufficient.
+  o = buf6;
+  p = start;
+  while (o < buf6 + sizeof(buf6) &&
+	 *p && ((*p == ':') ||
+		(*p >= '0' && *p <= '9') ||
+		(*p >= 'a' && *p <= 'f') ||
+		(*p >= 'A' && *p <= 'F'))) {
+    *o++ = *p++;
+  }
+  *o = 0;
+  //cout << "buf4 is '" << buf4 << "', buf6 is '" << buf6 << "'" << std::endl;
+
+  // ipv4?
+  struct in_addr a4;
+  struct in6_addr a6;
+  if (inet_pton(AF_INET, buf4, &a4)) {
+    u.sin.sin_addr.s_addr = a4.s_addr;
+    u.sa.sa_family = AF_INET;
+    p = start + strlen(buf4);
+  } else if (inet_pton(AF_INET6, buf6, &a6)) {
+    u.sa.sa_family = AF_INET6;
+    memcpy(&u.sin6.sin6_addr, &a6, sizeof(a6));
+    p = start + strlen(buf6);
+  } else {
+    return false;
+  }
+
+  if (brackets) {
+    if (*p != ']')
+      return false;
+    p++;
+  }
+  
+  //cout << "p is " << *p << std::endl;
+  if (*p == ':') {
+    // parse a port, too!
+    p++;
+    int port = atoi(p);
+    if (port > MAX_PORT_NUMBER) {
+      return false;
+    }
+    set_port(port);
+    while (*p && *p >= '0' && *p <= '9')
+      p++;
+  }
+
+  if (*p == '/') {
+    // parse nonce, too
+    p++;
+    int non = atoi(p);
+    set_nonce(non);
+    while (*p && *p >= '0' && *p <= '9')
+      p++;
+  }
+
+  if (end)
+    *end = p;
+
+  type = newtype;
+
+  //cout << *this << std::endl;
+  return true;
+}
+
+ostream& operator<<(ostream& out, const entity_addr_t &addr)
+{
+  if (addr.type == entity_addr_t::TYPE_NONE) {
+    return out << "-";
+  }
+  if (addr.type != entity_addr_t::TYPE_ANY) {
+    out << entity_addr_t::get_type_name(addr.type) << ":";
+  }
+  out << addr.get_sockaddr() << '/' << addr.nonce;
+  return out;
+}
+
+ostream& operator<<(ostream& out, const sockaddr_storage &ss)
+{
+  char buf[NI_MAXHOST] = { 0 };
+  char serv[NI_MAXSERV] = { 0 };
+  size_t hostlen;
+
+  if (ss.ss_family == AF_INET)
+    hostlen = sizeof(struct sockaddr_in);
+  else if (ss.ss_family == AF_INET6)
+    hostlen = sizeof(struct sockaddr_in6);
+  else
+    hostlen = sizeof(struct sockaddr_storage);
+  getnameinfo((struct sockaddr *)&ss, hostlen, buf, sizeof(buf),
+	      serv, sizeof(serv),
+	      NI_NUMERICHOST | NI_NUMERICSERV);
+  if (ss.ss_family == AF_INET6)
+    return out << '[' << buf << "]:" << serv;
+  return out << buf << ':' << serv;
+}
+
+ostream& operator<<(ostream& out, const sockaddr *sa)
+{
+  char buf[NI_MAXHOST] = { 0 };
+  char serv[NI_MAXSERV] = { 0 };
+  size_t hostlen;
+
+  if (sa->sa_family == AF_INET)
+    hostlen = sizeof(struct sockaddr_in);
+  else if (sa->sa_family == AF_INET6)
+    hostlen = sizeof(struct sockaddr_in6);
+  else
+    hostlen = sizeof(struct sockaddr_storage);
+  getnameinfo(sa, hostlen, buf, sizeof(buf),
+	      serv, sizeof(serv),
+	      NI_NUMERICHOST | NI_NUMERICSERV);
+  if (sa->sa_family == AF_INET6)
+    return out << '[' << buf << "]:" << serv;
+  return out << buf << ':' << serv;
+}
+
+// entity_addrvec_t
+
+bool entity_addrvec_t::parse(const char *s, const char **end)
+{
+  const char *orig_s = s;
+  const char *static_end;
+  if (!end) {
+    end = &static_end;
+  } else {
+    *end = s;
+  }
+  v.clear();
+  bool brackets = false;
+  if (*s == '[') {
+    // weirdness: make sure this isn't an IPV6 addr!
+    entity_addr_t a;
+    const char *p;
+    if (!a.parse(s, &p) || !a.is_ipv6()) {
+      // it's not
+      brackets = true;
+      ++s;
+    }
+  }
+  while (*s) {
+    entity_addr_t a;
+    bool r = a.parse(s, end);
+    if (!r) {
+      if (brackets) {
+	v.clear();
+	*end = orig_s;
+	return false;
+      }
+      break;
+    }
+    v.push_back(a);
+    s = *end;
+    if (!brackets) {
+      break;
+    }
+    if (*s != ',') {
+      break;
+    }
+    ++s;
+  }
+  if (brackets) {
+    if (*s == ']') {
+      ++s;
+      *end = s;
+    } else {
+      *end = orig_s;
+      v.clear();
+      return false;
+    }
+  }
+  return !v.empty();
+}
+
+void entity_addrvec_t::encode(bufferlist& bl, uint64_t features) const
+{
+  using ceph::encode;
+  if ((features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+    // encode a single legacy entity_addr_t for unfeatured peers
+    encode(legacy_addr(), bl, 0);
+    return;
+  }
+  encode((__u8)2, bl);
+  encode(v, bl, features);
+}
+
+void entity_addrvec_t::decode(bufferlist::const_iterator& bl)
+{
+  using ceph::decode;
+  __u8 marker;
+  decode(marker, bl);
+  if (marker == 0) {
+    // legacy!
+    entity_addr_t addr;
+    addr.decode_legacy_addr_after_marker(bl);
+    v.clear();
+    v.push_back(addr);
+    return;
+  }
+  if (marker == 1) {
+    entity_addr_t addr;
+    DECODE_START(1, bl);
+    decode(addr.type, bl);
+    decode(addr.nonce, bl);
+    __u32 elen;
+    decode(elen, bl);
+    if (elen) {
+      struct sockaddr *sa = (struct sockaddr *)addr.get_sockaddr();
+#if defined(__FreeBSD__) || defined(__APPLE__)
+      sa->sa_len = 0;
+#endif
+      uint16_t ss_family;
+      if (elen < sizeof(ss_family)) {
+        throw ceph::buffer::malformed_input("elen smaller than family len");
+      }
+      decode(ss_family, bl);
+      sa->sa_family = ss_family;
+      elen -= sizeof(ss_family);
+      if (elen > addr.get_sockaddr_len() - sizeof(sa->sa_family)) {
+        throw ceph::buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      bl.copy(elen, sa->sa_data);
+    }
+    DECODE_FINISH(bl);
+    v.clear();
+    v.push_back(addr);
+    return;
+  }
+  if (marker > 2)
+    throw buffer::malformed_input("entity_addrvec_marker > 2");
+  decode(v, bl);
+}
+
+void entity_addrvec_t::dump(Formatter *f) const
+{
+  f->open_array_section("addrvec");
+  for (vector<entity_addr_t>::const_iterator p = v.begin();
+       p != v.end(); ++p) {
+    f->dump_object("addr", *p);
+  }
+  f->close_section();
+}
+
+void entity_addrvec_t::generate_test_instances(list<entity_addrvec_t*>& ls)
+{
+  ls.push_back(new entity_addrvec_t());
+  ls.push_back(new entity_addrvec_t());
+  ls.back()->v.push_back(entity_addr_t());
+  ls.push_back(new entity_addrvec_t());
+  ls.back()->v.push_back(entity_addr_t());
+  ls.back()->v.push_back(entity_addr_t());
+}
+
+std::string entity_addr_t::ip_only_to_str() const 
+{
+  const char *host_ip = NULL;
+  char addr_buf[INET6_ADDRSTRLEN];
+  switch (get_family()) {
+  case AF_INET:
+    host_ip = inet_ntop(AF_INET, &in4_addr().sin_addr, 
+                        addr_buf, INET_ADDRSTRLEN);
+    break;
+  case AF_INET6:
+    host_ip = inet_ntop(AF_INET6, &in6_addr().sin6_addr, 
+                        addr_buf, INET6_ADDRSTRLEN);
+    break;
+  default:
+    break;
+  }
+  return host_ip ? host_ip : "";
+}
diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h
new file mode 100644
index 00000000..74d5ee30
--- /dev/null
+++ b/src/msg/msg_types.h
@@ -0,0 +1,803 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_TYPES_H
+#define CEPH_MSG_TYPES_H
+
+#include <sstream>
+
+#include <netinet/in.h>
+
+#include "include/ceph_features.h"
+#include "include/types.h"
+#include "include/blobhash.h"
+#include "include/encoding.h"
+
+#define MAX_PORT_NUMBER 65535
+
+namespace ceph {
+  class Formatter;
+}
+
+extern ostream& operator<<(ostream& out, const sockaddr_storage &ss);
+extern ostream& operator<<(ostream& out, const sockaddr *sa);
+
+typedef uint8_t entity_type_t;
+
+class entity_name_t {
+public:
+  entity_type_t _type;
+  int64_t _num;
+
+public:
+  static const int TYPE_MON = CEPH_ENTITY_TYPE_MON;
+  static const int TYPE_MDS = CEPH_ENTITY_TYPE_MDS;
+  static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD;
+  static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT;
+  static const int TYPE_MGR = CEPH_ENTITY_TYPE_MGR;
+
+  static const int64_t NEW = -1;
+
+  // cons
+  entity_name_t() : _type(0), _num(0) { }
+  entity_name_t(int t, int64_t n) : _type(t), _num(n) { }
+  explicit entity_name_t(const ceph_entity_name &n) :
+    _type(n.type), _num(n.num) { }
+
+  // static cons
+  static entity_name_t MON(int64_t i=NEW) { return entity_name_t(TYPE_MON, i); }
+  static entity_name_t MDS(int64_t i=NEW) { return entity_name_t(TYPE_MDS, i); }
+  static entity_name_t OSD(int64_t i=NEW) { return entity_name_t(TYPE_OSD, i); }
+  static entity_name_t CLIENT(int64_t i=NEW) { return entity_name_t(TYPE_CLIENT, i); }
+  static entity_name_t MGR(int64_t i=NEW) { return entity_name_t(TYPE_MGR, i); }
+
+  int64_t num() const { return _num; }
+  int type() const { return _type; }
+  const char *type_str() const {
+    return ceph_entity_type_name(type());
+  }
+
+  bool is_new() const { return num() < 0; }
+
+  bool is_client() const { return type() == TYPE_CLIENT; }
+  bool is_mds() const { return type() == TYPE_MDS; }
+  bool is_osd() const { return type() == TYPE_OSD; }
+  bool is_mon() const { return type() == TYPE_MON; }
+  bool is_mgr() const { return type() == TYPE_MGR; }
+
+  operator ceph_entity_name() const {
+    ceph_entity_name n = { _type, init_le64(_num) };
+    return n;
+  }
+
+  bool parse(const string& s) {
+    const char *start = s.c_str();
+    char *end;
+    bool got = parse(start, &end);
+    return got && end == start + s.length();
+  }
+  bool parse(const char *start, char **end) {
+    if (strstr(start, "mon.") == start) {
+      _type = TYPE_MON;
+      start += 4;
+    } else if (strstr(start, "osd.") == start) {
+      _type = TYPE_OSD;
+      start += 4;
+    } else if (strstr(start, "mds.") == start) {
+      _type = TYPE_MDS;
+      start += 4;
+    } else if (strstr(start, "client.") == start) {
+      _type = TYPE_CLIENT;
+      start += 7;
+    } else if (strstr(start, "mgr.") == start) {
+      _type = TYPE_MGR;
+      start += 4;
+    } else {
+      return false;
+    }
+    if (isspace(*start))
+      return false;
+    _num = strtoll(start, end, 10);
+    if (*end == NULL || *end == start)
+      return false;
+    return true;
+  }
+
+  DENC(entity_name_t, v, p) {
+    denc(v._type, p);
+    denc(v._num, p);
+  }
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(list<entity_name_t*>& o);
+};
+WRITE_CLASS_DENC(entity_name_t)
+
+inline bool operator== (const entity_name_t& l, const entity_name_t& r) {
+  return (l.type() == r.type()) && (l.num() == r.num()); }
+inline bool operator!= (const entity_name_t& l, const entity_name_t& r) {
+  return (l.type() != r.type()) || (l.num() != r.num()); }
+inline bool operator< (const entity_name_t& l, const entity_name_t& r) {
+  return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); }
+
+inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) {
+  //if (addr.is_namer()) return out << "namer";
+  if (addr.is_new() || addr.num() < 0)
+    return out << addr.type_str() << ".?";
+  else
+    return out << addr.type_str() << '.' << addr.num();
+}
+inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) {
+  return out << entity_name_t{addr.type, static_cast<int64_t>(addr.num)};
+}
+
+namespace std {
+  template<> struct hash< entity_name_t >
+  {
+    size_t operator()( const entity_name_t &m ) const
+    {
+      return rjhash32(m.type() ^ m.num());
+    }
+  };
+} // namespace std
+
+// define a wire format for sockaddr that matches Linux's.
+struct ceph_sockaddr_storage {
+  ceph_le16 ss_family;
+  __u8 __ss_padding[128 - sizeof(ceph_le16)];
+
+  void encode(bufferlist& bl) const {
+    struct ceph_sockaddr_storage ss = *this;
+    ss.ss_family = htons(ss.ss_family);
+    ::encode_raw(ss, bl);
+  }
+
+  void decode(bufferlist::const_iterator& bl) {
+    struct ceph_sockaddr_storage ss;
+    ::decode_raw(ss, bl);
+    ss.ss_family = ntohs(ss.ss_family);
+    *this = ss;
+  }
+} __attribute__ ((__packed__));
+WRITE_CLASS_ENCODER(ceph_sockaddr_storage)
+
+/*
+ * encode sockaddr.ss_family as network byte order
+ */
+static inline void encode(const sockaddr_storage& a, bufferlist& bl) {
+#if defined(__linux__)
+  struct sockaddr_storage ss = a;
+  ss.ss_family = htons(ss.ss_family);
+  ::encode_raw(ss, bl);
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+  ceph_sockaddr_storage ss{};
+  auto src = (unsigned char const *)&a;
+  auto dst = (unsigned char *)&ss;
+  src += sizeof(a.ss_len);
+  ss.ss_family = a.ss_family;
+  src += sizeof(a.ss_family);
+  dst += sizeof(ss.ss_family);
+  const auto copy_size = std::min((unsigned char*)(&a + 1) - src,
+				  (unsigned char*)(&ss + 1) - dst);
+  ::memcpy(dst, src, copy_size);
+  encode(ss, bl);
+#else
+  ceph_sockaddr_storage ss{};
+  ::memset(&ss, '\0', sizeof(ss));
+  ::memcpy(&wireaddr, &ss, std::min(sizeof(ss), sizeof(a)));
+  encode(ss, bl);
+#endif
+}
+static inline void decode(sockaddr_storage& a, bufferlist::const_iterator& bl) {
+#if defined(__linux__)
+  ::decode_raw(a, bl);
+  a.ss_family = ntohs(a.ss_family);
+#elif defined(__FreeBSD__) || defined(__APPLE__)
+  ceph_sockaddr_storage ss{};
+  decode(ss, bl);
+  auto src = (unsigned char const *)&ss;
+  auto dst = (unsigned char *)&a;
+  a.ss_len = 0;
+  dst += sizeof(a.ss_len);
+  a.ss_family = ss.ss_family;
+  src += sizeof(ss.ss_family);
+  dst += sizeof(a.ss_family);
+  auto const copy_size = std::min((unsigned char*)(&ss + 1) - src,
+				  (unsigned char*)(&a + 1) - dst);
+  ::memcpy(dst, src, copy_size);
+#else
+  ceph_sockaddr_storage ss{};
+  decode(ss, bl);
+  ::memcpy(&a, &ss, std::min(sizeof(ss), sizeof(a)));
+#endif
+}
+
+/*
+ * an entity's network address.
+ * includes a random value that prevents it from being reused.
+ * thus identifies a particular process instance.
+ * ipv4 for now.
+ */
+struct entity_addr_t {
+  typedef enum {
+    TYPE_NONE = 0,
+    TYPE_LEGACY = 1,  ///< legacy msgr1 protocol (ceph jewel and older)
+    TYPE_MSGR2 = 2,   ///< msgr2 protocol (new in ceph kraken)
+    TYPE_ANY = 3,  ///< ambiguous
+  } type_t;
+  static const type_t TYPE_DEFAULT = TYPE_MSGR2;
+  static std::string_view get_type_name(int t) {
+    switch (t) {
+    case TYPE_NONE: return "none";
+    case TYPE_LEGACY: return "v1";
+    case TYPE_MSGR2: return "v2";
+    case TYPE_ANY: return "any";
+    default: return "???";
+    }
+  };
+
+  __u32 type;
+  __u32 nonce;
+  union {
+    sockaddr sa;
+    sockaddr_in sin;
+    sockaddr_in6 sin6;
+  } u;
+
+  entity_addr_t() : type(0), nonce(0) {
+    memset(&u, 0, sizeof(u));
+  }
+  entity_addr_t(__u32 _type, __u32 _nonce) : type(_type), nonce(_nonce) {
+    memset(&u, 0, sizeof(u));
+  }
+  explicit entity_addr_t(const ceph_entity_addr &o) {
+    type = o.type;
+    nonce = o.nonce;
+    memcpy(&u, &o.in_addr, sizeof(u));
+#if !defined(__FreeBSD__)
+    u.sa.sa_family = ntohs(u.sa.sa_family);
+#endif
+  }
+
+  uint32_t get_type() const { return type; }
+  void set_type(uint32_t t) { type = t; }
+  bool is_legacy() const { return type == TYPE_LEGACY; }
+  bool is_msgr2() const { return type == TYPE_MSGR2; }
+  bool is_any() const { return type == TYPE_ANY; }
+
+  __u32 get_nonce() const { return nonce; }
+  void set_nonce(__u32 n) { nonce = n; }
+
+  int get_family() const {
+    return u.sa.sa_family;
+  }
+  void set_family(int f) {
+    u.sa.sa_family = f;
+  }
+
+  bool is_ipv4() const {
+    return u.sa.sa_family == AF_INET;
+  }
+  bool is_ipv6() const {
+    return u.sa.sa_family == AF_INET6;
+  }
+
+  sockaddr_in &in4_addr() {
+    return u.sin;
+  }
+  const sockaddr_in &in4_addr() const{
+    return u.sin;
+  }
+  sockaddr_in6 &in6_addr(){
+    return u.sin6;
+  }
+  const sockaddr_in6 &in6_addr() const{
+    return u.sin6;
+  }
+  const sockaddr *get_sockaddr() const {
+    return &u.sa;
+  }
+  size_t get_sockaddr_len() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      return sizeof(u.sin);
+    case AF_INET6:
+      return sizeof(u.sin6);
+    }
+    return sizeof(u);
+  }
+  bool set_sockaddr(const struct sockaddr *sa)
+  {
+    switch (sa->sa_family) {
+    case AF_INET:
+      // pre-zero, since we're only copying a portion of the source
+      memset(&u, 0, sizeof(u));
+      memcpy(&u.sin, sa, sizeof(u.sin));
+      break;
+    case AF_INET6:
+      // pre-zero, since we're only copying a portion of the source
+      memset(&u, 0, sizeof(u));
+      memcpy(&u.sin6, sa, sizeof(u.sin6));
+      break;
+    case AF_UNSPEC:
+      memset(&u, 0, sizeof(u));
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  sockaddr_storage get_sockaddr_storage() const {
+    sockaddr_storage ss;
+    memcpy(&ss, &u, sizeof(u));
+    memset((char*)&ss + sizeof(u), 0, sizeof(ss) - sizeof(u));
+    return ss;
+  }
+
+  void set_in4_quad(int pos, int val) {
+    u.sin.sin_family = AF_INET;
+    unsigned char *ipq = (unsigned char*)&u.sin.sin_addr.s_addr;
+    ipq[pos] = val;
+  }
+  void set_port(int port) {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      u.sin.sin_port = htons(port);
+      break;
+    case AF_INET6:
+      u.sin6.sin6_port = htons(port);
+      break;
+    default:
+      ceph_abort();
+    }
+  }
+  int get_port() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      return ntohs(u.sin.sin_port);
+      break;
+    case AF_INET6:
+      return ntohs(u.sin6.sin6_port);
+      break;
+    }
+    return 0;
+  }
+
+  operator ceph_entity_addr() const {
+    ceph_entity_addr a;
+    a.type = 0;
+    a.nonce = nonce;
+    a.in_addr = get_sockaddr_storage();
+#if !defined(__FreeBSD__)
+    a.in_addr.ss_family = htons(a.in_addr.ss_family);
+#endif
+    return a;
+  }
+
+  bool probably_equals(const entity_addr_t &o) const {
+    if (get_port() != o.get_port())
+      return false;
+    if (get_nonce() != o.get_nonce())
+      return false;
+    if (is_blank_ip() || o.is_blank_ip())
+      return true;
+    if (memcmp(&u, &o.u, sizeof(u)) == 0)
+      return true;
+    return false;
+  }
+
+  bool is_same_host(const entity_addr_t &o) const {
+    if (u.sa.sa_family != o.u.sa.sa_family)
+      return false;
+    if (u.sa.sa_family == AF_INET)
+      return u.sin.sin_addr.s_addr == o.u.sin.sin_addr.s_addr;
+    if (u.sa.sa_family == AF_INET6)
+      return memcmp(u.sin6.sin6_addr.s6_addr,
+		    o.u.sin6.sin6_addr.s6_addr,
+		    sizeof(u.sin6.sin6_addr.s6_addr)) == 0;
+    return false;
+  }
+
+  bool is_blank_ip() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+      return u.sin.sin_addr.s_addr == INADDR_ANY;
+    case AF_INET6:
+      return memcmp(&u.sin6.sin6_addr, &in6addr_any, sizeof(in6addr_any)) == 0;
+    default:
+      return true;
+    }
+  }
+
+  bool is_ip() const {
+    switch (u.sa.sa_family) {
+    case AF_INET:
+    case AF_INET6:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  std::string ip_only_to_str() const;
+
+  std::string get_legacy_str() const {
+    ostringstream ss;
+    ss << get_sockaddr() << "/" << get_nonce();
+    return ss.str();
+  }
+
+  bool parse(const char *s, const char **end = 0, int type=0);
+
+  void decode_legacy_addr_after_marker(bufferlist::const_iterator& bl)
+  {
+    using ceph::decode;
+    __u8 marker;
+    __u16 rest;
+    decode(marker, bl);
+    decode(rest, bl);
+    decode(nonce, bl);
+    sockaddr_storage ss;
+    decode(ss, bl);
+    set_sockaddr((sockaddr*)&ss);
+    if (get_family() == AF_UNSPEC) {
+      type = TYPE_NONE;
+    } else {
+      type = TYPE_LEGACY;
+    }
+  }
+
+  // Right now, these only deal with sockaddr_storage that have only family and content.
+  // Apparently on BSD there is also an ss_len that we need to handle; this requires
+  // broader study
+
+  void encode(bufferlist& bl, uint64_t features) const {
+    using ceph::encode;
+    if ((features & CEPH_FEATURE_MSG_ADDR2) == 0) {
+      encode((__u32)0, bl);
+      encode(nonce, bl);
+      sockaddr_storage ss = get_sockaddr_storage();
+      encode(ss, bl);
+      return;
+    }
+    encode((__u8)1, bl);
+    ENCODE_START(1, 1, bl);
+    if (HAVE_FEATURE(features, SERVER_NAUTILUS)) {
+      encode(type, bl);
+    } else {
+      // map any -> legacy for old clients.  this is primary for the benefit
+      // of OSDMap's blacklist, but is reasonable in general since any: is
+      // meaningless for pre-nautilus clients or daemons.
+      auto t = type;
+      if (t == TYPE_ANY) {
+	t = TYPE_LEGACY;
+      }
+      encode(t, bl);
+    }
+    encode(nonce, bl);
+    __u32 elen = get_sockaddr_len();
+#if (__FreeBSD__) || defined(__APPLE__)
+      elen -= sizeof(u.sa.sa_len);
+#endif
+    encode(elen, bl);
+    if (elen) {
+      uint16_t ss_family = u.sa.sa_family;
+
+      encode(ss_family, bl);
+      elen -= sizeof(u.sa.sa_family);
+      bl.append(u.sa.sa_data, elen);
+    }
+    ENCODE_FINISH(bl);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    __u8 marker;
+    decode(marker, bl);
+    if (marker == 0) {
+      decode_legacy_addr_after_marker(bl);
+      return;
+    }
+    if (marker != 1)
+      throw buffer::malformed_input("entity_addr_t marker != 1");
+    DECODE_START(1, bl);
+    decode(type, bl);
+    decode(nonce, bl);
+    __u32 elen;
+    decode(elen, bl);
+    if (elen) {
+#if defined(__FreeBSD__) || defined(__APPLE__)
+      u.sa.sa_len = 0;
+#endif
+      uint16_t ss_family;
+      if (elen < sizeof(ss_family)) {
+	throw buffer::malformed_input("elen smaller than family len");
+      }
+      decode(ss_family, bl);
+      u.sa.sa_family = ss_family;
+      elen -= sizeof(ss_family);
+      if (elen > get_sockaddr_len() - sizeof(u.sa.sa_family)) {
+	throw buffer::malformed_input("elen exceeds sockaddr len");
+      }
+      bl.copy(elen, u.sa.sa_data);
+    }
+    DECODE_FINISH(bl);
+  }
+
+  void dump(Formatter *f) const;
+
+  static void generate_test_instances(list<entity_addr_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(entity_addr_t)
+
+ostream& operator<<(ostream& out, const entity_addr_t &addr);
+
+inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; }
+inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; }
+inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; }
+inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; }
+inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; }
+inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; }
+
+namespace std {
+  template<> struct hash< entity_addr_t >
+  {
+    size_t operator()( const entity_addr_t& x ) const
+    {
+      static blobhash H;
+      return H((const char*)&x, sizeof(x));
+    }
+  };
+} // namespace std
+
+struct entity_addrvec_t {
+  vector<entity_addr_t> v;
+
+  entity_addrvec_t() {}
+  explicit entity_addrvec_t(const entity_addr_t& a) : v({ a }) {}
+
+  unsigned size() const { return v.size(); }
+  bool empty() const { return v.empty(); }
+
+  entity_addr_t legacy_addr() const {
+    for (auto& a : v) {
+      if (a.type == entity_addr_t::TYPE_LEGACY) {
+	return a;
+      }
+    }
+    return entity_addr_t();
+  }
+  entity_addr_t as_legacy_addr() const {
+    for (auto& a : v) {
+      if (a.is_legacy()) {
+	return a;
+      }
+      if (a.is_any()) {
+	auto b = a;
+	b.set_type(entity_addr_t::TYPE_LEGACY);
+	return b;
+      }
+    }
+    // hrm... lie!
+    auto a = front();
+    a.set_type(entity_addr_t::TYPE_LEGACY);
+    return a;
+  }
+  entity_addr_t front() const {
+    if (!v.empty()) {
+      return v.front();
+    }
+    return entity_addr_t();
+  }
+  entity_addr_t legacy_or_front_addr() const {
+    for (auto& a : v) {
+      if (a.type == entity_addr_t::TYPE_LEGACY) {
+	return a;
+      }
+    }
+    if (!v.empty()) {
+      return v.front();
+    }
+    return entity_addr_t();
+  }
+  string get_legacy_str() const {
+    return legacy_or_front_addr().get_legacy_str();
+  }
+
+  entity_addr_t msgr2_addr() const {
+    for (auto &a : v) {
+      if (a.type == entity_addr_t::TYPE_MSGR2) {
+        return a;
+      }
+    }
+    return entity_addr_t();
+  }
+  bool has_msgr2() const {
+    for (auto& a : v) {
+      if (a.is_msgr2()) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  bool parse(const char *s, const char **end = 0);
+
+  void get_ports(set<int> *ports) const {
+    for (auto& a : v) {
+      ports->insert(a.get_port());
+    }
+  }
+  set<int> get_ports() const {
+    set<int> r;
+    get_ports(&r);
+    return r;
+  }
+
+  void encode(bufferlist& bl, uint64_t features) const;
+  void decode(bufferlist::const_iterator& bl);
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<entity_addrvec_t*>& ls);
+
+  bool legacy_equals(const entity_addrvec_t& o) const {
+    if (v == o.v) {
+      return true;
+    }
+    if (v.size() == 1 &&
+	front().is_legacy() &&
+	front() == o.legacy_addr()) {
+      return true;
+    }
+    if (o.v.size() == 1 &&
+	o.front().is_legacy() &&
+	o.front() == legacy_addr()) {
+      return true;
+    }
+    return false;
+  }
+
+  bool probably_equals(const entity_addrvec_t& o) const {
+    for (unsigned i = 0; i < v.size(); ++i) {
+      if (!v[i].probably_equals(o.v[i])) {
+	return false;
+      }
+    }
+    return true;
+  }
+  bool contains(const entity_addr_t& a) const {
+    for (auto& i : v) {
+      if (a == i) {
+	return true;
+      }
+    }
+    return false;
+  }
+  bool is_same_host(const entity_addr_t& a) const {
+    for (auto& i : v) {
+      if (i.is_same_host(a)) {
+	return true;
+      }
+    }
+    return false;
+  }
+
+  friend ostream& operator<<(ostream& out, const entity_addrvec_t& av) {
+    if (av.v.empty()) {
+      return out;
+    } else if (av.v.size() == 1) {
+      return out << av.v[0];
+    } else {
+      return out << av.v;
+    }
+  }
+
+  friend bool operator==(const entity_addrvec_t& l, const entity_addrvec_t& r) {
+    return l.v == r.v;
+  }
+  friend bool operator!=(const entity_addrvec_t& l, const entity_addrvec_t& r) {
+    return l.v != r.v;
+  }
+  friend bool operator<(const entity_addrvec_t& l, const entity_addrvec_t& r) {
+    return l.v < r.v;  // see lexicographical_compare()
+  }
+};
+WRITE_CLASS_ENCODER_FEATURES(entity_addrvec_t);
+
+namespace std {
+  template<> struct hash< entity_addrvec_t >
+  {
+    size_t operator()( const entity_addrvec_t& x ) const
+    {
+      static blobhash H;
+      size_t r = 0;
+      for (auto& i : x.v) {
+	r += H((const char*)&i, sizeof(i));
+      }
+      return r;
+    }
+  };
+} // namespace std
+
+/*
+ * a particular entity instance
+ */
+struct entity_inst_t {
+  entity_name_t name;
+  entity_addr_t addr;
+  entity_inst_t() {}
+  entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {}
+  // cppcheck-suppress noExplicitConstructor
+  entity_inst_t(const ceph_entity_inst& i) : name(i.name), addr(i.addr) { }
+  entity_inst_t(const ceph_entity_name& n, const ceph_entity_addr &a) : name(n), addr(a) {}
+  operator ceph_entity_inst() {
+    ceph_entity_inst i = {name, addr};
+    return i;
+  }
+
+  void encode(bufferlist& bl, uint64_t features) const {
+    using ceph::encode;
+    encode(name, bl);
+    encode(addr, bl, features);
+  }
+  void decode(bufferlist::const_iterator& bl) {
+    using ceph::decode;
+    decode(name, bl);
+    decode(addr, bl);
+  }
+
+  void dump(Formatter *f) const;
+  static void generate_test_instances(list<entity_inst_t*>& o);
+};
+WRITE_CLASS_ENCODER_FEATURES(entity_inst_t)
+
+
+inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name == b.name && a.addr == b.addr;
+}
+inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name != b.name || a.addr != b.addr;
+}
+inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name < b.name || (a.name == b.name && a.addr < b.addr);
+}
+inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) {
+  return a.name < b.name || (a.name == b.name && a.addr <= b.addr);
+}
+inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return b < a; }
+inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return b <= a; }
+
+namespace std {
+  template<> struct hash< entity_inst_t >
+  {
+    size_t operator()( const entity_inst_t& x ) const
+    {
+      static hash< entity_name_t > H;
+      static hash< entity_addr_t > I;
+      return H(x.name) ^ I(x.addr);
+    }
+  };
+} // namespace std
+
+
+inline ostream& operator<<(ostream& out, const entity_inst_t &i)
+{
+  return out << i.name << " " << i.addr;
+}
+inline ostream& operator<<(ostream& out, const ceph_entity_inst &i)
+{
+  entity_inst_t n = i;
+  return out << n;
+}
+
+#endif
diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc
new file mode 100644
index 00000000..52f3df2b
--- /dev/null
+++ b/src/msg/simple/Accepter.cc
@@ -0,0 +1,402 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+#include <iterator>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <sys/uio.h>
+#include <limits.h>
+#include <poll.h>
+
+#include "msg/msg_types.h"
+#include "msg/Message.h"
+
+#include "Accepter.h"
+#include "Pipe.h"
+#include "SimpleMessenger.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/safe_io.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "accepter."
+
+
+/********************************************
+ * Accepter
+ */
+
+int Accepter::create_selfpipe(int *pipe_rd, int *pipe_wr) {
+  int selfpipe[2];
+  if (pipe_cloexec(selfpipe) < 0) {
+    int e = errno;
+    lderr(msgr->cct) << __func__ << " unable to create the selfpipe: "
+                    << cpp_strerror(e) << dendl;
+    return -e;
+  }
+  for (size_t i = 0; i < std::size(selfpipe); i++) {
+    int rc = fcntl(selfpipe[i], F_GETFL);
+    ceph_assert(rc != -1);
+    rc = fcntl(selfpipe[i], F_SETFL, rc | O_NONBLOCK);
+    ceph_assert(rc != -1);
+  }
+  *pipe_rd = selfpipe[0];
+  *pipe_wr = selfpipe[1];
+  return 0;
+}
+
+int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports)
+{
+  const auto& conf = msgr->cct->_conf;
+  // bind to a socket
+  ldout(msgr->cct,10) <<  __func__ << dendl;
+  
+  int family;
+  switch (bind_addr.get_family()) {
+  case AF_INET:
+  case AF_INET6:
+    family = bind_addr.get_family();
+    break;
+
+  default:
+    // bind_addr is empty
+    family = conf->ms_bind_ipv6 ? AF_INET6 : AF_INET;
+  }
+
+  /* socket creation */
+  listen_sd = socket_cloexec(family, SOCK_STREAM, 0);
+  if (listen_sd < 0) {
+    int e = errno;
+    lderr(msgr->cct) << __func__ << " unable to create socket: "
+		     << cpp_strerror(e) << dendl;
+    return -e;
+  }
+  ldout(msgr->cct,10) <<  __func__ << " socket sd: " << listen_sd << dendl;
+
+  // use whatever user specified (if anything)
+  entity_addr_t listen_addr = bind_addr;
+  if (listen_addr.get_type() == entity_addr_t::TYPE_NONE) {
+    listen_addr.set_type(entity_addr_t::TYPE_LEGACY);
+  }
+  listen_addr.set_family(family);
+
+  /* bind to port */
+  int rc = -1;
+  int r = -1;
+
+  for (int i = 0; i < conf->ms_bind_retry_count; i++) {
+
+    if (i > 0) {
+        lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in " 
+			 << conf->ms_bind_retry_delay << " seconds " << dendl;
+        sleep(conf->ms_bind_retry_delay);
+    }
+
+    if (listen_addr.get_port()) {
+        // specific port
+
+        // reuse addr+port when possible
+        int on = 1;
+        rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+        if (rc < 0) {
+            lderr(msgr->cct) << __func__ << " unable to setsockopt: "
+                             << cpp_strerror(errno) << dendl;
+            r = -errno;
+            continue;
+        }
+
+        rc = ::bind(listen_sd, listen_addr.get_sockaddr(),
+		    listen_addr.get_sockaddr_len());
+        if (rc < 0) {
+            lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+			     << ": " << cpp_strerror(errno) << dendl;
+            r = -errno;
+            continue;
+        }
+    } else {
+        // try a range of ports
+        for (int port = msgr->cct->_conf->ms_bind_port_min; 
+		port <= msgr->cct->_conf->ms_bind_port_max; port++) {
+            if (avoid_ports.count(port))
+                continue;
+
+            listen_addr.set_port(port);
+            rc = ::bind(listen_sd, listen_addr.get_sockaddr(),
+			listen_addr.get_sockaddr_len());
+            if (rc == 0)
+                break;
+        }
+        if (rc < 0) {
+            lderr(msgr->cct) <<  __func__ << "  unable to bind to " << listen_addr
+                             << " on any port in range " << msgr->cct->_conf->ms_bind_port_min
+                             << "-" << msgr->cct->_conf->ms_bind_port_max
+                             << ": " << cpp_strerror(errno)
+                             << dendl;
+            r = -errno;
+            // Clear port before retry, otherwise we shall fail again.
+            listen_addr.set_port(0); 
+            continue;
+        }
+        ldout(msgr->cct,10) << __func__ << " bound on random port " 
+			    << listen_addr << dendl;
+    }
+
+    if (rc == 0)
+        break;
+  }
+
+  // It seems that binding completely failed, return with that exit status
+  if (rc < 0) {
+      lderr(msgr->cct) << __func__ << " was unable to bind after " 
+		       << conf->ms_bind_retry_count << " attempts: " 
+		       << cpp_strerror(errno) << dendl;
+      ::close(listen_sd);
+      listen_sd = -1;
+      return r;
+  }
+
+  // what port did we get?
+  sockaddr_storage ss;
+  socklen_t llen = sizeof(ss);
+  rc = getsockname(listen_sd, (sockaddr*)&ss, &llen);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(msgr->cct) << __func__ << " failed getsockname: " 
+		     << cpp_strerror(rc) << dendl;
+    ::close(listen_sd);
+    listen_sd = -1;
+    return rc;
+  }
+  listen_addr.set_sockaddr((sockaddr*)&ss);
+  
+  if (msgr->cct->_conf->ms_tcp_rcvbuf) {
+    int size = msgr->cct->_conf->ms_tcp_rcvbuf;
+    rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_RCVBUF, 
+			(void*)&size, sizeof(size));
+    if (rc < 0)  {
+      rc = -errno;
+      lderr(msgr->cct) <<  __func__ << "  failed to set SO_RCVBUF to " 
+		       << size << ": " << cpp_strerror(rc) << dendl;
+      ::close(listen_sd);
+      listen_sd = -1;
+      return rc;
+    }
+  }
+
+  ldout(msgr->cct,10) <<  __func__ << " bound to " << listen_addr << dendl;
+
+  // listen!
+  rc = ::listen(listen_sd, msgr->cct->_conf->ms_tcp_listen_backlog);
+  if (rc < 0) {
+    rc = -errno;
+    lderr(msgr->cct) <<  __func__ << " unable to listen on " << listen_addr
+		     << ": " << cpp_strerror(rc) << dendl;
+    ::close(listen_sd);
+    listen_sd = -1;
+    return rc;
+  }
+  
+  msgr->set_myaddrs(entity_addrvec_t(bind_addr));
+  if (bind_addr != entity_addr_t() &&
+      !bind_addr.is_blank_ip())
+    msgr->learned_addr(bind_addr);
+  else
+    ceph_assert(msgr->get_need_addr());  // should still be true.
+
+  if (msgr->get_myaddr_legacy().get_port() == 0) {
+    msgr->set_myaddrs(entity_addrvec_t(listen_addr));
+  }
+  entity_addr_t addr = msgr->get_myaddr_legacy();
+  addr.nonce = nonce;
+  msgr->set_myaddrs(entity_addrvec_t(addr));
+
+  msgr->init_local_connection();
+
+  rc = create_selfpipe(&shutdown_rd_fd, &shutdown_wr_fd);
+  if (rc < 0) {
+    lderr(msgr->cct) <<  __func__ << " unable to create signalling pipe " << listen_addr
+		     << ": " << cpp_strerror(rc) << dendl;
+    return rc;
+  }
+
+  ldout(msgr->cct,1) <<  __func__ << " my_addrs " << *msgr->my_addrs
+		     << " my_addr " << msgr->my_addr
+		     << " need_addr=" << msgr->get_need_addr() << dendl;
+  return 0;
+}
+
+int Accepter::rebind(const set<int>& avoid_ports)
+{
+  ldout(msgr->cct,1) << __func__ << " avoid " << avoid_ports << dendl;
+  
+  entity_addr_t addr = msgr->get_myaddr_legacy();
+  set<int> new_avoid = avoid_ports;
+  new_avoid.insert(addr.get_port());
+  addr.set_port(0);
+
+  // adjust the nonce; we want our entity_addr_t to be truly unique.
+  nonce += 1000000;
+  entity_addrvec_t newaddrs = *msgr->my_addrs;
+  newaddrs.v[0].nonce = nonce;
+  msgr->set_myaddrs(newaddrs);
+  ldout(msgr->cct,10) << __func__ << " new nonce " << nonce << " and addr "
+			<< msgr->my_addr << dendl;
+
+  ldout(msgr->cct,10) << " will try " << addr << " and avoid ports " << new_avoid << dendl;
+  int r = bind(addr, new_avoid);
+  if (r == 0)
+    start();
+  return r;
+}
+
+int Accepter::start()
+{
+  ldout(msgr->cct,1) << __func__ << dendl;
+
+  // start thread
+  create("ms_accepter");
+
+  return 0;
+}
+
+void *Accepter::entry()
+{
+  ldout(msgr->cct,1) << __func__ << " start" << dendl;
+  
+  int errors = 0;
+
+  struct pollfd pfd[2];
+  memset(pfd, 0, sizeof(pfd));
+
+  pfd[0].fd = listen_sd;
+  pfd[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+  pfd[1].fd = shutdown_rd_fd;
+  pfd[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+  while (!done) {
+    ldout(msgr->cct,20) << __func__ << " calling poll for sd:" << listen_sd << dendl;
+    int r = poll(pfd, 2, -1);
+    if (r < 0) {
+      if (errno == EINTR) {
+        continue;
+      }
+      ldout(msgr->cct,1) << __func__ << " poll got error"  
+ 			  << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+      ceph_abort();
+    }
+    ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl;
+    ldout(msgr->cct,20) << __func__ <<  " pfd.revents[0]=" << pfd[0].revents << dendl;
+    ldout(msgr->cct,20) << __func__ <<  " pfd.revents[1]=" << pfd[1].revents << dendl;
+
+    if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) {
+      ldout(msgr->cct,1) << __func__ << " poll got errors in revents "  
+ 			 <<  pfd[0].revents << dendl;
+      ceph_abort();
+    }
+    if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) {
+      // We got "signaled" to exit the poll
+      // clean the selfpipe
+      char ch;
+      if (::read(shutdown_rd_fd, &ch, sizeof(ch)) == -1) {
+        if (errno != EAGAIN)
+          ldout(msgr->cct,1) << __func__ << " Cannot read selfpipe: "
+ 			      << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+        }
+      break;
+    }
+    if (done) break;
+
+    // accept
+    sockaddr_storage ss;
+    socklen_t slen = sizeof(ss);
+    int sd = accept_cloexec(listen_sd, (sockaddr*)&ss, &slen);
+    if (sd >= 0) {
+      errors = 0;
+      ldout(msgr->cct,10) << __func__ << " incoming on sd " << sd << dendl;
+      
+      msgr->add_accept_pipe(sd);
+    } else {
+      int e = errno;
+      ldout(msgr->cct,0) << __func__ << " no incoming connection?  sd = " << sd
+	      << " errno " << e << " " << cpp_strerror(e) << dendl;
+      if (++errors > msgr->cct->_conf->ms_max_accept_failures) {
+        lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl;
+        ceph_abort();
+      }
+    }
+  }
+
+  ldout(msgr->cct,20) << __func__ << " closing" << dendl;
+  // socket is closed right after the thread has joined.
+  // closing it here might race
+  if (shutdown_rd_fd >= 0) {
+    ::close(shutdown_rd_fd);
+    shutdown_rd_fd = -1;
+  }
+
+  ldout(msgr->cct,10) << __func__ << " stopping" << dendl;
+  return 0;
+}
+
+void Accepter::stop()
+{
+  done = true;
+  ldout(msgr->cct,10) << __func__ << " accept listening on: " << listen_sd << dendl;
+
+  if (shutdown_wr_fd < 0)
+    return;
+
+  // Send a byte to the shutdown pipe that the thread is listening to
+  char ch = 0x0;
+  int ret = safe_write(shutdown_wr_fd, &ch, sizeof(ch));
+  if (ret < 0) {
+    ldout(msgr->cct,1) << __func__ << " close failed: "
+             << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+  } else {
+    ldout(msgr->cct,15) << __func__ << " signaled poll" << dendl;
+  }
+  VOID_TEMP_FAILURE_RETRY(close(shutdown_wr_fd));
+  shutdown_wr_fd = -1;
+
+  // wait for thread to stop before closing the socket, to avoid
+  // racing against fd re-use.
+  if (is_started()) {
+    ldout(msgr->cct,5) << __func__ << " wait for thread to join." << dendl;
+    join();
+  }
+
+  if (listen_sd >= 0) {
+    if (::close(listen_sd) < 0) {
+      ldout(msgr->cct,1) << __func__ << " close listen_sd failed: "
+	      << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+    }
+    listen_sd = -1;
+  }
+  if (shutdown_rd_fd >= 0) {
+    if (::close(shutdown_rd_fd) < 0) {
+      ldout(msgr->cct,1) << __func__ << " close shutdown_rd_fd failed: "
+	      << " errno " << errno << " " << cpp_strerror(errno) << dendl;
+    }
+    shutdown_rd_fd = -1;
+  }
+  done = false;
+}
+
+
+
+
diff --git a/src/msg/simple/Accepter.h b/src/msg/simple/Accepter.h
new file mode 100644
index 00000000..7824c3a1
--- /dev/null
+++ b/src/msg/simple/Accepter.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MSG_ACCEPTER_H
+#define CEPH_MSG_ACCEPTER_H
+
+#include "common/Thread.h"
+
+class SimpleMessenger;
+struct entity_addr_t;
+
+/**
+ * If the SimpleMessenger binds to a specific address, the Accepter runs
+ * and listens for incoming connections.
+ */
+class Accepter : public Thread {
+  SimpleMessenger *msgr;
+  bool done;
+  int listen_sd;
+  uint64_t nonce;
+  int shutdown_rd_fd;
+  int shutdown_wr_fd;
+  int create_selfpipe(int *pipe_rd, int *pipe_wr);
+
+public:
+  Accepter(SimpleMessenger *r, uint64_t n) 
+    : msgr(r), done(false), listen_sd(-1), nonce(n),
+      shutdown_rd_fd(-1), shutdown_wr_fd(-1)
+    {}
+    
+  void *entry() override;
+  void stop();
+  int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports);
+  int rebind(const set<int>& avoid_port);
+  int start();
+};
+
+
+#endif
diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc
new file mode 100644
index 00000000..fd44dc4e
--- /dev/null
+++ b/src/msg/simple/Pipe.cc
@@ -0,0 +1,2712 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <sys/uio.h>
+#include <limits.h>
+#include <poll.h>
+
+#include "msg/Message.h"
+#include "Pipe.h"
+#include "SimpleMessenger.h"
+
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/valgrind.h"
+
+// Below included to get encode_encrypt(); That probably should be in Crypto.h, instead
+
+#include "auth/cephx/CephxProtocol.h"
+#include "auth/AuthSessionHandler.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+#include "include/random.h"
+
+// Constant to limit starting sequence number to 2^31.  Nothing special about it, just a big number.  PLR
+#define SEQ_MASK  0x7fffffff 
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << *this
+ostream& Pipe::_pipe_prefix(std::ostream &out) const {
+  return out << "-- " << msgr->get_myaddr_legacy() << " >> " << peer_addr
+	     << " pipe(" << this
+	     << " sd=" << sd << " :" << port
+             << " s=" << state
+             << " pgs=" << peer_global_seq
+             << " cs=" << connect_seq
+             << " l=" << policy.lossy
+             << " c=" << connection_state
+             << ").";
+}
+
+ostream& operator<<(ostream &out, const Pipe &pipe) {
+  return pipe._pipe_prefix(out);
+}
+
+/**
+ * The DelayedDelivery is for injecting delays into Message delivery off
+ * the socket. It is only enabled if delays are requested, and if they
+ * are then it pulls Messages off the DelayQueue and puts them into the
+ * in_q (SimpleMessenger::dispatch_queue).
+ * Please note that this probably has issues with Pipe shutdown and
+ * replacement semantics. I've tried, but no guarantees.
+ */
+class Pipe::DelayedDelivery: public Thread {
+  Pipe *pipe;
+  std::deque< pair<utime_t,Message*> > delay_queue;
+  Mutex delay_lock;
+  Cond delay_cond;
+  int flush_count;
+  bool active_flush;
+  bool stop_delayed_delivery;
+  bool delay_dispatching; // we are in fast dispatch now
+  bool stop_fast_dispatching_flag; // we need to stop fast dispatching
+
+public:
+  explicit DelayedDelivery(Pipe *p)
+    : pipe(p),
+      delay_lock("Pipe::DelayedDelivery::delay_lock"), flush_count(0),
+      active_flush(false),
+      stop_delayed_delivery(false),
+      delay_dispatching(false),
+      stop_fast_dispatching_flag(false) { }
+  ~DelayedDelivery() override {
+    discard();
+  }
+  void *entry() override;
+  void queue(utime_t release, Message *m) {
+    Mutex::Locker l(delay_lock);
+    delay_queue.push_back(make_pair(release, m));
+    delay_cond.Signal();
+  }
+  void discard();
+  void flush();
+  bool is_flushing() {
+    Mutex::Locker l(delay_lock);
+    return flush_count > 0 || active_flush;
+  }
+  void wait_for_flush() {
+    Mutex::Locker l(delay_lock);
+    while (flush_count > 0 || active_flush)
+      delay_cond.Wait(delay_lock);
+  }
+  void stop() {
+    delay_lock.Lock();
+    stop_delayed_delivery = true;
+    delay_cond.Signal();
+    delay_lock.Unlock();
+  }
+  void steal_for_pipe(Pipe *new_owner) {
+    Mutex::Locker l(delay_lock);
+    pipe = new_owner;
+  }
+  /**
+   * We need to stop fast dispatching before we need to stop putting
+   * normal messages into the DispatchQueue.
+   */
+  void stop_fast_dispatching();
+};
+
+/**************************************
+ * Pipe
+ */
+
+Pipe::Pipe(SimpleMessenger *r, int st, PipeConnection *con)
+  : RefCountedObject(r->cct),
+    reader_thread(this),
+    writer_thread(this),
+    delay_thread(NULL),
+    msgr(r),
+    conn_id(r->dispatch_queue.get_id()),
+    recv_ofs(0),
+    recv_len(0),
+    sd(-1), port(0),
+    peer_type(-1),
+    pipe_lock("SimpleMessenger::Pipe::pipe_lock"),
+    state(st),
+    connection_state(NULL),
+    reader_running(false), reader_needs_join(false),
+    reader_dispatching(false), notify_on_dispatch_done(false),
+    writer_running(false),
+    in_q(&(r->dispatch_queue)),
+    send_keepalive(false),
+    send_keepalive_ack(false),
+    connect_seq(0), peer_global_seq(0),
+    out_seq(0), in_seq(0), in_seq_acked(0) {
+  ANNOTATE_BENIGN_RACE_SIZED(&sd, sizeof(sd), "Pipe socket");
+  ANNOTATE_BENIGN_RACE_SIZED(&state, sizeof(state), "Pipe state");
+  ANNOTATE_BENIGN_RACE_SIZED(&recv_len, sizeof(recv_len), "Pipe recv_len");
+  ANNOTATE_BENIGN_RACE_SIZED(&recv_ofs, sizeof(recv_ofs), "Pipe recv_ofs");
+  if (con) {
+    connection_state = con;
+    connection_state->reset_pipe(this);
+  } else {
+    connection_state = new PipeConnection(msgr->cct, msgr);
+    connection_state->pipe = get();
+  }
+
+  randomize_out_seq();
+
+  msgr->timeout = msgr->cct->_conf->ms_connection_idle_timeout * 1000; //convert to ms
+  if (msgr->timeout == 0)
+    msgr->timeout = -1;
+
+  recv_max_prefetch = msgr->cct->_conf->ms_tcp_prefetch_max_size;
+  recv_buf = new char[recv_max_prefetch];
+}
+
+Pipe::~Pipe()
+{
+  ceph_assert(out_q.empty());
+  ceph_assert(sent.empty());
+  delete delay_thread;
+  delete[] recv_buf;
+}
+
+void Pipe::handle_ack(uint64_t seq)
+{
+  lsubdout(msgr->cct, ms, 15) << "reader got ack seq " << seq << dendl;
+  // trim sent list
+  while (!sent.empty() &&
+	 sent.front()->get_seq() <= seq) {
+    Message *m = sent.front();
+    sent.pop_front();
+    lsubdout(msgr->cct, ms, 10) << "reader got ack seq "
+				<< seq << " >= " << m->get_seq() << " on " << m << " " << *m << dendl;
+    m->put();
+  }
+}
+
+void Pipe::start_reader()
+{
+  ceph_assert(pipe_lock.is_locked());
+  ceph_assert(!reader_running);
+  if (reader_needs_join) {
+    reader_thread.join();
+    reader_needs_join = false;
+  }
+  reader_running = true;
+  reader_thread.create("ms_pipe_read", msgr->cct->_conf->ms_rwthread_stack_bytes);
+}
+
+void Pipe::maybe_start_delay_thread()
+{
+  if (!delay_thread) {
+    auto pos = msgr->cct->_conf.get_val<std::string>("ms_inject_delay_type").find(ceph_entity_type_name(connection_state->peer_type));
+    if (pos != string::npos) {
+      lsubdout(msgr->cct, ms, 1) << "setting up a delay queue on Pipe " << this << dendl;
+      delay_thread = new DelayedDelivery(this);
+      delay_thread->create("ms_pipe_delay");
+    }
+  }
+}
+
+void Pipe::start_writer()
+{
+  ceph_assert(pipe_lock.is_locked());
+  ceph_assert(!writer_running);
+  writer_running = true;
+  writer_thread.create("ms_pipe_write", msgr->cct->_conf->ms_rwthread_stack_bytes);
+}
+
+void Pipe::join_reader()
+{
+  if (!reader_running)
+    return;
+  cond.Signal();
+  pipe_lock.Unlock();
+  reader_thread.join();
+  pipe_lock.Lock();
+  reader_needs_join = false;
+}
+
+void Pipe::DelayedDelivery::discard()
+{
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::discard" << dendl;
+  Mutex::Locker l(delay_lock);
+  while (!delay_queue.empty()) {
+    Message *m = delay_queue.front().second;
+    pipe->in_q->dispatch_throttle_release(m->get_dispatch_throttle_size());
+    m->put();
+    delay_queue.pop_front();
+  }
+}
+
+void Pipe::DelayedDelivery::flush()
+{
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::flush" << dendl;
+  Mutex::Locker l(delay_lock);
+  flush_count = delay_queue.size();
+  delay_cond.Signal();
+}
+
+void *Pipe::DelayedDelivery::entry()
+{
+  Mutex::Locker locker(delay_lock);
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::entry start" << dendl;
+
+  while (!stop_delayed_delivery) {
+    if (delay_queue.empty()) {
+      lgeneric_subdout(pipe->msgr->cct, ms, 30) << *pipe << "DelayedDelivery::entry sleeping on delay_cond because delay queue is empty" << dendl;
+      delay_cond.Wait(delay_lock);
+      continue;
+    }
+    utime_t release = delay_queue.front().first;
+    Message *m = delay_queue.front().second;
+    string delay_msg_type = pipe->msgr->cct->_conf->ms_inject_delay_msg_type;
+    if (!flush_count &&
+        (release > ceph_clock_now() &&
+         (delay_msg_type.empty() || m->get_type_name() == delay_msg_type))) {
+      lgeneric_subdout(pipe->msgr->cct, ms, 10) << *pipe << "DelayedDelivery::entry sleeping on delay_cond until " << release << dendl;
+      delay_cond.WaitUntil(delay_lock, release);
+      continue;
+    }
+    lgeneric_subdout(pipe->msgr->cct, ms, 10) << *pipe << "DelayedDelivery::entry dequeuing message " << m << " for delivery, past " << release << dendl;
+    delay_queue.pop_front();
+    if (flush_count > 0) {
+      --flush_count;
+      active_flush = true;
+    }
+    if (pipe->in_q->can_fast_dispatch(m)) {
+      if (!stop_fast_dispatching_flag) {
+        delay_dispatching = true;
+        delay_lock.Unlock();
+        pipe->in_q->fast_dispatch(m);
+        delay_lock.Lock();
+        delay_dispatching = false;
+        if (stop_fast_dispatching_flag) {
+          // we need to let the stopping thread proceed
+          delay_cond.Signal();
+          delay_lock.Unlock();
+          delay_lock.Lock();
+        }
+      }
+    } else {
+      pipe->in_q->enqueue(m, m->get_priority(), pipe->conn_id);
+    }
+    active_flush = false;
+  }
+  lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::entry stop" << dendl;
+  return NULL;
+}
+
+void Pipe::DelayedDelivery::stop_fast_dispatching() {
+  Mutex::Locker l(delay_lock);
+  stop_fast_dispatching_flag = true;
+  while (delay_dispatching)
+    delay_cond.Wait(delay_lock);
+}
+
+
+int Pipe::accept()
+{
+  ldout(msgr->cct,10) << "accept" << dendl;
+  ceph_assert(pipe_lock.is_locked());
+  ceph_assert(state == STATE_ACCEPTING);
+
+  pipe_lock.Unlock();
+
+  // vars
+  bufferlist addrs;
+  entity_addr_t socket_addr;
+  socklen_t len;
+  int r;
+  char banner[strlen(CEPH_BANNER)+1];
+  bufferlist addrbl;
+  ceph_msg_connect connect;
+  ceph_msg_connect_reply reply;
+  Pipe *existing = 0;
+  bufferptr bp;
+  bufferlist authorizer, authorizer_reply;
+  bool authorizer_valid;
+  uint64_t feat_missing;
+  bool replaced = false;
+  // this variable denotes if the connection attempt from peer is a hard 
+  // reset or not, it is true if there is an existing connection and the
+  // connection sequence from peer is equal to zero
+  bool is_reset_from_peer = false;
+  CryptoKey session_key;
+  int removed; // single-use down below
+
+  // this should roughly mirror pseudocode at
+  //  http://ceph.com/wiki/Messaging_protocol
+  int reply_tag = 0;
+  uint64_t existing_seq = -1;
+
+  // used for reading in the remote acked seq on connect
+  uint64_t newly_acked_seq = 0;
+
+  bool need_challenge = false;
+  bool had_challenge = false;
+  std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge;
+
+  recv_reset();
+
+  set_socket_options();
+
+  // announce myself.
+  r = tcp_write(CEPH_BANNER, strlen(CEPH_BANNER));
+  if (r < 0) {
+    ldout(msgr->cct,10) << "accept couldn't write banner" << dendl;
+    goto fail_unlocked;
+  }
+
+  // and my addr
+  encode(msgr->my_addr, addrs, 0);  // legacy
+
+  port = msgr->my_addr.get_port();
+
+  // and peer's socket addr (they might not know their ip)
+  sockaddr_storage ss;
+  len = sizeof(ss);
+  r = ::getpeername(sd, (sockaddr*)&ss, &len);
+  if (r < 0) {
+    ldout(msgr->cct,0) << "accept failed to getpeername " << cpp_strerror(errno) << dendl;
+    goto fail_unlocked;
+  }
+  socket_addr.set_sockaddr((sockaddr*)&ss);
+  encode(socket_addr, addrs, 0);  // legacy
+
+  r = tcp_write(addrs.c_str(), addrs.length());
+  if (r < 0) {
+    ldout(msgr->cct,10) << "accept couldn't write my+peer addr" << dendl;
+    goto fail_unlocked;
+  }
+
+  ldout(msgr->cct,1) << "accept sd=" << sd << " " << socket_addr << dendl;
+  
+  // identify peer
+  if (tcp_read(banner, strlen(CEPH_BANNER)) < 0) {
+    ldout(msgr->cct,10) << "accept couldn't read banner" << dendl;
+    goto fail_unlocked;
+  }
+  if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+    banner[strlen(CEPH_BANNER)] = 0;
+    ldout(msgr->cct,1) << "accept peer sent bad banner '" << banner << "' (should be '" << CEPH_BANNER << "')" << dendl;
+    goto fail_unlocked;
+  }
+  {
+    bufferptr tp(sizeof(ceph_entity_addr));
+    addrbl.push_back(std::move(tp));
+  }
+  if (tcp_read(addrbl.c_str(), addrbl.length()) < 0) {
+    ldout(msgr->cct,10) << "accept couldn't read peer_addr" << dendl;
+    goto fail_unlocked;
+  }
+  try {
+    auto ti = addrbl.cbegin();
+    decode(peer_addr, ti);
+  } catch (const buffer::error& e) {
+    ldout(msgr->cct,2) << __func__ <<  " decode peer_addr failed: " << e.what()
+			<< dendl;
+    goto fail_unlocked;
+  }
+
+  ldout(msgr->cct,10) << "accept peer addr is " << peer_addr << dendl;
+  if (peer_addr.is_blank_ip()) {
+    // peer apparently doesn't know what ip they have; figure it out for them.
+    int port = peer_addr.get_port();
+    peer_addr.u = socket_addr.u;
+    peer_addr.set_port(port);
+    ldout(msgr->cct,0) << "accept peer addr is really " << peer_addr
+	    << " (socket is " << socket_addr << ")" << dendl;
+  }
+  set_peer_addr(peer_addr);  // so that connection_state gets set up
+  
+  while (1) {
+    if (tcp_read((char*)&connect, sizeof(connect)) < 0) {
+      ldout(msgr->cct,10) << "accept couldn't read connect" << dendl;
+      goto fail_unlocked;
+    }
+
+    authorizer.clear();
+    if (connect.authorizer_len) {
+      bp = buffer::create(connect.authorizer_len);
+      if (tcp_read(bp.c_str(), connect.authorizer_len) < 0) {
+        ldout(msgr->cct,10) << "accept couldn't read connect authorizer" << dendl;
+        goto fail_unlocked;
+      }
+      authorizer.push_back(std::move(bp));
+      authorizer_reply.clear();
+    }
+
+    ldout(msgr->cct,20) << "accept got peer connect_seq " << connect.connect_seq
+	     << " global_seq " << connect.global_seq
+	     << dendl;
+    
+    msgr->lock.Lock();   // FIXME
+    pipe_lock.Lock();
+    if (msgr->dispatch_queue.stop)
+      goto shutting_down;
+    if (state != STATE_ACCEPTING) {
+      goto shutting_down;
+    }
+
+    // note peer's type, flags
+    set_peer_type(connect.host_type);
+    policy = msgr->get_policy(connect.host_type);
+    ldout(msgr->cct,10) << "accept of host_type " << connect.host_type
+			<< ", policy.lossy=" << policy.lossy
+			<< " policy.server=" << policy.server
+			<< " policy.standby=" << policy.standby
+			<< " policy.resetcheck=" << policy.resetcheck
+			<< dendl;
+
+    memset(&reply, 0, sizeof(reply));
+    reply.protocol_version = msgr->get_proto_version(peer_type, false);
+    msgr->lock.Unlock();
+
+    // mismatch?
+    ldout(msgr->cct,10) << "accept my proto " << reply.protocol_version
+	     << ", their proto " << connect.protocol_version << dendl;
+    if (connect.protocol_version != reply.protocol_version) {
+      reply.tag = CEPH_MSGR_TAG_BADPROTOVER;
+      goto reply;
+    }
+
+    // require signatures for cephx?
+    if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) {
+      if (peer_type == CEPH_ENTITY_TYPE_OSD ||
+	  peer_type == CEPH_ENTITY_TYPE_MDS ||
+	  peer_type == CEPH_ENTITY_TYPE_MGR) {
+	if (msgr->cct->_conf->cephx_require_signatures ||
+	    msgr->cct->_conf->cephx_cluster_require_signatures) {
+	  ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for cluster" << dendl;
+	  policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+	}
+	if (msgr->cct->_conf->cephx_require_version >= 2 ||
+	    msgr->cct->_conf->cephx_cluster_require_version >= 2) {
+	  ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+	  policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+	}
+      } else {
+	if (msgr->cct->_conf->cephx_require_signatures ||
+	    msgr->cct->_conf->cephx_service_require_signatures) {
+	  ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for service" << dendl;
+	  policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+	}
+	if (msgr->cct->_conf->cephx_require_version >= 2 ||
+	    msgr->cct->_conf->cephx_service_require_version >= 2) {
+	  ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl;
+	  policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+	}
+      }
+    }
+
+    feat_missing = policy.features_required & ~(uint64_t)connect.features;
+    if (feat_missing) {
+      ldout(msgr->cct,1) << "peer missing required features " << std::hex << feat_missing << std::dec << dendl;
+      reply.tag = CEPH_MSGR_TAG_FEATURES;
+      goto reply;
+    }
+    
+    // Check the authorizer.  If not good, bail out.
+
+    pipe_lock.Unlock();
+
+    need_challenge = HAVE_FEATURE(connect.features, CEPHX_V2);
+    had_challenge = (bool)authorizer_challenge;
+    authorizer_reply.clear();
+    if (!msgr->ms_deliver_verify_authorizer(
+	  connection_state.get(), peer_type, connect.authorizer_protocol,
+	  authorizer,
+	  authorizer_reply, authorizer_valid, session_key,
+	  nullptr /* connection_secret */,
+	  need_challenge ? &authorizer_challenge : nullptr) ||
+	!authorizer_valid) {
+      pipe_lock.Lock();
+      if (state != STATE_ACCEPTING)
+	goto shutting_down_msgr_unlocked;
+      if (!had_challenge && need_challenge && authorizer_challenge) {
+	ldout(msgr->cct,10) << "accept: challenging authorizer "
+			    << authorizer_reply.length()
+			    << " bytes" << dendl;
+	ceph_assert(authorizer_reply.length());
+	reply.tag = CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER;
+      } else {
+	ldout(msgr->cct,0) << "accept: got bad authorizer" << dendl;
+	reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER;
+      }
+      session_security.reset();
+      goto reply;
+    } 
+
+    // We've verified the authorizer for this pipe, so set up the session security structure.  PLR
+
+    ldout(msgr->cct,10) << "accept:  setting up session_security." << dendl;
+
+  retry_existing_lookup:
+    msgr->lock.Lock();
+    pipe_lock.Lock();
+    if (msgr->dispatch_queue.stop)
+      goto shutting_down;
+    if (state != STATE_ACCEPTING)
+      goto shutting_down;
+    
+    // existing?
+    existing = msgr->_lookup_pipe(peer_addr);
+    if (existing) {
+      existing->pipe_lock.Lock(true);  // skip lockdep check (we are locking a second Pipe here)
+      if (existing->reader_dispatching) {
+	/** we need to wait, or we can deadlock if downstream
+	 *  fast_dispatchers are (naughtily!) waiting on resources
+	 *  held by somebody trying to make use of the SimpleMessenger lock.
+	 *  So drop locks, wait, and retry. It just looks like a slow network
+	 *  to everybody else.
+	 *
+	 *  We take a ref to existing here since it might get reaped before we
+	 *  wake up (see bug #15870).  We can be confident that it lived until
+	 *  locked it since we held the msgr lock from _lookup_pipe through to
+	 *  locking existing->lock and checking reader_dispatching.
+	 */
+	existing->get();
+	pipe_lock.Unlock();
+	msgr->lock.Unlock();
+	existing->notify_on_dispatch_done = true;
+	while (existing->reader_dispatching)
+	  existing->cond.Wait(existing->pipe_lock);
+	existing->pipe_lock.Unlock();
+	existing->put();
+	existing = nullptr;
+	goto retry_existing_lookup;
+      }
+
+      if (connect.global_seq < existing->peer_global_seq) {
+	ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq
+		 << " > " << connect.global_seq << ", RETRY_GLOBAL" << dendl;
+	reply.tag = CEPH_MSGR_TAG_RETRY_GLOBAL;
+	reply.global_seq = existing->peer_global_seq;  // so we can send it below..
+	existing->pipe_lock.Unlock();
+	msgr->lock.Unlock();
+	goto reply;
+      } else {
+	ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq
+		 << " <= " << connect.global_seq << ", looks ok" << dendl;
+      }
+      
+      if (existing->policy.lossy) {
+	ldout(msgr->cct,0) << "accept replacing existing (lossy) channel (new one lossy="
+	        << policy.lossy << ")" << dendl;
+	existing->was_session_reset();
+	goto replace;
+      }
+
+      ldout(msgr->cct,0) << "accept connect_seq " << connect.connect_seq
+			 << " vs existing " << existing->connect_seq
+			 << " state " << existing->get_state_name() << dendl;
+
+      if (connect.connect_seq == 0 && existing->connect_seq > 0) {
+	ldout(msgr->cct,0) << "accept peer reset, then tried to connect to us, replacing" << dendl;
+        // this is a hard reset from peer
+        is_reset_from_peer = true;
+	if (policy.resetcheck)
+	  existing->was_session_reset(); // this resets out_queue, msg_ and connect_seq #'s
+	goto replace;
+      }
+
+      if (connect.connect_seq < existing->connect_seq) {
+	// old attempt, or we sent READY but they didn't get it.
+	ldout(msgr->cct,10) << "accept existing " << existing << ".cseq " << existing->connect_seq
+			    << " > " << connect.connect_seq << ", RETRY_SESSION" << dendl;
+	goto retry_session;
+      }
+
+      if (connect.connect_seq == existing->connect_seq) {
+	// if the existing connection successfully opened, and/or
+	// subsequently went to standby, then the peer should bump
+	// their connect_seq and retry: this is not a connection race
+	// we need to resolve here.
+	if (existing->state == STATE_OPEN ||
+	    existing->state == STATE_STANDBY) {
+	  ldout(msgr->cct,10) << "accept connection race, existing " << existing
+			      << ".cseq " << existing->connect_seq
+			      << " == " << connect.connect_seq
+			      << ", OPEN|STANDBY, RETRY_SESSION" << dendl;
+	  goto retry_session;
+	}
+
+	// connection race?
+	if (peer_addr < msgr->my_addr ||
+	    existing->policy.server) {
+	  // incoming wins
+	  ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq
+		   << " == " << connect.connect_seq << ", or we are server, replacing my attempt" << dendl;
+	  if (!(existing->state == STATE_CONNECTING ||
+		existing->state == STATE_WAIT))
+	    lderr(msgr->cct) << "accept race bad state, would replace, existing="
+			     << existing->get_state_name()
+			     << " " << existing << ".cseq=" << existing->connect_seq
+			     << " == " << connect.connect_seq
+			     << dendl;
+	  ceph_assert(existing->state == STATE_CONNECTING ||
+		 existing->state == STATE_WAIT);
+	  goto replace;
+	} else {
+	  // our existing outgoing wins
+	  ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq
+		   << " == " << connect.connect_seq << ", sending WAIT" << dendl;
+	  ceph_assert(peer_addr > msgr->my_addr);
+	  if (!(existing->state == STATE_CONNECTING))
+	    lderr(msgr->cct) << "accept race bad state, would send wait, existing="
+			     << existing->get_state_name()
+			     << " " << existing << ".cseq=" << existing->connect_seq
+			     << " == " << connect.connect_seq
+			     << dendl;
+	  ceph_assert(existing->state == STATE_CONNECTING);
+	  // make sure our outgoing connection will follow through
+	  existing->_send_keepalive();
+	  reply.tag = CEPH_MSGR_TAG_WAIT;
+	  existing->pipe_lock.Unlock();
+	  msgr->lock.Unlock();
+	  goto reply;
+	}
+      }
+
+      ceph_assert(connect.connect_seq > existing->connect_seq);
+      ceph_assert(connect.global_seq >= existing->peer_global_seq);
+      if (policy.resetcheck &&   // RESETSESSION only used by servers; peers do not reset each other
+	  existing->connect_seq == 0) {
+	ldout(msgr->cct,0) << "accept we reset (peer sent cseq " << connect.connect_seq 
+		 << ", " << existing << ".cseq = " << existing->connect_seq
+		 << "), sending RESETSESSION" << dendl;
+	reply.tag = CEPH_MSGR_TAG_RESETSESSION;
+	msgr->lock.Unlock();
+	existing->pipe_lock.Unlock();
+	goto reply;
+      }
+
+      // reconnect
+      ldout(msgr->cct,10) << "accept peer sent cseq " << connect.connect_seq
+	       << " > " << existing->connect_seq << dendl;
+      goto replace;
+    } // existing
+    else if (connect.connect_seq > 0) {
+      // we reset, and they are opening a new session
+      ldout(msgr->cct,0) << "accept we reset (peer sent cseq " << connect.connect_seq << "), sending RESETSESSION" << dendl;
+      msgr->lock.Unlock();
+      reply.tag = CEPH_MSGR_TAG_RESETSESSION;
+      goto reply;
+    } else {
+      // new session
+      ldout(msgr->cct,10) << "accept new session" << dendl;
+      existing = NULL;
+      goto open;
+    }
+    ceph_abort();
+
+  retry_session:
+    ceph_assert(existing->pipe_lock.is_locked());
+    ceph_assert(pipe_lock.is_locked());
+    reply.tag = CEPH_MSGR_TAG_RETRY_SESSION;
+    reply.connect_seq = existing->connect_seq + 1;
+    existing->pipe_lock.Unlock();
+    msgr->lock.Unlock();
+    goto reply;    
+
+  reply:
+    ceph_assert(pipe_lock.is_locked());
+    reply.features = ((uint64_t)connect.features & policy.features_supported) | policy.features_required;
+    reply.authorizer_len = authorizer_reply.length();
+    pipe_lock.Unlock();
+    r = tcp_write((char*)&reply, sizeof(reply));
+    if (r < 0)
+      goto fail_unlocked;
+    if (reply.authorizer_len) {
+      r = tcp_write(authorizer_reply.c_str(), authorizer_reply.length());
+      if (r < 0)
+	goto fail_unlocked;
+    }
+  }
+  
+ replace:
+  ceph_assert(existing->pipe_lock.is_locked());
+  ceph_assert(pipe_lock.is_locked());
+  // if it is a hard reset from peer, we don't need a round-trip to negotiate in/out sequence
+  if ((connect.features & CEPH_FEATURE_RECONNECT_SEQ) && !is_reset_from_peer) {
+    reply_tag = CEPH_MSGR_TAG_SEQ;
+    existing_seq = existing->in_seq;
+  }
+  ldout(msgr->cct,10) << "accept replacing " << existing << dendl;
+  existing->stop();
+  existing->unregister_pipe();
+  replaced = true;
+
+  if (existing->policy.lossy) {
+    // disconnect from the Connection
+    ceph_assert(existing->connection_state);
+    if (existing->connection_state->clear_pipe(existing))
+      msgr->dispatch_queue.queue_reset(existing->connection_state.get());
+  } else {
+    // queue a reset on the new connection, which we're dumping for the old
+    msgr->dispatch_queue.queue_reset(connection_state.get());
+
+    // drop my Connection, and take a ref to the existing one. do not
+    // clear existing->connection_state, since read_message and
+    // write_message both dereference it without pipe_lock.
+    connection_state = existing->connection_state;
+
+    // make existing Connection reference us
+    connection_state->reset_pipe(this);
+
+    if (existing->delay_thread) {
+      existing->delay_thread->steal_for_pipe(this);
+      delay_thread = existing->delay_thread;
+      existing->delay_thread = NULL;
+      delay_thread->flush();
+    }
+
+    // steal incoming queue
+    uint64_t replaced_conn_id = conn_id;
+    conn_id = existing->conn_id;
+    existing->conn_id = replaced_conn_id;
+
+    // reset the in_seq if this is a hard reset from peer,
+    // otherwise we respect our original connection's value
+    in_seq = is_reset_from_peer ? 0 : existing->in_seq;
+    in_seq_acked = in_seq;
+
+    // steal outgoing queue and out_seq
+    existing->requeue_sent();
+    out_seq = existing->out_seq;
+    ldout(msgr->cct,10) << "accept re-queuing on out_seq " << out_seq << " in_seq " << in_seq << dendl;
+    for (map<int, list<Message*> >::iterator p = existing->out_q.begin();
+         p != existing->out_q.end();
+         ++p)
+      out_q[p->first].splice(out_q[p->first].begin(), p->second);
+  }
+  existing->stop_and_wait();
+  existing->pipe_lock.Unlock();
+
+ open:
+  // open
+  ceph_assert(pipe_lock.is_locked());
+  connect_seq = connect.connect_seq + 1;
+  peer_global_seq = connect.global_seq;
+  ceph_assert(state == STATE_ACCEPTING);
+  state = STATE_OPEN;
+  ldout(msgr->cct,10) << "accept success, connect_seq = " << connect_seq << ", sending READY" << dendl;
+
+  // send READY reply
+  reply.tag = (reply_tag ? reply_tag : CEPH_MSGR_TAG_READY);
+  reply.features = policy.features_supported;
+  reply.global_seq = msgr->get_global_seq();
+  reply.connect_seq = connect_seq;
+  reply.flags = 0;
+  reply.authorizer_len = authorizer_reply.length();
+  if (policy.lossy)
+    reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
+
+  connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features);
+  ldout(msgr->cct,10) << "accept features " << connection_state->get_features() << dendl;
+
+  session_security.reset(
+      get_auth_session_handler(msgr->cct,
+			       connect.authorizer_protocol,
+			       session_key,
+			       connection_state->get_features()));
+
+  // notify
+  msgr->dispatch_queue.queue_accept(connection_state.get());
+  msgr->ms_deliver_handle_fast_accept(connection_state.get());
+
+  // ok!
+  if (msgr->dispatch_queue.stop)
+    goto shutting_down;
+  removed = msgr->accepting_pipes.erase(this);
+  ceph_assert(removed == 1);
+  register_pipe();
+  msgr->lock.Unlock();
+  pipe_lock.Unlock();
+
+  r = tcp_write((char*)&reply, sizeof(reply));
+  if (r < 0) {
+    goto fail_registered;
+  }
+
+  if (reply.authorizer_len) {
+    r = tcp_write(authorizer_reply.c_str(), authorizer_reply.length());
+    if (r < 0) {
+      goto fail_registered;
+    }
+  }
+
+  if (reply_tag == CEPH_MSGR_TAG_SEQ) {
+    if (tcp_write((char*)&existing_seq, sizeof(existing_seq)) < 0) {
+      ldout(msgr->cct,2) << "accept write error on in_seq" << dendl;
+      goto fail_registered;
+    }
+    if (tcp_read((char*)&newly_acked_seq, sizeof(newly_acked_seq)) < 0) {
+      ldout(msgr->cct,2) << "accept read error on newly_acked_seq" << dendl;
+      goto fail_registered;
+    }
+  }
+
+  pipe_lock.Lock();
+  discard_requeued_up_to(newly_acked_seq);
+  if (state != STATE_CLOSED) {
+    ldout(msgr->cct,10) << "accept starting writer, state " << get_state_name() << dendl;
+    start_writer();
+  }
+  ldout(msgr->cct,20) << "accept done" << dendl;
+
+  maybe_start_delay_thread();
+
+  return 0;   // success.
+
+ fail_registered:
+  ldout(msgr->cct, 10) << "accept fault after register" << dendl;
+
+  if (msgr->cct->_conf->ms_inject_internal_delays) {
+    ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl;
+    utime_t t;
+    t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+
+ fail_unlocked:
+  pipe_lock.Lock();
+  if (state != STATE_CLOSED) {
+    bool queued = is_queued();
+    ldout(msgr->cct, 10) << "  queued = " << (int)queued << dendl;
+    if (queued) {
+      state = policy.server ? STATE_STANDBY : STATE_CONNECTING;
+    } else if (replaced) {
+      state = STATE_STANDBY;
+    } else {
+      state = STATE_CLOSED;
+      state_closed = true;
+    }
+    fault();
+    if (queued || replaced)
+      start_writer();
+  }
+  return -1;
+
+ shutting_down:
+  msgr->lock.Unlock();
+ shutting_down_msgr_unlocked:
+  ceph_assert(pipe_lock.is_locked());
+
+  if (msgr->cct->_conf->ms_inject_internal_delays) {
+    ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl;
+    utime_t t;
+    t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+
+  state = STATE_CLOSED;
+  state_closed = true;
+  fault();
+  return -1;
+}
+
+void Pipe::set_socket_options()
+{
+  // disable Nagle algorithm?
+  if (msgr->cct->_conf->ms_tcp_nodelay) {
+    int flag = 1;
+    int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
+    if (r < 0) {
+      r = -errno;
+      ldout(msgr->cct,0) << "couldn't set TCP_NODELAY: "
+                         << cpp_strerror(r) << dendl;
+    }
+  }
+  if (msgr->cct->_conf->ms_tcp_rcvbuf) {
+    int size = msgr->cct->_conf->ms_tcp_rcvbuf;
+    int r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
+    if (r < 0)  {
+      r = -errno;
+      ldout(msgr->cct,0) << "couldn't set SO_RCVBUF to " << size
+                         << ": " << cpp_strerror(r) << dendl;
+    }
+  }
+
+  // block ESIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
+  int val = 1;
+  int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
+  if (r) {
+    r = -errno;
+    ldout(msgr->cct,0) << "couldn't set SO_NOSIGPIPE: "
+                       << cpp_strerror(r) << dendl;
+  }
+#endif
+
+#ifdef SO_PRIORITY
+  int prio = msgr->get_socket_priority();
+  if (prio >= 0) {
+    int r = -1;
+#ifdef IPTOS_CLASS_CS6
+    int iptos = IPTOS_CLASS_CS6;
+    int addr_family = 0;
+    if (!peer_addr.is_blank_ip()) {
+      addr_family = peer_addr.get_family();
+    } else {
+      addr_family = msgr->get_myaddr_legacy().get_family();
+    }
+    switch (addr_family) {
+    case AF_INET:
+      r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
+      break;
+    case AF_INET6:
+      r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
+      break;
+    default:
+      lderr(msgr->cct) << "couldn't set ToS of unknown family ("
+		       << addr_family << ")"
+		       << " to " << iptos << dendl;
+      return;
+    }
+    if (r < 0) {
+      r = -errno;
+      ldout(msgr->cct,0) << "couldn't set TOS to " << iptos
+			 << ": " << cpp_strerror(r) << dendl;
+    }
+#endif
+    // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
+    // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
+    // We need to call setsockopt(SO_PRIORITY) after it.
+    r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
+    if (r < 0) {
+      r = -errno;
+      ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio
+                         << ": " << cpp_strerror(r) << dendl;
+    }
+  }
+#endif
+}
+
+int Pipe::connect()
+{
+  ldout(msgr->cct,10) << "connect " << connect_seq << dendl;
+  ceph_assert(pipe_lock.is_locked());
+
+  __u32 cseq = connect_seq;
+  __u32 gseq = msgr->get_global_seq();
+
+  // stop reader thread
+  join_reader();
+
+  pipe_lock.Unlock();
+  
+  char tag = -1;
+  int rc = -1;
+  struct msghdr msg;
+  struct iovec msgvec[2];
+  int msglen;
+  char banner[strlen(CEPH_BANNER) + 1];  // extra byte makes coverity happy
+  entity_addr_t paddr;
+  entity_addr_t peer_addr_for_me, socket_addr;
+  AuthAuthorizer *authorizer = NULL;
+  bufferlist addrbl, myaddrbl;
+  const auto& conf = msgr->cct->_conf;
+
+  // close old socket.  this is safe because we stopped the reader thread above.
+  if (sd >= 0)
+    ::close(sd);
+
+  // create socket?
+  sd = socket_cloexec(peer_addr.get_family(), SOCK_STREAM, 0);
+  if (sd < 0) {
+    int e = errno;
+    lderr(msgr->cct) << "connect couldn't create socket " << cpp_strerror(e) << dendl;
+    rc = -e;
+    goto fail;
+  }
+
+  recv_reset();
+
+  set_socket_options();
+
+  {
+    entity_addr_t addr2bind = msgr->get_myaddr_legacy();
+    if (msgr->cct->_conf->ms_bind_before_connect && (!addr2bind.is_blank_ip())) {
+      addr2bind.set_port(0);
+      int r = ::bind(sd , addr2bind.get_sockaddr(), addr2bind.get_sockaddr_len());
+      if (r < 0) {
+        ldout(msgr->cct,2) << "client bind error " << ", " << cpp_strerror(errno) << dendl;
+        goto fail;
+      }
+    }
+  }
+
+  // connect!
+  ldout(msgr->cct,10) << "connecting to " << peer_addr << dendl;
+  rc = ::connect(sd, peer_addr.get_sockaddr(), peer_addr.get_sockaddr_len());
+  if (rc < 0) {
+    int stored_errno = errno;
+    ldout(msgr->cct,2) << "connect error " << peer_addr
+	     << ", " << cpp_strerror(stored_errno) << dendl;
+    if (stored_errno == ECONNREFUSED) {
+      ldout(msgr->cct, 2) << "connection refused!" << dendl;
+      msgr->dispatch_queue.queue_refused(connection_state.get());
+    }
+    goto fail;
+  }
+
+  // verify banner
+  // FIXME: this should be non-blocking, or in some other way verify the banner as we get it.
+  rc = tcp_read((char*)&banner, strlen(CEPH_BANNER));
+  if (rc < 0) {
+    ldout(msgr->cct,2) << "connect couldn't read banner, " << cpp_strerror(rc) << dendl;
+    goto fail;
+  }
+  if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+    ldout(msgr->cct,0) << "connect protocol error (bad banner) on peer " << peer_addr << dendl;
+    goto fail;
+  }
+
+  memset(&msg, 0, sizeof(msg));
+  msgvec[0].iov_base = banner;
+  msgvec[0].iov_len = strlen(CEPH_BANNER);
+  msg.msg_iov = msgvec;
+  msg.msg_iovlen = 1;
+  msglen = msgvec[0].iov_len;
+  rc = do_sendmsg(&msg, msglen);
+  if (rc < 0) {
+    ldout(msgr->cct,2) << "connect couldn't write my banner, " << cpp_strerror(rc) << dendl;
+    goto fail;
+  }
+
+  // identify peer
+  {
+#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__)
+    bufferptr p(sizeof(ceph_entity_addr) * 2);
+#else
+    int wirelen = sizeof(__u32) * 2 + sizeof(ceph_sockaddr_storage);
+    bufferptr p(wirelen * 2);
+#endif
+    addrbl.push_back(std::move(p));
+  }
+  rc = tcp_read(addrbl.c_str(), addrbl.length());
+  if (rc < 0) {
+    ldout(msgr->cct,2) << "connect couldn't read peer addrs, " << cpp_strerror(rc) << dendl;
+    goto fail;
+  }
+  try {
+    auto p = addrbl.cbegin();
+    decode(paddr, p);
+    decode(peer_addr_for_me, p);
+  }
+  catch (buffer::error& e) {
+    ldout(msgr->cct,2) << "connect couldn't decode peer addrs: " << e.what()
+		       << dendl;
+    goto fail;
+  }
+  port = peer_addr_for_me.get_port();
+
+  ldout(msgr->cct,20) << "connect read peer addr " << paddr << " on socket " << sd << dendl;
+  if (peer_addr != paddr) {
+    if (paddr.is_blank_ip() &&
+	peer_addr.get_port() == paddr.get_port() &&
+	peer_addr.get_nonce() == paddr.get_nonce()) {
+      ldout(msgr->cct,0) << "connect claims to be " 
+	      << paddr << " not " << peer_addr << " - presumably this is the same node!" << dendl;
+    } else {
+      ldout(msgr->cct,10) << "connect claims to be "
+			  << paddr << " not " << peer_addr << dendl;
+      goto fail;
+    }
+  }
+
+  ldout(msgr->cct,20) << "connect peer addr for me is " << peer_addr_for_me << dendl;
+
+  msgr->learned_addr(peer_addr_for_me);
+
+  encode(msgr->my_addr, myaddrbl, 0);  // legacy
+
+  memset(&msg, 0, sizeof(msg));
+  msgvec[0].iov_base = myaddrbl.c_str();
+  msgvec[0].iov_len = myaddrbl.length();
+  msg.msg_iov = msgvec;
+  msg.msg_iovlen = 1;
+  msglen = msgvec[0].iov_len;
+  rc = do_sendmsg(&msg, msglen);
+  if (rc < 0) {
+    ldout(msgr->cct,2) << "connect couldn't write my addr, " << cpp_strerror(rc) << dendl;
+    goto fail;
+  }
+  ldout(msgr->cct,10) << "connect sent my addr " << msgr->my_addr << dendl;
+
+
+  while (1) {
+    if (!authorizer) {
+      authorizer = msgr->ms_deliver_get_authorizer(peer_type);
+    }
+    bufferlist authorizer_reply;
+
+    ceph_msg_connect connect;
+    connect.features = policy.features_supported;
+    connect.host_type = msgr->get_myname().type();
+    connect.global_seq = gseq;
+    connect.connect_seq = cseq;
+    connect.protocol_version = msgr->get_proto_version(peer_type, true);
+    connect.authorizer_protocol = authorizer ? authorizer->protocol : 0;
+    connect.authorizer_len = authorizer ? authorizer->bl.length() : 0;
+    if (authorizer) 
+      ldout(msgr->cct,10) << "connect.authorizer_len=" << connect.authorizer_len
+	       << " protocol=" << connect.authorizer_protocol << dendl;
+    connect.flags = 0;
+    if (policy.lossy)
+      connect.flags |= CEPH_MSG_CONNECT_LOSSY;  // this is fyi, actually, server decides!
+    memset(&msg, 0, sizeof(msg));
+    msgvec[0].iov_base = (char*)&connect;
+    msgvec[0].iov_len = sizeof(connect);
+    msg.msg_iov = msgvec;
+    msg.msg_iovlen = 1;
+    msglen = msgvec[0].iov_len;
+    if (authorizer) {
+      msgvec[1].iov_base = authorizer->bl.c_str();
+      msgvec[1].iov_len = authorizer->bl.length();
+      msg.msg_iovlen++;
+      msglen += msgvec[1].iov_len;
+    }
+
+    ldout(msgr->cct,10) << "connect sending gseq=" << gseq << " cseq=" << cseq
+	     << " proto=" << connect.protocol_version << dendl;
+    rc = do_sendmsg(&msg, msglen);
+    if (rc < 0) {
+      ldout(msgr->cct,2) << "connect couldn't write gseq, cseq, " << cpp_strerror(rc) << dendl;
+      goto fail;
+    }
+
+    ldout(msgr->cct,20) << "connect wrote (self +) cseq, waiting for reply" << dendl;
+    ceph_msg_connect_reply reply;
+    rc = tcp_read((char*)&reply, sizeof(reply));
+    if (rc < 0) {
+      ldout(msgr->cct,2) << "connect read reply " << cpp_strerror(rc) << dendl;
+      goto fail;
+    }
+
+    ldout(msgr->cct,20) << "connect got reply tag " << (int)reply.tag
+			<< " connect_seq " << reply.connect_seq
+			<< " global_seq " << reply.global_seq
+			<< " proto " << reply.protocol_version
+			<< " flags " << (int)reply.flags
+			<< " features " << reply.features
+			<< dendl;
+
+    authorizer_reply.clear();
+
+    if (reply.authorizer_len) {
+      ldout(msgr->cct,10) << "reply.authorizer_len=" << reply.authorizer_len << dendl;
+      bufferptr bp = buffer::create(reply.authorizer_len);
+      rc = tcp_read(bp.c_str(), reply.authorizer_len);
+      if (rc < 0) {
+        ldout(msgr->cct,10) << "connect couldn't read connect authorizer_reply" << cpp_strerror(rc) << dendl;
+	goto fail;
+      }
+      authorizer_reply.push_back(bp);
+    }
+
+    if (reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+      authorizer->add_challenge(msgr->cct, authorizer_reply);
+      ldout(msgr->cct,10) << " got authorizer challenge, " << authorizer_reply.length()
+			  << " bytes" << dendl;
+      continue;
+    }
+
+    if (authorizer) {
+      auto iter = authorizer_reply.cbegin();
+      if (!authorizer->verify_reply(iter, nullptr /* connection_secret */)) {
+        ldout(msgr->cct,0) << "failed verifying authorize reply" << dendl;
+	goto fail;
+      }
+    }
+
+    if (conf->ms_inject_internal_delays) {
+      ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl;
+      utime_t t;
+      t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays);
+      t.sleep();
+    }
+
+    pipe_lock.Lock();
+    if (state != STATE_CONNECTING) {
+      ldout(msgr->cct,0) << "connect got RESETSESSION but no longer connecting" << dendl;
+      goto stop_locked;
+    }
+
+    if (reply.tag == CEPH_MSGR_TAG_FEATURES) {
+      ldout(msgr->cct,0) << "connect protocol feature mismatch, my " << std::hex
+	      << connect.features << " < peer " << reply.features
+	      << " missing " << (reply.features & ~policy.features_supported)
+	      << std::dec << dendl;
+      goto fail_locked;
+    }
+
+    if (reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
+      ldout(msgr->cct,0) << "connect protocol version mismatch, my " << connect.protocol_version
+	      << " != " << reply.protocol_version << dendl;
+      goto fail_locked;
+    }
+
+    if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
+      ldout(msgr->cct,0) << "connect got BADAUTHORIZER" << dendl;
+      goto fail_locked;
+    }
+    if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
+      ldout(msgr->cct,0) << "connect got RESETSESSION" << dendl;
+      was_session_reset();
+      cseq = 0;
+      pipe_lock.Unlock();
+      continue;
+    }
+    if (reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
+      gseq = msgr->get_global_seq(reply.global_seq);
+      ldout(msgr->cct,10) << "connect got RETRY_GLOBAL " << reply.global_seq
+	       << " chose new " << gseq << dendl;
+      pipe_lock.Unlock();
+      continue;
+    }
+    if (reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
+      ceph_assert(reply.connect_seq > connect_seq);
+      ldout(msgr->cct,10) << "connect got RETRY_SESSION " << connect_seq
+	       << " -> " << reply.connect_seq << dendl;
+      cseq = connect_seq = reply.connect_seq;
+      pipe_lock.Unlock();
+      continue;
+    }
+
+    if (reply.tag == CEPH_MSGR_TAG_WAIT) {
+      ldout(msgr->cct,3) << "connect got WAIT (connection race)" << dendl;
+      state = STATE_WAIT;
+      goto stop_locked;
+    }
+
+    if (reply.tag == CEPH_MSGR_TAG_READY ||
+        reply.tag == CEPH_MSGR_TAG_SEQ) {
+      uint64_t feat_missing = policy.features_required & ~(uint64_t)reply.features;
+      if (feat_missing) {
+	ldout(msgr->cct,1) << "missing required features " << std::hex << feat_missing << std::dec << dendl;
+	goto fail_locked;
+      }
+
+      if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+        ldout(msgr->cct,10) << "got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" << dendl;
+        uint64_t newly_acked_seq = 0;
+        rc = tcp_read((char*)&newly_acked_seq, sizeof(newly_acked_seq));
+        if (rc < 0) {
+          ldout(msgr->cct,2) << "connect read error on newly_acked_seq" << cpp_strerror(rc) << dendl;
+          goto fail_locked;
+        }
+	ldout(msgr->cct,2) << " got newly_acked_seq " << newly_acked_seq
+			   << " vs out_seq " << out_seq << dendl;
+	while (newly_acked_seq > out_seq) {
+	  Message *m = _get_next_outgoing();
+	  ceph_assert(m);
+	  ldout(msgr->cct,2) << " discarding previously sent " << m->get_seq()
+			     << " " << *m << dendl;
+	  ceph_assert(m->get_seq() <= newly_acked_seq);
+	  m->put();
+	  ++out_seq;
+	}
+        if (tcp_write((char*)&in_seq, sizeof(in_seq)) < 0) {
+          ldout(msgr->cct,2) << "connect write error on in_seq" << dendl;
+          goto fail_locked;
+        }
+      }
+
+      // hooray!
+      peer_global_seq = reply.global_seq;
+      policy.lossy = reply.flags & CEPH_MSG_CONNECT_LOSSY;
+      state = STATE_OPEN;
+      connect_seq = cseq + 1;
+      ceph_assert(connect_seq == reply.connect_seq);
+      backoff = utime_t();
+      connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features);
+      ldout(msgr->cct,10) << "connect success " << connect_seq << ", lossy = " << policy.lossy
+	       << ", features " << connection_state->get_features() << dendl;
+      
+
+      // If we have an authorizer, get a new AuthSessionHandler to deal with ongoing security of the
+      // connection.  PLR
+
+      if (authorizer != NULL) {
+	session_security.reset(
+            get_auth_session_handler(
+	      msgr->cct,
+	      authorizer->protocol,
+	      authorizer->session_key,
+	      connection_state->get_features()));
+      }  else {
+        // We have no authorizer, so we shouldn't be applying security to messages in this pipe.  PLR
+	session_security.reset();
+      }
+
+      msgr->dispatch_queue.queue_connect(connection_state.get());
+      msgr->ms_deliver_handle_fast_connect(connection_state.get());
+      
+      if (!reader_running) {
+	ldout(msgr->cct,20) << "connect starting reader" << dendl;
+	start_reader();
+      }
+      maybe_start_delay_thread();
+      delete authorizer;
+      return 0;
+    }
+    
+    // protocol error
+    ldout(msgr->cct,0) << "connect got bad tag " << (int)tag << dendl;
+    goto fail_locked;
+  }
+
+ fail:
+  if (conf->ms_inject_internal_delays) {
+    ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl;
+    utime_t t;
+    t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+
+  pipe_lock.Lock();
+ fail_locked:
+  if (state == STATE_CONNECTING)
+    fault();
+  else
+    ldout(msgr->cct,3) << "connect fault, but state = " << get_state_name()
+		       << " != connecting, stopping" << dendl;
+
+ stop_locked:
+  delete authorizer;
+  return rc;
+}
+
+void Pipe::register_pipe()
+{
+  ldout(msgr->cct,10) << "register_pipe" << dendl;
+  ceph_assert(msgr->lock.is_locked());
+  Pipe *existing = msgr->_lookup_pipe(peer_addr);
+  ceph_assert(existing == NULL);
+  msgr->rank_pipe[peer_addr] = this;
+}
+
+void Pipe::unregister_pipe()
+{
+  ceph_assert(msgr->lock.is_locked());
+  ceph::unordered_map<entity_addr_t,Pipe*>::iterator p = msgr->rank_pipe.find(peer_addr);
+  if (p != msgr->rank_pipe.end() && p->second == this) {
+    ldout(msgr->cct,10) << "unregister_pipe" << dendl;
+    msgr->rank_pipe.erase(p);
+  } else {
+    ldout(msgr->cct,10) << "unregister_pipe - not registered" << dendl;
+    msgr->accepting_pipes.erase(this);  // somewhat overkill, but safe.
+  }
+}
+
+void Pipe::join()
+{
+  ldout(msgr->cct, 20) << "join" << dendl;
+  if (writer_thread.is_started())
+    writer_thread.join();
+  if (reader_thread.is_started())
+    reader_thread.join();
+  if (delay_thread) {
+    ldout(msgr->cct, 20) << "joining delay_thread" << dendl;
+    delay_thread->stop();
+    delay_thread->join();
+  }
+}
+
+void Pipe::requeue_sent()
+{
+  if (sent.empty())
+    return;
+
+  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  while (!sent.empty()) {
+    Message *m = sent.back();
+    sent.pop_back();
+    ldout(msgr->cct,10) << "requeue_sent " << *m << " for resend seq " << out_seq
+			<< " (" << m->get_seq() << ")" << dendl;
+    rq.push_front(m);
+    out_seq--;
+  }
+}
+
+void Pipe::discard_requeued_up_to(uint64_t seq)
+{
+  ldout(msgr->cct, 10) << "discard_requeued_up_to " << seq << dendl;
+  if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+    out_seq = seq;
+    return;
+  }
+  list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+  while (!rq.empty()) {
+    Message *m = rq.front();
+    if (m->get_seq() == 0 || m->get_seq() > seq)
+      break;
+    ldout(msgr->cct,10) << "discard_requeued_up_to " << *m << " for resend seq " << out_seq
+			<< " <= " << seq << ", discarding" << dendl;
+    m->put();
+    rq.pop_front();
+    out_seq++;
+  }
+  if (rq.empty())
+    out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+}
+
+/*
+ * Tears down the Pipe's message queues, and removes them from the DispatchQueue
+ * Must hold pipe_lock prior to calling.
+ */
+void Pipe::discard_out_queue()
+{
+  ldout(msgr->cct,10) << "discard_queue" << dendl;
+
+  for (list<Message*>::iterator p = sent.begin(); p != sent.end(); ++p) {
+    ldout(msgr->cct,20) << "  discard " << *p << dendl;
+    (*p)->put();
+  }
+  sent.clear();
+  for (map<int,list<Message*> >::iterator p = out_q.begin(); p != out_q.end(); ++p)
+    for (list<Message*>::iterator r = p->second.begin(); r != p->second.end(); ++r) {
+      ldout(msgr->cct,20) << "  discard " << *r << dendl;
+      (*r)->put();
+    }
+  out_q.clear();
+}
+
+void Pipe::fault(bool onread)
+{
+  const auto& conf = msgr->cct->_conf;
+  ceph_assert(pipe_lock.is_locked());
+  cond.Signal();
+
+  if (onread && state == STATE_CONNECTING) {
+    ldout(msgr->cct,10) << "fault already connecting, reader shutting down" << dendl;
+    return;
+  }
+  
+  ldout(msgr->cct,2) << "fault " << cpp_strerror(errno) << dendl;
+
+  if (state == STATE_CLOSED ||
+      state == STATE_CLOSING) {
+    ldout(msgr->cct,10) << "fault already closed|closing" << dendl;
+    if (connection_state->clear_pipe(this))
+      msgr->dispatch_queue.queue_reset(connection_state.get());
+    return;
+  }
+
+  shutdown_socket();
+
+  // lossy channel?
+  if (policy.lossy && state != STATE_CONNECTING) {
+    ldout(msgr->cct,10) << "fault on lossy channel, failing" << dendl;
+
+    // disconnect from Connection, and mark it failed.  future messages
+    // will be dropped.
+    ceph_assert(connection_state);
+    stop();
+    bool cleared = connection_state->clear_pipe(this);
+
+    // crib locks, blech.  note that Pipe is now STATE_CLOSED and the
+    // rank_pipe entry is ignored by others.
+    pipe_lock.Unlock();
+
+    if (conf->ms_inject_internal_delays) {
+      ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl;
+      utime_t t;
+      t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays);
+      t.sleep();
+    }
+
+    msgr->lock.Lock();
+    pipe_lock.Lock();
+    unregister_pipe();
+    msgr->lock.Unlock();
+
+    if (delay_thread)
+      delay_thread->discard();
+    in_q->discard_queue(conn_id);
+    discard_out_queue();
+    if (cleared)
+      msgr->dispatch_queue.queue_reset(connection_state.get());
+    return;
+  }
+
+  // queue delayed items immediately
+  if (delay_thread)
+    delay_thread->flush();
+
+  // requeue sent items
+  requeue_sent();
+
+  if (policy.standby && !is_queued()) {
+    ldout(msgr->cct,0) << "fault with nothing to send, going to standby" << dendl;
+    state = STATE_STANDBY;
+    return;
+  }
+
+  if (state != STATE_CONNECTING) {
+    if (policy.server) {
+      ldout(msgr->cct,0) << "fault, server, going to standby" << dendl;
+      state = STATE_STANDBY;
+    } else {
+      ldout(msgr->cct,0) << "fault, initiating reconnect" << dendl;
+      connect_seq++;
+      state = STATE_CONNECTING;
+    }
+    backoff = utime_t();
+  } else if (backoff == utime_t()) {
+    ldout(msgr->cct,0) << "fault" << dendl;
+    backoff.set_from_double(conf->ms_initial_backoff);
+  } else {
+    ldout(msgr->cct,10) << "fault waiting " << backoff << dendl;
+    cond.WaitInterval(pipe_lock, backoff);
+    backoff += backoff;
+    if (backoff > conf->ms_max_backoff)
+      backoff.set_from_double(conf->ms_max_backoff);
+    ldout(msgr->cct,10) << "fault done waiting or woke up" << dendl;
+  }
+}
+
+void Pipe::randomize_out_seq()
+{
+  if (connection_state->get_features() & CEPH_FEATURE_MSG_AUTH) {
+    // Set out_seq to a random value, so CRC won't be predictable.
+    out_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK);
+    lsubdout(msgr->cct, ms, 10) << "randomize_out_seq " << out_seq << dendl;
+  } else {
+    // previously, seq #'s always started at 0.
+    out_seq = 0;
+  }
+}
+
+void Pipe::was_session_reset()
+{
+  ceph_assert(pipe_lock.is_locked());
+
+  ldout(msgr->cct,10) << "was_session_reset" << dendl;
+  in_q->discard_queue(conn_id);
+  if (delay_thread)
+    delay_thread->discard();
+  discard_out_queue();
+
+  msgr->dispatch_queue.queue_remote_reset(connection_state.get());
+
+  randomize_out_seq();
+
+  in_seq = 0;
+  in_seq_acked = 0;
+  connect_seq = 0;
+}
+
+void Pipe::stop()
+{
+  ldout(msgr->cct,10) << "stop" << dendl;
+  ceph_assert(pipe_lock.is_locked());
+  state = STATE_CLOSED;
+  state_closed = true;
+  cond.Signal();
+  shutdown_socket();
+}
+
+void Pipe::stop_and_wait()
+{
+  ceph_assert(pipe_lock.is_locked_by_me());
+  if (state != STATE_CLOSED)
+    stop();
+
+  if (msgr->cct->_conf->ms_inject_internal_delays) {
+    ldout(msgr->cct, 10) << __func__ << " sleep for "
+			 << msgr->cct->_conf->ms_inject_internal_delays
+			 << dendl;
+    utime_t t;
+    t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays);
+    t.sleep();
+  }
+  
+  if (delay_thread) {
+    pipe_lock.Unlock();
+    delay_thread->stop_fast_dispatching();
+    pipe_lock.Lock();
+  }
+  while (reader_running &&
+	 reader_dispatching)
+    cond.Wait(pipe_lock);
+}
+
+/* read msgs from socket.
+ * also, server.
+ */
+void Pipe::reader()
+{
+  pipe_lock.Lock();
+
+  if (state == STATE_ACCEPTING) {
+    accept();
+    ceph_assert(pipe_lock.is_locked());
+  }
+
+  // loop.
+  while (state != STATE_CLOSED &&
+	 state != STATE_CONNECTING) {
+    ceph_assert(pipe_lock.is_locked());
+
+    // sleep if (re)connecting
+    if (state == STATE_STANDBY) {
+      ldout(msgr->cct,20) << "reader sleeping during reconnect|standby" << dendl;
+      cond.Wait(pipe_lock);
+      continue;
+    }
+
+    // get a reference to the AuthSessionHandler while we have the pipe_lock
+    std::shared_ptr<AuthSessionHandler> auth_handler = session_security;
+
+    pipe_lock.Unlock();
+
+    char tag = -1;
+    ldout(msgr->cct,20) << "reader reading tag..." << dendl;
+    if (tcp_read((char*)&tag, 1) < 0) {
+      pipe_lock.Lock();
+      ldout(msgr->cct,2) << "reader couldn't read tag, " << cpp_strerror(errno) << dendl;
+      fault(true);
+      continue;
+    }
+
+    if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
+      ldout(msgr->cct,2) << "reader got KEEPALIVE" << dendl;
+      pipe_lock.Lock();
+      connection_state->set_last_keepalive(ceph_clock_now());
+      continue;
+    }
+    if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
+      ldout(msgr->cct,30) << "reader got KEEPALIVE2 tag ..." << dendl;
+      ceph_timespec t;
+      int rc = tcp_read((char*)&t, sizeof(t));
+      pipe_lock.Lock();
+      if (rc < 0) {
+	ldout(msgr->cct,2) << "reader couldn't read KEEPALIVE2 stamp "
+			   << cpp_strerror(errno) << dendl;
+	fault(true);
+      } else {
+	send_keepalive_ack = true;
+	keepalive_ack_stamp = utime_t(t);
+	ldout(msgr->cct,2) << "reader got KEEPALIVE2 " << keepalive_ack_stamp
+			   << dendl;
+	connection_state->set_last_keepalive(ceph_clock_now());
+	cond.Signal();
+      }
+      continue;
+    }
+    if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+      ldout(msgr->cct,2) << "reader got KEEPALIVE_ACK" << dendl;
+      struct ceph_timespec t;
+      int rc = tcp_read((char*)&t, sizeof(t));
+      pipe_lock.Lock();
+      if (rc < 0) {
+	ldout(msgr->cct,2) << "reader couldn't read KEEPALIVE2 stamp " << cpp_strerror(errno) << dendl;
+	fault(true);
+      } else {
+	connection_state->set_last_keepalive_ack(utime_t(t));
+      }
+      continue;
+    }
+
+    // open ...
+    if (tag == CEPH_MSGR_TAG_ACK) {
+      ldout(msgr->cct,20) << "reader got ACK" << dendl;
+      ceph_le64 seq;
+      int rc = tcp_read((char*)&seq, sizeof(seq));
+      pipe_lock.Lock();
+      if (rc < 0) {
+	ldout(msgr->cct,2) << "reader couldn't read ack seq, " << cpp_strerror(errno) << dendl;
+	fault(true);
+      } else if (state != STATE_CLOSED) {
+        handle_ack(seq);
+      }
+      continue;
+    }
+
+    else if (tag == CEPH_MSGR_TAG_MSG) {
+      ldout(msgr->cct,20) << "reader got MSG" << dendl;
+      Message *m = 0;
+      int r = read_message(&m, auth_handler.get());
+
+      pipe_lock.Lock();
+      
+      if (!m) {
+	if (r < 0)
+	  fault(true);
+	continue;
+      }
+
+      m->trace.event("pipe read message");
+
+      if (state == STATE_CLOSED ||
+	  state == STATE_CONNECTING) {
+	in_q->dispatch_throttle_release(m->get_dispatch_throttle_size());
+	m->put();
+	continue;
+      }
+
+      // check received seq#.  if it is old, drop the message.  
+      // note that incoming messages may skip ahead.  this is convenient for the client
+      // side queueing because messages can't be renumbered, but the (kernel) client will
+      // occasionally pull a message out of the sent queue to send elsewhere.  in that case
+      // it doesn't matter if we "got" it or not.
+      if (m->get_seq() <= in_seq) {
+	ldout(msgr->cct,0) << "reader got old message "
+		<< m->get_seq() << " <= " << in_seq << " " << m << " " << *m
+		<< ", discarding" << dendl;
+	in_q->dispatch_throttle_release(m->get_dispatch_throttle_size());
+	m->put();
+	if (connection_state->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+	    msgr->cct->_conf->ms_die_on_old_message)
+	  ceph_abort_msg("old msgs despite reconnect_seq feature");
+	continue;
+      }
+      if (m->get_seq() > in_seq + 1) {
+	ldout(msgr->cct,0) << "reader missed message?  skipped from seq "
+			   << in_seq << " to " << m->get_seq() << dendl;
+	if (msgr->cct->_conf->ms_die_on_skipped_message)
+	  ceph_abort_msg("skipped incoming seq");
+      }
+
+      m->set_connection(connection_state.get());
+
+      // note last received message.
+      in_seq = m->get_seq();
+
+      cond.Signal();  // wake up writer, to ack this
+      
+      ldout(msgr->cct,10) << "reader got message "
+	       << m->get_seq() << " " << m << " " << *m
+	       << dendl;
+      in_q->fast_preprocess(m);
+
+      if (delay_thread) {
+        utime_t release;
+        if (rand() % 10000 < msgr->cct->_conf->ms_inject_delay_probability * 10000.0) {
+          release = m->get_recv_stamp();
+          release += msgr->cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+          lsubdout(msgr->cct, ms, 1) << "queue_received will delay until " << release << " on " << m << " " << *m << dendl;
+        }
+        delay_thread->queue(release, m);
+      } else {
+        if (in_q->can_fast_dispatch(m)) {
+	  reader_dispatching = true;
+          pipe_lock.Unlock();
+          in_q->fast_dispatch(m);
+          pipe_lock.Lock();
+	  reader_dispatching = false;
+	  if (state == STATE_CLOSED ||
+	      notify_on_dispatch_done) { // there might be somebody waiting
+	    notify_on_dispatch_done = false;
+	    cond.Signal();
+	  }
+        } else {
+          in_q->enqueue(m, m->get_priority(), conn_id);
+        }
+      }
+    }
+    
+    else if (tag == CEPH_MSGR_TAG_CLOSE) {
+      ldout(msgr->cct,20) << "reader got CLOSE" << dendl;
+      pipe_lock.Lock();
+      if (state == STATE_CLOSING) {
+	state = STATE_CLOSED;
+	state_closed = true;
+      } else {
+	state = STATE_CLOSING;
+      }
+      cond.Signal();
+      break;
+    }
+    else {
+      ldout(msgr->cct,0) << "reader bad tag " << (int)tag << dendl;
+      pipe_lock.Lock();
+      fault(true);
+    }
+  }
+
+ 
+  // reap?
+  reader_running = false;
+  reader_needs_join = true;
+  unlock_maybe_reap();
+  ldout(msgr->cct,10) << "reader done" << dendl;
+}
+
+/* write msgs to socket.
+ * also, client.
+ */
+void Pipe::writer()
+{
+  pipe_lock.Lock();
+  while (state != STATE_CLOSED) {// && state != STATE_WAIT) {
+    ldout(msgr->cct,10) << "writer: state = " << get_state_name()
+			<< " policy.server=" << policy.server << dendl;
+
+    // standby?
+    if (is_queued() && state == STATE_STANDBY && !policy.server)
+      state = STATE_CONNECTING;
+
+    // connect?
+    if (state == STATE_CONNECTING) {
+      ceph_assert(!policy.server);
+      connect();
+      continue;
+    }
+    
+    if (state == STATE_CLOSING) {
+      // write close tag
+      ldout(msgr->cct,20) << "writer writing CLOSE tag" << dendl;
+      char tag = CEPH_MSGR_TAG_CLOSE;
+      state = STATE_CLOSED;
+      state_closed = true;
+      pipe_lock.Unlock();
+      if (sd >= 0) {
+	// we can ignore return value, actually; we don't care if this succeeds.
+	int r = ::write(sd, &tag, 1);
+	(void)r;
+      }
+      pipe_lock.Lock();
+      continue;
+    }
+
+    if (state != STATE_CONNECTING && state != STATE_WAIT && state != STATE_STANDBY &&
+	(is_queued() || in_seq > in_seq_acked)) {
+
+      // keepalive?
+      if (send_keepalive) {
+	int rc;
+	if (connection_state->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+	  pipe_lock.Unlock();
+	  rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2,
+				ceph_clock_now());
+	} else {
+	  pipe_lock.Unlock();
+	  rc = write_keepalive();
+	}
+	pipe_lock.Lock();
+	if (rc < 0) {
+	  ldout(msgr->cct,2) << "writer couldn't write keepalive[2], "
+			     << cpp_strerror(errno) << dendl;
+	  fault();
+ 	  continue;
+	}
+	send_keepalive = false;
+      }
+      if (send_keepalive_ack) {
+	utime_t t = keepalive_ack_stamp;
+	pipe_lock.Unlock();
+	int rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2_ACK, t);
+	pipe_lock.Lock();
+	if (rc < 0) {
+	  ldout(msgr->cct,2) << "writer couldn't write keepalive_ack, " << cpp_strerror(errno) << dendl;
+	  fault();
+	  continue;
+	}
+	send_keepalive_ack = false;
+      }
+
+      // send ack?
+      if (in_seq > in_seq_acked) {
+	uint64_t send_seq = in_seq;
+	pipe_lock.Unlock();
+	int rc = write_ack(send_seq);
+	pipe_lock.Lock();
+	if (rc < 0) {
+	  ldout(msgr->cct,2) << "writer couldn't write ack, " << cpp_strerror(errno) << dendl;
+	  fault();
+ 	  continue;
+	}
+	in_seq_acked = send_seq;
+      }
+
+      // grab outgoing message
+      Message *m = _get_next_outgoing();
+      if (m) {
+	m->set_seq(++out_seq);
+	if (!policy.lossy) {
+	  // put on sent list
+	  sent.push_back(m); 
+	  m->get();
+	}
+
+	// associate message with Connection (for benefit of encode_payload)
+	m->set_connection(connection_state.get());
+
+	uint64_t features = connection_state->get_features();
+
+	if (m->empty_payload())
+	  ldout(msgr->cct,20) << "writer encoding " << m->get_seq() << " features " << features
+			      << " " << m << " " << *m << dendl;
+	else
+	  ldout(msgr->cct,20) << "writer half-reencoding " << m->get_seq() << " features " << features
+			      << " " << m << " " << *m << dendl;
+
+	// encode and copy out of *m
+	m->encode(features, msgr->crcflags);
+
+	// prepare everything
+	const ceph_msg_header& header = m->get_header();
+	const ceph_msg_footer& footer = m->get_footer();
+
+	// Now that we have all the crcs calculated, handle the
+	// digital signature for the message, if the pipe has session
+	// security set up.  Some session security options do not
+	// actually calculate and check the signature, but they should
+	// handle the calls to sign_message and check_signature.  PLR
+	if (session_security.get() == NULL) {
+	  ldout(msgr->cct, 20) << "writer no session security" << dendl;
+	} else {
+	  if (session_security->sign_message(m)) {
+	    ldout(msgr->cct, 20) << "writer failed to sign seq # " << header.seq
+				 << "): sig = " << footer.sig << dendl;
+	  } else {
+	    ldout(msgr->cct, 20) << "writer signed seq # " << header.seq
+				 << "): sig = " << footer.sig << dendl;
+	  }
+	}
+
+	bufferlist blist = m->get_payload();
+	blist.append(m->get_middle());
+	blist.append(m->get_data());
+
+        pipe_lock.Unlock();
+
+        m->trace.event("pipe writing message");
+
+        ldout(msgr->cct,20) << "writer sending " << m->get_seq() << " " << m << dendl;
+	int rc = write_message(header, footer, blist);
+
+	pipe_lock.Lock();
+	if (rc < 0) {
+          ldout(msgr->cct,1) << "writer error sending " << m << ", "
+		  << cpp_strerror(errno) << dendl;
+	  fault();
+        }
+	m->put();
+      }
+      continue;
+    }
+    
+    // wait
+    ldout(msgr->cct,20) << "writer sleeping" << dendl;
+    cond.Wait(pipe_lock);
+  }
+  
+  ldout(msgr->cct,20) << "writer finishing" << dendl;
+
+  // reap?
+  writer_running = false;
+  unlock_maybe_reap();
+  ldout(msgr->cct,10) << "writer done" << dendl;
+}
+
+void Pipe::unlock_maybe_reap()
+{
+  if (!reader_running && !writer_running) {
+    shutdown_socket();
+    pipe_lock.Unlock();
+    if (delay_thread && delay_thread->is_flushing()) {
+      delay_thread->wait_for_flush();
+    }
+    msgr->queue_reap(this);
+  } else {
+    pipe_lock.Unlock();
+  }
+}
+
+static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off)
+{
+  // create a buffer to read into that matches the data alignment
+  unsigned left = len;
+  if (off & ~CEPH_PAGE_MASK) {
+    // head
+    unsigned head = 0;
+    head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
+    data.push_back(buffer::create(head));
+    left -= head;
+  }
+  unsigned middle = left & CEPH_PAGE_MASK;
+  if (middle > 0) {
+    data.push_back(buffer::create_small_page_aligned(middle));
+    left -= middle;
+  }
+  if (left) {
+    data.push_back(buffer::create(left));
+  }
+}
+
+int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler)
+{
+  int ret = -1;
+  // envelope
+  //ldout(msgr->cct,10) << "receiver.read_message from sd " << sd  << dendl;
+  
+  ceph_msg_header header; 
+  ceph_msg_footer footer;
+  __u32 header_crc = 0;
+
+  if (tcp_read((char*)&header, sizeof(header)) < 0)
+    return -1;
+  if (msgr->crcflags & MSG_CRC_HEADER) {
+    header_crc = ceph_crc32c(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc));
+  }
+
+  ldout(msgr->cct,20) << "reader got envelope type=" << header.type
+           << " src " << entity_name_t(header.src)
+           << " front=" << header.front_len
+	   << " data=" << header.data_len
+	   << " off " << header.data_off
+           << dendl;
+
+  // verify header crc
+  if ((msgr->crcflags & MSG_CRC_HEADER) && header_crc != header.crc) {
+    ldout(msgr->cct,0) << "reader got bad header crc " << header_crc << " != " << header.crc << dendl;
+    return -1;
+  }
+
+  bufferlist front, middle, data;
+  int front_len, middle_len;
+  unsigned data_len, data_off;
+  int aborted;
+  Message *message;
+  utime_t recv_stamp = ceph_clock_now();
+
+  if (policy.throttler_messages) {
+    ldout(msgr->cct,10) << "reader wants " << 1 << " message from policy throttler "
+			<< policy.throttler_messages->get_current() << "/"
+			<< policy.throttler_messages->get_max() << dendl;
+    policy.throttler_messages->get();
+  }
+
+  uint64_t message_size = header.front_len + header.middle_len + header.data_len;
+  if (message_size) {
+    if (policy.throttler_bytes) {
+      ldout(msgr->cct,10) << "reader wants " << message_size << " bytes from policy throttler "
+	       << policy.throttler_bytes->get_current() << "/"
+	       << policy.throttler_bytes->get_max() << dendl;
+      policy.throttler_bytes->get(message_size);
+    }
+
+    // throttle total bytes waiting for dispatch.  do this _after_ the
+    // policy throttle, as this one does not deadlock (unless dispatch
+    // blocks indefinitely, which it shouldn't).  in contrast, the
+    // policy throttle carries for the lifetime of the message.
+    ldout(msgr->cct,10) << "reader wants " << message_size << " from dispatch throttler "
+	     << in_q->dispatch_throttler.get_current() << "/"
+	     << in_q->dispatch_throttler.get_max() << dendl;
+    in_q->dispatch_throttler.get(message_size);
+  }
+
+  utime_t throttle_stamp = ceph_clock_now();
+
+  // read front
+  front_len = header.front_len;
+  if (front_len) {
+    bufferptr bp = buffer::create(front_len);
+    if (tcp_read(bp.c_str(), front_len) < 0)
+      goto out_dethrottle;
+    front.push_back(std::move(bp));
+    ldout(msgr->cct,20) << "reader got front " << front.length() << dendl;
+  }
+
+  // read middle
+  middle_len = header.middle_len;
+  if (middle_len) {
+    bufferptr bp = buffer::create(middle_len);
+    if (tcp_read(bp.c_str(), middle_len) < 0)
+      goto out_dethrottle;
+    middle.push_back(std::move(bp));
+    ldout(msgr->cct,20) << "reader got middle " << middle.length() << dendl;
+  }
+
+
+  // read data
+  data_len = le32_to_cpu(header.data_len);
+  data_off = le32_to_cpu(header.data_off);
+  if (data_len) {
+    unsigned offset = 0;
+    unsigned left = data_len;
+
+    bufferlist newbuf, rxbuf;
+    bufferlist::iterator blp;
+//    int rxbuf_version = 0;
+	
+    while (left > 0) {
+      // wait for data
+      if (tcp_read_wait() < 0)
+	goto out_dethrottle;
+
+      // get a buffer
+#if 0
+      // The rx_buffers implementation is buggy:
+      // - see http://tracker.ceph.com/issues/22480
+      //
+      // - From inspection, I think that we have problems if we read *part*
+      // of the message into an rx_buffer, then drop the lock, someone revokes,
+      // and then later try to read the rest.  In that case our final bufferlist
+      // will have part of the original static_buffer from the first chunk and
+      // partly a piece that we allocated.  I think that to make this correct,
+      // we need to keep the bufferlist we are reading into in Connection under
+      // the lock, and on revoke, if the data is partly read, rebuild() to copy
+      // into fresh buffers so that all references to our static buffer are
+      // cleared up.
+      //
+      // - Also... what happens if we fully read into the static
+      // buffer, then revoke?  We still have some bufferlist out there
+      // in the process of getting dispatched back to objecter or
+      // librados that references the static buffer.
+      connection_state->lock.Lock();
+      map<ceph_tid_t,pair<bufferlist,int> >::iterator p = connection_state->rx_buffers.find(header.tid);
+      if (p != connection_state->rx_buffers.end()) {
+	if (rxbuf.length() == 0 || p->second.second != rxbuf_version) {
+	  ldout(msgr->cct,10) << "reader seleting rx buffer v " << p->second.second
+		   << " at offset " << offset
+		   << " len " << p->second.first.length() << dendl;
+	  rxbuf = p->second.first;
+	  rxbuf_version = p->second.second;
+	  // make sure it's big enough
+	  if (rxbuf.length() < data_len)
+	    rxbuf.push_back(buffer::create(data_len - rxbuf.length()));
+	  blp = p->second.first.begin();
+	  blp.advance(offset);
+	}
+      } else {
+	if (!newbuf.length()) {
+	  ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " << offset << dendl;
+	  alloc_aligned_buffer(newbuf, data_len, data_off);
+	  blp = newbuf.begin();
+	  blp.advance(offset);
+	}
+      }
+      bufferptr bp = blp.get_current_ptr();
+      int read = std::min(bp.length(), left);
+      ldout(msgr->cct,20) << "reader reading nonblocking into " << (void*)bp.c_str() << " len " << bp.length() << dendl;
+      ssize_t got = tcp_read_nonblocking(bp.c_str(), read);
+      ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl;
+      connection_state->lock.Unlock();
+#else
+      // rx_buffer-less implementation
+      if (!newbuf.length()) {
+	ldout(msgr->cct,20) << "reader allocating new rx buffer at offset "
+			    << offset << dendl;
+	alloc_aligned_buffer(newbuf, data_len, data_off);
+	blp = newbuf.begin();
+	blp.advance(offset);
+      }
+      bufferptr bp = blp.get_current_ptr();
+      int read = std::min(bp.length(), left);
+      ldout(msgr->cct,20) << "reader reading nonblocking into "
+			  << (void*)bp.c_str() << " len " << bp.length()
+			  << dendl;
+      ssize_t got = tcp_read_nonblocking(bp.c_str(), read);
+      ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl;
+#endif
+      if (got < 0)
+	goto out_dethrottle;
+      if (got > 0) {
+	blp.advance(static_cast<size_t>(got));
+	data.append(bp, 0, got);
+	offset += got;
+	left -= got;
+      } // else we got a signal or something; just loop.
+    }
+  }
+
+  // footer
+  if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    if (tcp_read((char*)&footer, sizeof(footer)) < 0)
+      goto out_dethrottle;
+  } else {
+    ceph_msg_footer_old old_footer;
+    if (tcp_read((char*)&old_footer, sizeof(old_footer)) < 0)
+      goto out_dethrottle;
+    footer.front_crc = old_footer.front_crc;
+    footer.middle_crc = old_footer.middle_crc;
+    footer.data_crc = old_footer.data_crc;
+    footer.sig = 0;
+    footer.flags = old_footer.flags;
+  }
+  
+  aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
+  ldout(msgr->cct,10) << "aborted = " << aborted << dendl;
+  if (aborted) {
+    ldout(msgr->cct,0) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
+	    << " byte message.. ABORTED" << dendl;
+    ret = 0;
+    goto out_dethrottle;
+  }
+
+  ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length()
+	   << " byte message" << dendl;
+  message = decode_message(msgr->cct, msgr->crcflags, header, footer,
+                           front, middle, data, connection_state.get());
+  if (!message) {
+    ret = -EINVAL;
+    goto out_dethrottle;
+  }
+
+  //
+  //  Check the signature if one should be present.  A zero return indicates success. PLR
+  //
+
+  if (auth_handler == NULL) {
+    ldout(msgr->cct, 10) << "No session security set" << dendl;
+  } else {
+    if (auth_handler->check_message_signature(message)) {
+      ldout(msgr->cct, 0) << "Signature check failed" << dendl;
+      message->put();
+      ret = -EINVAL;
+      goto out_dethrottle;
+    } 
+  }
+
+  message->set_byte_throttler(policy.throttler_bytes);
+  message->set_message_throttler(policy.throttler_messages);
+
+  // store reservation size in message, so we don't get confused
+  // by messages entering the dispatch queue through other paths.
+  message->set_dispatch_throttle_size(message_size);
+
+  message->set_recv_stamp(recv_stamp);
+  message->set_throttle_stamp(throttle_stamp);
+  message->set_recv_complete_stamp(ceph_clock_now());
+
+  *pm = message;
+  return 0;
+
+ out_dethrottle:
+  // release bytes reserved from the throttlers on failure
+  if (policy.throttler_messages) {
+    ldout(msgr->cct,10) << "reader releasing " << 1 << " message to policy throttler "
+			<< policy.throttler_messages->get_current() << "/"
+			<< policy.throttler_messages->get_max() << dendl;
+    policy.throttler_messages->put();
+  }
+  if (message_size) {
+    if (policy.throttler_bytes) {
+      ldout(msgr->cct,10) << "reader releasing " << message_size << " bytes to policy throttler "
+			  << policy.throttler_bytes->get_current() << "/"
+			  << policy.throttler_bytes->get_max() << dendl;
+      policy.throttler_bytes->put(message_size);
+    }
+
+    in_q->dispatch_throttle_release(message_size);
+  }
+  return ret;
+}
+
+int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more)
+{
+  MSGR_SIGPIPE_STOPPER;
+  while (len > 0) {
+    int r;
+    r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+    if (r == 0) 
+      ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl;
+    if (r < 0) {
+      r = -errno; 
+      ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(r) << dendl;
+      return r;
+    }
+    if (state == STATE_CLOSED) {
+      ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl;
+      return -EINTR; // close enough
+    }
+
+    len -= r;
+    if (len == 0) break;
+    
+    // hrmph.  trim r bytes off the front of our message.
+    ldout(msgr->cct,20) << "do_sendmsg short write did " << r << ", still have " << len << dendl;
+    while (r > 0) {
+      if (msg->msg_iov[0].iov_len <= (size_t)r) {
+	// lose this whole item
+	//ldout(msgr->cct,30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl;
+	r -= msg->msg_iov[0].iov_len;
+	msg->msg_iov++;
+	msg->msg_iovlen--;
+      } else {
+	// partial!
+	//ldout(msgr->cct,30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl;
+	msg->msg_iov[0].iov_base = (char *)msg->msg_iov[0].iov_base + r;
+	msg->msg_iov[0].iov_len -= r;
+	break;
+      }
+    }
+  }
+  return 0;
+}
+
+
+int Pipe::write_ack(uint64_t seq)
+{
+  ldout(msgr->cct,10) << "write_ack " << seq << dendl;
+
+  char c = CEPH_MSGR_TAG_ACK;
+  ceph_le64 s;
+  s = seq;
+
+  struct msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  struct iovec msgvec[2];
+  msgvec[0].iov_base = &c;
+  msgvec[0].iov_len = 1;
+  msgvec[1].iov_base = &s;
+  msgvec[1].iov_len = sizeof(s);
+  msg.msg_iov = msgvec;
+  msg.msg_iovlen = 2;
+  
+  if (do_sendmsg(&msg, 1 + sizeof(s), true) < 0)
+    return -1;	
+  return 0;
+}
+
+int Pipe::write_keepalive()
+{
+  ldout(msgr->cct,10) << "write_keepalive" << dendl;
+
+  char c = CEPH_MSGR_TAG_KEEPALIVE;
+
+  struct msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  struct iovec msgvec[2];
+  msgvec[0].iov_base = &c;
+  msgvec[0].iov_len = 1;
+  msg.msg_iov = msgvec;
+  msg.msg_iovlen = 1;
+  
+  if (do_sendmsg(&msg, 1) < 0)
+    return -1;	
+  return 0;
+}
+
+int Pipe::write_keepalive2(char tag, const utime_t& t)
+{
+  ldout(msgr->cct,10) << "write_keepalive2 " << (int)tag << " " << t << dendl;
+  struct ceph_timespec ts;
+  t.encode_timeval(&ts);
+  struct msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  struct iovec msgvec[2];
+  msgvec[0].iov_base = &tag;
+  msgvec[0].iov_len = 1;
+  msgvec[1].iov_base = &ts;
+  msgvec[1].iov_len = sizeof(ts);
+  msg.msg_iov = msgvec;
+  msg.msg_iovlen = 2;
+
+  if (do_sendmsg(&msg, 1 + sizeof(ts)) < 0)
+    return -1;
+  return 0;
+}
+
+
+int Pipe::write_message(const ceph_msg_header& header, const ceph_msg_footer& footer, bufferlist& blist)
+{
+  int ret;
+
+  // set up msghdr and iovecs
+  struct msghdr msg;
+  memset(&msg, 0, sizeof(msg));
+  msg.msg_iov = msgvec;
+  int msglen = 0;
+  
+  // send tag
+  char tag = CEPH_MSGR_TAG_MSG;
+  msgvec[msg.msg_iovlen].iov_base = &tag;
+  msgvec[msg.msg_iovlen].iov_len = 1;
+  msglen++;
+  msg.msg_iovlen++;
+
+  // send envelope
+  msgvec[msg.msg_iovlen].iov_base = (char*)&header;
+  msgvec[msg.msg_iovlen].iov_len = sizeof(header);
+  msglen += sizeof(header);
+  msg.msg_iovlen++;
+
+  // payload (front+data)
+  auto pb = std::cbegin(blist.buffers());
+  unsigned b_off = 0;  // carry-over buffer offset, if any
+  unsigned bl_pos = 0; // blist pos
+  unsigned left = blist.length();
+
+  while (left > 0) {
+    unsigned donow = std::min(left, pb->length()-b_off);
+    if (donow == 0) {
+      ldout(msgr->cct,0) << "donow = " << donow << " left " << left << " pb->length " << pb->length()
+                         << " b_off " << b_off << dendl;
+    }
+    ceph_assert(donow > 0);
+    ldout(msgr->cct,30) << " bl_pos " << bl_pos << " b_off " << b_off
+	     << " leftinchunk " << left
+	     << " buffer len " << pb->length()
+	     << " writing " << donow 
+	     << dendl;
+    
+    if (msg.msg_iovlen >= SM_IOV_MAX-2) {
+      if (do_sendmsg(&msg, msglen, true))
+	goto fail;
+      
+      // and restart the iov
+      msg.msg_iov = msgvec;
+      msg.msg_iovlen = 0;
+      msglen = 0;
+    }
+    
+    msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off);
+    msgvec[msg.msg_iovlen].iov_len = donow;
+    msglen += donow;
+    msg.msg_iovlen++;
+    
+    ceph_assert(left >= donow);
+    left -= donow;
+    b_off += donow;
+    bl_pos += donow;
+    if (left == 0)
+      break;
+    while (b_off == pb->length()) {
+      ++pb;
+      b_off = 0;
+    }
+  }
+  ceph_assert(left == 0);
+
+  // send footer; if receiver doesn't support signatures, use the old footer format
+
+  ceph_msg_footer_old old_footer;
+  if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+    msgvec[msg.msg_iovlen].iov_base = (void*)&footer;
+    msgvec[msg.msg_iovlen].iov_len = sizeof(footer);
+    msglen += sizeof(footer);
+    msg.msg_iovlen++;
+  } else {
+    if (msgr->crcflags & MSG_CRC_HEADER) {
+      old_footer.front_crc = footer.front_crc;
+      old_footer.middle_crc = footer.middle_crc;
+    } else {
+	old_footer.front_crc = old_footer.middle_crc = 0;
+    }
+    old_footer.data_crc = msgr->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
+    old_footer.flags = footer.flags;
+    msgvec[msg.msg_iovlen].iov_base = (char*)&old_footer;
+    msgvec[msg.msg_iovlen].iov_len = sizeof(old_footer);
+    msglen += sizeof(old_footer);
+    msg.msg_iovlen++;
+  }
+
+  // send
+  if (do_sendmsg(&msg, msglen))
+    goto fail;
+
+  ret = 0;
+
+ out:
+  return ret;
+
+ fail:
+  ret = -1;
+  goto out;
+}
+
+
+int Pipe::tcp_read(char *buf, unsigned len)
+{
+  if (sd < 0)
+    return -EINVAL;
+
+  while (len > 0) {
+
+    if (msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) {
+      if (rand() % msgr->cct->_conf->ms_inject_socket_failures == 0) {
+	ldout(msgr->cct, 0) << "injecting socket failure" << dendl;
+	::shutdown(sd, SHUT_RDWR);
+      }
+    }
+
+    if (tcp_read_wait() < 0)
+      return -1;
+
+    ssize_t got = tcp_read_nonblocking(buf, len);
+
+    if (got < 0)
+      return -1;
+
+    len -= got;
+    buf += got;
+    //lgeneric_dout(cct, DBL) << "tcp_read got " << got << ", " << len << " left" << dendl;
+  }
+  return 0;
+}
+
+int Pipe::tcp_read_wait()
+{
+  if (sd < 0)
+    return -EINVAL;
+  struct pollfd pfd;
+  short evmask;
+  pfd.fd = sd;
+  pfd.events = POLLIN;
+#if defined(__linux__)
+  pfd.events |= POLLRDHUP;
+#endif
+
+  if (has_pending_data())
+    return 0;
+
+  int r = poll(&pfd, 1, msgr->timeout);
+  if (r < 0)
+    return -errno;
+  if (r == 0)
+    return -EAGAIN;
+
+  evmask = POLLERR | POLLHUP | POLLNVAL;
+#if defined(__linux__)
+  evmask |= POLLRDHUP;
+#endif
+  if (pfd.revents & evmask)
+    return -1;
+
+  if (!(pfd.revents & POLLIN))
+    return -1;
+
+  return 0;
+}
+
+ssize_t Pipe::do_recv(char *buf, size_t len, int flags)
+{
+again:
+  ssize_t got = ::recv( sd, buf, len, flags );
+  if (got < 0) {
+    if (errno == EINTR) {
+      goto again;
+    }
+    ldout(msgr->cct, 10) << __func__ << " socket " << sd << " returned "
+		     << got << " " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  if (got == 0) {
+    return -1;
+  }
+  return got;
+}
+
+ssize_t Pipe::buffered_recv(char *buf, size_t len, int flags)
+{
+  size_t left = len;
+  ssize_t total_recv = 0;
+  if (recv_len > recv_ofs) {
+    int to_read = std::min(recv_len - recv_ofs, left);
+    memcpy(buf, &recv_buf[recv_ofs], to_read);
+    recv_ofs += to_read;
+    left -= to_read;
+    if (left == 0) {
+      return to_read;
+    }
+    buf += to_read;
+    total_recv += to_read;
+  }
+
+  /* nothing left in the prefetch buffer */
+
+  if (left > recv_max_prefetch) {
+    /* this was a large read, we don't prefetch for these */
+    ssize_t ret = do_recv(buf, left, flags );
+    if (ret < 0) {
+      if (total_recv > 0)
+        return total_recv;
+      return ret;
+    }
+    total_recv += ret;
+    return total_recv;
+  }
+
+
+  ssize_t got = do_recv(recv_buf, recv_max_prefetch, flags);
+  if (got < 0) {
+    if (total_recv > 0)
+      return total_recv;
+
+    return got;
+  }
+
+  recv_len = (size_t)got;
+  got = std::min(left, (size_t)got);
+  memcpy(buf, recv_buf, got);
+  recv_ofs = got;
+  total_recv += got;
+  return total_recv;
+}
+
+ssize_t Pipe::tcp_read_nonblocking(char *buf, unsigned len)
+{
+  ssize_t got = buffered_recv(buf, len, MSG_DONTWAIT );
+  if (got < 0) {
+    ldout(msgr->cct, 10) << __func__ << " socket " << sd << " returned "
+		         << got << " " << cpp_strerror(errno) << dendl;
+    return -1;
+  }
+  if (got == 0) {
+    /* poll() said there was data, but we didn't read any - peer
+     * sent a FIN.  Maybe POLLRDHUP signals this, but this is
+     * standard socket behavior as documented by Stevens.
+     */
+    return -1;
+  }
+  return got;
+}
+
+int Pipe::tcp_write(const char *buf, unsigned len)
+{
+  if (sd < 0)
+    return -1;
+  struct pollfd pfd;
+  pfd.fd = sd;
+  pfd.events = POLLOUT | POLLHUP | POLLNVAL | POLLERR;
+#if defined(__linux__)
+  pfd.events |= POLLRDHUP;
+#endif
+
+  if (msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) {
+    if (rand() % msgr->cct->_conf->ms_inject_socket_failures == 0) {
+      ldout(msgr->cct, 0) << "injecting socket failure" << dendl;
+      ::shutdown(sd, SHUT_RDWR);
+    }
+  }
+
+  if (poll(&pfd, 1, -1) < 0)
+    return -1;
+
+  if (!(pfd.revents & POLLOUT))
+    return -1;
+
+  //lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl;
+  ceph_assert(len > 0);
+  while (len > 0) {
+    MSGR_SIGPIPE_STOPPER;
+    int did = ::send( sd, buf, len, MSG_NOSIGNAL );
+    if (did < 0) {
+      //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
+      //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl;
+      return did;
+    }
+    len -= did;
+    buf += did;
+    //lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl;
+  }
+  return 0;
+}
diff --git a/src/msg/simple/Pipe.h b/src/msg/simple/Pipe.h
new file mode 100644
index 00000000..81245198
--- /dev/null
+++ b/src/msg/simple/Pipe.h
@@ -0,0 +1,315 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_MSGR_PIPE_H
+#define CEPH_MSGR_PIPE_H
+
+#include "auth/AuthSessionHandler.h"
+
+#include "msg/msg_types.h"
+#include "msg/Messenger.h"
+#include "PipeConnection.h"
+
+
+class SimpleMessenger;
+class DispatchQueue;
+
+static const int SM_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
+
+  /**
+   * The Pipe is the most complex SimpleMessenger component. It gets
+   * two threads, one each for reading and writing on a socket it's handed
+   * at creation time, and is responsible for everything that happens on
+   * that socket. Besides message transmission, it's responsible for
+   * propagating socket errors to the SimpleMessenger and then sticking
+   * around in a state where it can provide enough data for the SimpleMessenger
+   * to provide reliable Message delivery when it manages to reconnect.
+   */
+  class Pipe : public RefCountedObject {
+    /**
+     * The Reader thread handles all reads off the socket -- not just
+     * Messages, but also acks and other protocol bits (excepting startup,
+     * when the Writer does a couple of reads).
+     * All the work is implemented in Pipe itself, of course.
+     */
+    class Reader : public Thread {
+      Pipe *pipe;
+    public:
+      explicit Reader(Pipe *p) : pipe(p) {}
+      void *entry() override { pipe->reader(); return 0; }
+    } reader_thread;
+
+    /**
+     * The Writer thread handles all writes to the socket (after startup).
+     * All the work is implemented in Pipe itself, of course.
+     */
+    class Writer : public Thread {
+      Pipe *pipe;
+    public:
+      explicit Writer(Pipe *p) : pipe(p) {}
+      void *entry() override { pipe->writer(); return 0; }
+    } writer_thread;
+
+    class DelayedDelivery;
+    DelayedDelivery *delay_thread;
+  public:
+    Pipe(SimpleMessenger *r, int st, PipeConnection *con);
+    ~Pipe() override;
+
+    SimpleMessenger *msgr;
+    uint64_t conn_id;
+    ostream& _pipe_prefix(std::ostream &out) const;
+
+    Pipe* get() {
+      return static_cast<Pipe*>(RefCountedObject::get());
+    }
+
+    bool is_connected() {
+      Mutex::Locker l(pipe_lock);
+      return state == STATE_OPEN;
+    }
+
+    char *recv_buf;
+    size_t recv_max_prefetch;
+    size_t recv_ofs;
+    size_t recv_len;
+
+    enum {
+      STATE_ACCEPTING,
+      STATE_CONNECTING,
+      STATE_OPEN,
+      STATE_STANDBY,
+      STATE_CLOSED,
+      STATE_CLOSING,
+      STATE_WAIT       // just wait for racing connection
+    };
+
+    static const char *get_state_name(int s) {
+      switch (s) {
+      case STATE_ACCEPTING: return "accepting";
+      case STATE_CONNECTING: return "connecting";
+      case STATE_OPEN: return "open";
+      case STATE_STANDBY: return "standby";
+      case STATE_CLOSED: return "closed";
+      case STATE_CLOSING: return "closing";
+      case STATE_WAIT: return "wait";
+      default: return "UNKNOWN";
+      }
+    }
+    const char *get_state_name() {
+      return get_state_name(state);
+    }
+
+  private:
+    int sd;
+    struct iovec msgvec[SM_IOV_MAX];
+
+  public:
+    int port;
+    int peer_type;
+    entity_addr_t peer_addr;
+    Messenger::Policy policy;
+    
+    Mutex pipe_lock;
+    int state;
+    std::atomic<bool> state_closed = { false }; // true iff state = STATE_CLOSED
+
+    // session_security handles any signatures or encryptions required for this pipe's msgs. PLR
+
+    std::shared_ptr<AuthSessionHandler> session_security;
+
+  protected:
+    friend class SimpleMessenger;
+    PipeConnectionRef connection_state;
+
+    utime_t backoff;         // backoff time
+
+    bool reader_running, reader_needs_join;
+    bool reader_dispatching; /// reader thread is dispatching without pipe_lock
+    bool notify_on_dispatch_done; /// something wants a signal when dispatch done
+    bool writer_running;
+
+    map<int, list<Message*> > out_q;  // priority queue for outbound msgs
+    DispatchQueue *in_q;
+    list<Message*> sent;
+    Cond cond;
+    bool send_keepalive;
+    bool send_keepalive_ack;
+    utime_t keepalive_ack_stamp;
+    bool halt_delivery; //if a pipe's queue is destroyed, stop adding to it
+    
+    __u32 connect_seq, peer_global_seq;
+    uint64_t out_seq;
+    uint64_t in_seq, in_seq_acked;
+    
+    void set_socket_options();
+
+    int accept();   // server handshake
+    int connect();  // client handshake
+    void reader();
+    void writer();
+    void unlock_maybe_reap();
+
+    void randomize_out_seq();
+
+    int read_message(Message **pm,
+		     AuthSessionHandler *session_security_copy);
+    int write_message(const ceph_msg_header& h, const ceph_msg_footer& f, bufferlist& body);
+    /**
+     * Write the given data (of length len) to the Pipe's socket. This function
+     * will loop until all passed data has been written out.
+     * If more is set, the function will optimize socket writes
+     * for additional data (by passing the MSG_MORE flag, aka TCP_CORK).
+     *
+     * @param msg The msghdr to write out
+     * @param len The length of the data in msg
+     * @param more Should be set true if this is one part of a larger message
+     * @return 0, or -1 on failure (unrecoverable -- close the socket).
+     */
+    int do_sendmsg(struct msghdr *msg, unsigned len, bool more=false);
+    int write_ack(uint64_t s);
+    int write_keepalive();
+    int write_keepalive2(char tag, const utime_t &t);
+
+    void fault(bool reader=false);
+
+    void was_session_reset();
+
+    /* Clean up sent list */
+    void handle_ack(uint64_t seq);
+
+    public:
+    Pipe(const Pipe& other);
+    const Pipe& operator=(const Pipe& other);
+
+    void start_reader();
+    void start_writer();
+    void maybe_start_delay_thread();
+    void join_reader();
+
+    // public constructors
+    static const Pipe& Server(int s);
+    static const Pipe& Client(const entity_addr_t& pi);
+
+    uint64_t get_out_seq() { return out_seq; }
+
+    bool is_queued() { return !out_q.empty() || send_keepalive || send_keepalive_ack; }
+
+    entity_addr_t& get_peer_addr() { return peer_addr; }
+
+    void set_peer_addr(const entity_addr_t& a) {
+      if (&peer_addr != &a)  // shut up valgrind
+        peer_addr = a;
+      connection_state->set_peer_addr(a);
+    }
+    void set_peer_type(int t) {
+      peer_type = t;
+      connection_state->set_peer_type(t);
+    }
+
+    void register_pipe();
+    void unregister_pipe();
+    void join();
+    /// stop a Pipe by closing its socket and setting it to STATE_CLOSED
+    void stop();
+    /// stop() a Pipe if not already done, and wait for it to finish any
+    /// fast_dispatch in progress.
+    void stop_and_wait();
+
+    void _send(Message *m) {
+      ceph_assert(pipe_lock.is_locked());
+      out_q[m->get_priority()].push_back(m);
+      cond.Signal();
+    }
+    void _send_keepalive() {
+      ceph_assert(pipe_lock.is_locked());
+      send_keepalive = true;
+      cond.Signal();
+    }
+    Message *_get_next_outgoing() {
+      ceph_assert(pipe_lock.is_locked());
+      Message *m = 0;
+      while (!m && !out_q.empty()) {
+        map<int, list<Message*> >::reverse_iterator p = out_q.rbegin();
+        if (!p->second.empty()) {
+          m = p->second.front();
+          p->second.pop_front();
+        }
+        if (p->second.empty())
+          out_q.erase(p->first);
+      }
+      return m;
+    }
+
+    /// move all messages in the sent list back into the queue at the highest priority.
+    void requeue_sent();
+    /// discard messages requeued by requeued_sent() up to a given seq
+    void discard_requeued_up_to(uint64_t seq);
+    void discard_out_queue();
+
+    void shutdown_socket() {
+      recv_reset();
+      if (sd >= 0)
+        ::shutdown(sd, SHUT_RDWR);
+    }
+
+    void recv_reset() {
+      recv_len = 0;
+      recv_ofs = 0;
+    }
+    ssize_t do_recv(char *buf, size_t len, int flags);
+    ssize_t buffered_recv(char *buf, size_t len, int flags);
+    bool has_pending_data() { return recv_len > recv_ofs; }
+
+    /**
+     * do a blocking read of len bytes from socket
+     *
+     * @param buf buffer to read into
+     * @param len exact number of bytes to read
+     * @return 0 for success, or -1 on error
+     */
+    int tcp_read(char *buf, unsigned len);
+
+    /**
+     * wait for bytes to become available on the socket
+     *
+     * @return 0 for success, or -1 on error
+     */
+    int tcp_read_wait();
+
+    /**
+     * non-blocking read of available bytes on socket
+     *
+     * This is expected to be used after tcp_read_wait(), and will return
+     * an error if there is no data on the socket to consume.
+     *
+     * @param buf buffer to read into
+     * @param len maximum number of bytes to read
+     * @return bytes read, or -1 on error or when there is no data
+     */
+    ssize_t tcp_read_nonblocking(char *buf, unsigned len);
+
+    /**
+     * blocking write of bytes to socket
+     *
+     * @param buf buffer
+     * @param len number of bytes to write
+     * @return 0 for success, or -1 on error
+     */
+    int tcp_write(const char *buf, unsigned len);
+
+  };
+
+
+#endif
diff --git a/src/msg/simple/PipeConnection.cc b/src/msg/simple/PipeConnection.cc
new file mode 100644
index 00000000..faa1ea9e
--- /dev/null
+++ b/src/msg/simple/PipeConnection.cc
@@ -0,0 +1,96 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "msg/Message.h"
+#include "Pipe.h"
+#include "SimpleMessenger.h"
+#include "PipeConnection.h"
+
+PipeConnection::~PipeConnection()
+{
+  if (pipe) {
+    pipe->put();
+    pipe = NULL;
+  }
+}
+
+Pipe* PipeConnection::get_pipe()
+{
+  Mutex::Locker l(lock);
+  if (pipe)
+    return pipe->get();
+  return NULL;
+}
+
+bool PipeConnection::try_get_pipe(Pipe **p)
+{
+  Mutex::Locker l(lock);
+  if (failed) {
+    *p = NULL;
+  } else {
+    if (pipe)
+      *p = pipe->get();
+    else
+      *p = NULL;
+  }
+  return !failed;
+}
+
+bool PipeConnection::clear_pipe(Pipe *old_p)
+{
+  Mutex::Locker l(lock);
+  if (old_p == pipe) {
+    pipe->put();
+    pipe = NULL;
+    failed = true;
+    return true;
+  }
+  return false;
+}
+
+void PipeConnection::reset_pipe(Pipe *p)
+{
+  Mutex::Locker l(lock);
+  if (pipe)
+    pipe->put();
+  pipe = p->get();
+}
+
+bool PipeConnection::is_connected()
+{
+  return static_cast<SimpleMessenger*>(msgr)->is_connected(this);
+}
+
+int PipeConnection::send_message(Message *m)
+{
+  ceph_assert(msgr);
+  return static_cast<SimpleMessenger*>(msgr)->send_message(m, this);
+}
+
+void PipeConnection::send_keepalive()
+{
+  static_cast<SimpleMessenger*>(msgr)->send_keepalive(this);
+}
+
+void PipeConnection::mark_down()
+{
+  if (msgr)
+    static_cast<SimpleMessenger*>(msgr)->mark_down(this);
+}
+
+void PipeConnection::mark_disposable()
+{
+  if (msgr)
+    static_cast<SimpleMessenger*>(msgr)->mark_disposable(this);
+}
diff --git a/src/msg/simple/PipeConnection.h b/src/msg/simple/PipeConnection.h
new file mode 100644
index 00000000..e5460440
--- /dev/null
+++ b/src/msg/simple/PipeConnection.h
@@ -0,0 +1,59 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_PIPECONNECTION_H
+#define CEPH_MSG_PIPECONNECTION_H
+
+#include "msg/Connection.h"
+
+class Pipe;
+
+class PipeConnection : public Connection {
+  Pipe* pipe;
+
+  friend class boost::intrusive_ptr<PipeConnection>;
+  friend class Pipe;
+
+public:
+
+  PipeConnection(CephContext *cct, Messenger *m)
+    : Connection(cct, m),
+      pipe(NULL) { }
+
+  ~PipeConnection() override;
+
+  Pipe* get_pipe();
+
+  bool try_get_pipe(Pipe** p);
+
+  bool clear_pipe(Pipe* old_p);
+
+  void reset_pipe(Pipe* p);
+
+  bool is_connected() override;
+
+  int send_message(Message *m) override;
+  void send_keepalive() override;
+  void mark_down() override;
+  void mark_disposable() override;
+
+  entity_addr_t get_peer_socket_addr() const override {
+    return peer_addrs->front();
+  }
+
+}; /* PipeConnection */
+
+typedef boost::intrusive_ptr<PipeConnection> PipeConnectionRef;
+
+#endif
diff --git a/src/msg/simple/SimpleMessenger.cc b/src/msg/simple/SimpleMessenger.cc
new file mode 100644
index 00000000..09d1ab7b
--- /dev/null
+++ b/src/msg/simple/SimpleMessenger.cc
@@ -0,0 +1,769 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#include <errno.h>
+#include <iostream>
+#include <fstream>
+
+
+#include "SimpleMessenger.h"
+
+#include "common/config.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+#include "common/valgrind.h"
+#include "auth/Crypto.h"
+#include "include/spinlock.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, SimpleMessenger *msgr) {
+  return *_dout << "-- " << msgr->get_myaddr_legacy() << " ";
+}
+
+
+/*******************
+ * SimpleMessenger
+ */
+
+SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name,
+				 string mname, uint64_t _nonce)
+  : SimplePolicyMessenger(cct, name,mname, _nonce),
+    accepter(this, _nonce),
+    dispatch_queue(cct, this, mname),
+    reaper_thread(this),
+    nonce(_nonce),
+    lock("SimpleMessenger::lock"), need_addr(true), did_bind(false),
+    global_seq(0),
+    cluster_protocol(0),
+    reaper_started(false), reaper_stop(false),
+    timeout(0),
+    local_connection(new PipeConnection(cct, this))
+{
+  ANNOTATE_BENIGN_RACE_SIZED(&timeout, sizeof(timeout),
+                             "SimpleMessenger read timeout");
+  init_local_connection();
+}
+
+/**
+ * Destroy the SimpleMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+SimpleMessenger::~SimpleMessenger()
+{
+  ceph_assert(!did_bind); // either we didn't bind or we shut down the Accepter
+  ceph_assert(rank_pipe.empty()); // we don't have any running Pipes.
+  ceph_assert(!reaper_started); // the reaper thread is stopped
+}
+
+void SimpleMessenger::ready()
+{
+  ldout(cct,10) << "ready " << get_myaddr_legacy() << dendl;
+  dispatch_queue.start();
+
+  lock.Lock();
+  if (did_bind)
+    accepter.start();
+  lock.Unlock();
+}
+
+
+int SimpleMessenger::shutdown()
+{
+  ldout(cct,10) << "shutdown " << get_myaddr_legacy() << dendl;
+  mark_down_all();
+
+  // break ref cycles on the loopback connection
+  local_connection->set_priv(NULL);
+
+  lock.Lock();
+  stop_cond.Signal();
+  stopped = true;
+  lock.Unlock();
+
+  return 0;
+}
+
+int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest)
+{
+  // set envelope
+  m->get_header().src = get_myname();
+  m->set_cct(cct);
+
+  if (!m->get_priority()) m->set_priority(get_default_send_priority());
+ 
+  ldout(cct,1) <<"--> " << dest.name << " "
+          << dest.addr << " -- " << *m
+    	  << " -- ?+" << m->get_data().length()
+	  << " " << m 
+	  << dendl;
+
+  if (dest.addr == entity_addr_t()) {
+    ldout(cct,0) << "send_message message " << *m
+                 << " with empty dest " << dest.addr << dendl;
+    m->put();
+    return -EINVAL;
+  }
+
+  lock.Lock();
+  Pipe *pipe = _lookup_pipe(dest.addr);
+  submit_message(m, (pipe ? pipe->connection_state.get() : NULL),
+                 dest.addr, dest.name.type(), true);
+  lock.Unlock();
+  return 0;
+}
+
+int SimpleMessenger::_send_message(Message *m, Connection *con)
+{
+  //set envelope
+  m->get_header().src = get_myname();
+
+  if (!m->get_priority()) m->set_priority(get_default_send_priority());
+
+  ldout(cct,1) << "--> " << con->get_peer_addr()
+      << " -- " << *m
+      << " -- ?+" << m->get_data().length()
+      << " " << m << " con " << con
+      << dendl;
+
+  submit_message(m, static_cast<PipeConnection*>(con),
+		 con->get_peer_addr(), con->get_peer_type(), false);
+  return 0;
+}
+
+/**
+ * If my_inst.addr doesn't have an IP set, this function
+ * will fill it in from the passed addr. Otherwise it does nothing and returns.
+ */
+bool SimpleMessenger::set_addr_unknowns(const entity_addrvec_t &addrs)
+{
+  bool ret = false;
+  auto addr = addrs.front();
+  ceph_assert(my_addr == my_addrs->front());
+  if (my_addr.is_blank_ip()) {
+    ldout(cct,1) << __func__ << " " << addr << dendl;
+    entity_addr_t t = my_addr;
+    int port = t.get_port();
+    t.u = addr.u;
+    t.set_port(port);
+    set_addrs(entity_addrvec_t(t));
+    init_local_connection();
+    ret = true;
+  } else {
+    ldout(cct,1) << __func__ << " " << addr << " no-op" << dendl;
+  }
+  ceph_assert(my_addr == my_addrs->front());
+  return ret;
+}
+
+void SimpleMessenger::set_myaddrs(const entity_addrvec_t &av)
+{
+  my_addr = av.front();
+  Messenger::set_myaddrs(av);
+}
+
+void SimpleMessenger::set_addrs(const entity_addrvec_t &av)
+{
+  auto t = av;
+  for (auto& a : t.v) {
+    a.set_nonce(nonce);
+  }
+  set_myaddrs(t);
+  init_local_connection();
+}
+
+int SimpleMessenger::get_proto_version(int peer_type, bool connect)
+{
+  int my_type = my_name.type();
+
+  // set reply protocol version
+  if (peer_type == my_type) {
+    // internal
+    return cluster_protocol;
+  } else {
+    // public
+    if (connect) {
+      switch (peer_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+      }
+    } else {
+      switch (my_type) {
+      case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+      case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+      }
+    }
+  }
+  return 0;
+}
+
+
+
+
+
+
+
+/********************************************
+ * SimpleMessenger
+ */
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+
+void SimpleMessenger::reaper_entry()
+{
+  ldout(cct,10) << "reaper_entry start" << dendl;
+  lock.Lock();
+  while (!reaper_stop) {
+    reaper();  // may drop and retake the lock
+    if (reaper_stop)
+      break;
+    reaper_cond.Wait(lock);
+  }
+  lock.Unlock();
+  ldout(cct,10) << "reaper_entry done" << dendl;
+}
+
+/*
+ * note: assumes lock is held
+ */
+void SimpleMessenger::reaper()
+{
+  ldout(cct,10) << "reaper" << dendl;
+  ceph_assert(lock.is_locked());
+
+  while (!pipe_reap_queue.empty()) {
+    Pipe *p = pipe_reap_queue.front();
+    pipe_reap_queue.pop_front();
+    ldout(cct,10) << "reaper reaping pipe " << p << " " <<
+      p->get_peer_addr() << dendl;
+    p->pipe_lock.Lock();
+    p->discard_out_queue();
+    if (p->connection_state) {
+      // mark_down, mark_down_all, or fault() should have done this,
+      // or accept() may have switch the Connection to a different
+      // Pipe... but make sure!
+      bool cleared = p->connection_state->clear_pipe(p);
+      ceph_assert(!cleared);
+    }
+    p->pipe_lock.Unlock();
+    p->unregister_pipe();
+    ceph_assert(pipes.count(p));
+    pipes.erase(p);
+
+    // drop msgr lock while joining thread; the delay through could be
+    // trying to fast dispatch, preventing it from joining without
+    // blocking and deadlocking.
+    lock.Unlock();
+    p->join();
+    lock.Lock();
+
+    if (p->sd >= 0)
+      ::close(p->sd);
+    ldout(cct,10) << "reaper reaped pipe " << p << " " << p->get_peer_addr() << dendl;
+    p->put();
+    ldout(cct,10) << "reaper deleted pipe " << p << dendl;
+  }
+  ldout(cct,10) << "reaper done" << dendl;
+}
+
+void SimpleMessenger::queue_reap(Pipe *pipe)
+{
+  ldout(cct,10) << "queue_reap " << pipe << dendl;
+  lock.Lock();
+  pipe_reap_queue.push_back(pipe);
+  reaper_cond.Signal();
+  lock.Unlock();
+}
+
+bool SimpleMessenger::is_connected(Connection *con)
+{
+  bool r = false;
+  if (con) {
+    Pipe *p = static_cast<Pipe *>(static_cast<PipeConnection*>(con)->get_pipe());
+    if (p) {
+      ceph_assert(p->msgr == this);
+      r = p->is_connected();
+      p->put();
+    }
+  }
+  return r;
+}
+
+int SimpleMessenger::bind(const entity_addr_t &bind_addr)
+{
+  lock.Lock();
+  if (started) {
+    ldout(cct,10) << "rank.bind already started" << dendl;
+    lock.Unlock();
+    return -1;
+  }
+  ldout(cct,10) << "rank.bind " << bind_addr << dendl;
+  lock.Unlock();
+
+  // bind to a socket
+  set<int> avoid_ports;
+  int r = accepter.bind(bind_addr, avoid_ports);
+  if (r >= 0)
+    did_bind = true;
+  return r;
+}
+
+int SimpleMessenger::rebind(const set<int>& avoid_ports)
+{
+  ldout(cct,1) << "rebind avoid " << avoid_ports << dendl;
+  ceph_assert(did_bind);
+  accepter.stop();
+  mark_down_all();
+  return accepter.rebind(avoid_ports);
+}
+
+
+int SimpleMessenger::client_bind(const entity_addr_t &bind_addr)
+{
+  if (!cct->_conf->ms_bind_before_connect)
+    return 0;
+  Mutex::Locker l(lock);
+  if (did_bind) {
+    ceph_assert(*my_addrs == entity_addrvec_t(bind_addr));
+    return 0;
+  }
+  if (started) {
+    ldout(cct,10) << "rank.bind already started" << dendl;
+    return -1;
+  }
+  ldout(cct,10) << "rank.bind " << bind_addr << dendl;
+
+  set_myaddrs(entity_addrvec_t(bind_addr));
+  return 0;
+}
+
+
+int SimpleMessenger::start()
+{
+  lock.Lock();
+  ldout(cct,1) << "messenger.start" << dendl;
+
+  // register at least one entity, first!
+  ceph_assert(my_name.type() >= 0);
+
+  ceph_assert(!started);
+  started = true;
+  stopped = false;
+
+  if (!did_bind) {
+    my_addr.nonce = nonce;
+    init_local_connection();
+  }
+
+  lock.Unlock();
+
+  reaper_started = true;
+  reaper_thread.create("ms_reaper");
+  return 0;
+}
+
+Pipe *SimpleMessenger::add_accept_pipe(int sd)
+{
+  lock.Lock();
+  Pipe *p = new Pipe(this, Pipe::STATE_ACCEPTING, NULL);
+  p->sd = sd;
+  p->pipe_lock.Lock();
+  p->start_reader();
+  p->pipe_lock.Unlock();
+  pipes.insert(p);
+  accepting_pipes.insert(p);
+  lock.Unlock();
+  return p;
+}
+
+/* connect_rank
+ * NOTE: assumes messenger.lock held.
+ */
+Pipe *SimpleMessenger::connect_rank(const entity_addr_t& addr,
+				    int type,
+				    PipeConnection *con,
+				    Message *first)
+{
+  ceph_assert(lock.is_locked());
+  ceph_assert(addr != my_addr);
+  
+  ldout(cct,10) << "connect_rank to " << addr << ", creating pipe and registering" << dendl;
+  
+  // create pipe
+  Pipe *pipe = new Pipe(this, Pipe::STATE_CONNECTING,
+			static_cast<PipeConnection*>(con));
+  pipe->pipe_lock.Lock();
+  pipe->set_peer_type(type);
+  pipe->set_peer_addr(addr);
+  pipe->policy = get_policy(type);
+  pipe->start_writer();
+  if (first)
+    pipe->_send(first);
+  pipe->pipe_lock.Unlock();
+  pipe->register_pipe();
+  pipes.insert(pipe);
+
+  return pipe;
+}
+
+
+
+
+
+
+ConnectionRef SimpleMessenger::connect_to(int type,
+					  const entity_addrvec_t& addrs)
+{
+  Mutex::Locker l(lock);
+  if (my_addr == addrs.front()) {
+    // local
+    return local_connection;
+  }
+
+  // remote
+  while (true) {
+    Pipe *pipe = _lookup_pipe(addrs.legacy_addr());
+    if (pipe) {
+      ldout(cct, 10) << "get_connection " << addrs << " existing " << pipe << dendl;
+    } else {
+      pipe = connect_rank(addrs.legacy_addr(), type, NULL, NULL);
+      ldout(cct, 10) << "get_connection " << addrs << " new " << pipe << dendl;
+    }
+    Mutex::Locker l(pipe->pipe_lock);
+    if (pipe->connection_state)
+      return pipe->connection_state;
+    // we failed too quickly!  retry.  FIXME.
+  }
+}
+
+ConnectionRef SimpleMessenger::get_loopback_connection()
+{
+  return local_connection;
+}
+
+void SimpleMessenger::submit_message(Message *m, PipeConnection *con,
+				     const entity_addr_t& dest_addr, int dest_type,
+				     bool already_locked)
+{
+  m->trace.event("simple submitting message");
+  if (cct->_conf->ms_dump_on_send) {
+    m->encode(-1, true);
+    ldout(cct, 0) << "submit_message " << *m << "\n";
+    m->get_payload().hexdump(*_dout);
+    if (m->get_data().length() > 0) {
+      *_dout << " data:\n";
+      m->get_data().hexdump(*_dout);
+    }
+    *_dout << dendl;
+    m->clear_payload();
+  }
+
+  // existing connection?
+  if (con) {
+    Pipe *pipe = NULL;
+    bool ok = static_cast<PipeConnection*>(con)->try_get_pipe(&pipe);
+    if (!ok) {
+      ldout(cct,0) << "submit_message " << *m << " remote, " << dest_addr
+		   << ", failed lossy con, dropping message " << m << dendl;
+      m->put();
+      return;
+    }
+    while (pipe && ok) {
+      // we loop in case of a racing reconnect, either from us or them
+      pipe->pipe_lock.Lock(); // can't use a Locker because of the Pipe ref
+      if (pipe->state != Pipe::STATE_CLOSED) {
+	ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr << ", have pipe." << dendl;
+	pipe->_send(m);
+	pipe->pipe_lock.Unlock();
+	pipe->put();
+	return;
+      }
+      Pipe *current_pipe;
+      ok = con->try_get_pipe(&current_pipe);
+      pipe->pipe_lock.Unlock();
+      if (current_pipe == pipe) {
+	ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr
+		      << ", had pipe " << pipe << ", but it closed." << dendl;
+	pipe->put();
+	current_pipe->put();
+	m->put();
+	return;
+      } else {
+	pipe->put();
+	pipe = current_pipe;
+      }
+    }
+  }
+
+  // local?
+  if (my_addr == dest_addr) {
+    // local
+    ldout(cct,20) << "submit_message " << *m << " local" << dendl;
+    m->set_connection(local_connection.get());
+    dispatch_queue.local_delivery(m, m->get_priority());
+    return;
+  }
+
+  // remote, no existing pipe.
+  const Policy& policy = get_policy(dest_type);
+  if (policy.server) {
+    ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr << ", lossy server for target type "
+		  << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl;
+    m->put();
+  } else {
+    ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr << ", new pipe." << dendl;
+    if (!already_locked) {
+      /** We couldn't handle the Message without reference to global data, so
+       *  grab the lock and do it again. If we got here, we know it's a non-lossy
+       *  Connection, so we can use our existing pointer without doing another lookup. */
+      Mutex::Locker l(lock);
+      submit_message(m, con, dest_addr, dest_type, true);
+    } else {
+      connect_rank(dest_addr, dest_type, static_cast<PipeConnection*>(con), m);
+    }
+  }
+}
+
+int SimpleMessenger::send_keepalive(Connection *con)
+{
+  int ret = 0;
+  Pipe *pipe = static_cast<Pipe *>(
+    static_cast<PipeConnection*>(con)->get_pipe());
+  if (pipe) {
+    ldout(cct,20) << "send_keepalive con " << con << ", have pipe." << dendl;
+    ceph_assert(pipe->msgr == this);
+    pipe->pipe_lock.Lock();
+    pipe->_send_keepalive();
+    pipe->pipe_lock.Unlock();
+    pipe->put();
+  } else {
+    ldout(cct,0) << "send_keepalive con " << con << ", no pipe." << dendl;
+    ret = -EPIPE;
+  }
+  return ret;
+}
+
+
+
+void SimpleMessenger::wait()
+{
+  lock.Lock();
+  if (!started) {
+    lock.Unlock();
+    return;
+  }
+  if (!stopped)
+    stop_cond.Wait(lock);
+
+  lock.Unlock();
+
+  // done!  clean up.
+  if (did_bind) {
+    ldout(cct,20) << "wait: stopping accepter thread" << dendl;
+    accepter.stop();
+    did_bind = false;
+    ldout(cct,20) << "wait: stopped accepter thread" << dendl;
+  }
+
+  dispatch_queue.shutdown();
+  if (dispatch_queue.is_started()) {
+    ldout(cct,10) << "wait: waiting for dispatch queue" << dendl;
+    dispatch_queue.wait();
+    dispatch_queue.discard_local();
+    ldout(cct,10) << "wait: dispatch queue is stopped" << dendl;
+  }
+
+  if (reaper_started) {
+    ldout(cct,20) << "wait: stopping reaper thread" << dendl;
+    lock.Lock();
+    reaper_cond.Signal();
+    reaper_stop = true;
+    lock.Unlock();
+    reaper_thread.join();
+    reaper_started = false;
+    ldout(cct,20) << "wait: stopped reaper thread" << dendl;
+  }
+
+  // close+reap all pipes
+  lock.Lock();
+  {
+    ldout(cct,10) << "wait: closing pipes" << dendl;
+
+    while (!rank_pipe.empty()) {
+      Pipe *p = rank_pipe.begin()->second;
+      p->unregister_pipe();
+      p->pipe_lock.Lock();
+      p->stop_and_wait();
+      // don't generate an event here; we're shutting down anyway.
+      PipeConnectionRef con = p->connection_state;
+      if (con)
+	con->clear_pipe(p);
+      p->pipe_lock.Unlock();
+    }
+
+    reaper();
+    ldout(cct,10) << "wait: waiting for pipes " << pipes << " to close" << dendl;
+    while (!pipes.empty()) {
+      reaper_cond.Wait(lock);
+      reaper();
+    }
+  }
+  lock.Unlock();
+
+  ldout(cct,10) << "wait: done." << dendl;
+  ldout(cct,1) << "shutdown complete." << dendl;
+  started = false;
+}
+
+
+void SimpleMessenger::mark_down_all()
+{
+  ldout(cct,1) << "mark_down_all" << dendl;
+  lock.Lock();
+  for (set<Pipe*>::iterator q = accepting_pipes.begin(); q != accepting_pipes.end(); ++q) {
+    Pipe *p = *q;
+    ldout(cct,5) << "mark_down_all accepting_pipe " << p << dendl;
+    p->pipe_lock.Lock();
+    p->stop();
+    PipeConnectionRef con = p->connection_state;
+    if (con && con->clear_pipe(p))
+      dispatch_queue.queue_reset(con.get());
+    p->pipe_lock.Unlock();
+  }
+  accepting_pipes.clear();
+
+  while (!rank_pipe.empty()) {
+    ceph::unordered_map<entity_addr_t,Pipe*>::iterator it = rank_pipe.begin();
+    Pipe *p = it->second;
+    ldout(cct,5) << "mark_down_all " << it->first << " " << p << dendl;
+    rank_pipe.erase(it);
+    p->unregister_pipe();
+    p->pipe_lock.Lock();
+    p->stop();
+    PipeConnectionRef con = p->connection_state;
+    if (con && con->clear_pipe(p))
+      dispatch_queue.queue_reset(con.get());
+    p->pipe_lock.Unlock();
+  }
+  lock.Unlock();
+}
+
+void SimpleMessenger::mark_down(const entity_addr_t& addr)
+{
+  lock.Lock();
+  Pipe *p = _lookup_pipe(addr);
+  if (p) {
+    ldout(cct,1) << "mark_down " << addr << " -- " << p << dendl;
+    p->unregister_pipe();
+    p->pipe_lock.Lock();
+    p->stop();
+    if (p->connection_state) {
+      // generate a reset event for the caller in this case, even
+      // though they asked for it, since this is the addr-based (and
+      // not Connection* based) interface
+      PipeConnectionRef con = p->connection_state;
+      if (con && con->clear_pipe(p))
+	dispatch_queue.queue_reset(con.get());
+    }
+    p->pipe_lock.Unlock();
+  } else {
+    ldout(cct,1) << "mark_down " << addr << " -- pipe dne" << dendl;
+  }
+  lock.Unlock();
+}
+
+void SimpleMessenger::mark_down(Connection *con)
+{
+  if (con == NULL)
+    return;
+  lock.Lock();
+  Pipe *p = static_cast<Pipe *>(static_cast<PipeConnection*>(con)->get_pipe());
+  if (p) {
+    ldout(cct,1) << "mark_down " << con << " -- " << p << dendl;
+    ceph_assert(p->msgr == this);
+    p->unregister_pipe();
+    p->pipe_lock.Lock();
+    p->stop();
+    if (p->connection_state) {
+      // do not generate a reset event for the caller in this case,
+      // since they asked for it.
+      p->connection_state->clear_pipe(p);
+    }
+    p->pipe_lock.Unlock();
+    p->put();
+  } else {
+    ldout(cct,1) << "mark_down " << con << " -- pipe dne" << dendl;
+  }
+  lock.Unlock();
+}
+
+void SimpleMessenger::mark_disposable(Connection *con)
+{
+  lock.Lock();
+  Pipe *p = static_cast<Pipe *>(static_cast<PipeConnection*>(con)->get_pipe());
+  if (p) {
+    ldout(cct,1) << "mark_disposable " << con << " -- " << p << dendl;
+    ceph_assert(p->msgr == this);
+    p->pipe_lock.Lock();
+    p->policy.lossy = true;
+    p->pipe_lock.Unlock();
+    p->put();
+  } else {
+    ldout(cct,1) << "mark_disposable " << con << " -- pipe dne" << dendl;
+  }
+  lock.Unlock();
+}
+
+void SimpleMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  if (!need_addr)
+    return;
+
+  lock.Lock();
+  if (need_addr && my_addr.is_blank_ip()) {
+    entity_addr_t t = peer_addr_for_me;
+    if (!did_bind) {
+      t.set_type(entity_addr_t::TYPE_ANY);
+      t.set_port(0);
+    } else {
+      t.set_type(entity_addr_t::TYPE_LEGACY);
+      t.set_port(my_addr.get_port());
+    }
+    t.set_nonce(my_addr.get_nonce());
+    ANNOTATE_BENIGN_RACE_SIZED(&my_addr, sizeof(my_addr),
+                               "SimpleMessenger learned addr");
+    set_myaddrs(entity_addrvec_t(t));
+    ldout(cct,1) << "learned my addr " << my_addr << dendl;
+    need_addr = false;
+    init_local_connection();
+  }
+  lock.Unlock();
+}
+
+void SimpleMessenger::init_local_connection()
+{
+  local_connection->peer_addrs = *my_addrs;
+  local_connection->peer_type = my_name.type();
+  local_connection->set_features(CEPH_FEATURES_ALL);
+  ms_deliver_handle_fast_connect(local_connection.get());
+}
diff --git a/src/msg/simple/SimpleMessenger.h b/src/msg/simple/SimpleMessenger.h
new file mode 100644
index 00000000..b1aad539
--- /dev/null
+++ b/src/msg/simple/SimpleMessenger.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+#ifndef CEPH_SIMPLEMESSENGER_H
+#define CEPH_SIMPLEMESSENGER_H
+
+#include <list>
+#include <map>
+
+#include "include/types.h"
+#include "include/xlist.h"
+
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+#include "common/Throttle.h"
+
+#include "include/spinlock.h"
+
+#include "msg/SimplePolicyMessenger.h"
+#include "msg/Message.h"
+#include "include/ceph_assert.h"
+
+#include "msg/DispatchQueue.h"
+#include "Pipe.h"
+#include "Accepter.h"
+
+/*
+ * This class handles transmission and reception of messages. Generally
+ * speaking, there are several major components:
+ *
+ * - Connection
+ *    Each logical session is associated with a Connection.
+ * - Pipe
+ *    Each network connection is handled through a pipe, which handles
+ *    the input and output of each message.  There is normally a 1:1
+ *    relationship between Pipe and Connection, but logical sessions may
+ *    get handed off between Pipes when sockets reconnect or during
+ *    connection races.
+ * - IncomingQueue
+ *    Incoming messages are associated with an IncomingQueue, and there
+ *    is one such queue associated with each Pipe.
+ * - DispatchQueue
+ *    IncomingQueues get queued in the DIspatchQueue, which is responsible
+ *    for doing a round-robin sweep and processing them via a worker thread.
+ * - SimpleMessenger
+ *    It's the exterior class passed to the external message handler and
+ *    most of the API details.
+ *
+ * Lock ordering:
+ *
+ *   SimpleMessenger::lock
+ *       Pipe::pipe_lock
+ *           DispatchQueue::lock
+ *               IncomingQueue::lock
+ */
+
+class SimpleMessenger : public SimplePolicyMessenger {
+  // First we have the public Messenger interface implementation...
+public:
+  /**
+   * Initialize the SimpleMessenger!
+   *
+   * @param cct The CephContext to use
+   * @param name The name to assign ourselves
+   * _nonce A unique ID to use for this SimpleMessenger. It should not
+   * be a value that will be repeated if the daemon restarts.
+   * features The local features bits for the local_connection
+   */
+  SimpleMessenger(CephContext *cct, entity_name_t name,
+		  string mname, uint64_t _nonce);
+
+  /**
+   * Destroy the SimpleMessenger. Pretty simple since all the work is done
+   * elsewhere.
+   */
+  ~SimpleMessenger() override;
+
+  /** @defgroup Accessors
+   * @{
+   */
+  bool set_addr_unknowns(const entity_addrvec_t& addr) override;
+  void set_addrs(const entity_addrvec_t &addr) override;
+  void set_myaddrs(const entity_addrvec_t& a) override;
+
+  int get_dispatch_queue_len() override {
+    return dispatch_queue.get_queue_len();
+  }
+
+  double get_dispatch_queue_max_age(utime_t now) override {
+    return dispatch_queue.get_max_age(now);
+  }
+  /** @} Accessors */
+
+  /**
+   * @defgroup Configuration functions
+   * @{
+   */
+  void set_cluster_protocol(int p) override {
+    ceph_assert(!started && !did_bind);
+    cluster_protocol = p;
+  }
+
+  int bind(const entity_addr_t& bind_addr) override;
+  int rebind(const set<int>& avoid_ports) override;
+  int client_bind(const entity_addr_t& bind_addr) override;
+
+  /** @} Configuration functions */
+
+  /**
+   * @defgroup Startup/Shutdown
+   * @{
+   */
+  int start() override;
+  void wait() override;
+  int shutdown() override;
+
+  /** @} // Startup/Shutdown */
+
+  /**
+   * @defgroup Messaging
+   * @{
+   */
+  int send_to(
+    Message *m,
+    int type,
+    const entity_addrvec_t& addr) override {
+    // temporary
+    return _send_message(m, entity_inst_t(entity_name_t(type, -1),
+					  addr.legacy_addr()));
+  }
+
+  int send_message(Message *m, Connection *con) {
+    return _send_message(m, con);
+  }
+
+  /** @} // Messaging */
+
+  /**
+   * @defgroup Connection Management
+   * @{
+   */
+  ConnectionRef connect_to(int type, const entity_addrvec_t& addrs) override;
+  ConnectionRef get_loopback_connection() override;
+  int send_keepalive(Connection *con);
+  void mark_down(const entity_addr_t& addr) override;
+  void mark_down(Connection *con);
+  void mark_disposable(Connection *con);
+  void mark_down_all() override;
+  /** @} // Connection Management */
+protected:
+  /**
+   * @defgroup Messenger Interfaces
+   * @{
+   */
+  /**
+   * Start up the DispatchQueue thread once we have somebody to dispatch to.
+   */
+  void ready() override;
+  /** @} // Messenger Interfaces */
+private:
+  /**
+   * @defgroup Inner classes
+   * @{
+   */
+
+public:
+  Accepter accepter;
+  DispatchQueue dispatch_queue;
+
+  friend class Accepter;
+
+  /**
+   * Register a new pipe for accept
+   *
+   * @param sd socket
+   */
+  Pipe *add_accept_pipe(int sd);
+
+private:
+
+  /**
+   * A thread used to tear down Pipes when they're complete.
+   */
+  class ReaperThread : public Thread {
+    SimpleMessenger *msgr;
+  public:
+    explicit ReaperThread(SimpleMessenger *m) : msgr(m) {}
+    void *entry() override {
+      msgr->reaper_entry();
+      return 0;
+    }
+  } reaper_thread;
+
+  /**
+   * @} // Inner classes
+   */
+
+  /**
+   * @defgroup Utility functions
+   * @{
+   */
+
+  /**
+   * Create a Pipe associated with the given entity (of the given type).
+   * Initiate the connection. (This function returning does not guarantee
+   * connection success.)
+   *
+   * @param addr The address of the entity to connect to.
+   * @param type The peer type of the entity at the address.
+   * @param con An existing Connection to associate with the new Pipe. If
+   * NULL, it creates a new Connection.
+   * @param first an initial message to queue on the new pipe
+   *
+   * @return a pointer to the newly-created Pipe. Caller does not own a
+   * reference; take one if you need it.
+   */
+  Pipe *connect_rank(const entity_addr_t& addr, int type, PipeConnection *con,
+		     Message *first);
+  /**
+   * Send a message, lazily or not.
+   * This just glues send_message together and passes
+   * the input on to submit_message.
+   */
+  int _send_message(Message *m, const entity_inst_t& dest);
+  /**
+   * Same as above, but for the Connection-based variants.
+   */
+  int _send_message(Message *m, Connection *con);
+  /**
+   * Queue up a Message for delivery to the entity specified
+   * by addr and dest_type.
+   * submit_message() is responsible for creating
+   * new Pipes (and closing old ones) as necessary.
+   *
+   * @param m The Message to queue up. This function eats a reference.
+   * @param con The existing Connection to use, or NULL if you don't know of one.
+   * @param addr The address to send the Message to.
+   * @param dest_type The peer type of the address we're sending to
+   * just drop silently under failure.
+   * @param already_locked If false, submit_message() will acquire the
+   * SimpleMessenger lock before accessing shared data structures; otherwise
+   * it will assume the lock is held. NOTE: if you are making a request
+   * without locking, you MUST have filled in the con with a valid pointer.
+   */
+  void submit_message(Message *m, PipeConnection *con,
+		      const entity_addr_t& addr, int dest_type,
+		      bool already_locked);
+  /**
+   * Look through the pipes in the pipe_reap_queue and tear them down.
+   */
+  void reaper();
+  /**
+   * @} // Utility functions
+   */
+
+  // SimpleMessenger stuff
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
+  /// overall lock used for SimpleMessenger data structures
+  Mutex lock;
+  /// true, specifying we haven't learned our addr; set false when we find it.
+  // maybe this should be protected by the lock?
+  bool need_addr;
+
+public:
+  bool get_need_addr() const { return need_addr; }
+
+private:
+  /**
+   *  false; set to true if the SimpleMessenger bound to a specific address;
+   *  and set false again by Accepter::stop(). This isn't lock-protected
+   *  since you shouldn't be able to race the only writers.
+   */
+  bool did_bind;
+  /// counter for the global seq our connection protocol uses
+  __u32 global_seq;
+  /// lock to protect the global_seq
+  ceph::spinlock global_seq_lock;
+
+  entity_addr_t my_addr;
+
+  /**
+   * hash map of addresses to Pipes
+   *
+   * NOTE: a Pipe* with state CLOSED may still be in the map but is considered
+   * invalid and can be replaced by anyone holding the msgr lock
+   */
+  ceph::unordered_map<entity_addr_t, Pipe*> rank_pipe;
+  /**
+   * list of pipes are in the process of accepting
+   *
+   * These are not yet in the rank_pipe map.
+   */
+  set<Pipe*> accepting_pipes;
+  /// a set of all the Pipes we have which are somehow active
+  set<Pipe*>      pipes;
+  /// a list of Pipes we want to tear down
+  list<Pipe*>     pipe_reap_queue;
+
+  /// internal cluster protocol version, if any, for talking to entities of the same type.
+  int cluster_protocol;
+
+  Cond  stop_cond;
+  bool stopped = true;
+
+  bool reaper_started, reaper_stop;
+  Cond reaper_cond;
+
+  /// This Cond is slept on by wait() and signaled by dispatch_entry()
+  Cond  wait_cond;
+
+  friend class Pipe;
+
+  Pipe *_lookup_pipe(const entity_addr_t& k) {
+    ceph::unordered_map<entity_addr_t, Pipe*>::iterator p = rank_pipe.find(k);
+    if (p == rank_pipe.end())
+      return NULL;
+    // see lock cribbing in Pipe::fault()
+    if (p->second->state_closed)
+      return NULL;
+    return p->second;
+  }
+
+public:
+
+  int timeout;
+
+  /// con used for sending messages to ourselves
+  ConnectionRef local_connection;
+
+  /**
+   * @defgroup SimpleMessenger internals
+   * @{
+   */
+
+  /**
+   * Increment the global sequence for this SimpleMessenger and return it.
+   * This is for the connect protocol, although it doesn't hurt if somebody
+   * else calls it.
+   *
+   * @return a global sequence ID that nobody else has seen.
+   */
+  __u32 get_global_seq(__u32 old=0) {
+    std::lock_guard<decltype(global_seq_lock)> lg(global_seq_lock);
+
+    if (old > global_seq)
+      global_seq = old;
+    __u32 ret = ++global_seq;
+
+    return ret;
+  }
+  /**
+   * Get the protocol version we support for the given peer type: either
+   * a peer protocol (if it matches our own), the protocol version for the
+   * peer (if we're connecting), or our protocol version (if we're accepting).
+   */
+  int get_proto_version(int peer_type, bool connect);
+
+  /**
+   * Fill in the features, address and peer type for the local connection, which
+   * is used for delivering messages back to ourself.
+   */
+  void init_local_connection();
+  /**
+   * Tell the SimpleMessenger its full IP address.
+   *
+   * This is used by Pipes when connecting to other endpoints, and
+   * probably shouldn't be called by anybody else.
+   */
+  void learned_addr(const entity_addr_t& peer_addr_for_me);
+
+  /**
+   * This function is used by the reaper thread. As long as nobody
+   * has set reaper_stop, it calls the reaper function, then
+   * waits to be signaled when it needs to reap again (or when it needs
+   * to stop).
+   */
+  void reaper_entry();
+  /**
+   * Add a pipe to the pipe_reap_queue, to be torn down on
+   * the next call to reaper().
+   * It should really only be the Pipe calling this, in our current
+   * implementation.
+   *
+   * @param pipe A Pipe which has stopped its threads and is
+   * ready to be torn down.
+   */
+  void queue_reap(Pipe *pipe);
+
+  /**
+   * Used to get whether this connection ready to send
+   */
+  bool is_connected(Connection *con);
+  /**
+   * @} // SimpleMessenger Internals
+   */
+} ;
+
+#endif /* CEPH_SIMPLEMESSENGER_H */
diff --git a/src/msg/xio/XioConnection.cc b/src/msg/xio/XioConnection.cc
new file mode 100644
index 00000000..4bfab39b
--- /dev/null
+++ b/src/msg/xio/XioConnection.cc
@@ -0,0 +1,858 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XioMsg.h"
+#include "XioConnection.h"
+#include "XioMessenger.h"
+#include "messages/MDataPing.h"
+#include "msg/msg_types.h"
+#include "auth/none/AuthNoneProtocol.h" // XXX
+
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+
+extern struct xio_mempool *xio_msgr_mpool;
+extern struct xio_mempool *xio_msgr_noreg_mpool;
+
+#define dout_subsys ceph_subsys_xio
+
+void print_xio_msg_hdr(CephContext *cct, const char *tag,
+		       const XioMsgHdr &hdr, const struct xio_msg *msg)
+{
+  if (msg) {
+    ldout(cct,4) << tag <<
+      " xio msg:" <<
+      " sn: " << msg->sn <<
+      " timestamp: " << msg->timestamp <<
+      dendl;
+  }
+
+  ldout(cct,4) << tag <<
+    " ceph header: " <<
+    " front_len: " << hdr.hdr->front_len <<
+    " seq: " << hdr.hdr->seq <<
+    " tid: " << hdr.hdr->tid <<
+    " type: " << hdr.hdr->type <<
+    " prio: " << hdr.hdr->priority <<
+    " name type: " << (int) hdr.hdr->src.type <<
+    " name num: " << (int) hdr.hdr->src.num <<
+    " version: " << hdr.hdr->version <<
+    " compat_version: " << hdr.hdr->compat_version <<
+    " front_len: " << hdr.hdr->front_len <<
+    " middle_len: " << hdr.hdr->middle_len <<
+    " data_len: " << hdr.hdr->data_len <<
+    " xio header: " <<
+    " msg_cnt: " << hdr.msg_cnt <<
+    dendl;
+
+  ldout(cct,4) << tag <<
+    " ceph footer: " <<
+    " front_crc: " << hdr.ftr->front_crc <<
+    " middle_crc: " << hdr.ftr->middle_crc <<
+    " data_crc: " << hdr.ftr->data_crc <<
+    " sig: " << hdr.ftr->sig <<
+    " flags: " << (uint32_t) hdr.ftr->flags <<
+    dendl;
+}
+
+void print_ceph_msg(CephContext *cct, const char *tag, Message *m)
+{
+  if (m->get_magic() & (MSG_MAGIC_XIO & MSG_MAGIC_TRACE_DTOR)) {
+    ceph_msg_header& header = m->get_header();
+    ldout(cct,4) << tag << " header version " << header.version <<
+      " compat version " << header.compat_version <<
+      dendl;
+  }
+}
+
+#undef dout_prefix
+#define dout_prefix conn_prefix(_dout)
+ostream& XioConnection::conn_prefix(std::ostream *_dout) {
+  return *_dout << "-- " << get_messenger()->get_myinst().addr << " >> " << peer_addr
+                << " peer=" << peer.name.type_str()
+                << " conn=" << conn << " sess=" << session << " ";
+}
+
+XioConnection::XioConnection(XioMessenger *m, XioConnection::type _type,
+			     const entity_inst_t& _peer) :
+  Connection(m->cct, m),
+  xio_conn_type(_type),
+  portal(m->get_portal()),
+  connected(false),
+  peer(_peer),
+  session(NULL),
+  conn(NULL),
+  magic(m->get_magic()),
+  scount(0),
+  send_ctr(0),
+  in_seq(),
+  cstate(this)
+{
+  set_peer_type(peer.name.type());
+  set_peer_addr(peer.addr);
+
+  Messenger::Policy policy;
+  int64_t max_msgs = 0, max_bytes = 0, bytes_opt = 0;
+  int xopt;
+
+  policy = m->get_policy(peer_type);
+
+  if (policy.throttler_messages) {
+    max_msgs = policy.throttler_messages->get_max();
+    ldout(m->cct,4) << "XioMessenger throttle_msgs: " << max_msgs << dendl;
+  }
+
+  xopt = m->cct->_conf->xio_queue_depth;
+  if (max_msgs > xopt)
+    xopt = max_msgs;
+
+  /* set high mark for send, reserved 20% for credits */
+  q_high_mark = xopt * 4 / 5;
+  q_low_mark = q_high_mark/2;
+
+  /* set send & receive msgs queue depth */
+  xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_SND_QUEUE_DEPTH_MSGS,
+             &xopt, sizeof(xopt));
+  xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_RCV_QUEUE_DEPTH_MSGS,
+             &xopt, sizeof(xopt));
+
+  if (policy.throttler_bytes) {
+    max_bytes = policy.throttler_bytes->get_max();
+    ldout(m->cct,4) << "XioMessenger throttle_bytes: " << max_bytes << dendl;
+  }
+
+  bytes_opt = (2 << 28); /* default: 512 MB */
+  if (max_bytes > bytes_opt)
+    bytes_opt = max_bytes;
+
+  /* set send & receive total bytes throttle */
+  xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_SND_QUEUE_DEPTH_BYTES,
+             &bytes_opt, sizeof(bytes_opt));
+  xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_RCV_QUEUE_DEPTH_BYTES,
+             &bytes_opt, sizeof(bytes_opt));
+
+  ldout(m->cct,4) << "throttle_msgs: " << xopt << " throttle_bytes: " << bytes_opt << dendl;
+
+  /* XXXX fake features, aieee! */
+  set_features(XIO_ALL_FEATURES);
+}
+
+int XioConnection::send_message(Message *m)
+{
+  XioMessenger *ms = static_cast<XioMessenger*>(get_messenger());
+  return ms->_send_message(m, this);
+}
+
+void XioConnection::send_keepalive_or_ack(bool ack, const utime_t *tp)
+{
+  /* If con is not in READY state, we need to queue the request */
+  if (cstate.session_state.read() != XioConnection::UP) {
+    std::lock_guad<ceph::util::spinlock> lg(sp);
+    if (cstate.session_state.read() != XioConnection::UP) {
+      if (ack) {
+	outgoing.ack = true;
+	outgoing.ack_time = *tp;
+      }
+      else {
+	outgoing.keepalive = true;
+      }
+      return;
+    }
+  }
+
+  send_keepalive_or_ack_internal(ack, tp);
+}
+
+void XioConnection::send_keepalive_or_ack_internal(bool ack, const utime_t *tp)
+{
+  XioCommand *xcmd = pool_alloc_xio_command(this);
+  if (! xcmd) {
+    /* could happen if Accelio has been shutdown */
+    return;
+  }
+
+  struct ceph_timespec ts;
+  if (ack) {
+    ceph_assert(tp);
+    tp->encode_timeval(&ts);
+    xcmd->get_bl_ref().append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
+    xcmd->get_bl_ref().append((char*)&ts, sizeof(ts));
+  } else if (has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+    utime_t t = ceph_clock_now();
+    t.encode_timeval(&ts);
+    xcmd->get_bl_ref().append(CEPH_MSGR_TAG_KEEPALIVE2);
+    xcmd->get_bl_ref().append((char*)&ts, sizeof(ts));
+  } else {
+    xcmd->get_bl_ref().append(CEPH_MSGR_TAG_KEEPALIVE);
+  }
+
+  const std::list<buffer::ptr>& header = xcmd->get_bl_ref().buffers();
+  ceph_assert(header.size() == 1);  /* accelio header must be without scatter gather */
+  list<bufferptr>::const_iterator pb = header.begin();
+  ceph_assert(pb->length() < XioMsgHdr::get_max_encoded_length());
+  struct xio_msg * msg = xcmd->get_xio_msg();
+  msg->out.header.iov_base = (char*) pb->c_str();
+  msg->out.header.iov_len = pb->length();
+
+  ldout(msgr->cct,8) << __func__ << " sending command with tag " << (int)(*(char*)msg->out.header.iov_base)
+       << " len " << msg->out.header.iov_len << dendl;
+
+  portal->enqueue(this, xcmd);
+}
+
+
+int XioConnection::passive_setup()
+{
+  /* XXX passive setup is a placeholder for (potentially active-side
+     initiated) feature and auth* negotiation */
+  static bufferlist authorizer_reply; /* static because fake */
+  static CryptoKey session_key; /* ditto */
+  bool authorizer_valid;
+
+  XioMessenger *msgr = static_cast<XioMessenger*>(get_messenger());
+
+  // fake an auth buffer
+  EntityName name;
+  name.set_type(peer.name.type());
+
+  AuthNoneAuthorizer auth;
+  auth.build_authorizer(name, peer.name.num());
+
+  /* XXX fake authorizer! */
+  msgr->ms_deliver_verify_authorizer(
+    this, peer_type, CEPH_AUTH_NONE,
+    auth.bl,
+    0,
+    authorizer_reply,
+    authorizer_valid,
+    session_key);
+
+  /* notify hook */
+  msgr->ms_deliver_handle_accept(this);
+  msgr->ms_deliver_handle_fast_accept(this);
+
+  /* try to insert in conns_entity_map */
+  msgr->try_insert(this);
+  return (0);
+}
+
+static inline XioDispatchHook* pool_alloc_xio_dispatch_hook(
+  XioConnection *xcon, Message *m, XioInSeq& msg_seq)
+{
+  struct xio_reg_mem mp_mem;
+  int e = xpool_alloc(xio_msgr_noreg_mpool,
+		      sizeof(XioDispatchHook), &mp_mem);
+  if (!!e)
+    return NULL;
+  XioDispatchHook *xhook = static_cast<XioDispatchHook*>(mp_mem.addr);
+  new (xhook) XioDispatchHook(xcon, m, msg_seq, mp_mem);
+  return xhook;
+}
+
+int XioConnection::handle_data_msg(struct xio_session *session,
+			      struct xio_msg *msg,
+			      int more_in_batch,
+			      void *cb_user_context)
+{
+  struct xio_msg *tmsg = msg;
+
+  /* XXX Accelio guarantees message ordering at
+   * xio_session */
+
+  if (! in_seq.p()) {
+    if (!tmsg->in.header.iov_len) {
+	ldout(msgr->cct,0) << __func__ << " empty header: packet out of sequence?" << dendl;
+	xio_release_msg(msg);
+	return 0;
+    }
+    const size_t sizeof_tag = 1;
+    XioMsgCnt msg_cnt(
+      buffer::create_static(tmsg->in.header.iov_len-sizeof_tag,
+			    ((char*) tmsg->in.header.iov_base)+sizeof_tag));
+    ldout(msgr->cct,10) << __func__ << " receive msg " << "tmsg " << tmsg
+      << " msg_cnt " << msg_cnt.msg_cnt
+      << " iov_base " << tmsg->in.header.iov_base
+      << " iov_len " << (int) tmsg->in.header.iov_len
+      << " nents " << tmsg->in.pdata_iov.nents
+      << " sn " << tmsg->sn << dendl;
+    ceph_assert(session == this->session);
+    in_seq.set_count(msg_cnt.msg_cnt);
+  } else {
+    /* XXX major sequence error */
+    ceph_assert(! tmsg->in.header.iov_len);
+  }
+
+  in_seq.append(msg);
+  if (in_seq.count() > 0) {
+    return 0;
+  }
+
+  XioMessenger *msgr = static_cast<XioMessenger*>(get_messenger());
+  XioDispatchHook *m_hook =
+    pool_alloc_xio_dispatch_hook(this, NULL /* msg */, in_seq);
+  XioInSeq& msg_seq = m_hook->msg_seq;
+  in_seq.clear();
+
+  ceph_msg_header header;
+  ceph_msg_footer footer;
+  buffer::list payload, middle, data;
+
+  const utime_t recv_stamp = ceph_clock_now();
+
+  ldout(msgr->cct,4) << __func__ << " " << "msg_seq.size()="  << msg_seq.size() <<
+    dendl;
+
+  struct xio_msg* msg_iter = msg_seq.begin();
+  tmsg = msg_iter;
+  XioMsgHdr hdr(header, footer,
+		buffer::create_static(tmsg->in.header.iov_len,
+				      (char*) tmsg->in.header.iov_base));
+
+  if (magic & (MSG_MAGIC_TRACE_XCON)) {
+    if (hdr.hdr->type == 43) {
+      print_xio_msg_hdr(msgr->cct, "on_msg", hdr, NULL);
+    }
+  }
+
+  unsigned int ix, blen, iov_len;
+  struct xio_iovec_ex *msg_iov, *iovs;
+  uint32_t take_len, left_len = 0;
+  char *left_base = NULL;
+
+  ix = 0;
+  blen = header.front_len;
+
+  while (blen && (msg_iter != msg_seq.end())) {
+    tmsg = msg_iter;
+    iov_len = vmsg_sglist_nents(&tmsg->in);
+    iovs = vmsg_sglist(&tmsg->in);
+    for (; blen && (ix < iov_len); ++ix) {
+      msg_iov = &iovs[ix];
+
+      /* XXX need to detect any buffer which needs to be
+       * split due to coalescing of a segment (front, middle,
+       * data) boundary */
+
+      take_len = std::min(blen, msg_iov->iov_len);
+      payload.append(
+	buffer::create_msg(
+	  take_len, (char*) msg_iov->iov_base, m_hook));
+      blen -= take_len;
+      if (! blen) {
+	left_len = msg_iov->iov_len - take_len;
+	if (left_len) {
+	  left_base = ((char*) msg_iov->iov_base) + take_len;
+	}
+      }
+    }
+    /* XXX as above, if a buffer is split, then we needed to track
+     * the new start (carry) and not advance */
+    if (ix == iov_len) {
+      msg_seq.next(&msg_iter);
+      ix = 0;
+    }
+  }
+
+  if (magic & (MSG_MAGIC_TRACE_XCON)) {
+    if (hdr.hdr->type == 43) {
+      ldout(msgr->cct,4) << "front (payload) dump:";
+      payload.hexdump( *_dout );
+      *_dout << dendl;
+    }
+  }
+
+  blen = header.middle_len;
+
+  if (blen && left_len) {
+    middle.append(
+      buffer::create_msg(left_len, left_base, m_hook));
+    left_len = 0;
+  }
+
+  while (blen && (msg_iter != msg_seq.end())) {
+    tmsg = msg_iter;
+    iov_len = vmsg_sglist_nents(&tmsg->in);
+    iovs = vmsg_sglist(&tmsg->in);
+    for (; blen && (ix < iov_len); ++ix) {
+      msg_iov = &iovs[ix];
+      take_len = std::min(blen, msg_iov->iov_len);
+      middle.append(
+	buffer::create_msg(
+	  take_len, (char*) msg_iov->iov_base, m_hook));
+      blen -= take_len;
+      if (! blen) {
+	left_len = msg_iov->iov_len - take_len;
+	if (left_len) {
+	  left_base = ((char*) msg_iov->iov_base) + take_len;
+	}
+      }
+    }
+    if (ix == iov_len) {
+      msg_seq.next(&msg_iter);
+      ix = 0;
+    }
+  }
+
+  blen = header.data_len;
+
+  if (blen && left_len) {
+    data.append(
+      buffer::create_msg(left_len, left_base, m_hook));
+    left_len = 0;
+  }
+
+  while (blen && (msg_iter != msg_seq.end())) {
+    tmsg = msg_iter;
+    iov_len = vmsg_sglist_nents(&tmsg->in);
+    iovs = vmsg_sglist(&tmsg->in);
+    for (; blen && (ix < iov_len); ++ix) {
+      msg_iov = &iovs[ix];
+      data.append(
+	buffer::create_msg(
+	  msg_iov->iov_len, (char*) msg_iov->iov_base, m_hook));
+      blen -= msg_iov->iov_len;
+    }
+    if (ix == iov_len) {
+      msg_seq.next(&msg_iter);
+      ix = 0;
+    }
+  }
+
+  /* update connection timestamp */
+  recv = tmsg->timestamp;
+
+  Message *m = decode_message(msgr->cct, msgr->crcflags, header, footer,
+                              payload, middle, data, this);
+
+  if (m) {
+    /* completion */
+    m->set_connection(this);
+
+    /* reply hook */
+    m_hook->set_message(m);
+    m->set_completion_hook(m_hook);
+
+    /* trace flag */
+    m->set_magic(magic);
+
+    /* update timestamps */
+    m->set_recv_stamp(recv_stamp);
+    m->set_recv_complete_stamp(ceph_clock_now());
+    m->set_seq(header.seq);
+
+    /* MP-SAFE */
+    state.set_in_seq(header.seq);
+
+    /* XXXX validate peer type */
+    if (peer_type != (int) hdr.peer_type) { /* XXX isn't peer_type -1? */
+      peer_type = hdr.peer_type;
+      peer_addr = hdr.addr;
+      peer.addr = peer_addr;
+      peer.name = entity_name_t(hdr.hdr->src);
+      if (xio_conn_type == XioConnection::PASSIVE) {
+	/* XXX kick off feature/authn/authz negotiation
+	 * nb:  very possibly the active side should initiate this, but
+	 * for now, call a passive hook so OSD and friends can create
+	 * sessions without actually negotiating
+	 */
+	passive_setup();
+      }
+    }
+
+    if (magic & (MSG_MAGIC_TRACE_XCON)) {
+      ldout(msgr->cct,4) << "decode m is " << m->get_type() << dendl;
+    }
+
+    /* dispatch it */
+    msgr->ds_dispatch(m);
+  } else {
+    /* responds for undecoded messages and frees hook */
+    ldout(msgr->cct,4) << "decode m failed" << dendl;
+    m_hook->on_err_finalize(this);
+  }
+
+  return 0;
+}
+
+int XioConnection::on_msg(struct xio_session *session,
+			      struct xio_msg *msg,
+			      int more_in_batch,
+			      void *cb_user_context)
+{
+  char tag = CEPH_MSGR_TAG_MSG;
+  if (msg->in.header.iov_len)
+    tag = *(char*)msg->in.header.iov_base;
+
+  ldout(msgr->cct,8) << __func__ << " receive msg with iov_len "
+    << (int) msg->in.header.iov_len << " tag " << (int)tag << dendl;
+
+  //header_len_without_tag is only meaningful in case we have tag
+  size_t header_len_without_tag = msg->in.header.iov_len - sizeof(tag);
+
+  switch(tag) {
+  case CEPH_MSGR_TAG_MSG:
+    ldout(msgr->cct, 20) << __func__ << " got data message" << dendl;
+    return handle_data_msg(session, msg, more_in_batch, cb_user_context);
+
+  case CEPH_MSGR_TAG_KEEPALIVE:
+    ldout(msgr->cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+    set_last_keepalive(ceph_clock_now());
+    break;
+
+  case CEPH_MSGR_TAG_KEEPALIVE2:
+    if (header_len_without_tag < sizeof(ceph_timespec)) {
+      lderr(msgr->cct) << __func__ << " too few data for KEEPALIVE2: got " << header_len_without_tag <<
+         " bytes instead of " << sizeof(ceph_timespec) << " bytes" << dendl;
+    }
+    else {
+      ceph_timespec *t = (ceph_timespec *) ((char*)msg->in.header.iov_base + sizeof(tag));
+      utime_t kp_t = utime_t(*t);
+      ldout(msgr->cct, 20) << __func__ << " got KEEPALIVE2 with timestamp" << kp_t << dendl;
+      send_keepalive_or_ack(true, &kp_t);
+      set_last_keepalive(ceph_clock_now());
+    }
+
+    break;
+
+  case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+    if (header_len_without_tag < sizeof(ceph_timespec)) {
+      lderr(msgr->cct) << __func__ << " too few data for KEEPALIVE2_ACK: got " << header_len_without_tag <<
+         " bytes instead of " << sizeof(ceph_timespec) << " bytes" << dendl;
+    }
+    else {
+      ceph_timespec *t = (ceph_timespec *) ((char*)msg->in.header.iov_base + sizeof(tag));
+      utime_t kp_t(*t);
+      ldout(msgr->cct, 20) << __func__ << " got KEEPALIVE2_ACK with timestamp" << kp_t << dendl;
+      set_last_keepalive_ack(kp_t);
+    }
+    break;
+
+  default:
+    lderr(msgr->cct) << __func__ << " unsupported message tag " << (int) tag << dendl;
+    ceph_assert(! "unsupported message tag");
+  }
+
+  xio_release_msg(msg);
+  return 0;
+}
+
+
+int XioConnection::on_ow_msg_send_complete(struct xio_session *session,
+					   struct xio_msg *req,
+					   void *conn_user_context)
+{
+  /* requester send complete (one-way) */
+  uint64_t rc = ++scount;
+
+  XioSend* xsend = static_cast<XioSend*>(req->user_context);
+  if (unlikely(magic & MSG_MAGIC_TRACE_CTR)) {
+    if (unlikely((rc % 1000000) == 0)) {
+      std::cout << "xio finished " << rc << " " << time(0) << std::endl;
+    }
+  } /* trace ctr */
+
+  ldout(msgr->cct,11) << "on_msg_delivered xcon: " << xsend->xcon <<
+    " msg: " << req << " sn: " << req->sn << dendl;
+
+  XioMsg *xmsg = dynamic_cast<XioMsg*>(xsend);
+  if (xmsg) {
+    ldout(msgr->cct,11) << "on_msg_delivered xcon: " <<
+      " type: " << xmsg->m->get_type() << " tid: " << xmsg->m->get_tid() <<
+      " seq: " << xmsg->m->get_seq() << dendl;
+  }
+
+  --send_ctr; /* atomic, because portal thread */
+
+  /* unblock flow-controlled connections, avoid oscillation */
+  if (unlikely(cstate.session_state.read() ==
+	       XioConnection::FLOW_CONTROLLED)) {
+    if ((send_ctr <= uint32_t(xio_qdepth_low_mark())) &&
+	(1 /* XXX memory <= memory low-water mark */))  {
+      cstate.state_up_ready(XioConnection::CState::OP_FLAG_NONE);
+      ldout(msgr->cct,2) << "on_msg_delivered xcon: " << xsend->xcon
+        << " up_ready from flow_controlled" << dendl;
+    }
+  }
+
+  xsend->put();
+
+  return 0;
+}  /* on_msg_delivered */
+
+void XioConnection::msg_send_fail(XioSend *xsend, int code)
+{
+  ldout(msgr->cct,2) << "xio_send_msg FAILED xcon: " << this <<
+    " msg: " << xsend->get_xio_msg() << " code=" << code <<
+    " (" << xio_strerror(code) << ")" << dendl;
+  /* return refs taken for each xio_msg */
+  xsend->put_msg_refs();
+} /* msg_send_fail */
+
+void XioConnection::msg_release_fail(struct xio_msg *msg, int code)
+{
+  ldout(msgr->cct,2) << "xio_release_msg FAILED xcon: " << this <<
+    " msg: " << msg <<  "code=" << code <<
+    " (" << xio_strerror(code) << ")" << dendl;
+} /* msg_release_fail */
+
+int XioConnection::flush_out_queues(uint32_t flags) {
+  XioMessenger* msgr = static_cast<XioMessenger*>(get_messenger());
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.lock();
+
+  if (outgoing.keepalive) {
+    outgoing.keepalive = false;
+    send_keepalive_or_ack_internal();
+  }
+
+  if (outgoing.ack) {
+    outgoing.ack = false;
+    send_keepalive_or_ack_internal(true, &outgoing.ack_time);
+  }
+
+  // send deferred 1 (direct backpresssure)
+  if (outgoing.requeue.size() > 0)
+    portal->requeue(this, outgoing.requeue);
+
+  // send deferred 2 (sent while deferred)
+  int ix, q_size = outgoing.mqueue.size();
+  for (ix = 0; ix < q_size; ++ix) {
+    Message::Queue::iterator q_iter = outgoing.mqueue.begin();
+    Message* m = &(*q_iter);
+    outgoing.mqueue.erase(q_iter);
+    msgr->_send_message_impl(m, this);
+  }
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.unlock();
+  return 0;
+}
+
+int XioConnection::discard_out_queues(uint32_t flags)
+{
+  Message::Queue disc_q;
+  XioSubmit::Queue deferred_q;
+
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.lock();
+
+  /* the two send queues contain different objects:
+   * - anything on the mqueue is a Message
+   * - anything on the requeue is an XioSend
+   */
+  Message::Queue::const_iterator i1 = disc_q.end();
+  disc_q.splice(i1, outgoing.mqueue);
+
+  XioSubmit::Queue::const_iterator i2 = deferred_q.end();
+  deferred_q.splice(i2, outgoing.requeue);
+
+  outgoing.keepalive = outgoing.ack = false;
+
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.unlock();
+
+  // mqueue
+  while (!disc_q.empty()) {
+    Message::Queue::iterator q_iter = disc_q.begin();
+    Message* m = &(*q_iter);
+    disc_q.erase(q_iter);
+    m->put();
+  }
+
+  // requeue
+  while (!deferred_q.empty()) {
+    XioSubmit::Queue::iterator q_iter = deferred_q.begin();
+    XioSubmit* xs = &(*q_iter);
+    XioSend* xsend;
+    switch (xs->type) {
+      case XioSubmit::OUTGOING_MSG:
+	xsend = static_cast<XioSend*>(xs);
+	deferred_q.erase(q_iter);
+	// release once for each chained xio_msg
+	xsend->put(xsend->get_msg_count());
+	break;
+      case XioSubmit::INCOMING_MSG_RELEASE:
+	deferred_q.erase(q_iter);
+	portal->release_xio_msg(static_cast<XioCompletion*>(xs));
+	break;
+      default:
+	ldout(msgr->cct,0) << __func__ << ": Unknown Msg type " << xs->type << dendl;
+	break;
+    }
+  }
+
+  return 0;
+}
+
+int XioConnection::adjust_clru(uint32_t flags)
+{
+  if (flags & CState::OP_FLAG_LOCKED)
+    sp.unlock();
+
+  XioMessenger* msgr = static_cast<XioMessenger*>(get_messenger());
+  msgr->conns_sp.lock();
+  sp.lock();
+
+  if (cstate.flags & CState::FLAG_MAPPED) {
+    XioConnection::ConnList::iterator citer =
+      XioConnection::ConnList::s_iterator_to(*this);
+    msgr->conns_list.erase(citer);
+    msgr->conns_list.push_front(*this); // LRU
+  }
+
+  msgr->conns_sp.unlock();
+
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.unlock();
+
+  return 0;
+}
+
+int XioConnection::on_msg_error(struct xio_session *session,
+				enum xio_status error,
+				struct xio_msg  *msg,
+				void *conn_user_context)
+{
+  XioSend *xsend = static_cast<XioSend*>(msg->user_context);
+  if (xsend)
+    xsend->put();
+
+  --send_ctr; /* atomic, because portal thread */
+  return 0;
+} /* on_msg_error */
+
+void XioConnection::mark_down()
+{
+  _mark_down(XioConnection::CState::OP_FLAG_NONE);
+}
+
+int XioConnection::_mark_down(uint32_t flags)
+{
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.lock();
+
+  // per interface comment, we only stage a remote reset if the
+  // current policy required it
+  if (cstate.policy.resetcheck)
+    cstate.flags |= CState::FLAG_RESET;
+
+  disconnect();
+
+  /* XXX this will almost certainly be called again from
+   * on_disconnect_event() */
+  discard_out_queues(flags|CState::OP_FLAG_LOCKED);
+
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.unlock();
+
+  return 0;
+}
+
+void XioConnection::mark_disposable()
+{
+  _mark_disposable(XioConnection::CState::OP_FLAG_NONE);
+}
+
+int XioConnection::_mark_disposable(uint32_t flags)
+{
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.lock();
+
+  cstate.policy.lossy = true;
+
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    sp.unlock();
+
+  return 0;
+}
+
+int XioConnection::CState::state_up_ready(uint32_t flags)
+{
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    xcon->sp.lock();
+
+  xcon->flush_out_queues(flags|CState::OP_FLAG_LOCKED);
+
+  session_state = session_states::UP;
+  startup_state = session_startup_states::READY;
+
+  if (! (flags & CState::OP_FLAG_LOCKED))
+    xcon->sp.unlock();
+
+  return (0);
+}
+
+int XioConnection::CState::state_discon()
+{
+  session_state = session_states::DISCONNECTED;
+  startup_state = session_startup_states::IDLE;
+
+  return 0;
+}
+
+int XioConnection::CState::state_flow_controlled(uint32_t flags)
+{
+  if (! (flags & OP_FLAG_LOCKED))
+    xcon->sp.lock();
+
+  session_state = session_states::FLOW_CONTROLLED;
+
+  if (! (flags & OP_FLAG_LOCKED))
+    xcon->sp.unlock();
+
+  return (0);
+}
+
+int XioConnection::CState::state_fail(Message* m, uint32_t flags)
+{
+  if (! (flags & OP_FLAG_LOCKED))
+    xcon->sp.lock();
+
+  // advance to state FAIL, drop queued, msgs, adjust LRU
+  session_state = session_states::DISCONNECTED;
+  startup_state = session_startup_states::FAIL;
+
+  xcon->discard_out_queues(flags|OP_FLAG_LOCKED);
+  xcon->adjust_clru(flags|OP_FLAG_LOCKED|OP_FLAG_LRU);
+
+  xcon->disconnect();
+
+  if (! (flags & OP_FLAG_LOCKED))
+    xcon->sp.unlock();
+
+  // notify ULP
+  XioMessenger* msgr = static_cast<XioMessenger*>(xcon->get_messenger());
+  msgr->ms_deliver_handle_reset(xcon);
+  m->put();
+
+  return 0;
+}
+
+
+int XioLoopbackConnection::send_message(Message *m)
+{
+  XioMessenger *ms = static_cast<XioMessenger*>(get_messenger());
+  m->set_connection(this);
+  m->set_seq(next_seq());
+  m->set_src(ms->get_myinst().name);
+  ms->ds_dispatch(m);
+  return 0;
+}
+
+void XioLoopbackConnection::send_keepalive()
+{
+  utime_t t = ceph_clock_now();
+  set_last_keepalive(t);
+  set_last_keepalive_ack(t);
+}
diff --git a/src/msg/xio/XioConnection.h b/src/msg/xio/XioConnection.h
new file mode 100644
index 00000000..00024ef3
--- /dev/null
+++ b/src/msg/xio/XioConnection.h
@@ -0,0 +1,380 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef XIO_CONNECTION_H
+#define XIO_CONNECTION_H
+
+#include <atomic>
+
+#include <boost/intrusive/avl_set.hpp>
+#include <boost/intrusive/list.hpp>
+
+extern "C" {
+#include "libxio.h"
+}
+
+#include "XioInSeq.h"
+#include "XioSubmit.h"
+#include "msg/Connection.h"
+#include "msg/Messenger.h"
+#include "auth/AuthSessionHandler.h"
+
+#define XIO_ALL_FEATURES (CEPH_FEATURES_ALL)
+
+
+#define XIO_NOP_TAG_MARKDOWN 0x0001
+
+namespace bi = boost::intrusive;
+
+class XioPortal;
+class XioMessenger;
+class XioSend;
+
+class XioConnection : public Connection
+{
+public:
+  enum type { ACTIVE, PASSIVE };
+
+  enum class session_states : unsigned {
+    INIT = 0,
+    START,
+    UP,
+    FLOW_CONTROLLED,
+    DISCONNECTED,
+    DELETED,
+    BARRIER
+  };
+
+  enum class session_startup_states : unsigned {
+    IDLE = 0,
+    CONNECTING,
+    ACCEPTING,
+    READY,
+    FAIL
+  };
+
+private:
+  XioConnection::type xio_conn_type;
+  XioPortal *portal;
+  std::atomic<bool> connected = { false };
+  entity_inst_t peer;
+  struct xio_session *session;
+  struct xio_connection	*conn;
+  ceph::util::spinlock sp;
+  std::atomic<int64_t> send = { 0 };
+  std::atomic<int64_t> recv = { 0 };
+  uint32_t n_reqs; // Accelio-initiated reqs in progress (!counting partials)
+  uint32_t magic;
+  uint32_t special_handling;
+  uint64_t scount;
+  uint32_t send_ctr;
+  int q_high_mark;
+  int q_low_mark;
+
+  struct lifecycle {
+    // different from Pipe states?
+    enum lf_state {
+      INIT,
+      LOCAL_DISCON,
+      REMOTE_DISCON,
+      RECONNECTING,
+      UP,
+      DEAD } state;
+
+    /* XXX */
+    uint32_t reconnects;
+    uint32_t connect_seq, peer_global_seq;
+    uint64_t in_seq, out_seq_acked; // atomic<uint64_t>, got receipt
+    std::atomic<int64_t> out_seq = { 0 }; 
+
+    lifecycle() : state(lifecycle::INIT), reconnects(0), connect_seq(0),
+		  peer_global_seq(0), in_seq(0), out_seq_acked(0)
+		  {}
+
+    void set_in_seq(uint64_t seq) {
+      in_seq = seq;
+    }
+
+    uint64_t next_out_seq() {
+      return ++out_seq;
+    }
+
+  } state;
+
+  /* batching */
+  XioInSeq in_seq;
+
+  class CState
+  {
+  public:
+    static const int FLAG_NONE = 0x0000;
+    static const int FLAG_BAD_AUTH = 0x0001;
+    static const int FLAG_MAPPED = 0x0002;
+    static const int FLAG_RESET = 0x0004;
+
+    static const int OP_FLAG_NONE = 0x0000;
+    static const int OP_FLAG_LOCKED = 0x0001;
+    static const int OP_FLAG_LRU = 0x0002;
+
+    uint64_t features;
+    Messenger::Policy policy;
+
+    CryptoKey session_key;
+    std::shared_ptr<AuthSessionHandler> session_security;
+    AuthAuthorizer *authorizer;
+    XioConnection *xcon;
+    uint32_t protocol_version;
+
+    std::atomic<session_states> session_state = { 0 };
+    std::atomic<session_startup_state> startup_state = { 0 };
+
+    uint32_t reconnects;
+    uint32_t connect_seq, global_seq, peer_global_seq;
+    uint64_t in_seq, out_seq_acked; // atomic<uint64_t>, got receipt
+    std::atomic<uint64_t> out_seq = { 0 }; 
+
+    uint32_t flags;
+
+    explicit CState(XioConnection* _xcon)
+      : features(0),
+	authorizer(NULL),
+	xcon(_xcon),
+	protocol_version(0),
+	session_state(INIT),
+	startup_state(IDLE),
+	reconnects(0),
+	connect_seq(0),
+	global_seq(0),
+	peer_global_seq(0),
+	in_seq(0),
+	out_seq_acked(0),
+	flags(FLAG_NONE) {}
+
+    uint64_t get_session_state() {
+      return session_state;
+    }
+
+    uint64_t get_startup_state() {
+      return startup_state;
+    }
+
+    void set_in_seq(uint64_t seq) {
+      in_seq = seq;
+    }
+
+    uint64_t next_out_seq() {
+      return ++out_seq;
+    };
+
+    // state machine
+    int init_state();
+    int next_state(Message* m);
+#if 0 // future (session startup)
+    int msg_connect(MConnect *m);
+    int msg_connect_reply(MConnectReply *m);
+    int msg_connect_reply(MConnectAuthReply *m);
+    int msg_connect_auth(MConnectAuth *m);
+    int msg_connect_auth_reply(MConnectAuthReply *m);
+#endif
+    int state_up_ready(uint32_t flags);
+    int state_flow_controlled(uint32_t flags);
+    int state_discon();
+    int state_fail(Message* m, uint32_t flags);
+
+  } cstate; /* CState */
+
+  // message submission queue
+  struct SendQ {
+    bool keepalive;
+    bool ack;
+    utime_t ack_time;
+    Message::Queue mqueue; // deferred
+    XioSubmit::Queue requeue;
+
+    SendQ():keepalive(false), ack(false){}
+  } outgoing;
+
+  // conns_entity_map comparison functor
+  struct EntityComp
+  {
+    // for internal ordering
+    bool operator()(const XioConnection &lhs,  const XioConnection &rhs) const
+      {  return lhs.get_peer() < rhs.get_peer(); }
+
+    // for external search by entity_inst_t(peer)
+    bool operator()(const entity_inst_t &peer, const XioConnection &c) const
+      {  return peer < c.get_peer(); }
+
+    bool operator()(const XioConnection &c, const entity_inst_t &peer) const
+      {  return c.get_peer() < peer;  }
+  };
+
+  bi::list_member_hook<> conns_hook;
+  bi::avl_set_member_hook<> conns_entity_map_hook;
+
+  typedef bi::list< XioConnection,
+		    bi::member_hook<XioConnection, bi::list_member_hook<>,
+				    &XioConnection::conns_hook > > ConnList;
+
+  typedef bi::member_hook<XioConnection, bi::avl_set_member_hook<>,
+			  &XioConnection::conns_entity_map_hook> EntityHook;
+
+  typedef bi::avl_set< XioConnection, EntityHook,
+		       bi::compare<EntityComp> > EntitySet;
+
+  friend class XioPortal;
+  friend class XioMessenger;
+  friend class XioDispatchHook;
+  friend class XioMarkDownHook;
+  friend class XioSend;
+
+  int on_disconnect_event() {
+    std::lock_guard<ceph::spinlock> lg(sp);
+
+    connected = false;
+    discard_out_queues(CState::OP_FLAG_LOCKED);
+
+    return 0;
+  }
+
+  int on_teardown_event() {
+
+    {
+    std::lock_guard<ceph::spinlock> lg(sp);
+
+    if (conn)
+      xio_connection_destroy(conn);
+    conn = NULL;
+    }
+
+    this->put();
+    return 0;
+  }
+
+  int xio_qdepth_high_mark() {
+    return q_high_mark;
+  }
+
+  int xio_qdepth_low_mark() {
+    return q_low_mark;
+  }
+
+public:
+  XioConnection(XioMessenger *m, XioConnection::type _type,
+		const entity_inst_t& peer);
+
+  ~XioConnection() {
+    if (conn)
+      xio_connection_destroy(conn);
+  }
+  ostream& conn_prefix(std::ostream *_dout);
+
+  bool is_connected() override { return connected; }
+
+  int send_message(Message *m) override;
+  void send_keepalive() override {send_keepalive_or_ack();}
+  void send_keepalive_or_ack(bool ack = false, const utime_t *tp = nullptr);
+  void mark_down() override;
+  int _mark_down(uint32_t flags);
+  void mark_disposable() override;
+  int _mark_disposable(uint32_t flags);
+
+  const entity_inst_t& get_peer() const { return peer; }
+
+  XioConnection* get() {
+#if 0
+    cout << "XioConnection::get " << this << " " << nref.load() << std::endl;
+#endif
+    RefCountedObject::get();
+    return this;
+  }
+
+  void put() {
+    RefCountedObject::put();
+#if 0
+    cout << "XioConnection::put " << this << " " << nref.load() << std::endl;
+#endif
+  }
+
+  void disconnect() {
+    if (is_connected()) {
+      connected = false;
+      xio_disconnect(conn); // normal teardown will clean up conn
+    }
+  }
+
+  uint32_t get_magic() { return magic; }
+  void set_magic(int _magic) { magic = _magic; }
+  uint32_t get_special_handling() { return special_handling; }
+  void set_special_handling(int n) { special_handling = n; }
+  uint64_t get_scount() { return scount; }
+
+  int passive_setup(); /* XXX */
+
+  int handle_data_msg(struct xio_session *session, struct xio_msg *msg,
+		 int more_in_batch, void *cb_user_context);
+  int on_msg(struct xio_session *session, struct xio_msg *msg,
+		 int more_in_batch, void *cb_user_context);
+  int on_ow_msg_send_complete(struct xio_session *session, struct xio_msg *msg,
+			      void *conn_user_context);
+  int on_msg_error(struct xio_session *session, enum xio_status error,
+		   struct xio_msg  *msg, void *conn_user_context);
+  void msg_send_fail(XioSend *xsend, int code);
+  void msg_release_fail(struct xio_msg *msg, int code);
+private:
+  void send_keepalive_or_ack_internal(bool ack = false, const utime_t *tp = nullptr);
+  int flush_out_queues(uint32_t flags);
+  int discard_out_queues(uint32_t flags);
+  int adjust_clru(uint32_t flags);
+};
+
+typedef boost::intrusive_ptr<XioConnection> XioConnectionRef;
+
+class XioLoopbackConnection : public Connection
+{
+private:
+  std::atomic<uint64_t> seq = { 0 };
+public:
+  explicit XioLoopbackConnection(Messenger *m) : Connection(m->cct, m)
+    {
+      const entity_inst_t& m_inst = m->get_myinst();
+      peer_addr = m_inst.addr;
+      peer_type = m_inst.name.type();
+      set_features(XIO_ALL_FEATURES); /* XXXX set to ours */
+    }
+
+  XioLoopbackConnection* get() {
+    return static_cast<XioLoopbackConnection*>(RefCountedObject::get());
+  }
+
+  bool is_connected() override { return true; }
+
+  int send_message(Message *m) override;
+  void send_keepalive() override;
+  void mark_down() override {}
+  void mark_disposable() override {}
+
+  uint64_t get_seq() {
+    return seq;
+  }
+
+  uint64_t next_seq() {
+    return ++seq;
+  }
+};
+
+typedef boost::intrusive_ptr<XioLoopbackConnection> XioLoopbackConnectionRef;
+
+#endif /* XIO_CONNECTION_H */
diff --git a/src/msg/xio/XioInSeq.h b/src/msg/xio/XioInSeq.h
new file mode 100644
index 00000000..7863a8f6
--- /dev/null
+++ b/src/msg/xio/XioInSeq.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef XIO_IN_SEQ_H
+#define XIO_IN_SEQ_H
+
+#include <boost/intrusive/list.hpp>
+#include "msg/SimplePolicyMessenger.h"
+extern "C" {
+#include "libxio.h"
+}
+
+/* For inbound messages (Accelio-owned) ONLY, use the message's
+ * user_context as an SLIST */
+class XioInSeq {
+private:
+  int cnt;
+  int sz;
+  struct xio_msg* head;
+  struct xio_msg* tail;
+
+public:
+  XioInSeq() : cnt(0), sz(0), head(NULL), tail(NULL) {}
+  XioInSeq(const XioInSeq& seq) {
+    cnt = seq.cnt;
+    sz = seq.sz;
+    head = seq.head;
+    tail = seq.tail;
+  }
+
+  int count() { return cnt; }
+
+  int size() { return sz; }
+
+  bool p() { return !!head; }
+
+  void set_count(int _cnt) { cnt = _cnt; }
+
+  void append(struct xio_msg* msg) {
+    msg->user_context = NULL;
+    if (!head) {
+      head = tail = msg;
+    } else {
+      tail->user_context = msg;
+      tail = msg;
+    }
+    ++sz;
+    --cnt;
+  }
+
+  struct xio_msg* begin() { return head; }
+
+  struct xio_msg* end() { return NULL; }
+
+  void next(struct xio_msg** msg) {
+    *msg = static_cast<struct xio_msg *>((*msg)->user_context);
+  }
+
+  struct xio_msg* dequeue() {
+    struct xio_msg* msgs = head;
+    clear();
+    return msgs;
+  }
+
+  void clear() {
+    head = tail = NULL;
+    cnt = 0;
+    sz = 0;
+  }
+};
+
+#endif /* XIO_IN_SEQ_H */
diff --git a/src/msg/xio/XioMessenger.cc b/src/msg/xio/XioMessenger.cc
new file mode 100644
index 00000000..dec7d0c7
--- /dev/null
+++ b/src/msg/xio/XioMessenger.cc
@@ -0,0 +1,1136 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <arpa/inet.h>
+#include <boost/lexical_cast.hpp>
+#include <set>
+#include <stdlib.h>
+#include <memory>
+
+#include "XioMsg.h"
+#include "XioMessenger.h"
+#include "common/address_helper.h"
+#include "common/code_environment.h"
+#include "messages/MNop.h"
+
+#define dout_subsys ceph_subsys_xio
+#undef dout_prefix
+#define dout_prefix *_dout << "xio."
+
+Mutex mtx("XioMessenger Package Lock");
+std::atomic<bool> initialized = { false };
+
+std::atomic<unsigned> XioMessenger::nInstances = { 0 };
+
+struct xio_mempool *xio_msgr_noreg_mpool;
+
+static struct xio_session_ops xio_msgr_ops;
+
+/* Accelio API callouts */
+
+namespace xio_log
+{
+typedef pair<const char*, int> level_pair;
+static const level_pair LEVELS[] = {
+  make_pair("fatal", 0),
+  make_pair("error", 0),
+  make_pair("warn", 1),
+  make_pair("info", 1),
+  make_pair("debug", 2),
+  make_pair("trace", 20)
+};
+
+static CephContext *context;
+
+int get_level()
+{
+  int level = 0;
+  for (size_t i = 0; i < sizeof(LEVELS); i++) {
+    if (!ldlog_p1(context, dout_subsys, LEVELS[i].second))
+      break;
+    level++;
+  }
+  return level;
+}
+
+void log_dout(const char *file, unsigned line,
+	      const char *function, unsigned level,
+	      const char *fmt, ...)
+{
+  char buffer[2048];
+  va_list args;
+  va_start(args, fmt);
+  int n = vsnprintf(buffer, sizeof(buffer), fmt, args);
+  va_end(args);
+
+  if (n > 0) {
+    const char *short_file = strrchr(file, '/');
+    short_file = (short_file == NULL) ? file : short_file + 1;
+
+    const level_pair &lvl = LEVELS[level];
+    ldout(context, lvl.second) << '[' << lvl.first << "] "
+      << short_file << ':' << line << ' '
+      << function << " - " << buffer << dendl;
+  }
+}
+}
+
+static int on_session_event(struct xio_session *session,
+			    struct xio_session_event_data *event_data,
+			    void *cb_user_context)
+{
+  XioMessenger *msgr = static_cast<XioMessenger*>(cb_user_context);
+  CephContext *cct = msgr->cct;
+
+  ldout(cct,4) << "session event: " << xio_session_event_str(event_data->event)
+    << ". reason: " << xio_strerror(event_data->reason) << dendl;
+
+  return msgr->session_event(session, event_data, cb_user_context);
+}
+
+static int on_new_session(struct xio_session *session,
+			  struct xio_new_session_req *req,
+			  void *cb_user_context)
+{
+  XioMessenger *msgr = static_cast<XioMessenger*>(cb_user_context);
+  CephContext *cct = msgr->cct;
+
+  ldout(cct,4) << "new session " << session
+    << " user_context " << cb_user_context << dendl;
+
+  return (msgr->new_session(session, req, cb_user_context));
+}
+
+static int on_msg(struct xio_session *session,
+		  struct xio_msg *req,
+		  int more_in_batch,
+		  void *cb_user_context)
+{
+  XioConnection* xcon __attribute__((unused)) =
+    static_cast<XioConnection*>(cb_user_context);
+  CephContext *cct = xcon->get_messenger()->cct;
+
+  ldout(cct,25) << "on_msg session " << session << " xcon " << xcon << dendl;
+
+  if (unlikely(XioPool::trace_mempool)) {
+    static uint32_t nreqs;
+    if (unlikely((++nreqs % 65536) == 0)) {
+      xp_stats.dump(__func__, nreqs);
+    }
+  }
+
+  return xcon->on_msg(session, req, more_in_batch,
+			  cb_user_context);
+}
+
+static int on_ow_msg_send_complete(struct xio_session *session,
+				   struct xio_msg *msg,
+				   void *conn_user_context)
+{
+  XioConnection *xcon =
+    static_cast<XioConnection*>(conn_user_context);
+  CephContext *cct = xcon->get_messenger()->cct;
+
+  ldout(cct,25) << "msg delivered session: " << session
+		<< " msg: " << msg << " conn_user_context "
+		<< conn_user_context << dendl;
+
+  return xcon->on_ow_msg_send_complete(session, msg, conn_user_context);
+}
+
+static int on_msg_error(struct xio_session *session,
+			enum xio_status error,
+			enum xio_msg_direction dir,
+			struct xio_msg  *msg,
+			void *conn_user_context)
+{
+  /* XIO promises to flush back undelivered messages */
+  XioConnection *xcon =
+    static_cast<XioConnection*>(conn_user_context);
+  CephContext *cct = xcon->get_messenger()->cct;
+
+  ldout(cct,4) << "msg error session: " << session
+    << " error: " << xio_strerror(error) << " msg: " << msg
+    << " conn_user_context " << conn_user_context << dendl;
+
+  return xcon->on_msg_error(session, error, msg, conn_user_context);
+}
+
+static int on_cancel(struct xio_session *session,
+		     struct xio_msg  *msg,
+		     enum xio_status result,
+		     void *conn_user_context)
+{
+  XioConnection* xcon __attribute__((unused)) =
+    static_cast<XioConnection*>(conn_user_context);
+  CephContext *cct = xcon->get_messenger()->cct;
+
+  ldout(cct,25) << "on cancel: session: " << session << " msg: " << msg
+    << " conn_user_context " << conn_user_context << dendl;
+
+  return 0;
+}
+
+static int on_cancel_request(struct xio_session *session,
+			     struct xio_msg  *msg,
+			     void *conn_user_context)
+{
+  XioConnection* xcon __attribute__((unused)) =
+    static_cast<XioConnection*>(conn_user_context);
+  CephContext *cct = xcon->get_messenger()->cct;
+
+  ldout(cct,25) << "on cancel request: session: " << session << " msg: " << msg
+    << " conn_user_context " << conn_user_context << dendl;
+
+  return 0;
+}
+
+/* free functions */
+static string xio_uri_from_entity(const string &type,
+				  const entity_addr_t& addr, bool want_port)
+{
+  const char *host = NULL;
+  char addr_buf[129];
+  string xio_uri;
+
+  switch(addr.get_family()) {
+  case AF_INET:
+    host = inet_ntop(AF_INET, &addr.in4_addr().sin_addr, addr_buf,
+		     INET_ADDRSTRLEN);
+    break;
+  case AF_INET6:
+    host = inet_ntop(AF_INET6, &addr.in6_addr().sin6_addr, addr_buf,
+		     INET6_ADDRSTRLEN);
+    break;
+  default:
+    abort();
+    break;
+  };
+
+  if (type == "rdma" || type == "tcp")
+      xio_uri = type + "://";
+  else
+      xio_uri = "rdma://";
+
+  /* The following can only succeed if the host is rdma-capable */
+  xio_uri += host;
+  if (want_port) {
+    xio_uri += ":";
+    xio_uri += boost::lexical_cast<std::string>(addr.get_port());
+  }
+
+  return xio_uri;
+} /* xio_uri_from_entity */
+
+void XioInit::package_init(CephContext *cct) {
+   if (! initialized) {
+
+     mtx.Lock();
+     if (! initialized) {
+
+       xio_init();
+
+       // claim a reference to the first context we see
+       xio_log::context = cct->get();
+
+       int xopt;
+       xopt = xio_log::get_level();
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_LEVEL,
+ 		  &xopt, sizeof(xopt));
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_FN,
+ 		  (const void*)xio_log::log_dout, sizeof(xio_log_fn));
+
+       xopt = 1;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_DISABLE_HUGETBL,
+ 		  &xopt, sizeof(xopt));
+
+       if (g_code_env == CODE_ENVIRONMENT_DAEMON) {
+         xopt = 1;
+         xio_set_opt(NULL, XIO_OPTLEVEL_RDMA, XIO_OPTNAME_ENABLE_FORK_INIT,
+ 		    &xopt, sizeof(xopt));
+       }
+
+       xopt = XIO_MSGR_IOVLEN;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_IN_IOVLEN,
+ 		  &xopt, sizeof(xopt));
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_OUT_IOVLEN,
+ 		  &xopt, sizeof(xopt));
+
+       /* enable flow-control */
+       xopt = 1;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_ENABLE_FLOW_CONTROL,
+                  &xopt, sizeof(xopt));
+
+       /* and set threshold for buffer callouts */
+       xopt = max(cct->_conf->xio_max_send_inline, 512);
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_DATA,
+                  &xopt, sizeof(xopt));
+
+       xopt = XioMsgHdr::get_max_encoded_length();
+       ldout(cct,2) << "setting accelio max header size " << xopt << dendl;
+       xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_HEADER,
+                  &xopt, sizeof(xopt));
+
+       size_t queue_depth = cct->_conf->xio_queue_depth;
+       struct xio_mempool_config mempool_config = {
+         6,
+         {
+           {1024,  0,  queue_depth,  262144},
+           {4096,  0,  queue_depth,  262144},
+           {16384, 0,  queue_depth,  262144},
+           {65536, 0,  128,  65536},
+           {262144, 0,  32,  16384},
+           {1048576, 0, 8,  8192}
+         }
+       };
+       xio_set_opt(NULL,
+                   XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_CONFIG_MEMPOOL,
+                   &mempool_config, sizeof(mempool_config));
+
+       /* and unregisterd one */
+ #define XMSG_MEMPOOL_QUANTUM 4096
+
+       xio_msgr_noreg_mpool =
+ 	xio_mempool_create(-1 /* nodeid */,
+ 			   XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC);
+
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 64,
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_64,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 256,
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_256,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 1024,
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_1k,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+       (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, getpagesize(),
+ 				       cct->_conf->xio_mp_min,
+ 				       cct->_conf->xio_mp_max_page,
+ 				       XMSG_MEMPOOL_QUANTUM, 0);
+
+       /* initialize ops singleton */
+       xio_msgr_ops.on_session_event = on_session_event;
+       xio_msgr_ops.on_new_session = on_new_session;
+       xio_msgr_ops.on_session_established = NULL;
+       xio_msgr_ops.on_msg = on_msg;
+       xio_msgr_ops.on_ow_msg_send_complete = on_ow_msg_send_complete;
+       xio_msgr_ops.on_msg_error = on_msg_error;
+       xio_msgr_ops.on_cancel = on_cancel;
+       xio_msgr_ops.on_cancel_request = on_cancel_request;
+
+       /* mark initialized */
+       initialized = true;
+     }
+     mtx.Unlock();
+   }
+ }
+
+/* XioMessenger */
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, XioMessenger *msgr) {
+  return *_dout << "-- " << msgr->get_myaddr_legacy() << " ";
+}
+
+XioMessenger::XioMessenger(CephContext *cct, entity_name_t name,
+			   string mname, uint64_t _nonce,
+			   uint64_t cflags, DispatchStrategy *ds)
+  : SimplePolicyMessenger(cct, name, mname, _nonce),
+    XioInit(cct),
+    portals(this, get_nportals(cflags), get_nconns_per_portal(cflags)),
+    dispatch_strategy(ds),
+    loop_con(new XioLoopbackConnection(this)),
+    special_handling(0),
+    sh_mtx("XioMessenger session mutex"),
+    sh_cond(),
+    need_addr(true),
+    did_bind(false),
+    nonce(_nonce)
+{
+
+  if (cct->_conf->xio_trace_xcon)
+    magic |= MSG_MAGIC_TRACE_XCON;
+
+  XioPool::trace_mempool = (cct->_conf->xio_trace_mempool);
+  XioPool::trace_msgcnt = (cct->_conf->xio_trace_msgcnt);
+
+  dispatch_strategy->set_messenger(this);
+
+  /* update class instance count */
+  nInstances++;
+
+  loop_con->set_features(CEPH_FEATURES_ALL);
+
+  ldout(cct,2) << "Create msgr: " << this << " instance: "
+    << nInstances << " type: " << name.type_str()
+    << " subtype: " << mname << " nportals: " << get_nportals(cflags)
+    << " nconns_per_portal: " << get_nconns_per_portal(cflags)
+    << dendl;
+
+} /* ctor */
+
+int XioMessenger::pool_hint(uint32_t dsize) {
+  if (dsize > 1024*1024)
+    return 0;
+
+  /* if dsize is already present, returns -EEXIST */
+  return xio_mempool_add_slab(xio_msgr_noreg_mpool, dsize, 0,
+				   cct->_conf->xio_mp_max_hint,
+				   XMSG_MEMPOOL_QUANTUM, 0);
+}
+
+int XioMessenger::get_nconns_per_portal(uint64_t cflags)
+{
+  const int XIO_DEFAULT_NUM_CONNS_PER_PORTAL = 8;
+  int nconns = XIO_DEFAULT_NUM_CONNS_PER_PORTAL;
+
+  if (cflags & Messenger::HAS_MANY_CONNECTIONS)
+    nconns = max(cct->_conf->xio_max_conns_per_portal, XIO_DEFAULT_NUM_CONNS_PER_PORTAL);
+  else if (cflags & Messenger::HEARTBEAT)
+    nconns = max(cct->_conf->osd_heartbeat_min_peers * 4, XIO_DEFAULT_NUM_CONNS_PER_PORTAL);
+
+  return nconns;
+}
+
+int XioMessenger::get_nportals(uint64_t cflags)
+{
+  int nportals = 1;
+
+  if (cflags & Messenger::HAS_HEAVY_TRAFFIC)
+    nportals = max(cct->_conf->xio_portal_threads, 1);
+
+  return nportals;
+}
+
+void XioMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+  // be careful here: multiple threads may block here, and readers of
+  // my_inst.addr do NOT hold any lock.
+
+  // this always goes from true -> false under the protection of the
+  // mutex.  if it is already false, we need not retake the mutex at
+  // all.
+  if (!need_addr)
+    return;
+
+  sh_mtx.Lock();
+  if (need_addr) {
+    entity_addr_t t = peer_addr_for_me;
+    t.set_port(my_inst.addr.get_port());
+    my_inst.addr.set_sockaddr(t.get_sockaddr());
+    ldout(cct,2) << "learned my addr " << my_inst.addr << dendl;
+    need_addr = false;
+    // init_local_connection();
+  }
+  sh_mtx.Unlock();
+
+}
+
+int XioMessenger::new_session(struct xio_session *session,
+			      struct xio_new_session_req *req,
+			      void *cb_user_context)
+{
+  if (shutdown_called) {
+    return xio_reject(
+      session, XIO_E_SESSION_REFUSED, NULL /* udata */, 0 /* udata len */);
+  }
+  int code = portals.accept(session, req, cb_user_context);
+  if (! code)
+    nsessions++;
+  return code;
+} /* new_session */
+
+int XioMessenger::session_event(struct xio_session *session,
+				struct xio_session_event_data *event_data,
+				void *cb_user_context)
+{
+  XioConnection *xcon;
+
+  switch (event_data->event) {
+  case XIO_SESSION_CONNECTION_ESTABLISHED_EVENT:
+  {
+    struct xio_connection *conn = event_data->conn;
+    struct xio_connection_attr xcona;
+    entity_addr_t peer_addr_for_me, paddr;
+
+    xcon = static_cast<XioConnection*>(event_data->conn_user_context);
+
+    ldout(cct,2) << "connection established " << event_data->conn
+      << " session " << session << " xcon " << xcon << dendl;
+
+    (void) xio_query_connection(conn, &xcona,
+				XIO_CONNECTION_ATTR_LOCAL_ADDR|
+				XIO_CONNECTION_ATTR_PEER_ADDR);
+    peer_addr_for_me.set_sockaddr((struct sockaddr *)&xcona.local_addr);
+    paddr.set_sockaddr((struct sockaddr *)&xcona.peer_addr);
+    //set_myaddr(peer_addr_for_me);
+    learned_addr(peer_addr_for_me);
+    ldout(cct,2) << "client: connected from " << peer_addr_for_me << " to " << paddr << dendl;
+
+    /* notify hook */
+    this->ms_deliver_handle_connect(xcon);
+    this->ms_deliver_handle_fast_connect(xcon);
+  }
+  break;
+
+  case XIO_SESSION_NEW_CONNECTION_EVENT:
+  {
+    struct xio_connection *conn = event_data->conn;
+    struct xio_connection_attr xcona;
+    entity_inst_t s_inst;
+    entity_addr_t peer_addr_for_me;
+
+    (void) xio_query_connection(conn, &xcona,
+				XIO_CONNECTION_ATTR_CTX|
+				XIO_CONNECTION_ATTR_PEER_ADDR|
+				XIO_CONNECTION_ATTR_LOCAL_ADDR);
+    /* XXX assumes RDMA */
+    s_inst.addr.set_sockaddr((struct sockaddr *)&xcona.peer_addr);
+    peer_addr_for_me.set_sockaddr((struct sockaddr *)&xcona.local_addr);
+
+    xcon = new XioConnection(this, XioConnection::PASSIVE, s_inst);
+    xcon->session = session;
+
+    struct xio_context_attr xctxa;
+    (void) xio_query_context(xcona.ctx, &xctxa, XIO_CONTEXT_ATTR_USER_CTX);
+
+    xcon->conn = conn;
+    xcon->portal = static_cast<XioPortal*>(xctxa.user_context);
+    ceph_assert(xcon->portal);
+
+    xcona.user_context = xcon;
+    (void) xio_modify_connection(conn, &xcona, XIO_CONNECTION_ATTR_USER_CTX);
+
+    xcon->connected = true;
+
+    /* sentinel ref */
+    xcon->get(); /* xcon->nref == 1 */
+    conns_sp.lock();
+    conns_list.push_back(*xcon);
+    /* XXX we can't put xcon in conns_entity_map becase we don't yet know
+     * it's peer address */
+    conns_sp.unlock();
+
+    /* XXXX pre-merge of session startup negotiation ONLY! */
+    xcon->cstate.state_up_ready(XioConnection::CState::OP_FLAG_NONE);
+
+    ldout(cct,2) << "New connection session " << session
+      << " xcon " << xcon << " on msgr: " << this << " portal: " << xcon->portal << dendl;
+    ldout(cct,2) << "Server: connected from " << s_inst.addr << " to " << peer_addr_for_me << dendl;
+  }
+  break;
+  case XIO_SESSION_CONNECTION_ERROR_EVENT:
+  case XIO_SESSION_CONNECTION_CLOSED_EVENT: /* orderly discon */
+  case XIO_SESSION_CONNECTION_DISCONNECTED_EVENT: /* unexpected discon */
+  case XIO_SESSION_CONNECTION_REFUSED_EVENT:
+    xcon = static_cast<XioConnection*>(event_data->conn_user_context);
+    ldout(cct,2) << xio_session_event_str(event_data->event)
+      << " xcon " << xcon << " session " << session  << dendl;
+    if (likely(!!xcon)) {
+      unregister_xcon(xcon);
+      xcon->on_disconnect_event();
+    }
+    break;
+  case XIO_SESSION_CONNECTION_TEARDOWN_EVENT:
+    xcon = static_cast<XioConnection*>(event_data->conn_user_context);
+    ldout(cct,2) << xio_session_event_str(event_data->event)
+      << " xcon " << xcon << " session " << session << dendl;
+    /*
+     * There are flows where Accelio sends teardown event without going
+     * through disconnect event. so we make sure we cleaned the connection.
+     */
+    unregister_xcon(xcon);
+    xcon->on_teardown_event();
+    break;
+  case XIO_SESSION_TEARDOWN_EVENT:
+    ldout(cct,2) << xio_session_event_str(event_data->event)
+      << " session " << session << dendl;
+    if (unlikely(XioPool::trace_mempool)) {
+      xp_stats.dump("xio session dtor", reinterpret_cast<uint64_t>(session));
+    }
+    xio_session_destroy(session);
+    if (--nsessions == 0) {
+      Mutex::Locker lck(sh_mtx);
+      if (nsessions == 0)
+	sh_cond.Signal();
+    }
+    break;
+  default:
+    break;
+  };
+
+  return 0;
+}
+
+enum bl_type
+{
+  BUFFER_PAYLOAD,
+  BUFFER_MIDDLE,
+  BUFFER_DATA
+};
+
+#define MAX_XIO_BUF_SIZE 1044480
+
+static inline int
+xio_count_buffers(const buffer::list& bl, int& req_size, int& msg_off, int& req_off)
+{
+
+  const std::list<buffer::ptr>& buffers = bl.buffers();
+  list<bufferptr>::const_iterator pb;
+  size_t size, off;
+  int result;
+  int first = 1;
+
+  off = size = 0;
+  result = 0;
+  for (;;) {
+    if (off >= size) {
+      if (first) pb = buffers.begin(); else ++pb;
+      if (pb == buffers.end()) {
+	break;
+      }
+      off = 0;
+      size = pb->length();
+      first = 0;
+    }
+    size_t count = size - off;
+    if (!count) continue;
+    if (req_size + count > MAX_XIO_BUF_SIZE) {
+	count = MAX_XIO_BUF_SIZE - req_size;
+    }
+
+    ++result;
+
+    /* advance iov and perhaps request */
+
+    off += count;
+    req_size += count;
+    ++msg_off;
+    if (unlikely(msg_off >= XIO_MSGR_IOVLEN || req_size >= MAX_XIO_BUF_SIZE)) {
+      ++req_off;
+      msg_off = 0;
+      req_size = 0;
+    }
+  }
+
+  return result;
+}
+
+static inline void
+xio_place_buffers(const buffer::list& bl, XioMsg *xmsg, struct xio_msg*& req,
+		  struct xio_iovec_ex*& msg_iov, int& req_size,
+		  int ex_cnt, int& msg_off, int& req_off, bl_type type)
+{
+
+  const std::list<buffer::ptr>& buffers = bl.buffers();
+  list<bufferptr>::const_iterator pb;
+  struct xio_iovec_ex* iov;
+  size_t size, off;
+  const char *data = NULL;
+  int first = 1;
+
+  off = size = 0;
+  for (;;) {
+    if (off >= size) {
+      if (first) pb = buffers.begin(); else ++pb;
+      if (pb == buffers.end()) {
+	break;
+      }
+      off = 0;
+      size = pb->length();
+      data = pb->c_str();	 // is c_str() efficient?
+      first = 0;
+    }
+    size_t count = size - off;
+    if (!count) continue;
+    if (req_size + count > MAX_XIO_BUF_SIZE) {
+	count = MAX_XIO_BUF_SIZE - req_size;
+    }
+
+    /* assign buffer */
+    iov = &msg_iov[msg_off];
+    iov->iov_base = (void *) (&data[off]);
+    iov->iov_len = count;
+
+    switch (type) {
+    case BUFFER_DATA:
+      //break;
+    default:
+    {
+      struct xio_reg_mem *mp = get_xio_mp(*pb);
+      iov->mr = (mp) ? mp->mr : NULL;
+    }
+      break;
+    }
+
+    /* advance iov(s) */
+
+    off += count;
+    req_size += count;
+    ++msg_off;
+
+    /* next request if necessary */
+
+    if (unlikely(msg_off >= XIO_MSGR_IOVLEN || req_size >= MAX_XIO_BUF_SIZE)) {
+      /* finish this request */
+      req->out.pdata_iov.nents = msg_off;
+      /* advance to next, and write in it if it's not the last one. */
+      if (++req_off >= ex_cnt) {
+	req = 0;	/* poison.  trap if we try to use it. */
+	msg_iov = NULL;
+      } else {
+	req = &xmsg->req_arr[req_off].msg;
+	msg_iov = req->out.pdata_iov.sglist;
+      }
+      msg_off = 0;
+      req_size = 0;
+    }
+  }
+}
+
+int XioMessenger::bind(const entity_addr_t& addr)
+{
+  if (addr.is_blank_ip()) {
+      lderr(cct) << "ERROR: need rdma ip for remote use! " << dendl;
+      cout << "Error: xio bind failed. public/cluster ip not specified" << std::endl;
+      return -1;
+  }
+
+  entity_addr_t shift_addr = addr;
+  string base_uri = xio_uri_from_entity(cct->_conf->xio_transport_type,
+					shift_addr, false /* want_port */);
+  ldout(cct,4) << "XioMessenger " << this << " bind: xio_uri "
+    << base_uri << ':' << shift_addr.get_port() << dendl;
+
+  uint16_t port0;
+  int r = portals.bind(&xio_msgr_ops, base_uri, shift_addr.get_port(), &port0);
+  if (r == 0) {
+    shift_addr.set_port(port0);
+    shift_addr.nonce = nonce;
+    set_myaddr(shift_addr);
+    need_addr = false;
+    did_bind = true;
+  }
+  return r;
+} /* bind */
+
+int XioMessenger::rebind(const set<int>& avoid_ports)
+{
+  ldout(cct,4) << "XioMessenger " << this << " rebind attempt" << dendl;
+  return 0;
+} /* rebind */
+
+int XioMessenger::start()
+{
+  portals.start();
+  dispatch_strategy->start();
+  if (!did_bind) {
+	  my_inst.addr.nonce = nonce;
+  }
+  started = true;
+  return 0;
+}
+
+void XioMessenger::wait()
+{
+  portals.join();
+  dispatch_strategy->wait();
+} /* wait */
+
+int XioMessenger::_send_message(Message *m, const entity_inst_t& dest)
+{
+  ConnectionRef conn = get_connection(dest);
+  if (conn)
+    return _send_message(m, &(*conn));
+  else
+    return EINVAL;
+} /* send_message(Message *, const entity_inst_t&) */
+
+static inline XioMsg* pool_alloc_xio_msg(Message *m, XioConnection *xcon,
+  int ex_cnt)
+{
+  struct xio_reg_mem mp_mem;
+  int e = xpool_alloc(xio_msgr_noreg_mpool, sizeof(XioMsg), &mp_mem);
+  if (!!e)
+    return NULL;
+  XioMsg *xmsg = reinterpret_cast<XioMsg*>(mp_mem.addr);
+  ceph_assert(!!xmsg);
+  new (xmsg) XioMsg(m, xcon, mp_mem, ex_cnt, CEPH_FEATURES_ALL);
+  return xmsg;
+}
+
+XioCommand* pool_alloc_xio_command(XioConnection *xcon)
+{
+  struct xio_reg_mem mp_mem;
+  int e = xpool_alloc(xio_msgr_noreg_mpool, sizeof(XioCommand), &mp_mem);
+  if (!!e)
+    return NULL;
+  XioCommand *xcmd = reinterpret_cast<XioCommand*>(mp_mem.addr);
+  ceph_assert(!!xcmd);
+  new (xcmd) XioCommand(xcon, mp_mem);
+  return xcmd;
+}
+
+int XioMessenger::_send_message(Message *m, Connection *con)
+{
+  if (con == loop_con.get() /* intrusive_ptr get() */) {
+    m->set_connection(con);
+    m->set_src(get_myinst().name);
+    m->set_seq(loop_con->next_seq());
+    ds_dispatch(m);
+    return 0;
+  }
+
+  XioConnection *xcon = static_cast<XioConnection*>(con);
+
+  /* If con is not in READY state, we have to enforce policy */
+  if (xcon->cstate.session_state.read() != XioConnection::UP) {
+    std::lock_guard<decltype(xcon->sp) lg(xcon->sp);
+
+    if (xcon->cstate.session_state.read() != XioConnection::UP) {
+      xcon->outgoing.mqueue.push_back(*m);
+      return 0;
+    }
+  }
+
+  return _send_message_impl(m, xcon);
+} /* send_message(Message* m, Connection *con) */
+
+int XioMessenger::_send_message_impl(Message* m, XioConnection* xcon)
+{
+  int code = 0;
+
+  Mutex::Locker l(xcon->lock);
+  if (unlikely(XioPool::trace_mempool)) {
+    static uint32_t nreqs;
+    if (unlikely((++nreqs % 65536) == 0)) {
+      xp_stats.dump(__func__, nreqs);
+    }
+  }
+
+  m->set_seq(xcon->state.next_out_seq());
+  m->set_magic(magic); // trace flags and special handling
+
+  m->encode(xcon->get_features(), this->crcflags);
+
+  buffer::list &payload = m->get_payload();
+  buffer::list &middle = m->get_middle();
+  buffer::list &data = m->get_data();
+
+  int msg_off = 0;
+  int req_off = 0;
+  int req_size = 0;
+  int nbuffers =
+    xio_count_buffers(payload, req_size, msg_off, req_off) +
+    xio_count_buffers(middle, req_size, msg_off, req_off) +
+    xio_count_buffers(data, req_size, msg_off, req_off);
+
+  int ex_cnt = req_off;
+  if (msg_off == 0 && ex_cnt > 0) {
+    // no buffers for last msg
+    ldout(cct,10) << "msg_off 0, ex_cnt " << ex_cnt << " -> " << ex_cnt-1 << dendl;
+    ex_cnt--;
+  }
+
+  /* get an XioMsg frame */
+  XioMsg *xmsg = pool_alloc_xio_msg(m, xcon, ex_cnt);
+  if (! xmsg) {
+    /* could happen if Accelio has been shutdown */
+    return ENOMEM;
+  }
+
+  ldout(cct,4) << __func__ << " " << m << " new XioMsg " << xmsg
+       << " tag " << (int)xmsg->hdr.tag
+       << " req_0 " << xmsg->get_xio_msg() << " msg type " << m->get_type()
+       << " features: " << xcon->get_features()
+       << " conn " << xcon->conn << " sess " << xcon->session << dendl;
+
+  if (magic & (MSG_MAGIC_XIO)) {
+
+    /* XXXX verify */
+    switch (m->get_type()) {
+    case 43:
+    // case 15:
+      ldout(cct,4) << __func__ << " stop 43 " << m->get_type() << " " << *m << dendl;
+      buffer::list &payload = m->get_payload();
+      ldout(cct,4) << __func__ << " payload dump:" << dendl;
+      payload.hexdump(cout);
+    }
+  }
+
+  struct xio_msg *req = xmsg->get_xio_msg();
+  struct xio_iovec_ex *msg_iov = req->out.pdata_iov.sglist;
+
+  if (magic & (MSG_MAGIC_XIO)) {
+    ldout(cct,4) << "payload: " << payload.buffers().size() <<
+      " middle: " << middle.buffers().size() <<
+      " data: " << data.buffers().size() <<
+      dendl;
+  }
+
+  if (unlikely(ex_cnt > 0)) {
+    ldout(cct,4) << __func__ << " buffer cnt > XIO_MSGR_IOVLEN (" <<
+      ((XIO_MSGR_IOVLEN-1) + nbuffers) << ")" << dendl;
+  }
+
+  /* do the invariant part */
+  msg_off = 0;
+  req_off = -1; /* most often, not used */
+  req_size = 0;
+
+  xio_place_buffers(payload, xmsg, req, msg_iov, req_size, ex_cnt, msg_off,
+		    req_off, BUFFER_PAYLOAD);
+
+  xio_place_buffers(middle, xmsg, req, msg_iov, req_size, ex_cnt, msg_off,
+		    req_off, BUFFER_MIDDLE);
+
+  xio_place_buffers(data, xmsg, req, msg_iov, req_size, ex_cnt, msg_off,
+		    req_off, BUFFER_DATA);
+  ldout(cct,10) << "ex_cnt " << ex_cnt << ", req_off " << req_off
+    << ", msg_cnt " << xmsg->get_msg_count() << dendl;
+
+  /* finalize request */
+  if (msg_off)
+    req->out.pdata_iov.nents = msg_off;
+
+  /* fixup first msg */
+  req = xmsg->get_xio_msg();
+
+  const std::list<buffer::ptr>& header = xmsg->hdr.get_bl().buffers();
+  ceph_assert(header.size() == 1); /* XXX */
+  list<bufferptr>::const_iterator pb = header.begin();
+  req->out.header.iov_base = (char*) pb->c_str();
+  req->out.header.iov_len = pb->length();
+
+  /* deliver via xio, preserve ordering */
+  if (xmsg->get_msg_count() > 1) {
+    struct xio_msg *head = xmsg->get_xio_msg();
+    struct xio_msg *tail = head;
+    for (req_off = 0; ((unsigned) req_off) < xmsg->get_msg_count()-1; ++req_off) {
+      req = &xmsg->req_arr[req_off].msg;
+assert(!req->in.pdata_iov.nents);
+assert(req->out.pdata_iov.nents || !nbuffers);
+      tail->next = req;
+      tail = req;
+     }
+    tail->next = NULL;
+  }
+  xmsg->trace = m->trace;
+  m->trace.event("xio portal enqueue for send");
+  m->trace.keyval("xio message segments", xmsg->hdr.msg_cnt);
+  xcon->portal->enqueue_for_send(xcon, xmsg);
+
+  return code;
+} /* send_message(Message *, Connection *) */
+
+int XioMessenger::shutdown()
+{
+  shutdown_called = true;
+  conns_sp.lock();
+  XioConnection::ConnList::iterator iter;
+  iter = conns_list.begin();
+  for (iter = conns_list.begin(); iter != conns_list.end(); ++iter) {
+    (void) iter->disconnect(); // XXX mark down?
+  }
+  conns_sp.unlock();
+  while(nsessions > 0) {
+    Mutex::Locker lck(sh_mtx);
+    if (nsessions > 0)
+      sh_cond.Wait(sh_mtx);
+  }
+  portals.shutdown();
+  dispatch_strategy->shutdown();
+  did_bind = false;
+  started = false;
+  return 0;
+} /* shutdown */
+
+ConnectionRef XioMessenger::get_connection(const entity_inst_t& dest)
+{
+  if (shutdown_called)
+    return NULL;
+
+  const entity_inst_t& self_inst = get_myinst();
+  if ((&dest == &self_inst) ||
+      (dest == self_inst)) {
+    return get_loopback_connection();
+  }
+
+  conns_sp.lock();
+  XioConnection::EntitySet::iterator conn_iter =
+    conns_entity_map.find(dest, XioConnection::EntityComp());
+  if (conn_iter != conns_entity_map.end()) {
+    ConnectionRef cref = &(*conn_iter);
+    conns_sp.unlock();
+    return cref;
+  }
+  else {
+    conns_sp.unlock();
+    string xio_uri = xio_uri_from_entity(cct->_conf->xio_transport_type,
+					 dest.addr, true /* want_port */);
+
+    ldout(cct,4) << "XioMessenger " << this << " get_connection: xio_uri "
+      << xio_uri << dendl;
+
+    /* XXX client session creation parameters */
+    struct xio_session_params params = {};
+    params.type         = XIO_SESSION_CLIENT;
+    params.ses_ops      = &xio_msgr_ops;
+    params.user_context = this;
+    params.uri          = xio_uri.c_str();
+
+    XioConnection *xcon = new XioConnection(this, XioConnection::ACTIVE,
+					    dest);
+
+    xcon->session = xio_session_create(&params);
+    if (! xcon->session) {
+      delete xcon;
+      return NULL;
+    }
+
+    /* this should cause callbacks with user context of conn, but
+     * we can always set it explicitly */
+    struct xio_connection_params xcp = {};
+    xcp.session           = xcon->session;
+    xcp.ctx               = xcon->portal->ctx;
+    xcp.conn_user_context = xcon;
+
+    xcon->conn = xio_connect(&xcp);
+    if (!xcon->conn) {
+      xio_session_destroy(xcon->session);
+      delete xcon;
+      return NULL;
+    }
+
+    nsessions++;
+    xcon->connected = true;
+
+    /* sentinel ref */
+    xcon->get(); /* xcon->nref == 1 */
+    conns_sp.lock();
+    conns_list.push_back(*xcon);
+    conns_entity_map.insert(*xcon);
+    conns_sp.unlock();
+
+    /* XXXX pre-merge of session startup negotiation ONLY! */
+    xcon->cstate.state_up_ready(XioConnection::CState::OP_FLAG_NONE);
+
+    ldout(cct,2) << "New connection xcon: " << xcon <<
+      " up_ready on session " << xcon->session <<
+      " on msgr: " << this << " portal: " << xcon->portal << dendl;
+
+    return xcon->get(); /* nref +1 */
+  }
+} /* get_connection */
+
+ConnectionRef XioMessenger::get_loopback_connection()
+{
+  return (loop_con.get());
+} /* get_loopback_connection */
+
+void XioMessenger::unregister_xcon(XioConnection *xcon)
+{
+  std::lock_guard<decltype(conns_sp)> lckr(conns_sp);
+
+  XioConnection::EntitySet::iterator conn_iter =
+	conns_entity_map.find(xcon->peer, XioConnection::EntityComp());
+  if (conn_iter != conns_entity_map.end()) {
+	XioConnection *xcon2 = &(*conn_iter);
+	if (xcon == xcon2) {
+	  conns_entity_map.erase(conn_iter);
+	}
+  }
+
+  /* check if citer on conn_list */
+  if (xcon->conns_hook.is_linked()) {
+    /* now find xcon on conns_list and erase */
+    XioConnection::ConnList::iterator citer =
+        XioConnection::ConnList::s_iterator_to(*xcon);
+    conns_list.erase(citer);
+  }
+}
+
+void XioMessenger::mark_down(const entity_addr_t& addr)
+{
+  entity_inst_t inst(entity_name_t(), addr);
+  std::lock_guard<decltype(conns_sp)> lckr(conns_sp);
+  XioConnection::EntitySet::iterator conn_iter =
+    conns_entity_map.find(inst, XioConnection::EntityComp());
+  if (conn_iter != conns_entity_map.end()) {
+      (*conn_iter)._mark_down(XioConnection::CState::OP_FLAG_NONE);
+    }
+} /* mark_down(const entity_addr_t& */
+
+void XioMessenger::mark_down(Connection* con)
+{
+  XioConnection *xcon = static_cast<XioConnection*>(con);
+  xcon->_mark_down(XioConnection::CState::OP_FLAG_NONE);
+} /* mark_down(Connection*) */
+
+void XioMessenger::mark_down_all()
+{
+  std::lock_guard<decltype(conns_sp)> lckr(conns_sp);
+  XioConnection::EntitySet::iterator conn_iter;
+  for (conn_iter = conns_entity_map.begin(); conn_iter !=
+	 conns_entity_map.begin(); ++conn_iter) {
+    (*conn_iter)._mark_down(XioConnection::CState::OP_FLAG_NONE);
+  }
+} /* mark_down_all */
+
+static inline XioMarkDownHook* pool_alloc_markdown_hook(
+  XioConnection *xcon, Message *m)
+{
+  struct xio_reg_mem mp_mem;
+  int e = xio_mempool_alloc(xio_msgr_noreg_mpool,
+			    sizeof(XioMarkDownHook), &mp_mem);
+  if (!!e)
+    return NULL;
+  XioMarkDownHook *hook = static_cast<XioMarkDownHook*>(mp_mem.addr);
+  new (hook) XioMarkDownHook(xcon, m, mp_mem);
+  return hook;
+}
+
+void XioMessenger::mark_down_on_empty(Connection* con)
+{
+  XioConnection *xcon = static_cast<XioConnection*>(con);
+  MNop* m = new MNop();
+  m->tag = XIO_NOP_TAG_MARKDOWN;
+  m->set_completion_hook(pool_alloc_markdown_hook(xcon, m));
+  // stall new messages
+  xcon->cstate.session_state = XioConnection::session_states::BARRIER;
+  (void) _send_message_impl(m, xcon);
+}
+
+void XioMessenger::mark_disposable(Connection *con)
+{
+  XioConnection *xcon = static_cast<XioConnection*>(con);
+  xcon->_mark_disposable(XioConnection::CState::OP_FLAG_NONE);
+}
+
+void XioMessenger::try_insert(XioConnection *xcon)
+{
+  std::lock_guard<decltype(conns_sp)> lckr(conns_sp);
+  /* already resident in conns_list */
+  conns_entity_map.insert(*xcon);
+}
+
+XioMessenger::~XioMessenger()
+{
+  delete dispatch_strategy;
+  nInstances--;
+} /* dtor */
diff --git a/src/msg/xio/XioMessenger.h b/src/msg/xio/XioMessenger.h
new file mode 100644
index 00000000..6f8a67ba
--- /dev/null
+++ b/src/msg/xio/XioMessenger.h
@@ -0,0 +1,176 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef XIO_MESSENGER_H
+#define XIO_MESSENGER_H
+
+#include "msg/SimplePolicyMessenger.h"
+
+#include <atomic>
+
+extern "C" {
+#include "libxio.h"
+}
+
+#include "XioConnection.h"
+#include "XioPortal.h"
+#include "QueueStrategy.h"
+#include "common/Thread.h"
+#include "common/Mutex.h"
+#include "include/spinlock.h"
+
+class XioInit {
+  /* safe to be called multiple times */
+  void package_init(CephContext *cct);
+
+protected:
+  explicit XioInit(CephContext *cct) {
+    this->package_init(cct);
+  }
+};
+
+class XioMessenger : public SimplePolicyMessenger, XioInit
+{
+private:
+  static std::atomic<uint64_t> nInstances = { 0 };
+  std::atomic<uint64_t> nsessions = { 0 };
+  std::atomic<bool> shutdown_called = { false };
+  ceph::spinlock conns_sp;
+  XioConnection::ConnList conns_list;
+  XioConnection::EntitySet conns_entity_map;
+  XioPortals portals;
+  DispatchStrategy* dispatch_strategy;
+  XioLoopbackConnectionRef loop_con;
+  uint32_t special_handling;
+  Mutex sh_mtx;
+  Cond sh_cond;
+  bool need_addr;
+  bool did_bind;
+
+  /// approximately unique ID set by the Constructor for use in entity_addr_t
+  uint64_t nonce;
+
+  friend class XioConnection;
+
+public:
+  XioMessenger(CephContext *cct, entity_name_t name,
+	       string mname, uint64_t nonce,
+	       uint64_t cflags = 0,
+	       DispatchStrategy* ds = new QueueStrategy(1));
+
+  virtual ~XioMessenger();
+
+  XioPortal* get_portal() { return portals.get_next_portal(); }
+
+  virtual void set_myaddr(const entity_addr_t& a) {
+    Messenger::set_myaddr(a);
+    loop_con->set_peer_addr(a);
+  }
+
+  int _send_message(Message *m, const entity_inst_t &dest);
+  int _send_message(Message *m, Connection *con);
+  int _send_message_impl(Message *m, XioConnection *xcon);
+
+  uint32_t get_special_handling() { return special_handling; }
+  void set_special_handling(int n) { special_handling = n; }
+  int pool_hint(uint32_t size);
+  void try_insert(XioConnection *xcon);
+
+  /* xio hooks */
+  int new_session(struct xio_session *session,
+		  struct xio_new_session_req *req,
+		  void *cb_user_context);
+
+  int session_event(struct xio_session *session,
+		    struct xio_session_event_data *event_data,
+		    void *cb_user_context);
+
+  /* Messenger interface */
+  virtual bool set_addr_unknowns(const entity_addrvec_t &addr) override
+    { } /* XXX applicable? */
+  virtual void set_addr(const entity_addr_t &addr) override
+    { } /* XXX applicable? */
+
+  virtual int get_dispatch_queue_len()
+    { return 0; } /* XXX bogus? */
+
+  virtual double get_dispatch_queue_max_age(utime_t now)
+    { return 0; } /* XXX bogus? */
+
+  virtual void set_cluster_protocol(int p)
+    { }
+
+  virtual int bind(const entity_addr_t& addr);
+
+  virtual int rebind(const set<int>& avoid_ports);
+
+  virtual int start();
+
+  virtual void wait();
+
+  virtual int shutdown();
+
+  virtual int send_message(Message *m, const entity_inst_t &dest) {
+    return _send_message(m, dest);
+  }
+
+  virtual int lazy_send_message(Message *m, const entity_inst_t& dest)
+    { return EINVAL; }
+
+  virtual int lazy_send_message(Message *m, Connection *con)
+    { return EINVAL; }
+
+  virtual ConnectionRef get_connection(const entity_inst_t& dest);
+
+  // compat hack
+  ConnectionRef connect_to(
+    int type, const entity_addrvec_t& dest) override {
+    return get_connection(entity_inst_t(entity_name_t(type, -1),
+					dest.legacy_addr()));
+  }
+
+  virtual ConnectionRef get_loopback_connection();
+
+  void unregister_xcon(XioConnection *xcon);
+  virtual void mark_down(const entity_addr_t& a);
+  virtual void mark_down(Connection *con);
+  virtual void mark_down_all();
+  virtual void mark_down_on_empty(Connection *con);
+  virtual void mark_disposable(Connection *con);
+
+  void ds_dispatch(Message *m)
+    { dispatch_strategy->ds_dispatch(m); }
+
+  /**
+   * Tell the XioMessenger its full IP address.
+   *
+   * This is used by clients when connecting to other endpoints, and
+   * probably shouldn't be called by anybody else.
+   */
+  void learned_addr(const entity_addr_t& peer_addr_for_me);
+
+private:
+  int get_nconns_per_portal(uint64_t cflags);
+  int get_nportals(uint64_t cflags);
+
+protected:
+  virtual void ready()
+    { }
+};
+
+XioCommand* pool_alloc_xio_command(XioConnection *xcon);
+
+
+#endif /* XIO_MESSENGER_H */
diff --git a/src/msg/xio/XioMsg.cc b/src/msg/xio/XioMsg.cc
new file mode 100644
index 00000000..4b6a5d68
--- /dev/null
+++ b/src/msg/xio/XioMsg.cc
@@ -0,0 +1,51 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XioMessenger.h"
+#include "XioConnection.h"
+#include "XioMsg.h"
+
+
+int XioDispatchHook::release_msgs()
+{
+  XioCompletion *xcmp;
+  int r = msg_seq.size();
+  cl_flag = true;
+
+  /* queue for release */
+  xcmp = static_cast<XioCompletion *>(rsp_pool.alloc(sizeof(XioCompletion)));
+  new (xcmp) XioCompletion(xcon, this);
+  xcmp->trace = m->trace;
+
+  /* merge with portal traffic */
+  xcon->portal->enqueue(xcon, xcmp);
+
+  ceph_assert(r);
+  return r;
+}
+
+/*static*/ size_t XioMsgHdr::get_max_encoded_length() {
+  ceph_msg_header _ceph_msg_header;
+  ceph_msg_footer _ceph_msg_footer;
+  XioMsgHdr hdr (_ceph_msg_header, _ceph_msg_footer, 0 /* features */);
+  const std::list<buffer::ptr>& hdr_buffers = hdr.get_bl().buffers();
+  ceph_assert(hdr_buffers.size() == 1); /* accelio header is small without scatter gather */
+  return hdr_buffers.begin()->length();
+}
+
+void XioMsg::print_debug(CephContext *cct, const char *tag) const {
+  print_xio_msg_hdr(cct, tag, hdr, get_xio_msg());
+  print_ceph_msg(cct, tag, m);
+}
diff --git a/src/msg/xio/XioMsg.h b/src/msg/xio/XioMsg.h
new file mode 100644
index 00000000..2f0c8490
--- /dev/null
+++ b/src/msg/xio/XioMsg.h
@@ -0,0 +1,446 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef XIO_MSG_H
+#define XIO_MSG_H
+
+#include <boost/intrusive/list.hpp>
+#include "msg/SimplePolicyMessenger.h"
+extern "C" {
+#include "libxio.h"
+}
+#include "XioConnection.h"
+#include "XioSubmit.h"
+#include "msg/msg_types.h"
+#include "XioPool.h"
+
+namespace bi = boost::intrusive;
+
+class XioMessenger;
+
+class XioMsgCnt
+{
+public:
+  ceph_le32 msg_cnt;
+  buffer::list bl;
+public:
+  explicit XioMsgCnt(buffer::ptr p)
+    {
+      bl.append(p);
+      buffer::list::iterator bl_iter = bl.begin();
+      decode(msg_cnt, bl_iter);
+    }
+};
+
+class XioMsgHdr
+{
+public:
+  char tag;
+  ceph_le32 msg_cnt;
+  ceph_le32 peer_type;
+  entity_addr_t addr; /* XXX hack! */
+  ceph_msg_header* hdr;
+  ceph_msg_footer* ftr;
+  uint64_t features;
+  buffer::list bl;
+public:
+  XioMsgHdr(ceph_msg_header& _hdr, ceph_msg_footer& _ftr, uint64_t _features)
+    : tag(CEPH_MSGR_TAG_MSG), msg_cnt(init_le32(0)), hdr(&_hdr), ftr(&_ftr),
+      features(_features)
+    { }
+
+  XioMsgHdr(ceph_msg_header& _hdr, ceph_msg_footer &_ftr, buffer::ptr p)
+    : hdr(&_hdr), ftr(&_ftr)
+    {
+      bl.append(p);
+      buffer::list::iterator bl_iter = bl.begin();
+      decode(bl_iter);
+    }
+
+  static size_t get_max_encoded_length();
+
+  const buffer::list& get_bl() { encode(bl); return bl; };
+
+  inline void encode_hdr(ceph::buffer::list& bl) const {
+    using ceph::encode;
+    encode(tag, bl);
+    encode(msg_cnt, bl);
+    encode(peer_type, bl);
+    encode(addr, bl, features);
+    encode(hdr->seq, bl);
+    encode(hdr->tid, bl);
+    encode(hdr->type, bl);
+    encode(hdr->priority, bl);
+    encode(hdr->version, bl);
+    encode(hdr->front_len, bl);
+    encode(hdr->middle_len, bl);
+    encode(hdr->data_len, bl);
+    encode(hdr->data_off, bl);
+    encode(hdr->src.type, bl);
+    encode(hdr->src.num, bl);
+    encode(hdr->compat_version, bl);
+    encode(hdr->crc, bl);
+  }
+
+  inline void encode_ftr(buffer::list& bl) const {
+    using ceph::encode;
+    encode(ftr->front_crc, bl);
+    encode(ftr->middle_crc, bl);
+    encode(ftr->data_crc, bl);
+    encode(ftr->sig, bl);
+    encode(ftr->flags, bl);
+  }
+
+  inline void encode(buffer::list& bl) const {
+    encode_hdr(bl);
+    encode_ftr(bl);
+  }
+
+  inline void decode_hdr(buffer::list::iterator& bl) {
+    using ceph::decode;
+    decode(tag, bl);
+    decode(msg_cnt, bl);
+    decode(peer_type, bl);
+    decode(addr, bl);
+    decode(hdr->seq, bl);
+    decode(hdr->tid, bl);
+    decode(hdr->type, bl);
+    decode(hdr->priority, bl);
+    decode(hdr->version, bl);
+    decode(hdr->front_len, bl);
+    decode(hdr->middle_len, bl);
+    decode(hdr->data_len, bl);
+    decode(hdr->data_off, bl);
+    decode(hdr->src.type, bl);
+    decode(hdr->src.num, bl);
+    decode(hdr->compat_version, bl);
+    decode(hdr->crc, bl);
+  }
+
+  inline void decode_ftr(buffer::list::iterator& bl) {
+    using ceph::decode;
+    decode(ftr->front_crc, bl);
+    decode(ftr->middle_crc, bl);
+    decode(ftr->data_crc, bl);
+    decode(ftr->sig, bl);
+    decode(ftr->flags, bl);
+  }
+
+  inline void decode(buffer::list::iterator& bl) {
+    decode_hdr(bl);
+    decode_ftr(bl);
+  }
+
+  virtual ~XioMsgHdr()
+    {}
+};
+
+WRITE_CLASS_ENCODER(XioMsgHdr);
+
+extern struct xio_mempool *xio_msgr_noreg_mpool;
+
+#define XIO_MSGR_IOVLEN 16
+
+struct xio_msg_ex
+{
+  struct xio_msg msg;
+  struct xio_iovec_ex iovs[XIO_MSGR_IOVLEN];
+
+  explicit xio_msg_ex(void* user_context) {
+    // go in structure order
+    msg.in.header.iov_len = 0;
+    msg.in.header.iov_base = NULL;  /* XXX Accelio requires this currently */
+
+    msg.in.sgl_type = XIO_SGL_TYPE_IOV_PTR;
+    msg.in.pdata_iov.max_nents = XIO_MSGR_IOVLEN;
+    msg.in.pdata_iov.nents = 0;
+    msg.in.pdata_iov.sglist = iovs;
+
+    // minimal zero "out" side
+    msg.out.header.iov_len = 0;
+    msg.out.header.iov_base = NULL;  /* XXX Accelio requires this currently,
+				      * against spec */
+    // out (some members adjusted later)
+    msg.out.sgl_type = XIO_SGL_TYPE_IOV_PTR;
+    msg.out.pdata_iov.max_nents = XIO_MSGR_IOVLEN;
+    msg.out.pdata_iov.nents = 0;
+    msg.out.pdata_iov.sglist = iovs;
+
+    // minimal initialize an "out" msg
+    msg.request = NULL;
+    msg.type = XIO_MSG_TYPE_ONE_WAY;
+    // for now, we DO NEED receipts for every msg
+    msg.flags = 0;
+    msg.user_context = user_context;
+    msg.next = NULL;
+    // minimal zero "in" side
+  }
+};
+
+class XioSend : public XioSubmit
+{
+public:
+  virtual void print_debug(CephContext *cct, const char *tag) const {};
+  const struct xio_msg * get_xio_msg() const {return &req_0.msg;}
+  struct xio_msg * get_xio_msg() {return &req_0.msg;}
+  virtual size_t get_msg_count() const {return 1;}
+
+  XioSend(XioConnection *_xcon, struct xio_reg_mem& _mp, int _ex_cnt=0) :
+    XioSubmit(XioSubmit::OUTGOING_MSG, _xcon),
+    req_0(this), mp_this(_mp), nrefs(_ex_cnt+1)
+  {
+    xpool_inc_msgcnt();
+    xcon->get();
+  }
+
+  XioSend* get() { nrefs++; return this; };
+
+  void put(int n) {
+    int refs = nrefs -= n;
+    if (refs == 0) {
+      struct xio_reg_mem *mp = &this->mp_this;
+      this->~XioSend();
+      xpool_free(sizeof(XioSend), mp);
+    }
+  }
+
+  void put() {
+    put(1);
+  }
+
+  void put_msg_refs() {
+    put(get_msg_count());
+  }
+
+  virtual ~XioSend() {
+    xpool_dec_msgcnt();
+    xcon->put();
+  }
+
+private:
+  xio_msg_ex req_0;
+  struct xio_reg_mem mp_this;
+  std::atomic<unsigned> nrefs = { 0 };
+};
+
+class XioCommand : public XioSend
+{
+public:
+  XioCommand(XioConnection *_xcon, struct xio_reg_mem& _mp):XioSend(_xcon, _mp) {
+  }
+
+  buffer::list& get_bl_ref() { return bl; };
+
+private:
+  buffer::list bl;
+};
+
+struct XioMsg : public XioSend
+{
+public:
+  Message* m;
+  XioMsgHdr hdr;
+  xio_msg_ex* req_arr;
+
+public:
+  XioMsg(Message *_m, XioConnection *_xcon, struct xio_reg_mem& _mp,
+	 int _ex_cnt, uint64_t _features) :
+    XioSend(_xcon, _mp, _ex_cnt),
+    m(_m), hdr(m->get_header(), m->get_footer(), _features),
+    req_arr(NULL)
+    {
+      const entity_inst_t &inst = xcon->get_messenger()->get_myinst();
+      hdr.peer_type = inst.name.type();
+      hdr.addr = xcon->get_messenger()->get_myaddr_legacy();
+      hdr.hdr->src.type = inst.name.type();
+      hdr.hdr->src.num = inst.name.num();
+      hdr.msg_cnt = _ex_cnt+1;
+
+      if (unlikely(_ex_cnt > 0)) {
+	alloc_trailers(_ex_cnt);
+      }
+    }
+
+  void print_debug(CephContext *cct, const char *tag) const override;
+  size_t get_msg_count() const override {
+    return hdr.msg_cnt;
+  }
+
+  void alloc_trailers(int cnt) {
+    req_arr = static_cast<xio_msg_ex*>(malloc(cnt * sizeof(xio_msg_ex)));
+    for (int ix = 0; ix < cnt; ++ix) {
+      xio_msg_ex* xreq = &(req_arr[ix]);
+      new (xreq) xio_msg_ex(this);
+    }
+  }
+
+  Message *get_message() { return m; }
+
+  ~XioMsg()
+    {
+      if (unlikely(!!req_arr)) {
+	for (unsigned int ix = 0; ix < get_msg_count()-1; ++ix) {
+	  xio_msg_ex* xreq = &(req_arr[ix]);
+	  xreq->~xio_msg_ex();
+	}
+	free(req_arr);
+      }
+
+      /* testing only! server's ready, resubmit request (not reached on
+       * PASSIVE/server side) */
+      if (unlikely(m->get_magic() & MSG_MAGIC_REDUPE)) {
+	if (likely(xcon->is_connected())) {
+	  xcon->send_message(m);
+	} else {
+	  /* dispose it */
+	  m->put();
+	}
+      } else {
+	  /* the normal case: done with message */
+	  m->put();
+      }
+    }
+};
+
+class XioDispatchHook : public Message::CompletionHook
+{
+private:
+  XioConnection *xcon;
+  XioInSeq msg_seq;
+  XioPool rsp_pool;
+  std::atomic<unsigned> nrefs { 1 };
+  bool cl_flag;
+  friend class XioConnection;
+  friend class XioMessenger;
+public:
+  struct xio_reg_mem mp_this;
+
+  XioDispatchHook(XioConnection *_xcon, Message *_m, XioInSeq& _msg_seq,
+		    struct xio_reg_mem& _mp) :
+    CompletionHook(_m),
+    xcon(_xcon->get()),
+    msg_seq(_msg_seq),
+    rsp_pool(xio_msgr_noreg_mpool),
+    cl_flag(false),
+    mp_this(_mp)
+    {
+      ++xcon->n_reqs; // atomicity by portal thread
+      xpool_inc_hookcnt();
+    }
+
+  virtual void finish(int r) {
+    this->put();
+  }
+
+  virtual void complete(int r) {
+    finish(r);
+  }
+
+  int release_msgs();
+
+  XioDispatchHook* get() {
+    nrefs++; return this;
+  }
+
+  void put(int n = 1) {
+    int refs = nrefs -= n;
+    if (refs == 0) {
+      /* in Marcus' new system, refs reaches 0 twice:  once in
+       * Message lifecycle, and again after xio_release_msg.
+       */
+      if (!cl_flag && release_msgs())
+	return;
+      struct xio_reg_mem *mp = &this->mp_this;
+      this->~XioDispatchHook();
+      xpool_free(sizeof(XioDispatchHook), mp);
+    }
+  }
+
+  XioInSeq& get_seq() { return msg_seq; }
+
+  XioPool& get_pool() { return rsp_pool; }
+
+  void on_err_finalize(XioConnection *xcon) {
+    /* can't decode message; even with one-way must free
+     * xio_msg structures, and then xiopool
+     */
+    this->finish(-1);
+  }
+
+  ~XioDispatchHook() {
+    --xcon->n_reqs; // atomicity by portal thread
+    xpool_dec_hookcnt();
+    xcon->put();
+  }
+};
+
+/* A sender-side CompletionHook that relies on the on_msg_delivered
+ * to complete a pending mark down. */
+class XioMarkDownHook : public Message::CompletionHook
+{
+private:
+  XioConnection* xcon;
+
+public:
+  struct xio_reg_mem mp_this;
+
+  XioMarkDownHook(
+    XioConnection* _xcon, Message *_m, struct xio_reg_mem& _mp) :
+    CompletionHook(_m), xcon(_xcon->get()), mp_this(_mp)
+    { }
+
+  virtual void claim(int r) {}
+
+  virtual void finish(int r) {
+    xcon->put();
+    struct xio_reg_mem *mp = &this->mp_this;
+    this->~XioMarkDownHook();
+    xio_mempool_free(mp);
+  }
+
+  virtual void complete(int r) {
+    xcon->_mark_down(XioConnection::CState::OP_FLAG_NONE);
+    finish(r);
+  }
+};
+
+struct XioCompletion : public XioSubmit
+{
+  XioDispatchHook *xhook;
+public:
+  XioCompletion(XioConnection *_xcon, XioDispatchHook *_xhook)
+    : XioSubmit(XioSubmit::INCOMING_MSG_RELEASE, _xcon /* not xcon! */),
+      xhook(_xhook->get()) {
+      // submit queue ref
+      xcon->get();
+    };
+
+  struct xio_msg* dequeue() {
+    return xhook->get_seq().dequeue();
+  }
+
+  XioDispatchHook* get_xhook() { return xhook; }
+
+  void finalize() {
+    xcon->put();
+    xhook->put();
+  }
+};
+
+void print_xio_msg_hdr(CephContext *cct, const char *tag,
+		       const XioMsgHdr &hdr, const struct xio_msg *msg);
+void print_ceph_msg(CephContext *cct, const char *tag, Message *m);
+
+#endif /* XIO_MSG_H */
diff --git a/src/msg/xio/XioPool.cc b/src/msg/xio/XioPool.cc
new file mode 100644
index 00000000..5f0d77a2
--- /dev/null
+++ b/src/msg/xio/XioPool.cc
@@ -0,0 +1,41 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include "XioPool.h"
+
+XioPoolStats xp_stats;
+
+bool XioPool::trace_mempool = 0;
+bool XioPool::trace_msgcnt = 0;
+
+void XioPoolStats::dump(const char* tag, uint64_t serial)
+{
+  std::cout
+    << tag << " #" << serial << ": "
+    << "pool objs: "
+    << "64: " << ctr_set[SLAB_64].read() << " "
+    << "256: " << ctr_set[SLAB_256].read() << " "
+    << "1024: " << ctr_set[SLAB_1024].read() << " "
+    << "page: " << ctr_set[SLAB_PAGE].read() << " "
+    << "max: " << ctr_set[SLAB_MAX].read() << " "
+    << "overflow: " << ctr_set[SLAB_OVERFLOW].read() << " "
+    << std::endl;
+  std::cout
+    << tag << " #" << serial << ": "
+    << " msg objs: "
+    << "in: " << hook_cnt.read() << " "
+    << "out: " << msg_cnt.read() << " "
+    << std::endl;
+}
diff --git a/src/msg/xio/XioPool.h b/src/msg/xio/XioPool.h
new file mode 100644
index 00000000..07fa7311
--- /dev/null
+++ b/src/msg/xio/XioPool.h
@@ -0,0 +1,218 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#ifndef XIO_POOL_H
+#define XIO_POOL_H
+
+#include <atomic>
+#include <vector>
+#include <cstdlib>
+#include <cstring>
+#include <cstdint>
+
+extern "C" {
+#include "libxio.h"
+}
+
+#include "common/likely.h"
+
+static inline int xpool_alloc(struct xio_mempool *pool, uint64_t size,
+			      struct xio_reg_mem* mp);
+static inline void xpool_free(uint64_t size, struct xio_reg_mem* mp);
+
+class XioPool
+{
+private:
+  struct xio_mempool *handle;
+
+public:
+  static bool trace_mempool;
+  static bool trace_msgcnt;
+  static const int MB = 8;
+
+  struct xio_piece {
+    struct xio_reg_mem mp[1];
+    struct xio_piece *next;
+    int s;
+    char payload[MB];
+  } *first;
+
+  explicit XioPool(struct xio_mempool *_handle) :
+    handle(_handle), first(0)
+    {
+    }
+  ~XioPool()
+    {
+      struct xio_piece *p;
+      while ((p = first)) {
+	first = p->next;
+	if (unlikely(trace_mempool)) {
+	  memset(p->payload, 0xcf, p->s); // guard bytes
+	}
+	xpool_free(sizeof(struct xio_piece)+(p->s)-MB, p->mp);
+      }
+    }
+  void *alloc(size_t _s)
+    {
+	void *r;
+	struct xio_reg_mem mp[1];
+	struct xio_piece *x;
+	int e = xpool_alloc(handle, (sizeof(struct xio_piece)-MB) + _s, mp);
+	if (e) {
+	  r = 0;
+	} else {
+	  x = reinterpret_cast<struct xio_piece *>(mp->addr);
+	  *x->mp = *mp;
+	  x->next = first;
+	  x->s = _s;
+	  first = x;
+	  r = x->payload;
+	}
+	return r;
+    }
+};
+
+class XioPoolStats {
+private:
+  enum pool_sizes {
+    SLAB_64 = 0,
+    SLAB_256,
+    SLAB_1024,
+    SLAB_PAGE,
+    SLAB_MAX,
+    SLAB_OVERFLOW,
+    NUM_SLABS,
+  };
+
+  std::atomic<unsigned> ctr_set[NUM_SLABS] = {};
+  std::atomic<unsigned> msg_cnt = { 0 };  // send msgs
+  std::atomic<unsigned> hook_cnt = { 0 }; // recv msgs
+
+public:
+  void dump(const char* tag, uint64_t serial);
+
+  void inc(uint64_t size) {
+    if (size <= 64) {
+      (ctr_set[SLAB_64])++;
+      return;
+    }
+    if (size <= 256) {
+      (ctr_set[SLAB_256])++;
+      return;
+    }
+    if (size <= 1024) {
+      (ctr_set[SLAB_1024])++;
+      return;
+    }
+    if (size <= 8192) {
+      (ctr_set[SLAB_PAGE])++;
+      return;
+    }
+    (ctr_set[SLAB_MAX])++;
+  }
+
+  void dec(uint64_t size) {
+    if (size <= 64) {
+      (ctr_set[SLAB_64])--;
+      return;
+    }
+    if (size <= 256) {
+      (ctr_set[SLAB_256])--;
+      return;
+    }
+    if (size <= 1024) {
+      (ctr_set[SLAB_1024])--;
+      return;
+    }
+    if (size <= 8192) {
+      (ctr_set[SLAB_PAGE])--;
+      return;
+    }
+    (ctr_set[SLAB_MAX])--;
+  }
+
+  void inc_overflow() { ctr_set[SLAB_OVERFLOW]++; }
+  void dec_overflow() { ctr_set[SLAB_OVERFLOW]--; }
+
+  void inc_msgcnt() {
+    if (unlikely(XioPool::trace_msgcnt)) {
+      msg_cnt++;
+    }
+  }
+
+  void dec_msgcnt() {
+    if (unlikely(XioPool::trace_msgcnt)) {
+      msg_cnt--;
+    }
+  }
+
+  void inc_hookcnt() {
+    if (unlikely(XioPool::trace_msgcnt)) {
+      hook_cnt++;
+    }
+  }
+
+  void dec_hookcnt() {
+    if (unlikely(XioPool::trace_msgcnt)) {
+      hook_cnt--;
+    }
+  }
+};
+
+extern XioPoolStats xp_stats;
+
+static inline int xpool_alloc(struct xio_mempool *pool, uint64_t size,
+			      struct xio_reg_mem* mp)
+{
+  // try to allocate from the xio pool
+  int r = xio_mempool_alloc(pool, size, mp);
+  if (r == 0) {
+    if (unlikely(XioPool::trace_mempool))
+      xp_stats += size;
+    return 0;
+  }
+  // fall back to malloc on errors
+  mp->addr = malloc(size);
+  ceph_assert(mp->addr);
+  mp->length = 0;
+  if (unlikely(XioPool::trace_mempool))
+    xp_stats.inc_overflow();
+  return 0;
+}
+
+static inline void xpool_free(uint64_t size, struct xio_reg_mem* mp)
+{
+  if (mp->length) {
+    if (unlikely(XioPool::trace_mempool))
+      xp_stats -= size;
+    xio_mempool_free(mp);
+  } else { // from malloc
+    if (unlikely(XioPool::trace_mempool))
+      xp_stats.dec_overflow();
+    free(mp->addr);
+  }
+}
+
+#define xpool_inc_msgcnt() \
+  do { xp_stats.inc_msgcnt(); } while (0)
+
+#define xpool_dec_msgcnt() \
+  do { xp_stats.dec_msgcnt(); } while (0)
+
+#define xpool_inc_hookcnt() \
+  do { xp_stats.inc_hookcnt(); } while (0)
+
+#define xpool_dec_hookcnt() \
+  do { xp_stats.dec_hookcnt(); } while (0)
+
+#endif /* XIO_POOL_H */
diff --git a/src/msg/xio/XioPortal.cc b/src/msg/xio/XioPortal.cc
new file mode 100644
index 00000000..e2379fb3
--- /dev/null
+++ b/src/msg/xio/XioPortal.cc
@@ -0,0 +1,98 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#include "XioPortal.h"
+#include <stdio.h>
+
+#define dout_subsys ceph_subsys_xio
+
+int XioPortal::bind(struct xio_session_ops *ops, const string &base_uri,
+		    uint16_t port, uint16_t *assigned_port)
+{
+  // format uri
+  char buf[40];
+  xio_uri = base_uri;
+  xio_uri += ":";
+  sprintf(buf, "%d", port);
+  xio_uri += buf;
+
+  uint16_t assigned;
+  server = xio_bind(ctx, ops, xio_uri.c_str(), &assigned, 0, msgr);
+  if (server == NULL)
+    return xio_errno();
+
+  // update uri if port changed
+  if (port != assigned) {
+    xio_uri = base_uri;
+    xio_uri += ":";
+    sprintf(buf, "%d", assigned);
+    xio_uri += buf;
+  }
+
+  portal_id = const_cast<char*>(xio_uri.c_str());
+  if (assigned_port)
+    *assigned_port = assigned;
+  ldout(msgr->cct,20) << "xio_bind: portal " << xio_uri
+    << " returned server " << server << dendl;
+  return 0;
+}
+
+int XioPortals::bind(struct xio_session_ops *ops, const string& base_uri,
+		     uint16_t port, uint16_t *port0)
+{
+  /* a server needs at least 1 portal */
+  if (n < 1)
+    return EINVAL;
+  Messenger *msgr = portals[0]->msgr;
+  portals.resize(n);
+
+  uint16_t port_min = msgr->cct->_conf->ms_bind_port_min;
+  const uint16_t port_max = msgr->cct->_conf->ms_bind_port_max;
+
+  /* bind the portals */
+  for (size_t i = 0; i < portals.size(); i++) {
+    uint16_t result_port;
+    if (port != 0) {
+      // bind directly to the given port
+      int r = portals[i]->bind(ops, base_uri, port, &result_port);
+      if (r != 0)
+        return -r;
+    } else {
+      int r = EADDRINUSE;
+      // try ports within the configured range
+      for (; port_min <= port_max; port_min++) {
+        r = portals[i]->bind(ops, base_uri, port_min, &result_port);
+        if (r == 0) {
+          port_min++;
+          break;
+        }
+      }
+      if (r != 0) {
+        lderr(msgr->cct) << "portal.bind unable to bind to " << base_uri
+            << " on any port in range " << msgr->cct->_conf->ms_bind_port_min
+            << "-" << port_max << ": " << xio_strerror(r) << dendl;
+        return -r;
+      }
+    }
+
+    ldout(msgr->cct,5) << "xp::bind: portal " << i << " bind OK: "
+      << portals[i]->xio_uri << dendl;
+
+    if (i == 0 && port0 != NULL)
+      *port0 = result_port;
+    port = 0; // use port 0 for all subsequent portals
+  }
+
+  return 0;
+}
diff --git a/src/msg/xio/XioPortal.h b/src/msg/xio/XioPortal.h
new file mode 100644
index 00000000..7a0afee4
--- /dev/null
+++ b/src/msg/xio/XioPortal.h
@@ -0,0 +1,458 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *s
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef XIO_PORTAL_H
+#define XIO_PORTAL_H
+
+#include <string>
+
+extern "C" {
+#include "libxio.h"
+}
+#include "XioInSeq.h"
+#include <boost/lexical_cast.hpp>
+#include "msg/SimplePolicyMessenger.h"
+#include "XioConnection.h"
+#include "XioMsg.h"
+
+#include "include/spinlock.h"
+
+#include "include/ceph_assert.h"
+#include "common/dout.h"
+
+#ifndef CACHE_LINE_SIZE
+#define CACHE_LINE_SIZE 64 /* XXX arch-specific define */
+#endif
+#define CACHE_PAD(_n) char __pad ## _n [CACHE_LINE_SIZE]
+
+class XioPortal : public Thread
+{
+private:
+
+  struct SubmitQueue
+  {
+    const static int nlanes = 7;
+
+    struct Lane
+    {
+      uint32_t size;
+      XioSubmit::Queue q;
+      ceph::spinlock sp;
+      CACHE_PAD(0);
+    };
+
+    Lane qlane[nlanes];
+
+    int ix; /* atomicity by portal thread */
+
+    SubmitQueue() : ix(0)
+      {
+	int ix;
+	Lane* lane;
+
+	for (ix = 0; ix < nlanes; ++ix) {
+	  lane = &qlane[ix];
+	  lane->size = 0;
+	}
+      }
+
+    inline Lane* get_lane(XioConnection *xcon)
+      {
+	return &qlane[(((uint64_t) xcon) / 16) % nlanes];
+      }
+
+    void enq(XioConnection *xcon, XioSubmit* xs)
+      {
+	Lane* lane = get_lane(xcon);
+    std::lock_guard<decltype(lane->sp)> lg(lane->sp);
+	lane->q.push_back(*xs);
+	++(lane->size);
+      }
+
+    void enq(XioConnection *xcon, XioSubmit::Queue& requeue_q)
+      {
+	int size = requeue_q.size();
+	Lane* lane = get_lane(xcon);
+    std::lock_guard<decltype(lane->sp)> lg(lane->sp);
+	XioSubmit::Queue::const_iterator i1 = lane->q.end();
+	lane->q.splice(i1, requeue_q);
+	lane->size += size;
+      }
+
+    void deq(XioSubmit::Queue& send_q)
+      {
+	Lane* lane;
+	int cnt;
+
+	for (cnt = 0; cnt < nlanes; ++cnt, ++ix, ix = ix % nlanes) {
+      std::lock_guard<decltype(lane->sp)> lg(lane->sp);
+	  lane = &qlane[ix];
+	  if (lane->size > 0) {
+	    XioSubmit::Queue::const_iterator i1 = send_q.end();
+	    send_q.splice(i1, lane->q);
+	    lane->size = 0;
+	    ++ix, ix = ix % nlanes;
+	    break;
+	  }
+	}
+      }
+
+  }; /* SubmitQueue */
+
+  Messenger *msgr;
+  struct xio_context *ctx;
+  struct xio_server *server;
+  SubmitQueue submit_q;
+  ceph::spinlock sp;
+  void *ev_loop;
+  string xio_uri;
+  char *portal_id;
+  bool _shutdown;
+  bool drained;
+  uint32_t magic;
+  uint32_t special_handling;
+
+  friend class XioPortals;
+  friend class XioMessenger;
+
+public:
+  explicit XioPortal(Messenger *_msgr, int max_conns) :
+    msgr(_msgr), ctx(NULL), server(NULL), submit_q(), xio_uri(""),
+    portal_id(NULL), _shutdown(false), drained(false),
+    magic(0),
+    special_handling(0)
+  {
+    struct xio_context_params ctx_params;
+    memset(&ctx_params, 0, sizeof(ctx_params));
+    ctx_params.user_context = this;
+    /*
+     * hint to Accelio the total number of connections that will share
+     * this context's resources: internal primary task pool...
+     */
+    ctx_params.max_conns_per_ctx = max_conns;
+
+    /* a portal is an xio_context and event loop */
+    ctx = xio_context_create(&ctx_params, 0 /* poll timeout */, -1 /* cpu hint */);
+    ceph_assert(ctx && "Whoops, failed to create portal/ctx");
+  }
+
+  int bind(struct xio_session_ops *ops, const string &base_uri,
+	   uint16_t port, uint16_t *assigned_port);
+
+  inline void release_xio_msg(XioCompletion* xcmp) {
+    struct xio_msg *msg = xcmp->dequeue();
+    struct xio_msg *next_msg = NULL;
+    int code;
+    if (unlikely(!xcmp->xcon->conn)) {
+      // NOTE: msg is not safe to dereference if the connection was torn down
+      xcmp->xcon->msg_release_fail(msg, ENOTCONN);
+    }
+    else while (msg) {
+      next_msg = static_cast<struct xio_msg *>(msg->user_context);
+      code = xio_release_msg(msg);
+      if (unlikely(code)) /* very unlikely, so log it */
+	xcmp->xcon->msg_release_fail(msg, code);
+      msg = next_msg;
+    }
+    xcmp->trace.event("xio_release_msg");
+    xcmp->finalize(); /* unconditional finalize */
+  }
+
+  void enqueue(XioConnection *xcon, XioSubmit *xs)
+    {
+      if (! _shutdown) {
+	submit_q.enq(xcon, xs);
+	xio_context_stop_loop(ctx);
+	return;
+      }
+
+      /* dispose xs */
+      switch(xs->type) {
+      case XioSubmit::OUTGOING_MSG: /* it was an outgoing 1-way */
+      {
+	XioSend* xsend = static_cast<XioSend*>(xs);
+	xs->xcon->msg_send_fail(xsend, -EINVAL);
+      }
+	break;
+      default:
+	/* INCOMING_MSG_RELEASE */
+	release_xio_msg(static_cast<XioCompletion*>(xs));
+      break;
+      };
+    }
+
+  void requeue(XioConnection* xcon, XioSubmit::Queue& send_q) {
+    submit_q.enq(xcon, send_q);
+  }
+
+  void requeue_all_xcon(XioConnection* xcon,
+			XioSubmit::Queue::iterator& q_iter,
+			XioSubmit::Queue& send_q) {
+    // XXX gather all already-dequeued outgoing messages for xcon
+    // and push them in FIFO order to front of the input queue,
+    // and mark the connection as flow-controlled
+    XioSubmit::Queue requeue_q;
+
+    while (q_iter != send_q.end()) {
+      XioSubmit *xs = &(*q_iter);
+      // skip retires and anything for other connections
+      if (xs->xcon != xcon) {
+	q_iter++;
+	continue;
+      }
+      q_iter = send_q.erase(q_iter);
+      requeue_q.push_back(*xs);
+    }
+    std::lock_guard<decltype(xcon->sp)> lg(xcon->sp);
+    XioSubmit::Queue::const_iterator i1 = xcon->outgoing.requeue.begin();
+    xcon->outgoing.requeue.splice(i1, requeue_q);
+    xcon->cstate.state_flow_controlled(XioConnection::CState::OP_FLAG_LOCKED);
+  }
+
+  void *entry()
+    {
+      int size, code = 0;
+      uint32_t xio_qdepth_high;
+      XioSubmit::Queue send_q;
+      XioSubmit::Queue::iterator q_iter;
+      struct xio_msg *msg = NULL;
+      XioConnection *xcon;
+      XioSubmit *xs;
+      XioSend *xsend;
+
+      do {
+	submit_q.deq(send_q);
+
+	/* shutdown() barrier */
+    std::lock_guard<decltype(sp)> lg(sp);
+
+      restart:
+	size = send_q.size();
+
+	if (_shutdown) {
+	  // XXX XioSend queues for flow-controlled connections may require
+	  // cleanup
+	  drained = true;
+	}
+
+	if (size > 0) {
+	  q_iter = send_q.begin();
+	  while (q_iter != send_q.end()) {
+	    xs = &(*q_iter);
+	    xcon = xs->xcon;
+
+	    switch (xs->type) {
+	    case XioSubmit::OUTGOING_MSG: /* it was an outgoing 1-way */
+	      xsend = static_cast<XioSend*>(xs);
+	      if (unlikely(!xcon->conn || !xcon->is_connected()))
+		code = ENOTCONN;
+	      else {
+		/* XXX guard Accelio send queue (should be safe to rely
+		 * on Accelio's check on below, but this assures that
+		 * all chained xio_msg are accounted) */
+		xio_qdepth_high = xcon->xio_qdepth_high_mark();
+		if (unlikely((xcon->send_ctr + xsend->get_msg_count()) >
+			     xio_qdepth_high)) {
+		  requeue_all_xcon(xcon, q_iter, send_q);
+		  goto restart;
+		}
+
+		xs->trace.event("xio_send_msg");
+		msg = xsend->get_xio_msg();
+		code = xio_send_msg(xcon->conn, msg);
+		/* header trace moved here to capture xio serial# */
+		if (ldlog_p1(msgr->cct, ceph_subsys_xio, 11)) {
+		  xsend->print_debug(msgr->cct, "xio_send_msg");
+		}
+		/* get the right Accelio's errno code */
+		if (unlikely(code)) {
+		  if ((code == -1) && (xio_errno() == -1)) {
+		    /* In case XIO does not have any credits to send,
+		     * it would still queue up the message(s) for transmission,
+		     * but would return -1 and errno would also be set to -1.
+		     * This needs to be treated as a success.
+		     */
+		    code = 0;
+		  }
+		  else {
+		    code = xio_errno();
+		  }
+		}
+	      } /* !ENOTCONN */
+	      if (unlikely(code)) {
+		switch (code) {
+		case XIO_E_TX_QUEUE_OVERFLOW:
+		{
+		  requeue_all_xcon(xcon, q_iter, send_q);
+		  goto restart;
+		}
+		  break;
+		default:
+		  q_iter = send_q.erase(q_iter);
+		  xcon->msg_send_fail(xsend, code);
+		  continue;
+		  break;
+		};
+	      } else {
+		xcon->send.set(msg->timestamp); // need atomic?
+		xcon->send_ctr += xsend->get_msg_count(); // only inc if cb promised
+	      }
+	      break;
+	    default:
+	      /* INCOMING_MSG_RELEASE */
+	      q_iter = send_q.erase(q_iter);
+	      release_xio_msg(static_cast<XioCompletion*>(xs));
+	      continue;
+	    } /* switch (xs->type) */
+	    q_iter = send_q.erase(q_iter);
+	  } /* while */
+	} /* size > 0 */
+
+	xio_context_run_loop(ctx, 300);
+
+      } while ((!_shutdown) || (!drained));
+
+      /* shutting down */
+      if (server) {
+	xio_unbind(server);
+      }
+      xio_context_destroy(ctx);
+      return NULL;
+    }
+
+  void shutdown()
+    {
+    std::lock_guard<decltype(sp)> lg(sp);
+	_shutdown = true;
+    }
+};
+
+class XioPortals
+{
+private:
+  vector<XioPortal*> portals;
+  char **p_vec;
+  int n;
+  int last_unused;
+
+public:
+  XioPortals(Messenger *msgr, int _n, int nconns) : p_vec(NULL), last_unused(0)
+  {
+    n = max(_n, 1);
+
+    portals.resize(n);
+    for (int i = 0; i < n; i++) {
+      if (!portals[i]) {
+        portals[i] = new XioPortal(msgr, nconns);
+        ceph_assert(portals[i] != nullptr);
+      }
+    }
+  }
+
+  vector<XioPortal*>& get() { return portals; }
+
+  const char **get_vec()
+  {
+    return (const char **) p_vec;
+  }
+
+  int get_portals_len()
+  {
+    return n;
+  }
+
+  int get_last_unused()
+  {
+    int pix = last_unused;
+    if (++last_unused >= get_portals_len())
+      last_unused = 0;
+    return pix;
+  }
+
+  XioPortal* get_next_portal()
+  {
+    int pix = get_last_unused();
+    return portals[pix];
+  }
+
+  int bind(struct xio_session_ops *ops, const string& base_uri,
+	   uint16_t port, uint16_t *port0);
+
+  int accept(struct xio_session *session,
+	     struct xio_new_session_req *req,
+	     void *cb_user_context)
+  {
+    const char **portals_vec = get_vec();
+    int pix = get_last_unused();
+
+    if (pix == 0) {
+      return xio_accept(session, NULL, 0, NULL, 0);
+    } else {
+      return xio_accept(session,
+			(const char **)&(portals_vec[pix]),
+			1, NULL, 0);
+    }
+  }
+
+  void start()
+  {
+    XioPortal *portal;
+    int p_ix, nportals = portals.size();
+
+    p_vec = new char*[nportals];
+    for (p_ix = 0; p_ix < nportals; ++p_ix) {
+      portal = portals[p_ix];
+      p_vec[p_ix] = (char*) /* portal->xio_uri.c_str() */
+			portal->portal_id;
+    }
+
+    for (p_ix = 0; p_ix < nportals; ++p_ix) {
+      string thread_name = "ms_xio_";
+      thread_name.append(std::to_string(p_ix));
+      portal = portals[p_ix];
+      portal->create(thread_name.c_str());
+    }
+  }
+
+  void shutdown()
+  {
+    int nportals = portals.size();
+    for (int p_ix = 0; p_ix < nportals; ++p_ix) {
+      XioPortal *portal = portals[p_ix];
+      portal->shutdown();
+    }
+  }
+
+  void join()
+  {
+    int nportals = portals.size();
+    for (int p_ix = 0; p_ix < nportals; ++p_ix) {
+      XioPortal *portal = portals[p_ix];
+      portal->join();
+    }
+  }
+
+  ~XioPortals()
+  {
+    int nportals = portals.size();
+    for (int ix = 0; ix < nportals; ++ix)
+      delete(portals[ix]);
+    portals.clear();
+    if (p_vec)
+      delete[] p_vec;
+  }
+};
+
+#endif /* XIO_PORTAL_H */
diff --git a/src/msg/xio/XioSubmit.h b/src/msg/xio/XioSubmit.h
new file mode 100644
index 00000000..9840ad4a
--- /dev/null
+++ b/src/msg/xio/XioSubmit.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ * Portions Copyright (C) 2013 CohortFS, LLC
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef XIO_SUBMIT_H
+#define XIO_SUBMIT_H
+
+#include <boost/intrusive/list.hpp>
+#include "msg/SimplePolicyMessenger.h"
+extern "C" {
+#include "libxio.h"
+}
+#include "XioConnection.h"
+#include "msg/msg_types.h"
+#include "XioPool.h"
+
+namespace bi = boost::intrusive;
+
+class XioConnection;
+
+struct XioSubmit
+{
+public:
+  enum submit_type
+  {
+    OUTGOING_MSG,
+    INCOMING_MSG_RELEASE
+  };
+  enum submit_type type;
+  bi::list_member_hook<> submit_list;
+  XioConnection *xcon;
+  ZTracer::Trace trace;
+
+  XioSubmit(enum submit_type _type, XioConnection *_xcon) :
+    type(_type), xcon(_xcon)
+    {}
+
+  typedef bi::list< XioSubmit,
+		    bi::member_hook< XioSubmit,
+				     bi::list_member_hook<>,
+				     &XioSubmit::submit_list >
+		    > Queue;
+  virtual ~XioSubmit(){
+  }
+};
+
+#endif /* XIO_SUBMIT_H */
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-27 18:24:20 +0000
commit	483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
tree	e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg
parent	Initial commit. (diff)
download	ceph-upstream.tar.xz ceph-upstream.zip