summaryrefslogtreecommitdiffstats
path: root/src/msg/async
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg/async
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/msg/async')
-rw-r--r--src/msg/async/AsyncConnection.cc771
-rw-r--r--src/msg/async/AsyncConnection.h238
-rw-r--r--src/msg/async/AsyncMessenger.cc949
-rw-r--r--src/msg/async/AsyncMessenger.h426
-rw-r--r--src/msg/async/Event.cc471
-rw-r--r--src/msg/async/Event.h266
-rw-r--r--src/msg/async/EventEpoll.cc142
-rw-r--r--src/msg/async/EventEpoll.h49
-rw-r--r--src/msg/async/EventKqueue.cc267
-rw-r--r--src/msg/async/EventKqueue.h67
-rw-r--r--src/msg/async/EventSelect.cc95
-rw-r--r--src/msg/async/EventSelect.h42
-rw-r--r--src/msg/async/PosixStack.cc293
-rw-r--r--src/msg/async/PosixStack.h56
-rw-r--r--src/msg/async/Protocol.cc14
-rw-r--r--src/msg/async/Protocol.h140
-rw-r--r--src/msg/async/ProtocolV1.cc2547
-rw-r--r--src/msg/async/ProtocolV1.h305
-rw-r--r--src/msg/async/ProtocolV2.cc2870
-rw-r--r--src/msg/async/ProtocolV2.h259
-rw-r--r--src/msg/async/Stack.cc217
-rw-r--r--src/msg/async/Stack.h356
-rw-r--r--src/msg/async/crypto_onwire.cc311
-rw-r--r--src/msg/async/crypto_onwire.h130
-rw-r--r--src/msg/async/dpdk/ARP.cc89
-rw-r--r--src/msg/async/dpdk/ARP.h301
-rw-r--r--src/msg/async/dpdk/DPDK.cc1267
-rw-r--r--src/msg/async/dpdk/DPDK.h918
-rw-r--r--src/msg/async/dpdk/DPDKStack.cc281
-rw-r--r--src/msg/async/dpdk/DPDKStack.h257
-rw-r--r--src/msg/async/dpdk/EventDPDK.cc85
-rw-r--r--src/msg/async/dpdk/EventDPDK.h40
-rw-r--r--src/msg/async/dpdk/IP.cc470
-rw-r--r--src/msg/async/dpdk/IP.h414
-rw-r--r--src/msg/async/dpdk/IPChecksum.cc70
-rw-r--r--src/msg/async/dpdk/IPChecksum.h72
-rw-r--r--src/msg/async/dpdk/Packet.cc146
-rw-r--r--src/msg/async/dpdk/Packet.h550
-rw-r--r--src/msg/async/dpdk/PacketUtil.h154
-rw-r--r--src/msg/async/dpdk/TCP-Stack.h40
-rw-r--r--src/msg/async/dpdk/TCP.cc840
-rw-r--r--src/msg/async/dpdk/TCP.h1503
-rw-r--r--src/msg/async/dpdk/UserspaceEvent.cc127
-rw-r--r--src/msg/async/dpdk/UserspaceEvent.h106
-rw-r--r--src/msg/async/dpdk/align.h50
-rw-r--r--src/msg/async/dpdk/array_map.h50
-rw-r--r--src/msg/async/dpdk/byteorder.h58
-rw-r--r--src/msg/async/dpdk/capture.h50
-rw-r--r--src/msg/async/dpdk/circular_buffer.h347
-rw-r--r--src/msg/async/dpdk/const.h42
-rw-r--r--src/msg/async/dpdk/dpdk_rte.cc154
-rw-r--r--src/msg/async/dpdk/dpdk_rte.h74
-rw-r--r--src/msg/async/dpdk/ethernet.cc16
-rw-r--r--src/msg/async/dpdk/ethernet.h84
-rw-r--r--src/msg/async/dpdk/ip_types.h109
-rw-r--r--src/msg/async/dpdk/net.cc205
-rw-r--r--src/msg/async/dpdk/net.h138
-rw-r--r--src/msg/async/dpdk/queue.h96
-rw-r--r--src/msg/async/dpdk/shared_ptr.h391
-rw-r--r--src/msg/async/dpdk/stream.h155
-rw-r--r--src/msg/async/dpdk/toeplitz.h92
-rw-r--r--src/msg/async/dpdk/transfer.h64
-rw-r--r--src/msg/async/frames_v2.cc480
-rw-r--r--src/msg/async/frames_v2.h842
-rw-r--r--src/msg/async/net_handler.cc233
-rw-r--r--src/msg/async/net_handler.h46
-rw-r--r--src/msg/async/rdma/Infiniband.cc1234
-rw-r--r--src/msg/async/rdma/Infiniband.h529
-rw-r--r--src/msg/async/rdma/RDMAConnectedSocketImpl.cc743
-rw-r--r--src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc183
-rw-r--r--src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc107
-rw-r--r--src/msg/async/rdma/RDMAServerSocketImpl.cc127
-rw-r--r--src/msg/async/rdma/RDMAStack.cc610
-rw-r--r--src/msg/async/rdma/RDMAStack.h348
74 files changed, 26668 insertions, 0 deletions
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc
new file mode 100644
index 00000000..b78d84a3
--- /dev/null
+++ b/src/msg/async/AsyncConnection.cc
@@ -0,0 +1,771 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+
+#include "include/Context.h"
+#include "include/random.h"
+#include "common/errno.h"
+#include "AsyncMessenger.h"
+#include "AsyncConnection.h"
+
+#include "ProtocolV1.h"
+#include "ProtocolV2.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+// Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR
+#define SEQ_MASK 0x7fffffff
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) {
+ return *_dout << "-- " << async_msgr->get_myaddrs() << " >> "
+ << *peer_addrs << " conn(" << this
+ << (msgr2 ? " msgr2=" : " legacy=")
+ << protocol.get()
+ << " " << ceph_con_mode_name(protocol->auth_meta->con_mode)
+ << " :" << port
+ << " s=" << get_state_name(state)
+ << " l=" << policy.lossy
+ << ").";
+}
+
+// Notes:
+// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead
+
+const uint32_t AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512;
+
+class C_time_wakeup : public EventCallback {
+ AsyncConnectionRef conn;
+
+ public:
+ explicit C_time_wakeup(AsyncConnectionRef c): conn(c) {}
+ void do_request(uint64_t fd_or_id) override {
+ conn->wakeup_from(fd_or_id);
+ }
+};
+
+class C_handle_read : public EventCallback {
+ AsyncConnectionRef conn;
+
+ public:
+ explicit C_handle_read(AsyncConnectionRef c): conn(c) {}
+ void do_request(uint64_t fd_or_id) override {
+ conn->process();
+ }
+};
+
+class C_handle_write : public EventCallback {
+ AsyncConnectionRef conn;
+
+ public:
+ explicit C_handle_write(AsyncConnectionRef c): conn(c) {}
+ void do_request(uint64_t fd) override {
+ conn->handle_write();
+ }
+};
+
+class C_handle_write_callback : public EventCallback {
+ AsyncConnectionRef conn;
+
+public:
+ explicit C_handle_write_callback(AsyncConnectionRef c) : conn(c) {}
+ void do_request(uint64_t fd) override { conn->handle_write_callback(); }
+};
+
+class C_clean_handler : public EventCallback {
+ AsyncConnectionRef conn;
+ public:
+ explicit C_clean_handler(AsyncConnectionRef c): conn(c) {}
+ void do_request(uint64_t id) override {
+ conn->cleanup();
+ delete this;
+ }
+};
+
+class C_tick_wakeup : public EventCallback {
+ AsyncConnectionRef conn;
+
+ public:
+ explicit C_tick_wakeup(AsyncConnectionRef c): conn(c) {}
+ void do_request(uint64_t fd_or_id) override {
+ conn->tick(fd_or_id);
+ }
+};
+
+
+AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+ Worker *w, bool m2, bool local)
+ : Connection(cct, m), delay_state(NULL), async_msgr(m), conn_id(q->get_id()),
+ logger(w->get_perf_counter()),
+ state(STATE_NONE), port(-1),
+ dispatch_queue(q), recv_buf(NULL),
+ recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)),
+ recv_start(0), recv_end(0),
+ last_active(ceph::coarse_mono_clock::now()),
+ connect_timeout_us(cct->_conf->ms_connection_ready_timeout*1000*1000),
+ inactive_timeout_us(cct->_conf->ms_connection_idle_timeout*1000*1000),
+ msgr2(m2), state_offset(0),
+ worker(w), center(&w->center),read_buffer(nullptr)
+{
+#ifdef UNIT_TESTS_BUILT
+ this->interceptor = m->interceptor;
+#endif
+ read_handler = new C_handle_read(this);
+ write_handler = new C_handle_write(this);
+ write_callback_handler = new C_handle_write_callback(this);
+ wakeup_handler = new C_time_wakeup(this);
+ tick_handler = new C_tick_wakeup(this);
+ // double recv_max_prefetch see "read_until"
+ recv_buf = new char[2*recv_max_prefetch];
+ if (local) {
+ protocol = std::unique_ptr<Protocol>(new LoopbackProtocolV1(this));
+ } else if (m2) {
+ protocol = std::unique_ptr<Protocol>(new ProtocolV2(this));
+ } else {
+ protocol = std::unique_ptr<Protocol>(new ProtocolV1(this));
+ }
+ logger->inc(l_msgr_created_connections);
+}
+
+AsyncConnection::~AsyncConnection()
+{
+ if (recv_buf)
+ delete[] recv_buf;
+ ceph_assert(!delay_state);
+}
+
+int AsyncConnection::get_con_mode() const {
+ return protocol->get_con_mode();
+}
+
+void AsyncConnection::maybe_start_delay_thread()
+{
+ if (!delay_state) {
+ async_msgr->cct->_conf.with_val<std::string>(
+ "ms_inject_delay_type",
+ [this](const string& s) {
+ if (s.find(ceph_entity_type_name(peer_type)) != string::npos) {
+ ldout(msgr->cct, 1) << __func__ << " setting up a delay queue"
+ << dendl;
+ delay_state = new DelayedDelivery(async_msgr, center, dispatch_queue,
+ conn_id);
+ }
+ });
+ }
+}
+
+
+ssize_t AsyncConnection::read(unsigned len, char *buffer,
+ std::function<void(char *, ssize_t)> callback) {
+ ldout(async_msgr->cct, 20) << __func__
+ << (pendingReadLen ? " continue" : " start")
+ << " len=" << len << dendl;
+ ssize_t r = read_until(len, buffer);
+ if (r > 0) {
+ readCallback = callback;
+ pendingReadLen = len;
+ read_buffer = buffer;
+ }
+ return r;
+}
+
+// Because this func will be called multi times to populate
+// the needed buffer, so the passed in bufferptr must be the same.
+// Normally, only "read_message" will pass existing bufferptr in
+//
+// And it will uses readahead method to reduce small read overhead,
+// "recv_buf" is used to store read buffer
+//
+// return the remaining bytes, 0 means this buffer is finished
+// else return < 0 means error
+ssize_t AsyncConnection::read_until(unsigned len, char *p)
+{
+ ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is "
+ << state_offset << dendl;
+
+ if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+ if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+ cs.shutdown();
+ }
+ }
+
+ ssize_t r = 0;
+ uint64_t left = len - state_offset;
+ if (recv_end > recv_start) {
+ uint64_t to_read = std::min<uint64_t>(recv_end - recv_start, left);
+ memcpy(p, recv_buf+recv_start, to_read);
+ recv_start += to_read;
+ left -= to_read;
+ ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer "
+ << " left is " << left << " buffer still has "
+ << recv_end - recv_start << dendl;
+ if (left == 0) {
+ return 0;
+ }
+ state_offset += to_read;
+ }
+
+ recv_end = recv_start = 0;
+ /* nothing left in the prefetch buffer */
+ if (left > (uint64_t)recv_max_prefetch) {
+ /* this was a large read, we don't prefetch for these */
+ do {
+ r = read_bulk(p+state_offset, left);
+ ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl;
+ if (r < 0) {
+ ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+ return -1;
+ } else if (r == static_cast<int>(left)) {
+ state_offset = 0;
+ return 0;
+ }
+ state_offset += r;
+ left -= r;
+ } while (r > 0);
+ } else {
+ do {
+ r = read_bulk(recv_buf+recv_end, recv_max_prefetch);
+ ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end
+ << " left is " << left << " got " << r << dendl;
+ if (r < 0) {
+ ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl;
+ return -1;
+ }
+ recv_end += r;
+ if (r >= static_cast<int>(left)) {
+ recv_start = len - state_offset;
+ memcpy(p+state_offset, recv_buf, recv_start);
+ state_offset = 0;
+ return 0;
+ }
+ left -= r;
+ } while (r > 0);
+ memcpy(p+state_offset, recv_buf, recv_end-recv_start);
+ state_offset += (recv_end - recv_start);
+ recv_end = recv_start = 0;
+ }
+ ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining "
+ << len - state_offset << " bytes" << dendl;
+ return len - state_offset;
+}
+
+/* return -1 means `fd` occurs error or closed, it should be closed
+ * return 0 means EAGAIN or EINTR */
+ssize_t AsyncConnection::read_bulk(char *buf, unsigned len)
+{
+ ssize_t nread;
+ again:
+ nread = cs.read(buf, len);
+ if (nread < 0) {
+ if (nread == -EAGAIN) {
+ nread = 0;
+ } else if (nread == -EINTR) {
+ goto again;
+ } else {
+ ldout(async_msgr->cct, 1) << __func__ << " reading from fd=" << cs.fd()
+ << " : "<< strerror(nread) << dendl;
+ return -1;
+ }
+ } else if (nread == 0) {
+ ldout(async_msgr->cct, 1) << __func__ << " peer close file descriptor "
+ << cs.fd() << dendl;
+ return -1;
+ }
+ return nread;
+}
+
+ssize_t AsyncConnection::write(bufferlist &bl,
+ std::function<void(ssize_t)> callback,
+ bool more) {
+
+ std::unique_lock<std::mutex> l(write_lock);
+ outgoing_bl.claim_append(bl);
+ ssize_t r = _try_send(more);
+ if (r > 0) {
+ writeCallback = callback;
+ }
+ return r;
+}
+
+// return the remaining bytes, it may larger than the length of ptr
+// else return < 0 means error
+ssize_t AsyncConnection::_try_send(bool more)
+{
+ if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) {
+ if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl;
+ cs.shutdown();
+ }
+ }
+
+ ceph_assert(center->in_thread());
+ ldout(async_msgr->cct, 25) << __func__ << " cs.send " << outgoing_bl.length()
+ << " bytes" << dendl;
+ ssize_t r = cs.send(outgoing_bl, more);
+ if (r < 0) {
+ ldout(async_msgr->cct, 1) << __func__ << " send error: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ ldout(async_msgr->cct, 10) << __func__ << " sent bytes " << r
+ << " remaining bytes " << outgoing_bl.length() << dendl;
+
+ if (!open_write && is_queued()) {
+ center->create_file_event(cs.fd(), EVENT_WRITABLE, write_handler);
+ open_write = true;
+ }
+
+ if (open_write && !is_queued()) {
+ center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+ open_write = false;
+ if (writeCallback) {
+ center->dispatch_event_external(write_callback_handler);
+ }
+ }
+
+ return outgoing_bl.length();
+}
+
+void AsyncConnection::inject_delay() {
+ if (async_msgr->cct->_conf->ms_inject_internal_delays) {
+ ldout(async_msgr->cct, 10) << __func__ << " sleep for " <<
+ async_msgr->cct->_conf->ms_inject_internal_delays << dendl;
+ utime_t t;
+ t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays);
+ t.sleep();
+ }
+}
+
+void AsyncConnection::process() {
+ std::lock_guard<std::mutex> l(lock);
+ last_active = ceph::coarse_mono_clock::now();
+ recv_start_time = ceph::mono_clock::now();
+
+ ldout(async_msgr->cct, 20) << __func__ << dendl;
+
+ switch (state) {
+ case STATE_NONE: {
+ ldout(async_msgr->cct, 20) << __func__ << " enter none state" << dendl;
+ return;
+ }
+ case STATE_CLOSED: {
+ ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl;
+ return;
+ }
+ case STATE_CONNECTING: {
+ ceph_assert(!policy.server);
+
+ // clear timer (if any) since we are connecting/re-connecting
+ if (last_tick_id) {
+ center->delete_time_event(last_tick_id);
+ last_tick_id = 0;
+ }
+
+ if (cs) {
+ center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+ cs.close();
+ }
+
+ SocketOptions opts;
+ opts.priority = async_msgr->get_socket_priority();
+ opts.connect_bind_addr = msgr->get_myaddrs().front();
+ ssize_t r = worker->connect(target_addr, opts, &cs);
+ if (r < 0) {
+ protocol->fault();
+ return;
+ }
+
+ center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+ state = STATE_CONNECTING_RE;
+ }
+ case STATE_CONNECTING_RE: {
+ ssize_t r = cs.is_connected();
+ if (r < 0) {
+ ldout(async_msgr->cct, 1) << __func__ << " reconnect failed to "
+ << target_addr << dendl;
+ if (r == -ECONNREFUSED) {
+ ldout(async_msgr->cct, 2)
+ << __func__ << " connection refused!" << dendl;
+ dispatch_queue->queue_refused(this);
+ }
+ protocol->fault();
+ return;
+ } else if (r == 0) {
+ ldout(async_msgr->cct, 10)
+ << __func__ << " nonblock connect inprogress" << dendl;
+ if (async_msgr->get_stack()->nonblock_connect_need_writable_event()) {
+ center->create_file_event(cs.fd(), EVENT_WRITABLE,
+ read_handler);
+ }
+ logger->tinc(l_msgr_running_recv_time,
+ ceph::mono_clock::now() - recv_start_time);
+ return;
+ }
+
+ center->delete_file_event(cs.fd(), EVENT_WRITABLE);
+ ldout(async_msgr->cct, 10)
+ << __func__ << " connect successfully, ready to send banner" << dendl;
+ state = STATE_CONNECTION_ESTABLISHED;
+ ceph_assert(last_tick_id == 0);
+ // exclude TCP nonblock connect time
+ last_connect_started = ceph::coarse_mono_clock::now();
+ last_tick_id = center->create_time_event(
+ connect_timeout_us, tick_handler);
+ break;
+ }
+
+ case STATE_ACCEPTING: {
+ center->create_file_event(cs.fd(), EVENT_READABLE, read_handler);
+ state = STATE_CONNECTION_ESTABLISHED;
+
+ break;
+ }
+
+ case STATE_CONNECTION_ESTABLISHED: {
+ if (pendingReadLen) {
+ ssize_t r = read(*pendingReadLen, read_buffer, readCallback);
+ if (r <= 0) { // read all bytes, or an error occured
+ pendingReadLen.reset();
+ char *buf_tmp = read_buffer;
+ read_buffer = nullptr;
+ readCallback(buf_tmp, r);
+ }
+ return;
+ }
+ break;
+ }
+ }
+
+ protocol->read_event();
+
+ logger->tinc(l_msgr_running_recv_time,
+ ceph::mono_clock::now() - recv_start_time);
+}
+
+bool AsyncConnection::is_connected() {
+ return protocol->is_connected();
+}
+
+void AsyncConnection::connect(const entity_addrvec_t &addrs, int type,
+ entity_addr_t &target) {
+
+ std::lock_guard<std::mutex> l(lock);
+ set_peer_type(type);
+ set_peer_addrs(addrs);
+ policy = msgr->get_policy(type);
+ target_addr = target;
+ _connect();
+}
+
+void AsyncConnection::_connect()
+{
+ ldout(async_msgr->cct, 10) << __func__ << dendl;
+
+ state = STATE_CONNECTING;
+ protocol->connect();
+ // rescheduler connection in order to avoid lock dep
+ // may called by external thread(send_message)
+ center->dispatch_event_external(read_handler);
+}
+
+void AsyncConnection::accept(ConnectedSocket socket,
+ const entity_addr_t &listen_addr,
+ const entity_addr_t &peer_addr)
+{
+ ldout(async_msgr->cct, 10) << __func__ << " sd=" << socket.fd()
+ << " listen_addr " << listen_addr
+ << " peer_addr " << peer_addr << dendl;
+ ceph_assert(socket.fd() >= 0);
+
+ std::lock_guard<std::mutex> l(lock);
+ cs = std::move(socket);
+ socket_addr = listen_addr;
+ target_addr = peer_addr; // until we know better
+ state = STATE_ACCEPTING;
+ protocol->accept();
+ // rescheduler connection in order to avoid lock dep
+ center->dispatch_event_external(read_handler);
+}
+
+int AsyncConnection::send_message(Message *m)
+{
+ FUNCTRACE(async_msgr->cct);
+ lgeneric_subdout(async_msgr->cct, ms,
+ 1) << "-- " << async_msgr->get_myaddrs() << " --> "
+ << get_peer_addrs() << " -- "
+ << *m << " -- " << m << " con "
+ << this
+ << dendl;
+
+ // optimistic think it's ok to encode(actually may broken now)
+ if (!m->get_priority())
+ m->set_priority(async_msgr->get_default_send_priority());
+
+ m->get_header().src = async_msgr->get_myname();
+ m->set_connection(this);
+
+ if (m->get_type() == CEPH_MSG_OSD_OP)
+ OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_BEGIN", true);
+ else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+ OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_BEGIN", true);
+
+ if (async_msgr->get_myaddrs() == get_peer_addrs()) { //loopback connection
+ ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl;
+ std::lock_guard<std::mutex> l(write_lock);
+ if (protocol->is_connected()) {
+ dispatch_queue->local_delivery(m, m->get_priority());
+ } else {
+ ldout(async_msgr->cct, 10) << __func__ << " loopback connection closed."
+ << " Drop message " << m << dendl;
+ m->put();
+ }
+ return 0;
+ }
+
+ // we don't want to consider local message here, it's too lightweight which
+ // may disturb users
+ logger->inc(l_msgr_send_messages);
+
+ protocol->send_message(m);
+ return 0;
+}
+
+entity_addr_t AsyncConnection::_infer_target_addr(const entity_addrvec_t& av)
+{
+ // pick the first addr of the same address family as socket_addr. it could be
+ // an any: or v2: addr, we don't care. it should not be a v1 addr.
+ for (auto& i : av.v) {
+ if (i.is_legacy()) {
+ continue;
+ }
+ if (i.get_family() == socket_addr.get_family()) {
+ ldout(async_msgr->cct,10) << __func__ << " " << av << " -> " << i << dendl;
+ return i;
+ }
+ }
+ ldout(async_msgr->cct,10) << __func__ << " " << av << " -> nothing to match "
+ << socket_addr << dendl;
+ return {};
+}
+
+void AsyncConnection::fault()
+{
+ shutdown_socket();
+ open_write = false;
+
+ // queue delayed items immediately
+ if (delay_state)
+ delay_state->flush();
+
+ recv_start = recv_end = 0;
+ state_offset = 0;
+ outgoing_bl.clear();
+}
+
+void AsyncConnection::_stop() {
+ writeCallback.reset();
+ dispatch_queue->discard_queue(conn_id);
+ async_msgr->unregister_conn(this);
+ worker->release_worker();
+
+ state = STATE_CLOSED;
+ open_write = false;
+
+ state_offset = 0;
+ // Make sure in-queue events will been processed
+ center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this)));
+}
+
+bool AsyncConnection::is_queued() const {
+ return outgoing_bl.length();
+}
+
+void AsyncConnection::shutdown_socket() {
+ for (auto &&t : register_time_events) center->delete_time_event(t);
+ register_time_events.clear();
+ if (last_tick_id) {
+ center->delete_time_event(last_tick_id);
+ last_tick_id = 0;
+ }
+ if (cs) {
+ center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE);
+ cs.shutdown();
+ cs.close();
+ }
+}
+
+void AsyncConnection::DelayedDelivery::do_request(uint64_t id)
+{
+ Message *m = nullptr;
+ {
+ std::lock_guard<std::mutex> l(delay_lock);
+ register_time_events.erase(id);
+ if (stop_dispatch)
+ return ;
+ if (delay_queue.empty())
+ return ;
+ m = delay_queue.front();
+ delay_queue.pop_front();
+ }
+ if (msgr->ms_can_fast_dispatch(m)) {
+ dispatch_queue->fast_dispatch(m);
+ } else {
+ dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+ }
+}
+
+void AsyncConnection::DelayedDelivery::discard() {
+ stop_dispatch = true;
+ center->submit_to(center->get_id(),
+ [this]() mutable {
+ std::lock_guard<std::mutex> l(delay_lock);
+ while (!delay_queue.empty()) {
+ Message *m = delay_queue.front();
+ dispatch_queue->dispatch_throttle_release(
+ m->get_dispatch_throttle_size());
+ m->put();
+ delay_queue.pop_front();
+ }
+ for (auto i : register_time_events)
+ center->delete_time_event(i);
+ register_time_events.clear();
+ stop_dispatch = false;
+ },
+ true);
+}
+
+void AsyncConnection::DelayedDelivery::flush() {
+ stop_dispatch = true;
+ center->submit_to(
+ center->get_id(), [this] () mutable {
+ std::lock_guard<std::mutex> l(delay_lock);
+ while (!delay_queue.empty()) {
+ Message *m = delay_queue.front();
+ if (msgr->ms_can_fast_dispatch(m)) {
+ dispatch_queue->fast_dispatch(m);
+ } else {
+ dispatch_queue->enqueue(m, m->get_priority(), conn_id);
+ }
+ delay_queue.pop_front();
+ }
+ for (auto i : register_time_events)
+ center->delete_time_event(i);
+ register_time_events.clear();
+ stop_dispatch = false;
+ }, true);
+}
+
+void AsyncConnection::send_keepalive()
+{
+ protocol->send_keepalive();
+}
+
+void AsyncConnection::mark_down()
+{
+ ldout(async_msgr->cct, 1) << __func__ << dendl;
+ std::lock_guard<std::mutex> l(lock);
+ protocol->stop();
+}
+
+void AsyncConnection::handle_write()
+{
+ ldout(async_msgr->cct, 10) << __func__ << dendl;
+ protocol->write_event();
+}
+
+void AsyncConnection::handle_write_callback() {
+ std::lock_guard<std::mutex> l(lock);
+ last_active = ceph::coarse_mono_clock::now();
+ recv_start_time = ceph::mono_clock::now();
+ write_lock.lock();
+ if (writeCallback) {
+ auto callback = *writeCallback;
+ writeCallback.reset();
+ write_lock.unlock();
+ callback(0);
+ return;
+ }
+ write_lock.unlock();
+}
+
+void AsyncConnection::stop(bool queue_reset) {
+ lock.lock();
+ bool need_queue_reset = (state != STATE_CLOSED) && queue_reset;
+ protocol->stop();
+ lock.unlock();
+ if (need_queue_reset) dispatch_queue->queue_reset(this);
+}
+
+void AsyncConnection::cleanup() {
+ shutdown_socket();
+ delete read_handler;
+ delete write_handler;
+ delete write_callback_handler;
+ delete wakeup_handler;
+ delete tick_handler;
+ if (delay_state) {
+ delete delay_state;
+ delay_state = NULL;
+ }
+}
+
+void AsyncConnection::wakeup_from(uint64_t id)
+{
+ lock.lock();
+ register_time_events.erase(id);
+ lock.unlock();
+ process();
+}
+
+void AsyncConnection::tick(uint64_t id)
+{
+ auto now = ceph::coarse_mono_clock::now();
+ ldout(async_msgr->cct, 20) << __func__ << " last_id=" << last_tick_id
+ << " last_active=" << last_active << dendl;
+ std::lock_guard<std::mutex> l(lock);
+ last_tick_id = 0;
+ if (!is_connected()) {
+ if (connect_timeout_us <=
+ (uint64_t)std::chrono::duration_cast<std::chrono::microseconds>
+ (now - last_connect_started).count()) {
+ ldout(async_msgr->cct, 1) << __func__ << " see no progress in more than "
+ << connect_timeout_us
+ << " us during connecting, fault."
+ << dendl;
+ protocol->fault();
+ } else {
+ last_tick_id = center->create_time_event(connect_timeout_us, tick_handler);
+ }
+ } else {
+ auto idle_period = std::chrono::duration_cast<std::chrono::microseconds>
+ (now - last_active).count();
+ if (inactive_timeout_us < (uint64_t)idle_period) {
+ ldout(async_msgr->cct, 1) << __func__ << " idle (" << idle_period
+ << ") for more than " << inactive_timeout_us
+ << " us, fault."
+ << dendl;
+ protocol->fault();
+ } else {
+ last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler);
+ }
+ }
+}
diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h
new file mode 100644
index 00000000..0c2512c8
--- /dev/null
+++ b/src/msg/async/AsyncConnection.h
@@ -0,0 +1,238 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNCCONNECTION_H
+#define CEPH_MSG_ASYNCCONNECTION_H
+
+#include <atomic>
+#include <pthread.h>
+#include <climits>
+#include <list>
+#include <mutex>
+#include <map>
+#include <functional>
+#include <optional>
+
+#include "auth/AuthSessionHandler.h"
+#include "common/ceph_time.h"
+#include "common/perf_counters.h"
+#include "include/buffer.h"
+#include "msg/Connection.h"
+#include "msg/Messenger.h"
+
+#include "Event.h"
+#include "Stack.h"
+
+class AsyncMessenger;
+class DispatchQueue;
+class Worker;
+class Protocol;
+
+static const int ASYNC_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX);
+
+/*
+ * AsyncConnection maintains a logic session between two endpoints. In other
+ * word, a pair of addresses can find the only AsyncConnection. AsyncConnection
+ * will handle with network fault or read/write transactions. If one file
+ * descriptor broken, AsyncConnection will maintain the message queue and
+ * sequence, try to reconnect peer endpoint.
+ */
+class AsyncConnection : public Connection {
+
+ ssize_t read(unsigned len, char *buffer,
+ std::function<void(char *, ssize_t)> callback);
+ ssize_t read_until(unsigned needed, char *p);
+ ssize_t read_bulk(char *buf, unsigned len);
+
+ ssize_t write(bufferlist &bl, std::function<void(ssize_t)> callback,
+ bool more=false);
+ ssize_t _try_send(bool more=false);
+
+ void _connect();
+ void _stop();
+ void fault();
+ void inject_delay();
+
+ bool is_queued() const;
+ void shutdown_socket();
+
+ /**
+ * The DelayedDelivery is for injecting delays into Message delivery off
+ * the socket. It is only enabled if delays are requested, and if they
+ * are then it pulls Messages off the DelayQueue and puts them into the
+ * AsyncMessenger event queue.
+ */
+ class DelayedDelivery : public EventCallback {
+ std::set<uint64_t> register_time_events; // need to delete it if stop
+ std::deque<Message*> delay_queue;
+ std::mutex delay_lock;
+ AsyncMessenger *msgr;
+ EventCenter *center;
+ DispatchQueue *dispatch_queue;
+ uint64_t conn_id;
+ std::atomic_bool stop_dispatch;
+
+ public:
+ explicit DelayedDelivery(AsyncMessenger *omsgr, EventCenter *c,
+ DispatchQueue *q, uint64_t cid)
+ : msgr(omsgr), center(c), dispatch_queue(q), conn_id(cid),
+ stop_dispatch(false) { }
+ ~DelayedDelivery() override {
+ ceph_assert(register_time_events.empty());
+ ceph_assert(delay_queue.empty());
+ }
+ void set_center(EventCenter *c) { center = c; }
+ void do_request(uint64_t id) override;
+ void queue(double delay_period, Message *m) {
+ std::lock_guard<std::mutex> l(delay_lock);
+ delay_queue.push_back(m);
+ register_time_events.insert(center->create_time_event(delay_period*1000000, this));
+ }
+ void discard();
+ bool ready() const { return !stop_dispatch && delay_queue.empty() && register_time_events.empty(); }
+ void flush();
+ } *delay_state;
+
+ public:
+ AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q,
+ Worker *w, bool is_msgr2, bool local);
+ ~AsyncConnection() override;
+ void maybe_start_delay_thread();
+
+ ostream& _conn_prefix(std::ostream *_dout);
+
+ bool is_connected() override;
+
+ // Only call when AsyncConnection first construct
+ void connect(const entity_addrvec_t& addrs, int type, entity_addr_t& target);
+
+ // Only call when AsyncConnection first construct
+ void accept(ConnectedSocket socket,
+ const entity_addr_t &listen_addr,
+ const entity_addr_t &peer_addr);
+ int send_message(Message *m) override;
+
+ void send_keepalive() override;
+ void mark_down() override;
+ void mark_disposable() override {
+ std::lock_guard<std::mutex> l(lock);
+ policy.lossy = true;
+ }
+
+ entity_addr_t get_peer_socket_addr() const override {
+ return target_addr;
+ }
+
+ int get_con_mode() const override;
+
+ private:
+ enum {
+ STATE_NONE,
+ STATE_CONNECTING,
+ STATE_CONNECTING_RE,
+ STATE_ACCEPTING,
+ STATE_CONNECTION_ESTABLISHED,
+ STATE_CLOSED
+ };
+
+ static const uint32_t TCP_PREFETCH_MIN_SIZE;
+ static const char *get_state_name(int state) {
+ const char* const statenames[] = {"STATE_NONE",
+ "STATE_CONNECTING",
+ "STATE_CONNECTING_RE",
+ "STATE_ACCEPTING",
+ "STATE_CONNECTION_ESTABLISHED",
+ "STATE_CLOSED"};
+ return statenames[state];
+ }
+
+ AsyncMessenger *async_msgr;
+ uint64_t conn_id;
+ PerfCounters *logger;
+ int state;
+ ConnectedSocket cs;
+ int port;
+ Messenger::Policy policy;
+
+ DispatchQueue *dispatch_queue;
+
+ // lockfree, only used in own thread
+ bufferlist outgoing_bl;
+ bool open_write = false;
+
+ std::mutex write_lock;
+
+ std::mutex lock;
+ EventCallbackRef read_handler;
+ EventCallbackRef write_handler;
+ EventCallbackRef write_callback_handler;
+ EventCallbackRef wakeup_handler;
+ EventCallbackRef tick_handler;
+ char *recv_buf;
+ uint32_t recv_max_prefetch;
+ uint32_t recv_start;
+ uint32_t recv_end;
+ set<uint64_t> register_time_events; // need to delete it if stop
+ ceph::coarse_mono_clock::time_point last_connect_started;
+ ceph::coarse_mono_clock::time_point last_active;
+ ceph::mono_clock::time_point recv_start_time;
+ uint64_t last_tick_id = 0;
+ const uint64_t connect_timeout_us;
+ const uint64_t inactive_timeout_us;
+
+ // Tis section are temp variables used by state transition
+
+ // Accepting state
+ bool msgr2 = false;
+ entity_addr_t socket_addr; ///< local socket addr
+ entity_addr_t target_addr; ///< which of the peer_addrs we're connecting to (as clienet) or should reconnect to (as peer)
+
+ entity_addr_t _infer_target_addr(const entity_addrvec_t& av);
+
+ // used only by "read_until"
+ uint64_t state_offset;
+ Worker *worker;
+ EventCenter *center;
+
+ std::unique_ptr<Protocol> protocol;
+
+ std::optional<std::function<void(ssize_t)>> writeCallback;
+ std::function<void(char *, ssize_t)> readCallback;
+ std::optional<unsigned> pendingReadLen;
+ char *read_buffer;
+
+ public:
+ // used by eventcallback
+ void handle_write();
+ void handle_write_callback();
+ void process();
+ void wakeup_from(uint64_t id);
+ void tick(uint64_t id);
+ void local_deliver();
+ void stop(bool queue_reset);
+ void cleanup();
+ PerfCounters *get_perf_counter() {
+ return logger;
+ }
+
+ friend class Protocol;
+ friend class ProtocolV1;
+ friend class ProtocolV2;
+}; /* AsyncConnection */
+
+typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef;
+
+#endif
diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc
new file mode 100644
index 00000000..2b1488c4
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.cc
@@ -0,0 +1,949 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "acconfig.h"
+
+#include <iostream>
+#include <fstream>
+
+#include "AsyncMessenger.h"
+
+#include "common/config.h"
+#include "common/Timer.h"
+#include "common/errno.h"
+
+#include "messages/MOSDOp.h"
+#include "messages/MOSDOpReply.h"
+#include "common/EventTrace.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, this)
+static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) {
+ return *_dout << "-- " << m->get_myaddrs() << " ";
+}
+
+static ostream& _prefix(std::ostream *_dout, Processor *p) {
+ return *_dout << " Processor -- ";
+}
+
+
+/*******************
+ * Processor
+ */
+
+class Processor::C_processor_accept : public EventCallback {
+ Processor *pro;
+
+ public:
+ explicit C_processor_accept(Processor *p): pro(p) {}
+ void do_request(uint64_t id) override {
+ pro->accept();
+ }
+};
+
+Processor::Processor(AsyncMessenger *r, Worker *w, CephContext *c)
+ : msgr(r), net(c), worker(w),
+ listen_handler(new C_processor_accept(this)) {}
+
+int Processor::bind(const entity_addrvec_t &bind_addrs,
+ const set<int>& avoid_ports,
+ entity_addrvec_t* bound_addrs)
+{
+ const auto& conf = msgr->cct->_conf;
+ // bind to socket(s)
+ ldout(msgr->cct, 10) << __func__ << " " << bind_addrs << dendl;
+
+ SocketOptions opts;
+ opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+ opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+
+ listen_sockets.resize(bind_addrs.v.size());
+ *bound_addrs = bind_addrs;
+
+ for (unsigned k = 0; k < bind_addrs.v.size(); ++k) {
+ auto& listen_addr = bound_addrs->v[k];
+
+ /* bind to port */
+ int r = -1;
+
+ for (int i = 0; i < conf->ms_bind_retry_count; i++) {
+ if (i > 0) {
+ lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in "
+ << conf->ms_bind_retry_delay << " seconds " << dendl;
+ sleep(conf->ms_bind_retry_delay);
+ }
+
+ if (listen_addr.get_port()) {
+ worker->center.submit_to(
+ worker->center.get_id(),
+ [this, k, &listen_addr, &opts, &r]() {
+ r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+ }, false);
+ if (r < 0) {
+ lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+ << ": " << cpp_strerror(r) << dendl;
+ continue;
+ }
+ } else {
+ // try a range of ports
+ for (int port = msgr->cct->_conf->ms_bind_port_min;
+ port <= msgr->cct->_conf->ms_bind_port_max;
+ port++) {
+ if (avoid_ports.count(port))
+ continue;
+
+ listen_addr.set_port(port);
+ worker->center.submit_to(
+ worker->center.get_id(),
+ [this, k, &listen_addr, &opts, &r]() {
+ r = worker->listen(listen_addr, k, opts, &listen_sockets[k]);
+ }, false);
+ if (r == 0)
+ break;
+ }
+ if (r < 0) {
+ lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr
+ << " on any port in range "
+ << msgr->cct->_conf->ms_bind_port_min
+ << "-" << msgr->cct->_conf->ms_bind_port_max << ": "
+ << cpp_strerror(r) << dendl;
+ listen_addr.set_port(0); // Clear port before retry, otherwise we shall fail again.
+ continue;
+ }
+ ldout(msgr->cct, 10) << __func__ << " bound on random port "
+ << listen_addr << dendl;
+ }
+ if (r == 0) {
+ break;
+ }
+ }
+
+ // It seems that binding completely failed, return with that exit status
+ if (r < 0) {
+ lderr(msgr->cct) << __func__ << " was unable to bind after "
+ << conf->ms_bind_retry_count
+ << " attempts: " << cpp_strerror(r) << dendl;
+ for (unsigned j = 0; j < k; ++j) {
+ // clean up previous bind
+ listen_sockets[j].abort_accept();
+ }
+ return r;
+ }
+ }
+
+ ldout(msgr->cct, 10) << __func__ << " bound to " << *bound_addrs << dendl;
+ return 0;
+}
+
+void Processor::start()
+{
+ ldout(msgr->cct, 1) << __func__ << dendl;
+
+ // start thread
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ for (auto& l : listen_sockets) {
+ if (l) {
+ worker->center.create_file_event(l.fd(), EVENT_READABLE,
+ listen_handler); }
+ }
+ }, false);
+}
+
+void Processor::accept()
+{
+ SocketOptions opts;
+ opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay;
+ opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf;
+ opts.priority = msgr->get_socket_priority();
+
+ for (auto& listen_socket : listen_sockets) {
+ ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd()
+ << dendl;
+ unsigned accept_error_num = 0;
+
+ while (true) {
+ entity_addr_t addr;
+ ConnectedSocket cli_socket;
+ Worker *w = worker;
+ if (!msgr->get_stack()->support_local_listen_table())
+ w = msgr->get_stack()->get_worker();
+ else
+ ++w->references;
+ int r = listen_socket.accept(&cli_socket, opts, &addr, w);
+ if (r == 0) {
+ ldout(msgr->cct, 10) << __func__ << " accepted incoming on sd "
+ << cli_socket.fd() << dendl;
+
+ msgr->add_accept(
+ w, std::move(cli_socket),
+ msgr->get_myaddrs().v[listen_socket.get_addr_slot()],
+ addr);
+ accept_error_num = 0;
+ continue;
+ } else {
+ --w->references;
+ if (r == -EINTR) {
+ continue;
+ } else if (r == -EAGAIN) {
+ break;
+ } else if (r == -EMFILE || r == -ENFILE) {
+ lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd()
+ << " errno " << r << " " << cpp_strerror(r) << dendl;
+ if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+ lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+ ceph_abort();
+ }
+ continue;
+ } else if (r == -ECONNABORTED) {
+ ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd()
+ << " errno " << r << " " << cpp_strerror(r) << dendl;
+ continue;
+ } else {
+ lderr(msgr->cct) << __func__ << " no incoming connection?"
+ << " errno " << r << " " << cpp_strerror(r) << dendl;
+ if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) {
+ lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl;
+ ceph_abort();
+ }
+ continue;
+ }
+ }
+ }
+ }
+}
+
+void Processor::stop()
+{
+ ldout(msgr->cct,10) << __func__ << dendl;
+
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ for (auto& listen_socket : listen_sockets) {
+ if (listen_socket) {
+ worker->center.delete_file_event(listen_socket.fd(), EVENT_READABLE);
+ listen_socket.abort_accept();
+ }
+ }
+ }, false);
+}
+
+
+struct StackSingleton {
+ CephContext *cct;
+ std::shared_ptr<NetworkStack> stack;
+
+ explicit StackSingleton(CephContext *c): cct(c) {}
+ void ready(std::string &type) {
+ if (!stack)
+ stack = NetworkStack::create(cct, type);
+ }
+ ~StackSingleton() {
+ stack->stop();
+ }
+};
+
+
+class C_handle_reap : public EventCallback {
+ AsyncMessenger *msgr;
+
+ public:
+ explicit C_handle_reap(AsyncMessenger *m): msgr(m) {}
+ void do_request(uint64_t id) override {
+ // judge whether is a time event
+ msgr->reap_dead();
+ }
+};
+
+/*******************
+ * AsyncMessenger
+ */
+
+AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name,
+ const std::string &type, string mname, uint64_t _nonce)
+ : SimplePolicyMessenger(cct, name,mname, _nonce),
+ dispatch_queue(cct, this, mname),
+ lock("AsyncMessenger::lock"),
+ nonce(_nonce), need_addr(true), did_bind(false),
+ global_seq(0), deleted_lock("AsyncMessenger::deleted_lock"),
+ cluster_protocol(0), stopped(true)
+{
+ std::string transport_type = "posix";
+ if (type.find("rdma") != std::string::npos)
+ transport_type = "rdma";
+ else if (type.find("dpdk") != std::string::npos)
+ transport_type = "dpdk";
+
+ auto single = &cct->lookup_or_create_singleton_object<StackSingleton>(
+ "AsyncMessenger::NetworkStack::" + transport_type, true, cct);
+ single->ready(transport_type);
+ stack = single->stack.get();
+ stack->start();
+ local_worker = stack->get_worker();
+ local_connection = new AsyncConnection(cct, this, &dispatch_queue,
+ local_worker, true, true);
+ init_local_connection();
+ reap_handler = new C_handle_reap(this);
+ unsigned processor_num = 1;
+ if (stack->support_local_listen_table())
+ processor_num = stack->get_num_worker();
+ for (unsigned i = 0; i < processor_num; ++i)
+ processors.push_back(new Processor(this, stack->get_worker(i), cct));
+}
+
+/**
+ * Destroy the AsyncMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+AsyncMessenger::~AsyncMessenger()
+{
+ delete reap_handler;
+ ceph_assert(!did_bind); // either we didn't bind or we shut down the Processor
+ for (auto &&p : processors)
+ delete p;
+}
+
+void AsyncMessenger::ready()
+{
+ ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+ stack->ready();
+ if (pending_bind) {
+ int err = bindv(pending_bind_addrs);
+ if (err) {
+ lderr(cct) << __func__ << " postponed bind failed" << dendl;
+ ceph_abort();
+ }
+ }
+
+ Mutex::Locker l(lock);
+ for (auto &&p : processors)
+ p->start();
+ dispatch_queue.start();
+}
+
+int AsyncMessenger::shutdown()
+{
+ ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl;
+
+ // done! clean up.
+ for (auto &&p : processors)
+ p->stop();
+ mark_down_all();
+ // break ref cycles on the loopback connection
+ local_connection->set_priv(NULL);
+ local_connection->mark_down();
+ did_bind = false;
+ lock.Lock();
+ stop_cond.Signal();
+ stopped = true;
+ lock.Unlock();
+ stack->drain();
+ return 0;
+}
+
+int AsyncMessenger::bind(const entity_addr_t &bind_addr)
+{
+ ldout(cct,10) << __func__ << " " << bind_addr << dendl;
+ // old bind() can take entity_addr_t(). new bindv() can take a
+ // 0.0.0.0-like address but needs type and family to be set.
+ auto a = bind_addr;
+ if (a == entity_addr_t()) {
+ a.set_type(entity_addr_t::TYPE_LEGACY);
+ if (cct->_conf->ms_bind_ipv6) {
+ a.set_family(AF_INET6);
+ } else {
+ a.set_family(AF_INET);
+ }
+ }
+ return bindv(entity_addrvec_t(a));
+}
+
+int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs)
+{
+ lock.Lock();
+
+ if (!pending_bind && started) {
+ ldout(cct,10) << __func__ << " already started" << dendl;
+ lock.Unlock();
+ return -1;
+ }
+
+ ldout(cct,10) << __func__ << " " << bind_addrs << dendl;
+
+ if (!stack->is_ready()) {
+ ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl;
+ pending_bind_addrs = bind_addrs;
+ pending_bind = true;
+ lock.Unlock();
+ return 0;
+ }
+
+ lock.Unlock();
+
+ // bind to a socket
+ set<int> avoid_ports;
+ entity_addrvec_t bound_addrs;
+ unsigned i = 0;
+ for (auto &&p : processors) {
+ int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+ if (r) {
+ // Note: this is related to local tcp listen table problem.
+ // Posix(default kernel implementation) backend shares listen table
+ // in the kernel, so all threads can use the same listen table naturally
+ // and only one thread need to bind. But other backends(like dpdk) uses local
+ // listen table, we need to bind/listen tcp port for each worker. So if the
+ // first worker failed to bind, it could be think the normal error then handle
+ // it, like port is used case. But if the first worker successfully to bind
+ // but the second worker failed, it's not expected and we need to assert
+ // here
+ ceph_assert(i == 0);
+ return r;
+ }
+ ++i;
+ }
+ _finish_bind(bind_addrs, bound_addrs);
+ return 0;
+}
+
+int AsyncMessenger::rebind(const set<int>& avoid_ports)
+{
+ ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl;
+ ceph_assert(did_bind);
+
+ for (auto &&p : processors)
+ p->stop();
+ mark_down_all();
+
+ // adjust the nonce; we want our entity_addr_t to be truly unique.
+ nonce += 1000000;
+ ldout(cct, 10) << __func__ << " new nonce " << nonce
+ << " and addr " << get_myaddrs() << dendl;
+
+ entity_addrvec_t bound_addrs;
+ entity_addrvec_t bind_addrs = get_myaddrs();
+ set<int> new_avoid(avoid_ports);
+ for (auto& a : bind_addrs.v) {
+ new_avoid.insert(a.get_port());
+ a.set_port(0);
+ }
+ ldout(cct, 10) << __func__ << " will try " << bind_addrs
+ << " and avoid ports " << new_avoid << dendl;
+ unsigned i = 0;
+ for (auto &&p : processors) {
+ int r = p->bind(bind_addrs, avoid_ports, &bound_addrs);
+ if (r) {
+ ceph_assert(i == 0);
+ return r;
+ }
+ ++i;
+ }
+ _finish_bind(bind_addrs, bound_addrs);
+ for (auto &&p : processors) {
+ p->start();
+ }
+ return 0;
+}
+
+int AsyncMessenger::client_bind(const entity_addr_t &bind_addr)
+{
+ if (!cct->_conf->ms_bind_before_connect)
+ return 0;
+ Mutex::Locker l(lock);
+ if (did_bind) {
+ return 0;
+ }
+ if (started) {
+ ldout(cct, 10) << __func__ << " already started" << dendl;
+ return -1;
+ }
+ ldout(cct, 10) << __func__ << " " << bind_addr << dendl;
+
+ set_myaddrs(entity_addrvec_t(bind_addr));
+ return 0;
+}
+
+void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs,
+ const entity_addrvec_t& listen_addrs)
+{
+ set_myaddrs(bind_addrs);
+ for (auto& a : bind_addrs.v) {
+ if (!a.is_blank_ip()) {
+ learned_addr(a);
+ }
+ }
+
+ if (get_myaddrs().front().get_port() == 0) {
+ set_myaddrs(listen_addrs);
+ }
+ entity_addrvec_t newaddrs = *my_addrs;
+ for (auto& a : newaddrs.v) {
+ a.set_nonce(nonce);
+ }
+ set_myaddrs(newaddrs);
+
+ init_local_connection();
+
+ ldout(cct,1) << __func__ << " bind my_addrs is " << get_myaddrs() << dendl;
+ did_bind = true;
+}
+
+int AsyncMessenger::start()
+{
+ lock.Lock();
+ ldout(cct,1) << __func__ << " start" << dendl;
+
+ // register at least one entity, first!
+ ceph_assert(my_name.type() >= 0);
+
+ ceph_assert(!started);
+ started = true;
+ stopped = false;
+
+ if (!did_bind) {
+ entity_addrvec_t newaddrs = *my_addrs;
+ for (auto& a : newaddrs.v) {
+ a.nonce = nonce;
+ }
+ set_myaddrs(newaddrs);
+ _init_local_connection();
+ }
+
+ lock.Unlock();
+ return 0;
+}
+
+void AsyncMessenger::wait()
+{
+ lock.Lock();
+ if (!started) {
+ lock.Unlock();
+ return;
+ }
+ if (!stopped)
+ stop_cond.Wait(lock);
+
+ lock.Unlock();
+
+ dispatch_queue.shutdown();
+ if (dispatch_queue.is_started()) {
+ ldout(cct, 10) << __func__ << ": waiting for dispatch queue" << dendl;
+ dispatch_queue.wait();
+ dispatch_queue.discard_local();
+ ldout(cct, 10) << __func__ << ": dispatch queue is stopped" << dendl;
+ }
+
+ // close all connections
+ shutdown_connections(false);
+ stack->drain();
+
+ ldout(cct, 10) << __func__ << ": done." << dendl;
+ ldout(cct, 1) << __func__ << " complete." << dendl;
+ started = false;
+}
+
+void AsyncMessenger::add_accept(Worker *w, ConnectedSocket cli_socket,
+ const entity_addr_t &listen_addr,
+ const entity_addr_t &peer_addr)
+{
+ lock.Lock();
+ AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w,
+ listen_addr.is_msgr2(), false);
+ conn->accept(std::move(cli_socket), listen_addr, peer_addr);
+ accepting_conns.insert(conn);
+ lock.Unlock();
+}
+
+AsyncConnectionRef AsyncMessenger::create_connect(
+ const entity_addrvec_t& addrs, int type)
+{
+ ceph_assert(lock.is_locked());
+
+ ldout(cct, 10) << __func__ << " " << addrs
+ << ", creating connection and registering" << dendl;
+
+ // here is where we decide which of the addrs to connect to. always prefer
+ // the first one, if we support it.
+ entity_addr_t target;
+ for (auto& a : addrs.v) {
+ if (!a.is_msgr2() && !a.is_legacy()) {
+ continue;
+ }
+ // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before
+ // trying it? for now, just pick whichever is listed first.
+ target = a;
+ break;
+ }
+
+ // create connection
+ Worker *w = stack->get_worker();
+ AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w,
+ target.is_msgr2(), false);
+ conn->connect(addrs, type, target);
+ ceph_assert(!conns.count(addrs));
+ ldout(cct, 10) << __func__ << " " << conn << " " << addrs << " "
+ << *conn->peer_addrs << dendl;
+ conns[addrs] = conn;
+ w->get_perf_counter()->inc(l_msgr_active_connections);
+
+ return conn;
+}
+
+
+ConnectionRef AsyncMessenger::get_loopback_connection()
+{
+ return local_connection;
+}
+
+bool AsyncMessenger::should_use_msgr2()
+{
+ // if we are bound to v1 only, and we are connecting to a v2 peer,
+ // we cannot use the peer's v2 address. otherwise the connection
+ // is assymetrical, because they would have to use v1 to connect
+ // to us, and we would use v2, and connection race detection etc
+ // would totally break down (among other things). or, the other
+ // end will be confused that we advertise ourselve with a v1
+ // address only (that we bound to) but connected with protocol v2.
+ return !did_bind || get_myaddrs().has_msgr2();
+}
+
+entity_addrvec_t AsyncMessenger::_filter_addrs(int type,
+ const entity_addrvec_t& addrs)
+{
+ if (!should_use_msgr2()) {
+ ldout(cct, 10) << __func__ << " " << addrs << " type " << type
+ << " limiting to v1 ()" << dendl;
+ entity_addrvec_t r;
+ for (auto& i : addrs.v) {
+ if (i.is_msgr2()) {
+ continue;
+ }
+ r.v.push_back(i);
+ }
+ return r;
+ } else {
+ return addrs;
+ }
+}
+
+int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs)
+{
+ Mutex::Locker l(lock);
+
+ FUNCTRACE(cct);
+ ceph_assert(m);
+
+ if (m->get_type() == CEPH_MSG_OSD_OP)
+ OID_EVENT_TRACE(((MOSDOp *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP");
+ else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+ OID_EVENT_TRACE(((MOSDOpReply *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP_REPLY");
+
+ ldout(cct, 1) << __func__ << "--> " << ceph_entity_type_name(type) << " "
+ << addrs << " -- " << *m << " -- ?+"
+ << m->get_data().length() << " " << m << dendl;
+
+ if (addrs.empty()) {
+ ldout(cct,0) << __func__ << " message " << *m
+ << " with empty dest " << addrs << dendl;
+ m->put();
+ return -EINVAL;
+ }
+
+ auto av = _filter_addrs(type, addrs);
+ AsyncConnectionRef conn = _lookup_conn(av);
+ submit_message(m, conn, av, type);
+ return 0;
+}
+
+ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs)
+{
+ Mutex::Locker l(lock);
+ if (*my_addrs == addrs ||
+ (addrs.v.size() == 1 &&
+ my_addrs->contains(addrs.front()))) {
+ // local
+ return local_connection;
+ }
+
+ auto av = _filter_addrs(type, addrs);
+
+ AsyncConnectionRef conn = _lookup_conn(av);
+ if (conn) {
+ ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl;
+ } else {
+ conn = create_connect(av, type);
+ ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl;
+ }
+
+ return conn;
+}
+
+void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con,
+ const entity_addrvec_t& dest_addrs,
+ int dest_type)
+{
+ if (cct->_conf->ms_dump_on_send) {
+ m->encode(-1, MSG_CRC_ALL);
+ ldout(cct, 0) << __func__ << " submit_message " << *m << "\n";
+ m->get_payload().hexdump(*_dout);
+ if (m->get_data().length() > 0) {
+ *_dout << " data:\n";
+ m->get_data().hexdump(*_dout);
+ }
+ *_dout << dendl;
+ m->clear_payload();
+ }
+
+ // existing connection?
+ if (con) {
+ con->send_message(m);
+ return ;
+ }
+
+ // local?
+ if (*my_addrs == dest_addrs ||
+ (dest_addrs.v.size() == 1 &&
+ my_addrs->contains(dest_addrs.front()))) {
+ // local
+ local_connection->send_message(m);
+ return ;
+ }
+
+ // remote, no existing connection.
+ const Policy& policy = get_policy(dest_type);
+ if (policy.server) {
+ ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addrs
+ << ", lossy server for target type "
+ << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl;
+ m->put();
+ } else {
+ ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addrs
+ << ", new connection." << dendl;
+ con = create_connect(dest_addrs, dest_type);
+ con->send_message(m);
+ }
+}
+
+/**
+ * If my_addr doesn't have an IP set, this function
+ * will fill it in from the passed addr. Otherwise it does nothing and returns.
+ */
+bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs)
+{
+ ldout(cct,1) << __func__ << " " << addrs << dendl;
+ bool ret = false;
+ Mutex::Locker l(lock);
+
+ entity_addrvec_t newaddrs = *my_addrs;
+ for (auto& a : newaddrs.v) {
+ if (a.is_blank_ip()) {
+ int type = a.get_type();
+ int port = a.get_port();
+ uint32_t nonce = a.get_nonce();
+ for (auto& b : addrs.v) {
+ if (a.get_family() == b.get_family()) {
+ ldout(cct,1) << __func__ << " assuming my addr " << a
+ << " matches provided addr " << b << dendl;
+ a = b;
+ a.set_nonce(nonce);
+ a.set_type(type);
+ a.set_port(port);
+ ret = true;
+ break;
+ }
+ }
+ }
+ }
+ set_myaddrs(newaddrs);
+ if (ret) {
+ _init_local_connection();
+ }
+ ldout(cct,1) << __func__ << " now " << *my_addrs << dendl;
+ return ret;
+}
+
+void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs)
+{
+ Mutex::Locker l(lock);
+ auto t = addrs;
+ for (auto& a : t.v) {
+ a.set_nonce(nonce);
+ }
+ set_myaddrs(t);
+ _init_local_connection();
+}
+
+void AsyncMessenger::shutdown_connections(bool queue_reset)
+{
+ ldout(cct,1) << __func__ << " " << dendl;
+ lock.Lock();
+ for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin();
+ q != accepting_conns.end(); ++q) {
+ AsyncConnectionRef p = *q;
+ ldout(cct, 5) << __func__ << " accepting_conn " << p.get() << dendl;
+ p->stop(queue_reset);
+ }
+ accepting_conns.clear();
+
+ while (!conns.empty()) {
+ auto it = conns.begin();
+ AsyncConnectionRef p = it->second;
+ ldout(cct, 5) << __func__ << " mark down " << it->first << " " << p << dendl;
+ conns.erase(it);
+ p->get_perf_counter()->dec(l_msgr_active_connections);
+ p->stop(queue_reset);
+ }
+
+ {
+ Mutex::Locker l(deleted_lock);
+ while (!deleted_conns.empty()) {
+ set<AsyncConnectionRef>::iterator it = deleted_conns.begin();
+ AsyncConnectionRef p = *it;
+ ldout(cct, 5) << __func__ << " delete " << p << dendl;
+ deleted_conns.erase(it);
+ }
+ }
+ lock.Unlock();
+}
+
+void AsyncMessenger::mark_down_addrs(const entity_addrvec_t& addrs)
+{
+ lock.Lock();
+ AsyncConnectionRef p = _lookup_conn(addrs);
+ if (p) {
+ ldout(cct, 1) << __func__ << " " << addrs << " -- " << p << dendl;
+ p->stop(true);
+ } else {
+ ldout(cct, 1) << __func__ << " " << addrs << " -- connection dne" << dendl;
+ }
+ lock.Unlock();
+}
+
+int AsyncMessenger::get_proto_version(int peer_type, bool connect) const
+{
+ int my_type = my_name.type();
+
+ // set reply protocol version
+ if (peer_type == my_type) {
+ // internal
+ return cluster_protocol;
+ } else {
+ // public
+ switch (connect ? peer_type : my_type) {
+ case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL;
+ case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL;
+ case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL;
+ }
+ }
+ return 0;
+}
+
+int AsyncMessenger::accept_conn(AsyncConnectionRef conn)
+{
+ Mutex::Locker l(lock);
+ auto it = conns.find(*conn->peer_addrs);
+ if (it != conns.end()) {
+ AsyncConnectionRef existing = it->second;
+
+ // lazy delete, see "deleted_conns"
+ // If conn already in, we will return 0
+ Mutex::Locker l(deleted_lock);
+ if (deleted_conns.erase(existing)) {
+ conns.erase(it);
+ } else if (conn != existing) {
+ return -1;
+ }
+ }
+ ldout(cct, 10) << __func__ << " " << conn << " " << *conn->peer_addrs << dendl;
+ conns[*conn->peer_addrs] = conn;
+ conn->get_perf_counter()->inc(l_msgr_active_connections);
+ accepting_conns.erase(conn);
+ return 0;
+}
+
+
+bool AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me)
+{
+ // be careful here: multiple threads may block here, and readers of
+ // my_addr do NOT hold any lock.
+
+ // this always goes from true -> false under the protection of the
+ // mutex. if it is already false, we need not retake the mutex at
+ // all.
+ if (!need_addr)
+ return false;
+ std::lock_guard l(lock);
+ if (need_addr) {
+ if (my_addrs->empty()) {
+ auto a = peer_addr_for_me;
+ a.set_type(entity_addr_t::TYPE_ANY);
+ a.set_nonce(nonce);
+ if (!did_bind) {
+ a.set_port(0);
+ }
+ set_myaddrs(entity_addrvec_t(a));
+ ldout(cct,10) << __func__ << " had no addrs" << dendl;
+ } else {
+ // fix all addrs of the same family, regardless of type (msgr2 vs legacy)
+ entity_addrvec_t newaddrs = *my_addrs;
+ for (auto& a : newaddrs.v) {
+ if (a.is_blank_ip() &&
+ a.get_family() == peer_addr_for_me.get_family()) {
+ entity_addr_t t = peer_addr_for_me;
+ if (!did_bind) {
+ t.set_type(entity_addr_t::TYPE_ANY);
+ t.set_port(0);
+ } else {
+ t.set_type(a.get_type());
+ t.set_port(a.get_port());
+ }
+ t.set_nonce(a.get_nonce());
+ ldout(cct,10) << __func__ << " " << a << " -> " << t << dendl;
+ a = t;
+ }
+ }
+ set_myaddrs(newaddrs);
+ }
+ ldout(cct, 1) << __func__ << " learned my addr " << *my_addrs
+ << " (peer_addr_for_me " << peer_addr_for_me << ")" << dendl;
+ _init_local_connection();
+ need_addr = false;
+ return true;
+ }
+ return false;
+}
+
+int AsyncMessenger::reap_dead()
+{
+ ldout(cct, 1) << __func__ << " start" << dendl;
+ int num = 0;
+
+ Mutex::Locker l1(lock);
+ Mutex::Locker l2(deleted_lock);
+
+ while (!deleted_conns.empty()) {
+ auto it = deleted_conns.begin();
+ AsyncConnectionRef p = *it;
+ ldout(cct, 5) << __func__ << " delete " << p << dendl;
+ auto conns_it = conns.find(*p->peer_addrs);
+ if (conns_it != conns.end() && conns_it->second == p)
+ conns.erase(conns_it);
+ accepting_conns.erase(p);
+ deleted_conns.erase(it);
+ ++num;
+ }
+
+ return num;
+}
diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h
new file mode 100644
index 00000000..98bf9d52
--- /dev/null
+++ b/src/msg/async/AsyncMessenger.h
@@ -0,0 +1,426 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_ASYNCMESSENGER_H
+#define CEPH_ASYNCMESSENGER_H
+
+#include <map>
+#include <mutex>
+
+#include "include/types.h"
+#include "include/xlist.h"
+#include "include/spinlock.h"
+#include "include/unordered_map.h"
+#include "include/unordered_set.h"
+
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "common/Thread.h"
+
+#include "msg/SimplePolicyMessenger.h"
+#include "msg/DispatchQueue.h"
+#include "AsyncConnection.h"
+#include "Event.h"
+
+#include "include/ceph_assert.h"
+
+class AsyncMessenger;
+
+/**
+ * If the Messenger binds to a specific address, the Processor runs
+ * and listens for incoming connections.
+ */
+class Processor {
+ AsyncMessenger *msgr;
+ NetHandler net;
+ Worker *worker;
+ vector<ServerSocket> listen_sockets;
+ EventCallbackRef listen_handler;
+
+ class C_processor_accept;
+
+ public:
+ Processor(AsyncMessenger *r, Worker *w, CephContext *c);
+ ~Processor() { delete listen_handler; };
+
+ void stop();
+ int bind(const entity_addrvec_t &bind_addrs,
+ const set<int>& avoid_ports,
+ entity_addrvec_t* bound_addrs);
+ void start();
+ void accept();
+};
+
+/*
+ * AsyncMessenger is represented for maintaining a set of asynchronous connections,
+ * it may own a bind address and the accepted connections will be managed by
+ * AsyncMessenger.
+ *
+ */
+
+class AsyncMessenger : public SimplePolicyMessenger {
+ // First we have the public Messenger interface implementation...
+public:
+ /**
+ * Initialize the AsyncMessenger!
+ *
+ * @param cct The CephContext to use
+ * @param name The name to assign ourselves
+ * _nonce A unique ID to use for this AsyncMessenger. It should not
+ * be a value that will be repeated if the daemon restarts.
+ */
+ AsyncMessenger(CephContext *cct, entity_name_t name, const std::string &type,
+ string mname, uint64_t _nonce);
+
+ /**
+ * Destroy the AsyncMessenger. Pretty simple since all the work is done
+ * elsewhere.
+ */
+ ~AsyncMessenger() override;
+
+ /** @defgroup Accessors
+ * @{
+ */
+ bool set_addr_unknowns(const entity_addrvec_t &addr) override;
+ void set_addrs(const entity_addrvec_t &addrs) override;
+
+ int get_dispatch_queue_len() override {
+ return dispatch_queue.get_queue_len();
+ }
+
+ double get_dispatch_queue_max_age(utime_t now) override {
+ return dispatch_queue.get_max_age(now);
+ }
+ /** @} Accessors */
+
+ /**
+ * @defgroup Configuration functions
+ * @{
+ */
+ void set_cluster_protocol(int p) override {
+ ceph_assert(!started && !did_bind);
+ cluster_protocol = p;
+ }
+
+ int bind(const entity_addr_t& bind_addr) override;
+ int rebind(const set<int>& avoid_ports) override;
+ int client_bind(const entity_addr_t& bind_addr) override;
+
+ int bindv(const entity_addrvec_t& bind_addrs) override;
+
+ bool should_use_msgr2() override;
+
+ /** @} Configuration functions */
+
+ /**
+ * @defgroup Startup/Shutdown
+ * @{
+ */
+ int start() override;
+ void wait() override;
+ int shutdown() override;
+
+ /** @} // Startup/Shutdown */
+
+ /**
+ * @defgroup Messaging
+ * @{
+ */
+ int send_to(Message *m, int type, const entity_addrvec_t& addrs) override;
+
+ /** @} // Messaging */
+
+ /**
+ * @defgroup Connection Management
+ * @{
+ */
+ ConnectionRef connect_to(int type,
+ const entity_addrvec_t& addrs) override;
+ ConnectionRef get_loopback_connection() override;
+ void mark_down(const entity_addr_t& addr) override {
+ mark_down_addrs(entity_addrvec_t(addr));
+ }
+ void mark_down_addrs(const entity_addrvec_t& addrs) override;
+ void mark_down_all() override {
+ shutdown_connections(true);
+ }
+ /** @} // Connection Management */
+
+ /**
+ * @defgroup Inner classes
+ * @{
+ */
+
+ /**
+ * @} // Inner classes
+ */
+
+protected:
+ /**
+ * @defgroup Messenger Interfaces
+ * @{
+ */
+ /**
+ * Start up the DispatchQueue thread once we have somebody to dispatch to.
+ */
+ void ready() override;
+ /** @} // Messenger Interfaces */
+
+private:
+
+ /**
+ * @defgroup Utility functions
+ * @{
+ */
+
+ /**
+ * Create a connection associated with the given entity (of the given type).
+ * Initiate the connection. (This function returning does not guarantee
+ * connection success.)
+ *
+ * @param addrs The address(es) of the entity to connect to.
+ * @param type The peer type of the entity at the address.
+ *
+ * @return a pointer to the newly-created connection. Caller does not own a
+ * reference; take one if you need it.
+ */
+ AsyncConnectionRef create_connect(const entity_addrvec_t& addrs, int type);
+
+ /**
+ * Queue up a Message for delivery to the entity specified
+ * by addr and dest_type.
+ * submit_message() is responsible for creating
+ * new AsyncConnection (and closing old ones) as necessary.
+ *
+ * @param m The Message to queue up. This function eats a reference.
+ * @param con The existing Connection to use, or NULL if you don't know of one.
+ * @param dest_addr The address to send the Message to.
+ * @param dest_type The peer type of the address we're sending to
+ * just drop silently under failure.
+ */
+ void submit_message(Message *m, AsyncConnectionRef con,
+ const entity_addrvec_t& dest_addrs, int dest_type);
+
+ void _finish_bind(const entity_addrvec_t& bind_addrs,
+ const entity_addrvec_t& listen_addrs);
+
+ entity_addrvec_t _filter_addrs(int type,
+ const entity_addrvec_t& addrs);
+
+ private:
+ static const uint64_t ReapDeadConnectionThreshold = 5;
+
+ NetworkStack *stack;
+ std::vector<Processor*> processors;
+ friend class Processor;
+ DispatchQueue dispatch_queue;
+
+ // the worker run messenger's cron jobs
+ Worker *local_worker;
+
+ std::string ms_type;
+
+ /// overall lock used for AsyncMessenger data structures
+ Mutex lock;
+ // AsyncMessenger stuff
+ /// approximately unique ID set by the Constructor for use in entity_addr_t
+ uint64_t nonce;
+
+ /// true, specifying we haven't learned our addr; set false when we find it.
+ // maybe this should be protected by the lock?
+ bool need_addr;
+
+ /**
+ * set to bind addresses if bind was called before NetworkStack was ready to
+ * bind
+ */
+ entity_addrvec_t pending_bind_addrs;
+
+ /**
+ * false; set to true if a pending bind exists
+ */
+ bool pending_bind = false;
+
+ /**
+ * The following aren't lock-protected since you shouldn't be able to race
+ * the only writers.
+ */
+
+ /**
+ * false; set to true if the AsyncMessenger bound to a specific address;
+ * and set false again by Accepter::stop().
+ */
+ bool did_bind;
+ /// counter for the global seq our connection protocol uses
+ __u32 global_seq;
+ /// lock to protect the global_seq
+ ceph::spinlock global_seq_lock;
+
+ /**
+ * hash map of addresses to Asyncconnection
+ *
+ * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered
+ * invalid and can be replaced by anyone holding the msgr lock
+ */
+ ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns;
+
+ /**
+ * list of connection are in the process of accepting
+ *
+ * These are not yet in the conns map.
+ */
+ set<AsyncConnectionRef> accepting_conns;
+
+ /**
+ * list of connection are closed which need to be clean up
+ *
+ * Because AsyncMessenger and AsyncConnection follow a lock rule that
+ * we can lock AsyncMesenger::lock firstly then lock AsyncConnection::lock
+ * but can't reversed. This rule is aimed to avoid dead lock.
+ * So if AsyncConnection want to unregister itself from AsyncMessenger,
+ * we pick up this idea that just queue itself to this set and do lazy
+ * deleted for AsyncConnection. "_lookup_conn" must ensure not return a
+ * AsyncConnection in this set.
+ */
+ Mutex deleted_lock;
+ set<AsyncConnectionRef> deleted_conns;
+
+ EventCallbackRef reap_handler;
+
+ /// internal cluster protocol version, if any, for talking to entities of the same type.
+ int cluster_protocol;
+
+ Cond stop_cond;
+ bool stopped;
+
+ AsyncConnectionRef _lookup_conn(const entity_addrvec_t& k) {
+ ceph_assert(lock.is_locked());
+ auto p = conns.find(k);
+ if (p == conns.end())
+ return NULL;
+
+ // lazy delete, see "deleted_conns"
+ Mutex::Locker l(deleted_lock);
+ if (deleted_conns.erase(p->second)) {
+ conns.erase(p);
+ return NULL;
+ }
+
+ return p->second;
+ }
+
+ void _init_local_connection() {
+ ceph_assert(lock.is_locked());
+ local_connection->peer_addrs = *my_addrs;
+ local_connection->peer_type = my_name.type();
+ local_connection->set_features(CEPH_FEATURES_ALL);
+ ms_deliver_handle_fast_connect(local_connection.get());
+ }
+
+ void shutdown_connections(bool queue_reset);
+
+public:
+
+ /// con used for sending messages to ourselves
+ AsyncConnectionRef local_connection;
+
+ /**
+ * @defgroup AsyncMessenger internals
+ * @{
+ */
+ /**
+ * This wraps _lookup_conn.
+ */
+ AsyncConnectionRef lookup_conn(const entity_addrvec_t& k) {
+ Mutex::Locker l(lock);
+ return _lookup_conn(k);
+ }
+
+ int accept_conn(AsyncConnectionRef conn);
+ bool learned_addr(const entity_addr_t &peer_addr_for_me);
+ void add_accept(Worker *w, ConnectedSocket cli_socket,
+ const entity_addr_t &listen_addr,
+ const entity_addr_t &peer_addr);
+ NetworkStack *get_stack() {
+ return stack;
+ }
+
+ uint64_t get_nonce() const {
+ return nonce;
+ }
+
+ /**
+ * Increment the global sequence for this AsyncMessenger and return it.
+ * This is for the connect protocol, although it doesn't hurt if somebody
+ * else calls it.
+ *
+ * @return a global sequence ID that nobody else has seen.
+ */
+ __u32 get_global_seq(__u32 old=0) {
+ std::lock_guard<ceph::spinlock> lg(global_seq_lock);
+
+ if (old > global_seq)
+ global_seq = old;
+ __u32 ret = ++global_seq;
+
+ return ret;
+ }
+ /**
+ * Get the protocol version we support for the given peer type: either
+ * a peer protocol (if it matches our own), the protocol version for the
+ * peer (if we're connecting), or our protocol version (if we're accepting).
+ */
+ int get_proto_version(int peer_type, bool connect) const;
+
+ /**
+ * Fill in the address and peer type for the local connection, which
+ * is used for delivering messages back to ourself.
+ */
+ void init_local_connection() {
+ Mutex::Locker l(lock);
+ _init_local_connection();
+ }
+
+ /**
+ * Unregister connection from `conns`
+ *
+ * See "deleted_conns"
+ */
+ void unregister_conn(AsyncConnectionRef conn) {
+ Mutex::Locker l(deleted_lock);
+ conn->get_perf_counter()->dec(l_msgr_active_connections);
+ deleted_conns.emplace(std::move(conn));
+
+ if (deleted_conns.size() >= ReapDeadConnectionThreshold) {
+ local_worker->center.dispatch_event_external(reap_handler);
+ }
+ }
+
+ /**
+ * Reap dead connection from `deleted_conns`
+ *
+ * @return the number of dead connections
+ *
+ * See "deleted_conns"
+ */
+ int reap_dead();
+
+ /**
+ * @} // AsyncMessenger Internals
+ */
+} ;
+
+#endif /* CEPH_ASYNCMESSENGER_H */
diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc
new file mode 100644
index 00000000..6b5e4c7c
--- /dev/null
+++ b/src/msg/async/Event.cc
@@ -0,0 +1,471 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "include/compat.h"
+#include "common/errno.h"
+#include "Event.h"
+
+#ifdef HAVE_DPDK
+#include "dpdk/EventDPDK.h"
+#endif
+
+#ifdef HAVE_EPOLL
+#include "EventEpoll.h"
+#else
+#ifdef HAVE_KQUEUE
+#include "EventKqueue.h"
+#else
+#include "EventSelect.h"
+#endif
+#endif
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EventCallback "
+class C_handle_notify : public EventCallback {
+ EventCenter *center;
+ CephContext *cct;
+
+ public:
+ C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {}
+ void do_request(uint64_t fd_or_id) override {
+ char c[256];
+ int r = 0;
+ do {
+ r = read(fd_or_id, c, sizeof(c));
+ if (r < 0) {
+ if (errno != EAGAIN)
+ ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(errno) << dendl;
+ }
+ } while (r > 0);
+ }
+};
+
+#undef dout_prefix
+#define dout_prefix _event_prefix(_dout)
+
+/**
+ * Construct a Poller.
+ *
+ * \param center
+ * EventCenter object through which the poller will be invoked (defaults
+ * to the global #RAMCloud::center object).
+ * \param pollerName
+ * Human readable name that can be printed out in debugging messages
+ * about the poller. The name of the superclass is probably sufficient
+ * for most cases.
+ */
+EventCenter::Poller::Poller(EventCenter* center, const string& name)
+ : owner(center), poller_name(name), slot(owner->pollers.size())
+{
+ owner->pollers.push_back(this);
+}
+
+/**
+ * Destroy a Poller.
+ */
+EventCenter::Poller::~Poller()
+{
+ // Erase this Poller from the vector by overwriting it with the
+ // poller that used to be the last one in the vector.
+ //
+ // Note: this approach is reentrant (it is safe to delete a
+ // poller from a poller callback, which means that the poll
+ // method is in the middle of scanning the list of all pollers;
+ // the worst that will happen is that the poller that got moved
+ // may not be invoked in the current scan).
+ owner->pollers[slot] = owner->pollers.back();
+ owner->pollers[slot]->slot = slot;
+ owner->pollers.pop_back();
+ slot = -1;
+}
+
+ostream& EventCenter::_event_prefix(std::ostream *_dout)
+{
+ return *_dout << "Event(" << this << " nevent=" << nevent
+ << " time_id=" << time_event_next_id << ").";
+}
+
+int EventCenter::init(int n, unsigned i, const std::string &t)
+{
+ // can't init multi times
+ ceph_assert(nevent == 0);
+
+ type = t;
+ idx = i;
+
+ if (t == "dpdk") {
+#ifdef HAVE_DPDK
+ driver = new DPDKDriver(cct);
+#endif
+ } else {
+#ifdef HAVE_EPOLL
+ driver = new EpollDriver(cct);
+#else
+#ifdef HAVE_KQUEUE
+ driver = new KqueueDriver(cct);
+#else
+ driver = new SelectDriver(cct);
+#endif
+#endif
+ }
+
+ if (!driver) {
+ lderr(cct) << __func__ << " failed to create event driver " << dendl;
+ return -1;
+ }
+
+ int r = driver->init(this, n);
+ if (r < 0) {
+ lderr(cct) << __func__ << " failed to init event driver." << dendl;
+ return r;
+ }
+
+ file_events.resize(n);
+ nevent = n;
+
+ if (!driver->need_wakeup())
+ return 0;
+
+ int fds[2];
+ if (pipe_cloexec(fds) < 0) {
+ int e = errno;
+ lderr(cct) << __func__ << " can't create notify pipe: " << cpp_strerror(e) << dendl;
+ return -e;
+ }
+
+ notify_receive_fd = fds[0];
+ notify_send_fd = fds[1];
+ r = net.set_nonblock(notify_receive_fd);
+ if (r < 0) {
+ return r;
+ }
+ r = net.set_nonblock(notify_send_fd);
+ if (r < 0) {
+ return r;
+ }
+
+ return r;
+}
+
+EventCenter::~EventCenter()
+{
+ {
+ std::lock_guard<std::mutex> l(external_lock);
+ while (!external_events.empty()) {
+ EventCallbackRef e = external_events.front();
+ if (e)
+ e->do_request(0);
+ external_events.pop_front();
+ }
+ }
+ time_events.clear();
+ //assert(time_events.empty());
+
+ if (notify_receive_fd >= 0)
+ ::close(notify_receive_fd);
+ if (notify_send_fd >= 0)
+ ::close(notify_send_fd);
+
+ delete driver;
+ if (notify_handler)
+ delete notify_handler;
+}
+
+
+void EventCenter::set_owner()
+{
+ owner = pthread_self();
+ ldout(cct, 2) << __func__ << " idx=" << idx << " owner=" << owner << dendl;
+ if (!global_centers) {
+ global_centers = &cct->lookup_or_create_singleton_object<
+ EventCenter::AssociatedCenters>(
+ "AsyncMessenger::EventCenter::global_center::" + type, true);
+ ceph_assert(global_centers);
+ global_centers->centers[idx] = this;
+ if (driver->need_wakeup()) {
+ notify_handler = new C_handle_notify(this, cct);
+ int r = create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler);
+ ceph_assert(r == 0);
+ }
+ }
+}
+
+int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt)
+{
+ ceph_assert(in_thread());
+ int r = 0;
+ if (fd >= nevent) {
+ int new_size = nevent << 2;
+ while (fd >= new_size)
+ new_size <<= 2;
+ ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl;
+ r = driver->resize_events(new_size);
+ if (r < 0) {
+ lderr(cct) << __func__ << " event count is exceed." << dendl;
+ return -ERANGE;
+ }
+ file_events.resize(new_size);
+ nevent = new_size;
+ }
+
+ EventCenter::FileEvent *event = _get_file_event(fd);
+ ldout(cct, 20) << __func__ << " create event started fd=" << fd << " mask=" << mask
+ << " original mask is " << event->mask << dendl;
+ if (event->mask == mask)
+ return 0;
+
+ r = driver->add_event(fd, event->mask, mask);
+ if (r < 0) {
+ // Actually we don't allow any failed error code, caller doesn't prepare to
+ // handle error status. So now we need to assert failure here. In practice,
+ // add_event shouldn't report error, otherwise it must be a innermost bug!
+ lderr(cct) << __func__ << " add event failed, ret=" << r << " fd=" << fd
+ << " mask=" << mask << " original mask is " << event->mask << dendl;
+ ceph_abort_msg("BUG!");
+ return r;
+ }
+
+ event->mask |= mask;
+ if (mask & EVENT_READABLE) {
+ event->read_cb = ctxt;
+ }
+ if (mask & EVENT_WRITABLE) {
+ event->write_cb = ctxt;
+ }
+ ldout(cct, 20) << __func__ << " create event end fd=" << fd << " mask=" << mask
+ << " original mask is " << event->mask << dendl;
+ return 0;
+}
+
+void EventCenter::delete_file_event(int fd, int mask)
+{
+ ceph_assert(in_thread() && fd >= 0);
+ if (fd >= nevent) {
+ ldout(cct, 1) << __func__ << " delete event fd=" << fd << " is equal or greater than nevent=" << nevent
+ << "mask=" << mask << dendl;
+ return ;
+ }
+ EventCenter::FileEvent *event = _get_file_event(fd);
+ ldout(cct, 30) << __func__ << " delete event started fd=" << fd << " mask=" << mask
+ << " original mask is " << event->mask << dendl;
+ if (!event->mask)
+ return ;
+
+ int r = driver->del_event(fd, event->mask, mask);
+ if (r < 0) {
+ // see create_file_event
+ ceph_abort_msg("BUG!");
+ }
+
+ if (mask & EVENT_READABLE && event->read_cb) {
+ event->read_cb = nullptr;
+ }
+ if (mask & EVENT_WRITABLE && event->write_cb) {
+ event->write_cb = nullptr;
+ }
+
+ event->mask = event->mask & (~mask);
+ ldout(cct, 30) << __func__ << " delete event end fd=" << fd << " mask=" << mask
+ << " original mask is " << event->mask << dendl;
+}
+
+uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt)
+{
+ ceph_assert(in_thread());
+ uint64_t id = time_event_next_id++;
+
+ ldout(cct, 30) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl;
+ EventCenter::TimeEvent event;
+ clock_type::time_point expire = clock_type::now() + std::chrono::microseconds(microseconds);
+ event.id = id;
+ event.time_cb = ctxt;
+ std::multimap<clock_type::time_point, TimeEvent>::value_type s_val(expire, event);
+ auto it = time_events.insert(std::move(s_val));
+ event_map[id] = it;
+
+ return id;
+}
+
+void EventCenter::delete_time_event(uint64_t id)
+{
+ ceph_assert(in_thread());
+ ldout(cct, 30) << __func__ << " id=" << id << dendl;
+ if (id >= time_event_next_id || id == 0)
+ return ;
+
+ auto it = event_map.find(id);
+ if (it == event_map.end()) {
+ ldout(cct, 10) << __func__ << " id=" << id << " not found" << dendl;
+ return ;
+ }
+
+ time_events.erase(it->second);
+ event_map.erase(it);
+}
+
+void EventCenter::wakeup()
+{
+ // No need to wake up since we never sleep
+ if (!pollers.empty() || !driver->need_wakeup())
+ return ;
+
+ ldout(cct, 20) << __func__ << dendl;
+ char buf = 'c';
+ // wake up "event_wait"
+ int n = write(notify_send_fd, &buf, sizeof(buf));
+ if (n < 0) {
+ if (errno != EAGAIN) {
+ ldout(cct, 1) << __func__ << " write notify pipe failed: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ }
+}
+
+int EventCenter::process_time_events()
+{
+ int processed = 0;
+ clock_type::time_point now = clock_type::now();
+ ldout(cct, 30) << __func__ << " cur time is " << now << dendl;
+
+ while (!time_events.empty()) {
+ auto it = time_events.begin();
+ if (now >= it->first) {
+ TimeEvent &e = it->second;
+ EventCallbackRef cb = e.time_cb;
+ uint64_t id = e.id;
+ time_events.erase(it);
+ event_map.erase(id);
+ ldout(cct, 30) << __func__ << " process time event: id=" << id << dendl;
+ processed++;
+ cb->do_request(id);
+ } else {
+ break;
+ }
+ }
+
+ return processed;
+}
+
+int EventCenter::process_events(unsigned timeout_microseconds, ceph::timespan *working_dur)
+{
+ struct timeval tv;
+ int numevents;
+ bool trigger_time = false;
+ auto now = clock_type::now();
+
+ auto it = time_events.begin();
+ bool blocking = pollers.empty() && !external_num_events.load();
+ // If exists external events or poller, don't block
+ if (!blocking) {
+ if (it != time_events.end() && now >= it->first)
+ trigger_time = true;
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ } else {
+ clock_type::time_point shortest;
+ shortest = now + std::chrono::microseconds(timeout_microseconds);
+
+ if (it != time_events.end() && shortest >= it->first) {
+ ldout(cct, 30) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl;
+ shortest = it->first;
+ trigger_time = true;
+ if (shortest > now) {
+ timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>(
+ shortest - now).count();
+ } else {
+ shortest = now;
+ timeout_microseconds = 0;
+ }
+ }
+ tv.tv_sec = timeout_microseconds / 1000000;
+ tv.tv_usec = timeout_microseconds % 1000000;
+ }
+
+ ldout(cct, 30) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl;
+ vector<FiredFileEvent> fired_events;
+ numevents = driver->event_wait(fired_events, &tv);
+ auto working_start = ceph::mono_clock::now();
+ for (int j = 0; j < numevents; j++) {
+ int rfired = 0;
+ FileEvent *event;
+ EventCallbackRef cb;
+ event = _get_file_event(fired_events[j].fd);
+
+ /* note the event->mask & mask & ... code: maybe an already processed
+ * event removed an element that fired and we still didn't
+ * processed, so we check if the event is still valid. */
+ if (event->mask & fired_events[j].mask & EVENT_READABLE) {
+ rfired = 1;
+ cb = event->read_cb;
+ cb->do_request(fired_events[j].fd);
+ }
+
+ if (event->mask & fired_events[j].mask & EVENT_WRITABLE) {
+ if (!rfired || event->read_cb != event->write_cb) {
+ cb = event->write_cb;
+ cb->do_request(fired_events[j].fd);
+ }
+ }
+
+ ldout(cct, 30) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl;
+ }
+
+ if (trigger_time)
+ numevents += process_time_events();
+
+ if (external_num_events.load()) {
+ external_lock.lock();
+ deque<EventCallbackRef> cur_process;
+ cur_process.swap(external_events);
+ external_num_events.store(0);
+ external_lock.unlock();
+ numevents += cur_process.size();
+ while (!cur_process.empty()) {
+ EventCallbackRef e = cur_process.front();
+ ldout(cct, 30) << __func__ << " do " << e << dendl;
+ e->do_request(0);
+ cur_process.pop_front();
+ }
+ }
+
+ if (!numevents && !blocking) {
+ for (uint32_t i = 0; i < pollers.size(); i++)
+ numevents += pollers[i]->poll();
+ }
+
+ if (working_dur)
+ *working_dur = ceph::mono_clock::now() - working_start;
+ return numevents;
+}
+
+void EventCenter::dispatch_event_external(EventCallbackRef e)
+{
+ uint64_t num = 0;
+ {
+ std::lock_guard lock{external_lock};
+ if (external_num_events > 0 && *external_events.rbegin() == e) {
+ return;
+ }
+ external_events.push_back(e);
+ num = ++external_num_events;
+ }
+ if (num == 1 && !in_thread())
+ wakeup();
+
+ ldout(cct, 30) << __func__ << " " << e << " pending " << num << dendl;
+}
diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h
new file mode 100644
index 00000000..6736060e
--- /dev/null
+++ b/src/msg/async/Event.h
@@ -0,0 +1,266 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENT_H
+#define CEPH_MSG_EVENT_H
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+// We use epoll, kqueue, evport, select in descending order by performance.
+#if defined(__linux__)
+#define HAVE_EPOLL 1
+#endif
+
+#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
+#define HAVE_KQUEUE 1
+#endif
+
+#ifdef __sun
+#include <sys/feature_tests.h>
+#ifdef _DTRACE_VERSION
+#define HAVE_EVPORT 1
+#endif
+#endif
+
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include "common/ceph_time.h"
+#include "common/dout.h"
+#include "net_handler.h"
+
+#define EVENT_NONE 0
+#define EVENT_READABLE 1
+#define EVENT_WRITABLE 2
+
+class EventCenter;
+
+class EventCallback {
+
+ public:
+ virtual void do_request(uint64_t fd_or_id) = 0;
+ virtual ~EventCallback() {} // we want a virtual destructor!!!
+};
+
+typedef EventCallback* EventCallbackRef;
+
+struct FiredFileEvent {
+ int fd;
+ int mask;
+};
+
+/*
+ * EventDriver is a wrap of event mechanisms depends on different OS.
+ * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will
+ * be used for worst condition.
+ */
+class EventDriver {
+ public:
+ virtual ~EventDriver() {} // we want a virtual destructor!!!
+ virtual int init(EventCenter *center, int nevent) = 0;
+ virtual int add_event(int fd, int cur_mask, int mask) = 0;
+ virtual int del_event(int fd, int cur_mask, int del_mask) = 0;
+ virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0;
+ virtual int resize_events(int newsize) = 0;
+ virtual bool need_wakeup() { return true; }
+};
+
+/*
+ * EventCenter maintain a set of file descriptor and handle registered events.
+ */
+class EventCenter {
+ public:
+ // should be enough;
+ static const int MAX_EVENTCENTER = 24;
+
+ private:
+ using clock_type = ceph::coarse_mono_clock;
+
+ struct AssociatedCenters {
+ EventCenter *centers[MAX_EVENTCENTER];
+ AssociatedCenters() {
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*));
+ }
+ };
+
+ struct FileEvent {
+ int mask;
+ EventCallbackRef read_cb;
+ EventCallbackRef write_cb;
+ FileEvent(): mask(0), read_cb(NULL), write_cb(NULL) {}
+ };
+
+ struct TimeEvent {
+ uint64_t id;
+ EventCallbackRef time_cb;
+
+ TimeEvent(): id(0), time_cb(NULL) {}
+ };
+
+ public:
+ /**
+ * A Poller object is invoked once each time through the dispatcher's
+ * inner polling loop.
+ */
+ class Poller {
+ public:
+ explicit Poller(EventCenter* center, const string& pollerName);
+ virtual ~Poller();
+
+ /**
+ * This method is defined by a subclass and invoked once by the
+ * center during each pass through its inner polling loop.
+ *
+ * \return
+ * 1 means that this poller did useful work during this call.
+ * 0 means that the poller found no work to do.
+ */
+ virtual int poll() = 0;
+
+ private:
+ /// The EventCenter object that owns this Poller. NULL means the
+ /// EventCenter has been deleted.
+ EventCenter* owner;
+
+ /// Human-readable string name given to the poller to make it
+ /// easy to identify for debugging. For most pollers just passing
+ /// in the subclass name probably makes sense.
+ string poller_name;
+
+ /// Index of this Poller in EventCenter::pollers. Allows deletion
+ /// without having to scan all the entries in pollers. -1 means
+ /// this poller isn't currently in EventCenter::pollers (happens
+ /// after EventCenter::reset).
+ int slot;
+ };
+
+ private:
+ CephContext *cct;
+ std::string type;
+ int nevent;
+ // Used only to external event
+ pthread_t owner = 0;
+ std::mutex external_lock;
+ std::atomic_ulong external_num_events;
+ deque<EventCallbackRef> external_events;
+ vector<FileEvent> file_events;
+ EventDriver *driver;
+ std::multimap<clock_type::time_point, TimeEvent> time_events;
+ // Keeps track of all of the pollers currently defined. We don't
+ // use an intrusive list here because it isn't reentrant: we need
+ // to add/remove elements while the center is traversing the list.
+ std::vector<Poller*> pollers;
+ std::map<uint64_t, std::multimap<clock_type::time_point, TimeEvent>::iterator> event_map;
+ uint64_t time_event_next_id;
+ int notify_receive_fd;
+ int notify_send_fd;
+ NetHandler net;
+ EventCallbackRef notify_handler;
+ unsigned idx;
+ AssociatedCenters *global_centers = nullptr;
+
+ int process_time_events();
+ FileEvent *_get_file_event(int fd) {
+ ceph_assert(fd < nevent);
+ return &file_events[fd];
+ }
+
+ public:
+ explicit EventCenter(CephContext *c):
+ cct(c), nevent(0),
+ external_num_events(0),
+ driver(NULL), time_event_next_id(1),
+ notify_receive_fd(-1), notify_send_fd(-1), net(c),
+ notify_handler(NULL), idx(0) { }
+ ~EventCenter();
+ ostream& _event_prefix(std::ostream *_dout);
+
+ int init(int nevent, unsigned idx, const std::string &t);
+ void set_owner();
+ pthread_t get_owner() const { return owner; }
+ unsigned get_id() const { return idx; }
+
+ EventDriver *get_driver() { return driver; }
+
+ // Used by internal thread
+ int create_file_event(int fd, int mask, EventCallbackRef ctxt);
+ uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt);
+ void delete_file_event(int fd, int mask);
+ void delete_time_event(uint64_t id);
+ int process_events(unsigned timeout_microseconds, ceph::timespan *working_dur = nullptr);
+ void wakeup();
+
+ // Used by external thread
+ void dispatch_event_external(EventCallbackRef e);
+ inline bool in_thread() const {
+ return pthread_equal(pthread_self(), owner);
+ }
+
+ private:
+ template <typename func>
+ class C_submit_event : public EventCallback {
+ std::mutex lock;
+ std::condition_variable cond;
+ bool done = false;
+ func f;
+ bool nonwait;
+ public:
+ C_submit_event(func &&_f, bool nw)
+ : f(std::move(_f)), nonwait(nw) {}
+ void do_request(uint64_t id) override {
+ f();
+ lock.lock();
+ cond.notify_all();
+ done = true;
+ bool del = nonwait;
+ lock.unlock();
+ if (del)
+ delete this;
+ }
+ void wait() {
+ ceph_assert(!nonwait);
+ std::unique_lock<std::mutex> l(lock);
+ while (!done)
+ cond.wait(l);
+ }
+ };
+
+ public:
+ template <typename func>
+ void submit_to(int i, func &&f, bool nowait = false) {
+ ceph_assert(i < MAX_EVENTCENTER && global_centers);
+ EventCenter *c = global_centers->centers[i];
+ ceph_assert(c);
+ if (!nowait && c->in_thread()) {
+ f();
+ return ;
+ }
+ if (nowait) {
+ C_submit_event<func> *event = new C_submit_event<func>(std::move(f), true);
+ c->dispatch_event_external(event);
+ } else {
+ C_submit_event<func> event(std::move(f), false);
+ c->dispatch_event_external(&event);
+ event.wait();
+ }
+ };
+};
+
+#endif
diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc
new file mode 100644
index 00000000..37b46973
--- /dev/null
+++ b/src/msg/async/EventEpoll.cc
@@ -0,0 +1,142 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include <fcntl.h>
+#include "EventEpoll.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "EpollDriver."
+
+int EpollDriver::init(EventCenter *c, int nevent)
+{
+ events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent);
+ if (!events) {
+ lderr(cct) << __func__ << " unable to malloc memory. " << dendl;
+ return -ENOMEM;
+ }
+ memset(events, 0, sizeof(struct epoll_event)*nevent);
+
+ epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+ if (epfd == -1) {
+ lderr(cct) << __func__ << " unable to do epoll_create: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ if (::fcntl(epfd, F_SETFD, FD_CLOEXEC) == -1) {
+ int e = errno;
+ ::close(epfd);
+ lderr(cct) << __func__ << " unable to set cloexec: "
+ << cpp_strerror(e) << dendl;
+
+ return -e;
+ }
+
+ size = nevent;
+
+ return 0;
+}
+
+int EpollDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+ ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+ << " add_mask=" << add_mask << " to " << epfd << dendl;
+ struct epoll_event ee;
+ /* If the fd was already monitored for some event, we need a MOD
+ * operation. Otherwise we need an ADD operation. */
+ int op;
+ op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD;
+
+ ee.events = EPOLLET;
+ add_mask |= cur_mask; /* Merge old events */
+ if (add_mask & EVENT_READABLE)
+ ee.events |= EPOLLIN;
+ if (add_mask & EVENT_WRITABLE)
+ ee.events |= EPOLLOUT;
+ ee.data.u64 = 0; /* avoid valgrind warning */
+ ee.data.fd = fd;
+ if (epoll_ctl(epfd, op, fd, &ee) == -1) {
+ lderr(cct) << __func__ << " epoll_ctl: add fd=" << fd << " failed. "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+
+ return 0;
+}
+
+int EpollDriver::del_event(int fd, int cur_mask, int delmask)
+{
+ ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+ << " delmask=" << delmask << " to " << epfd << dendl;
+ struct epoll_event ee;
+ int mask = cur_mask & (~delmask);
+ int r = 0;
+
+ ee.events = 0;
+ if (mask & EVENT_READABLE) ee.events |= EPOLLIN;
+ if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT;
+ ee.data.u64 = 0; /* avoid valgrind warning */
+ ee.data.fd = fd;
+ if (mask != EVENT_NONE) {
+ if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) {
+ lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask
+ << " failed." << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ } else {
+ /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+ * EPOLL_CTL_DEL. */
+ if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) {
+ lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd
+ << " failed." << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ }
+ return 0;
+}
+
+int EpollDriver::resize_events(int newsize)
+{
+ return 0;
+}
+
+int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+ int retval, numevents = 0;
+
+ retval = epoll_wait(epfd, events, size,
+ tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+ if (retval > 0) {
+ int j;
+
+ numevents = retval;
+ fired_events.resize(numevents);
+ for (j = 0; j < numevents; j++) {
+ int mask = 0;
+ struct epoll_event *e = events + j;
+
+ if (e->events & EPOLLIN) mask |= EVENT_READABLE;
+ if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE;
+ if (e->events & EPOLLERR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+ if (e->events & EPOLLHUP) mask |= EVENT_READABLE|EVENT_WRITABLE;
+ fired_events[j].fd = e->data.fd;
+ fired_events[j].mask = mask;
+ }
+ }
+ return numevents;
+}
diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h
new file mode 100644
index 00000000..abc4b8bb
--- /dev/null
+++ b/src/msg/async/EventEpoll.h
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTEPOLL_H
+#define CEPH_MSG_EVENTEPOLL_H
+
+#include <unistd.h>
+#include <sys/epoll.h>
+
+#include "Event.h"
+
+class EpollDriver : public EventDriver {
+ int epfd;
+ struct epoll_event *events;
+ CephContext *cct;
+ int size;
+
+ public:
+ explicit EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), size(0) {}
+ ~EpollDriver() override {
+ if (epfd != -1)
+ close(epfd);
+
+ if (events)
+ free(events);
+ }
+
+ int init(EventCenter *c, int nevent) override;
+ int add_event(int fd, int cur_mask, int add_mask) override;
+ int del_event(int fd, int cur_mask, int del_mask) override;
+ int resize_events(int newsize) override;
+ int event_wait(vector<FiredFileEvent> &fired_events,
+ struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc
new file mode 100644
index 00000000..d6ba4a3d
--- /dev/null
+++ b/src/msg/async/EventKqueue.cc
@@ -0,0 +1,267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventKqueue.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "KqueueDriver."
+
+#define KEVENT_NOWAIT 0
+
+int KqueueDriver::test_kqfd() {
+ struct kevent ke[1];
+ if (kevent(kqfd, ke, 0, NULL, 0, KEVENT_NOWAIT) == -1) {
+ ldout(cct,0) << __func__ << " invalid kqfd = " << kqfd
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ return kqfd;
+}
+
+int KqueueDriver::restore_events() {
+ struct kevent ke[2];
+ int i;
+
+ ldout(cct,30) << __func__ << " on kqfd = " << kqfd << dendl;
+ for(i=0;i<size;i++) {
+ int num = 0;
+ if (sav_events[i].mask == 0 )
+ continue;
+ ldout(cct,30) << __func__ << " restore kqfd = " << kqfd
+ << " fd = " << i << " mask " << sav_events[i].mask << dendl;
+ if (sav_events[i].mask & EVENT_READABLE)
+ EV_SET(&ke[num++], i, EVFILT_READ, EV_ADD, 0, 0, NULL);
+ if (sav_events[i].mask & EVENT_WRITABLE)
+ EV_SET(&ke[num++], i, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
+ if (num) {
+ if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+ ldout(cct,0) << __func__ << " unable to add event: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ }
+ }
+ return 0;
+}
+
+int KqueueDriver::test_thread_change(const char* funcname) {
+ // check to see if we changed thread, because that invalidates
+ // the kqfd and we need to restore that
+ int oldkqfd = kqfd;
+
+ if (!pthread_equal(mythread, pthread_self())) {
+ ldout(cct,20) << funcname << " We changed thread from " << mythread
+ << " to " << pthread_self() << dendl;
+ mythread = pthread_self();
+ kqfd = -1;
+ } else if ((kqfd != -1) && (test_kqfd() < 0)) {
+ // should this ever happen?
+ // It would be strange to change kqfd with thread change.
+ // Might nee to change this into an ceph_assert() in the future.
+ ldout(cct,0) << funcname << " Warning: Recreating old kqfd. "
+ << "This should not happen!!!" << dendl;
+ kqfd = -1;
+ }
+ if (kqfd == -1) {
+ kqfd = kqueue();
+ ldout(cct,30) << funcname << " kqueue: new kqfd = " << kqfd
+ << " (was: " << oldkqfd << ")"
+ << dendl;
+ if (kqfd < 0) {
+ lderr(cct) << funcname << " unable to do kqueue: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ if (restore_events()< 0) {
+ lderr(cct) << funcname << " unable restore all events "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ }
+ return 0;
+}
+
+int KqueueDriver::init(EventCenter *c, int nevent)
+{
+ // keep track of possible changes of our thread
+ // because change of thread kills the kqfd
+ mythread = pthread_self();
+
+ // Reserve the space to accept the kevent return events.
+ res_events = (struct kevent*)malloc(sizeof(struct kevent)*nevent);
+ if (!res_events) {
+ lderr(cct) << __func__ << " unable to malloc memory: "
+ << cpp_strerror(errno) << dendl;
+ return -ENOMEM;
+ }
+ memset(res_events, 0, sizeof(struct kevent)*nevent);
+ size = nevent;
+
+ // Reserve the space to keep all of the events set, so it can be redone
+ // when we change trhread ID.
+ sav_events = (struct SaveEvent*)malloc(sizeof(struct SaveEvent)*nevent);
+ if (!sav_events) {
+ lderr(cct) << __func__ << " unable to malloc memory: "
+ << cpp_strerror(errno) << dendl;
+ return -ENOMEM;
+ }
+ memset(sav_events, 0, sizeof(struct SaveEvent)*nevent);
+ sav_max = nevent;
+
+ // Delay assigning a descriptor until it is really needed.
+ // kqfd = kqueue();
+ kqfd = -1;
+ return 0;
+}
+
+int KqueueDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+ struct kevent ke[2];
+ int num = 0;
+
+ ldout(cct,30) << __func__ << " add event kqfd = " << kqfd << " fd = " << fd
+ << " cur_mask = " << cur_mask << " add_mask = " << add_mask
+ << dendl;
+
+ int r = test_thread_change(__func__);
+ if ( r < 0 )
+ return r;
+
+ if (add_mask & EVENT_READABLE)
+ EV_SET(&ke[num++], fd, EVFILT_READ, EV_ADD|EV_CLEAR, 0, 0, NULL);
+ if (add_mask & EVENT_WRITABLE)
+ EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_ADD|EV_CLEAR, 0, 0, NULL);
+
+ if (num) {
+ if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) {
+ lderr(cct) << __func__ << " unable to add event: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ }
+ // keep what we set
+ if (fd >= sav_max)
+ resize_events(sav_max+5000);
+ sav_events[fd].mask = cur_mask | add_mask;
+ return 0;
+}
+
+int KqueueDriver::del_event(int fd, int cur_mask, int del_mask)
+{
+ struct kevent ke[2];
+ int num = 0;
+ int mask = cur_mask & del_mask;
+
+ ldout(cct,30) << __func__ << " delete event kqfd = " << kqfd
+ << " fd = " << fd << " cur_mask = " << cur_mask
+ << " del_mask = " << del_mask << dendl;
+
+ int r = test_thread_change(__func__);
+ if ( r < 0 )
+ return r;
+
+ if (mask & EVENT_READABLE)
+ EV_SET(&ke[num++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+ if (mask & EVENT_WRITABLE)
+ EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
+
+ if (num) {
+ int r = 0;
+ if ((r = kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT)) < 0) {
+ lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << mask
+ << " failed." << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ }
+ // keep the administration
+ sav_events[fd].mask = cur_mask & ~del_mask;
+ return 0;
+}
+
+int KqueueDriver::resize_events(int newsize)
+{
+ ldout(cct,30) << __func__ << " kqfd = " << kqfd << "newsize = " << newsize
+ << dendl;
+ if (newsize > sav_max) {
+ sav_events = (struct SaveEvent*)realloc(sav_events, sizeof(struct SaveEvent)*newsize);
+ if (!sav_events) {
+ lderr(cct) << __func__ << " unable to realloc memory: "
+ << cpp_strerror(errno) << dendl;
+ ceph_assert(sav_events);
+ return -ENOMEM;
+ }
+ memset(&sav_events[size], 0, sizeof(struct SaveEvent)*(newsize-sav_max));
+ sav_max = newsize;
+ }
+ return 0;
+}
+
+int KqueueDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+ int retval, numevents = 0;
+ struct timespec timeout;
+
+ ldout(cct,10) << __func__ << " kqfd = " << kqfd << dendl;
+
+ int r = test_thread_change(__func__);
+ if ( r < 0 )
+ return r;
+
+ if (tvp != NULL) {
+ timeout.tv_sec = tvp->tv_sec;
+ timeout.tv_nsec = tvp->tv_usec * 1000;
+ ldout(cct,20) << __func__ << " "
+ << timeout.tv_sec << " sec "
+ << timeout.tv_nsec << " nsec"
+ << dendl;
+ retval = kevent(kqfd, NULL, 0, res_events, size, &timeout);
+ } else {
+ ldout(cct,30) << __func__ << " event_wait: " << " NULL" << dendl;
+ retval = kevent(kqfd, NULL, 0, res_events, size, KEVENT_NOWAIT);
+ }
+
+ ldout(cct,25) << __func__ << " kevent retval: " << retval << dendl;
+ if (retval < 0) {
+ lderr(cct) << __func__ << " kqueue error: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ } else if (retval == 0) {
+ ldout(cct,5) << __func__ << " Hit timeout("
+ << timeout.tv_sec << " sec "
+ << timeout.tv_nsec << " nsec"
+ << ")." << dendl;
+ } else {
+ int j;
+
+ numevents = retval;
+ fired_events.resize(numevents);
+ for (j = 0; j < numevents; j++) {
+ int mask = 0;
+ struct kevent *e = res_events + j;
+
+ if (e->filter == EVFILT_READ) mask |= EVENT_READABLE;
+ if (e->filter == EVFILT_WRITE) mask |= EVENT_WRITABLE;
+ if (e->flags & EV_ERROR) mask |= EVENT_READABLE|EVENT_WRITABLE;
+ fired_events[j].fd = (int)e->ident;
+ fired_events[j].mask = mask;
+
+ }
+ }
+ return numevents;
+}
diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h
new file mode 100644
index 00000000..24863a93
--- /dev/null
+++ b/src/msg/async/EventKqueue.h
@@ -0,0 +1,67 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTKQUEUE_H
+#define CEPH_MSG_EVENTKQUEUE_H
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <unistd.h>
+
+#include "Event.h"
+
+class KqueueDriver : public EventDriver {
+ int kqfd;
+ pthread_t mythread;
+ struct kevent *res_events;
+ CephContext *cct;
+ int size;
+
+ // Keep what we set on the kqfd
+ struct SaveEvent{
+ int fd;
+ int mask;
+ };
+ struct SaveEvent *sav_events;
+ int sav_max;
+ int restore_events();
+ int test_kqfd();
+ int test_thread_change(const char* funcname);
+
+ public:
+ explicit KqueueDriver(CephContext *c): kqfd(-1), res_events(NULL), cct(c),
+ size(0), sav_max(0) {}
+ virtual ~KqueueDriver() {
+ if (kqfd != -1)
+ close(kqfd);
+
+ if (res_events)
+ free(res_events);
+ size = 0;
+ if (sav_events)
+ free(sav_events);
+ sav_max = 0;
+ }
+
+ int init(EventCenter *c, int nevent) override;
+ int add_event(int fd, int cur_mask, int add_mask) override;
+ int del_event(int fd, int cur_mask, int del_mask) override;
+ int resize_events(int newsize) override;
+ int event_wait(vector<FiredFileEvent> &fired_events,
+ struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc
new file mode 100644
index 00000000..fdee6ebc
--- /dev/null
+++ b/src/msg/async/EventSelect.cc
@@ -0,0 +1,95 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "EventSelect.h"
+
+#include <unistd.h>
+#include <sys/select.h>
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "SelectDriver."
+
+int SelectDriver::init(EventCenter *c, int nevent)
+{
+ ldout(cct, 0) << "Select isn't suitable for production env, just avoid "
+ << "compiling error or special purpose" << dendl;
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+ max_fd = 0;
+ return 0;
+}
+
+int SelectDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+ ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask
+ << dendl;
+
+ int mask = cur_mask | add_mask;
+ if (mask & EVENT_READABLE)
+ FD_SET(fd, &rfds);
+ if (mask & EVENT_WRITABLE)
+ FD_SET(fd, &wfds);
+ if (fd > max_fd)
+ max_fd = fd;
+
+ return 0;
+}
+
+int SelectDriver::del_event(int fd, int cur_mask, int delmask)
+{
+ ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask
+ << dendl;
+
+ if (delmask & EVENT_READABLE)
+ FD_CLR(fd, &rfds);
+ if (delmask & EVENT_WRITABLE)
+ FD_CLR(fd, &wfds);
+ return 0;
+}
+
+int SelectDriver::resize_events(int newsize)
+{
+ return 0;
+}
+
+int SelectDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+ int retval, numevents = 0;
+
+ memcpy(&_rfds, &rfds, sizeof(fd_set));
+ memcpy(&_wfds, &wfds, sizeof(fd_set));
+
+ retval = select(max_fd+1, &_rfds, &_wfds, NULL, tvp);
+ if (retval > 0) {
+ for (int j = 0; j <= max_fd; j++) {
+ int mask = 0;
+ struct FiredFileEvent fe;
+ if (FD_ISSET(j, &_rfds))
+ mask |= EVENT_READABLE;
+ if (FD_ISSET(j, &_wfds))
+ mask |= EVENT_WRITABLE;
+ if (mask) {
+ fe.fd = j;
+ fe.mask = mask;
+ fired_events.push_back(fe);
+ numevents++;
+ }
+ }
+ }
+ return numevents;
+}
diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h
new file mode 100644
index 00000000..1b75da0b
--- /dev/null
+++ b/src/msg/async/EventSelect.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_EVENTSELECT_H
+#define CEPH_MSG_EVENTSELECT_H
+
+#include "Event.h"
+
+class SelectDriver : public EventDriver {
+ fd_set rfds, wfds;
+ /* We need to have a copy of the fd sets as it's not safe to reuse
+ * FD sets after select(). */
+ fd_set _rfds, _wfds;
+ int max_fd;
+ CephContext *cct;
+
+ public:
+ explicit SelectDriver(CephContext *c): max_fd(0), cct(c) {}
+ ~SelectDriver() override {}
+
+ int init(EventCenter *c, int nevent) override;
+ int add_event(int fd, int cur_mask, int add_mask) override;
+ int del_event(int fd, int cur_mask, int del_mask) override;
+ int resize_events(int newsize) override;
+ int event_wait(vector<FiredFileEvent> &fired_events,
+ struct timeval *tp) override;
+};
+
+#endif
diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc
new file mode 100644
index 00000000..e9c8d404
--- /dev/null
+++ b/src/msg/async/PosixStack.cc
@@ -0,0 +1,293 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+#include <algorithm>
+
+#include "PosixStack.h"
+
+#include "include/buffer.h"
+#include "include/str_list.h"
+#include "common/errno.h"
+#include "common/strtol.h"
+#include "common/dout.h"
+#include "msg/Messenger.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "PosixStack "
+
+class PosixConnectedSocketImpl final : public ConnectedSocketImpl {
+ NetHandler &handler;
+ int _fd;
+ entity_addr_t sa;
+ bool connected;
+
+ public:
+ explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected)
+ : handler(h), _fd(f), sa(sa), connected(connected) {}
+
+ int is_connected() override {
+ if (connected)
+ return 1;
+
+ int r = handler.reconnect(sa, _fd);
+ if (r == 0) {
+ connected = true;
+ return 1;
+ } else if (r < 0) {
+ return r;
+ } else {
+ return 0;
+ }
+ }
+
+ ssize_t zero_copy_read(bufferptr&) override {
+ return -EOPNOTSUPP;
+ }
+
+ ssize_t read(char *buf, size_t len) override {
+ ssize_t r = ::read(_fd, buf, len);
+ if (r < 0)
+ r = -errno;
+ return r;
+ }
+
+ // return the sent length
+ // < 0 means error occurred
+ static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more)
+ {
+ size_t sent = 0;
+ while (1) {
+ MSGR_SIGPIPE_STOPPER;
+ ssize_t r;
+ r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0));
+ if (r < 0) {
+ if (errno == EINTR) {
+ continue;
+ } else if (errno == EAGAIN) {
+ break;
+ }
+ return -errno;
+ }
+
+ sent += r;
+ if (len == sent) break;
+
+ while (r > 0) {
+ if (msg.msg_iov[0].iov_len <= (size_t)r) {
+ // drain this whole item
+ r -= msg.msg_iov[0].iov_len;
+ msg.msg_iov++;
+ msg.msg_iovlen--;
+ } else {
+ msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r;
+ msg.msg_iov[0].iov_len -= r;
+ break;
+ }
+ }
+ }
+ return (ssize_t)sent;
+ }
+
+ ssize_t send(bufferlist &bl, bool more) override {
+ size_t sent_bytes = 0;
+ auto pb = std::cbegin(bl.buffers());
+ uint64_t left_pbrs = std::size(bl.buffers());
+ while (left_pbrs) {
+ struct msghdr msg;
+ struct iovec msgvec[IOV_MAX];
+ uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX);
+ left_pbrs -= size;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iovlen = size;
+ msg.msg_iov = msgvec;
+ unsigned msglen = 0;
+ for (auto iov = msgvec; iov != msgvec + size; iov++) {
+ iov->iov_base = (void*)(pb->c_str());
+ iov->iov_len = pb->length();
+ msglen += pb->length();
+ ++pb;
+ }
+ ssize_t r = do_sendmsg(_fd, msg, msglen, left_pbrs || more);
+ if (r < 0)
+ return r;
+
+ // "r" is the remaining length
+ sent_bytes += r;
+ if (static_cast<unsigned>(r) < msglen)
+ break;
+ // only "r" == 0 continue
+ }
+
+ if (sent_bytes) {
+ bufferlist swapped;
+ if (sent_bytes < bl.length()) {
+ bl.splice(sent_bytes, bl.length()-sent_bytes, &swapped);
+ bl.swap(swapped);
+ } else {
+ bl.clear();
+ }
+ }
+
+ return static_cast<ssize_t>(sent_bytes);
+ }
+ void shutdown() override {
+ ::shutdown(_fd, SHUT_RDWR);
+ }
+ void close() override {
+ ::close(_fd);
+ }
+ int fd() const override {
+ return _fd;
+ }
+ int socket_fd() const override {
+ return _fd;
+ }
+ friend class PosixServerSocketImpl;
+ friend class PosixNetworkStack;
+};
+
+class PosixServerSocketImpl : public ServerSocketImpl {
+ NetHandler &handler;
+ int _fd;
+
+ public:
+ explicit PosixServerSocketImpl(NetHandler &h, int f,
+ const entity_addr_t& listen_addr, unsigned slot)
+ : ServerSocketImpl(listen_addr.get_type(), slot),
+ handler(h), _fd(f) {}
+ int accept(ConnectedSocket *sock, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ void abort_accept() override {
+ ::close(_fd);
+ }
+ int fd() const override {
+ return _fd;
+ }
+};
+
+int PosixServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+ ceph_assert(sock);
+ sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ int sd = accept_cloexec(_fd, (sockaddr*)&ss, &slen);
+ if (sd < 0) {
+ return -errno;
+ }
+
+ int r = handler.set_nonblock(sd);
+ if (r < 0) {
+ ::close(sd);
+ return -errno;
+ }
+
+ r = handler.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+ if (r < 0) {
+ ::close(sd);
+ return -errno;
+ }
+
+ ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+ out->set_type(addr_type);
+ out->set_sockaddr((sockaddr*)&ss);
+ handler.set_priority(sd, opt.priority, out->get_family());
+
+ std::unique_ptr<PosixConnectedSocketImpl> csi(new PosixConnectedSocketImpl(handler, *out, sd, true));
+ *sock = ConnectedSocket(std::move(csi));
+ return 0;
+}
+
+void PosixWorker::initialize()
+{
+}
+
+int PosixWorker::listen(entity_addr_t &sa,
+ unsigned addr_slot,
+ const SocketOptions &opt,
+ ServerSocket *sock)
+{
+ int listen_sd = net.create_socket(sa.get_family(), true);
+ if (listen_sd < 0) {
+ return -errno;
+ }
+
+ int r = net.set_nonblock(listen_sd);
+ if (r < 0) {
+ ::close(listen_sd);
+ return -errno;
+ }
+
+ r = net.set_socket_options(listen_sd, opt.nodelay, opt.rcbuf_size);
+ if (r < 0) {
+ ::close(listen_sd);
+ return -errno;
+ }
+
+ r = ::bind(listen_sd, sa.get_sockaddr(), sa.get_sockaddr_len());
+ if (r < 0) {
+ r = -errno;
+ ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+ << ": " << cpp_strerror(r) << dendl;
+ ::close(listen_sd);
+ return r;
+ }
+
+ r = ::listen(listen_sd, cct->_conf->ms_tcp_listen_backlog);
+ if (r < 0) {
+ r = -errno;
+ lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(r) << dendl;
+ ::close(listen_sd);
+ return r;
+ }
+
+ *sock = ServerSocket(
+ std::unique_ptr<PosixServerSocketImpl>(
+ new PosixServerSocketImpl(net, listen_sd, sa, addr_slot)));
+ return 0;
+}
+
+int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) {
+ int sd;
+
+ if (opts.nonblock) {
+ sd = net.nonblock_connect(addr, opts.connect_bind_addr);
+ } else {
+ sd = net.connect(addr, opts.connect_bind_addr);
+ }
+
+ if (sd < 0) {
+ return -errno;
+ }
+
+ net.set_priority(sd, opts.priority, addr.get_family());
+ *socket = ConnectedSocket(
+ std::unique_ptr<PosixConnectedSocketImpl>(new PosixConnectedSocketImpl(net, addr, sd, !opts.nonblock)));
+ return 0;
+}
+
+PosixNetworkStack::PosixNetworkStack(CephContext *c, const string &t)
+ : NetworkStack(c, t)
+{
+}
diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h
new file mode 100644
index 00000000..f1aaccd4
--- /dev/null
+++ b/src/msg/async/PosixStack.h
@@ -0,0 +1,56 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_POSIXSTACK_H
+#define CEPH_MSG_ASYNC_POSIXSTACK_H
+
+#include <thread>
+
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#include "Stack.h"
+
+class PosixWorker : public Worker {
+ NetHandler net;
+ void initialize() override;
+ public:
+ PosixWorker(CephContext *c, unsigned i)
+ : Worker(c, i), net(c) {}
+ int listen(entity_addr_t &sa,
+ unsigned addr_slot,
+ const SocketOptions &opt,
+ ServerSocket *socks) override;
+ int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+};
+
+class PosixNetworkStack : public NetworkStack {
+ vector<std::thread> threads;
+
+ public:
+ explicit PosixNetworkStack(CephContext *c, const string &t);
+
+ void spawn_worker(unsigned i, std::function<void ()> &&func) override {
+ threads.resize(i+1);
+ threads[i] = std::thread(func);
+ }
+ void join_worker(unsigned i) override {
+ ceph_assert(threads.size() > i && threads[i].joinable());
+ threads[i].join();
+ }
+};
+
+#endif //CEPH_MSG_ASYNC_POSIXSTACK_H
diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc
new file mode 100644
index 00000000..4bdc065e
--- /dev/null
+++ b/src/msg/async/Protocol.cc
@@ -0,0 +1,14 @@
+#include "Protocol.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+
+Protocol::Protocol(int type, AsyncConnection *connection)
+ : proto_type(type),
+ connection(connection),
+ messenger(connection->async_msgr),
+ cct(connection->async_msgr->cct) {
+ auth_meta.reset(new AuthConnectionMeta());
+}
+
+Protocol::~Protocol() {}
diff --git a/src/msg/async/Protocol.h b/src/msg/async/Protocol.h
new file mode 100644
index 00000000..cccba183
--- /dev/null
+++ b/src/msg/async/Protocol.h
@@ -0,0 +1,140 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_
+#define _MSG_ASYNC_PROTOCOL_
+
+#include <list>
+#include <map>
+
+#include "AsyncConnection.h"
+#include "include/buffer.h"
+#include "include/msgr.h"
+
+/*
+ * Continuation Helper Classes
+ */
+
+#include <memory>
+#include <tuple>
+
+template <class C>
+class Ct {
+public:
+ virtual ~Ct() {}
+ virtual Ct<C> *call(C *foo) const = 0;
+};
+
+template <class C, typename... Args>
+class CtFun : public Ct<C> {
+private:
+ using fn_t = Ct<C> *(C::*)(Args...);
+ fn_t _f;
+ std::tuple<Args...> _params;
+
+ template <std::size_t... Is>
+ inline Ct<C> *_call(C *foo, std::index_sequence<Is...>) const {
+ return (foo->*_f)(std::get<Is>(_params)...);
+ }
+
+public:
+ CtFun(fn_t f) : _f(f) {}
+
+ inline void setParams(Args... args) { _params = std::make_tuple(args...); }
+ inline Ct<C> *call(C *foo) const override {
+ return _call(foo, std::index_sequence_for<Args...>());
+ }
+};
+
+using rx_buffer_t =
+ std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>;
+
+template <class C>
+class CtRxNode : public Ct<C> {
+ using fn_t = Ct<C> *(C::*)(rx_buffer_t&&, int r);
+ fn_t _f;
+
+public:
+ mutable rx_buffer_t node;
+ int r;
+
+ CtRxNode(fn_t f) : _f(f) {}
+ void setParams(rx_buffer_t &&node, int r) {
+ this->node = std::move(node);
+ this->r = r;
+ }
+ inline Ct<C> *call(C *foo) const override {
+ return (foo->*_f)(std::move(node), r);
+ }
+};
+
+template <class C> using CONTINUATION_TYPE = CtFun<C>;
+template <class C> using CONTINUATION_TX_TYPE = CtFun<C, int>;
+template <class C> using CONTINUATION_RX_TYPE = CtFun<C, char*, int>;
+template <class C> using CONTINUATION_RXBPTR_TYPE = CtRxNode<C>;
+
+#define CONTINUATION_DECL(C, F, ...) \
+ CtFun<C, ##__VA_ARGS__> F##_cont { (&C::F) };
+
+#define CONTINUATION(F) F##_cont
+#define CONTINUE(F, ...) (F##_cont.setParams(__VA_ARGS__), &F##_cont)
+
+#define CONTINUATION_RUN(CT) \
+ { \
+ Ct<std::remove_reference<decltype(*this)>::type> *_cont = &CT;\
+ do { \
+ _cont = _cont->call(this); \
+ } while (_cont); \
+ }
+
+#define READ_HANDLER_CONTINUATION_DECL(C, F) \
+ CONTINUATION_DECL(C, F, char *, int)
+
+#define READ_BPTR_HANDLER_CONTINUATION_DECL(C, F) \
+ CtRxNode<C> F##_cont { (&C::F) };
+
+#define WRITE_HANDLER_CONTINUATION_DECL(C, F) CONTINUATION_DECL(C, F, int)
+
+//////////////////////////////////////////////////////////////////////
+
+class AsyncMessenger;
+
+class Protocol {
+public:
+ const int proto_type;
+protected:
+ AsyncConnection *connection;
+ AsyncMessenger *messenger;
+ CephContext *cct;
+public:
+ std::shared_ptr<AuthConnectionMeta> auth_meta;
+
+public:
+ Protocol(int type, AsyncConnection *connection);
+ virtual ~Protocol();
+
+ // prepare protocol for connecting to peer
+ virtual void connect() = 0;
+ // prepare protocol for accepting peer connections
+ virtual void accept() = 0;
+ // true -> protocol is ready for sending messages
+ virtual bool is_connected() = 0;
+ // stop connection
+ virtual void stop() = 0;
+ // signal and handle connection failure
+ virtual void fault() = 0;
+ // send message
+ virtual void send_message(Message *m) = 0;
+ // send keepalive
+ virtual void send_keepalive() = 0;
+
+ virtual void read_event() = 0;
+ virtual void write_event() = 0;
+ virtual bool is_queued() = 0;
+
+ int get_con_mode() const {
+ return auth_meta->con_mode;
+ }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_ */
diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc
new file mode 100644
index 00000000..9a7ab9d4
--- /dev/null
+++ b/src/msg/async/ProtocolV1.cc
@@ -0,0 +1,2547 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ProtocolV1.h"
+
+#include "common/errno.h"
+
+#include "AsyncConnection.h"
+#include "AsyncMessenger.h"
+#include "common/EventTrace.h"
+#include "include/random.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) {
+ return *_dout << "--1- " << messenger->get_myaddrs() << " >> "
+ << *connection->peer_addrs
+ << " conn("
+ << connection << " " << this
+ << " :" << connection->port << " s=" << get_state_name(state)
+ << " pgs=" << peer_global_seq << " cs=" << connect_seq
+ << " l=" << connection->policy.lossy << ").";
+}
+
+#define WRITE(B, C) write(CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), L)
+
+#define READB(L, B, C) read(CONTINUATION(C), L, B)
+
+// Constant to limit starting sequence number to 2^31. Nothing special about
+// it, just a big number. PLR
+#define SEQ_MASK 0x7fffffff
+
+const int ASYNC_COALESCE_THRESHOLD = 256;
+
+using namespace std;
+
+static void alloc_aligned_buffer(bufferlist &data, unsigned len, unsigned off) {
+ // create a buffer to read into that matches the data alignment
+ unsigned alloc_len = 0;
+ unsigned left = len;
+ unsigned head = 0;
+ if (off & ~CEPH_PAGE_MASK) {
+ // head
+ alloc_len += CEPH_PAGE_SIZE;
+ head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left);
+ left -= head;
+ }
+ alloc_len += left;
+ bufferptr ptr(buffer::create_small_page_aligned(alloc_len));
+ if (head) ptr.set_offset(CEPH_PAGE_SIZE - head);
+ data.push_back(std::move(ptr));
+}
+
+/**
+ * Protocol V1
+ **/
+
+ProtocolV1::ProtocolV1(AsyncConnection *connection)
+ : Protocol(1, connection),
+ temp_buffer(nullptr),
+ can_write(WriteStatus::NOWRITE),
+ keepalive(false),
+ connect_seq(0),
+ peer_global_seq(0),
+ msg_left(0),
+ cur_msg_size(0),
+ replacing(false),
+ is_reset_from_peer(false),
+ once_ready(false),
+ state(NONE),
+ global_seq(0),
+ authorizer(nullptr),
+ wait_for_seq(false) {
+ temp_buffer = new char[4096];
+}
+
+ProtocolV1::~ProtocolV1() {
+ ceph_assert(out_q.empty());
+ ceph_assert(sent.empty());
+
+ delete[] temp_buffer;
+
+ if (authorizer) {
+ delete authorizer;
+ }
+}
+
+void ProtocolV1::connect() {
+ this->state = START_CONNECT;
+
+ // reset connect state variables
+ if (authorizer) {
+ delete authorizer;
+ authorizer = nullptr;
+ }
+ authorizer_buf.clear();
+ // FIPS zeroization audit 20191115: these memsets are not security related.
+ memset(&connect_msg, 0, sizeof(connect_msg));
+ memset(&connect_reply, 0, sizeof(connect_reply));
+
+ global_seq = messenger->get_global_seq();
+}
+
+void ProtocolV1::accept() { this->state = START_ACCEPT; }
+
+bool ProtocolV1::is_connected() {
+ return can_write.load() == WriteStatus::CANWRITE;
+}
+
+void ProtocolV1::stop() {
+ ldout(cct, 20) << __func__ << dendl;
+ if (state == CLOSED) {
+ return;
+ }
+
+ if (connection->delay_state) connection->delay_state->flush();
+
+ ldout(cct, 2) << __func__ << dendl;
+ std::lock_guard<std::mutex> l(connection->write_lock);
+
+ reset_recv_state();
+ discard_out_queue();
+
+ connection->_stop();
+
+ can_write = WriteStatus::CLOSED;
+ state = CLOSED;
+}
+
+void ProtocolV1::fault() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (state == CLOSED || state == NONE) {
+ ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+ return;
+ }
+
+ if (connection->policy.lossy && state != START_CONNECT &&
+ state != CONNECTING) {
+ ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return;
+ }
+
+ connection->write_lock.lock();
+ can_write = WriteStatus::NOWRITE;
+ is_reset_from_peer = false;
+
+ // requeue sent items
+ requeue_sent();
+
+ if (!once_ready && out_q.empty() && state >= START_ACCEPT &&
+ state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) {
+ ldout(cct, 10) << __func__ << " with nothing to send and in the half "
+ << " accept state just closed" << dendl;
+ connection->write_lock.unlock();
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return;
+ }
+ replacing = false;
+
+ connection->fault();
+
+ reset_recv_state();
+
+ if (connection->policy.standby && out_q.empty() && !keepalive &&
+ state != WAIT) {
+ ldout(cct, 10) << __func__ << " with nothing to send, going to standby"
+ << dendl;
+ state = STANDBY;
+ connection->write_lock.unlock();
+ return;
+ }
+
+ connection->write_lock.unlock();
+
+ if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) ||
+ state == WAIT) {
+ // backoff!
+ if (state == WAIT) {
+ backoff.set_from_double(cct->_conf->ms_max_backoff);
+ } else if (backoff == utime_t()) {
+ backoff.set_from_double(cct->_conf->ms_initial_backoff);
+ } else {
+ backoff += backoff;
+ if (backoff > cct->_conf->ms_max_backoff)
+ backoff.set_from_double(cct->_conf->ms_max_backoff);
+ }
+
+ global_seq = messenger->get_global_seq();
+ state = START_CONNECT;
+ connection->state = AsyncConnection::STATE_CONNECTING;
+ ldout(cct, 10) << __func__ << " waiting " << backoff << dendl;
+ // woke up again;
+ connection->register_time_events.insert(
+ connection->center->create_time_event(backoff.to_nsec() / 1000,
+ connection->wakeup_handler));
+ } else {
+ // policy maybe empty when state is in accept
+ if (connection->policy.server) {
+ ldout(cct, 0) << __func__ << " server, going to standby" << dendl;
+ state = STANDBY;
+ } else {
+ ldout(cct, 0) << __func__ << " initiating reconnect" << dendl;
+ connect_seq++;
+ global_seq = messenger->get_global_seq();
+ state = START_CONNECT;
+ connection->state = AsyncConnection::STATE_CONNECTING;
+ }
+ backoff = utime_t();
+ connection->center->dispatch_event_external(connection->read_handler);
+ }
+}
+
+void ProtocolV1::send_message(Message *m) {
+ bufferlist bl;
+ uint64_t f = connection->get_features();
+
+ // TODO: Currently not all messages supports reencode like MOSDMap, so here
+ // only let fast dispatch support messages prepare message
+ bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+ if (can_fast_prepare) {
+ prepare_send_message(f, m, bl);
+ }
+
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ // "features" changes will change the payload encoding
+ if (can_fast_prepare &&
+ (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) {
+ // ensure the correctness of message encoding
+ bl.clear();
+ m->clear_payload();
+ ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f
+ << " != " << connection->get_features() << dendl;
+ }
+ if (can_write == WriteStatus::CLOSED) {
+ ldout(cct, 10) << __func__ << " connection closed."
+ << " Drop message " << m << dendl;
+ m->put();
+ } else {
+ m->trace.event("async enqueueing message");
+ out_q[m->get_priority()].emplace_back(std::move(bl), m);
+ ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+ << dendl;
+ if (can_write != WriteStatus::REPLACING && !write_in_progress) {
+ write_in_progress = true;
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+ }
+}
+
+void ProtocolV1::prepare_send_message(uint64_t features, Message *m,
+ bufferlist &bl) {
+ ldout(cct, 20) << __func__ << " m " << *m << dendl;
+
+ // associate message with Connection (for benefit of encode_payload)
+ if (m->empty_payload()) {
+ ldout(cct, 20) << __func__ << " encoding features " << features << " " << m
+ << " " << *m << dendl;
+ } else {
+ ldout(cct, 20) << __func__ << " half-reencoding features " << features
+ << " " << m << " " << *m << dendl;
+ }
+
+ // encode and copy out of *m
+ m->encode(features, messenger->crcflags);
+
+ bl.append(m->get_payload());
+ bl.append(m->get_middle());
+ bl.append(m->get_data());
+}
+
+void ProtocolV1::send_keepalive() {
+ ldout(cct, 10) << __func__ << dendl;
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ if (can_write != WriteStatus::CLOSED) {
+ keepalive = true;
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+}
+
+void ProtocolV1::read_event() {
+ ldout(cct, 20) << __func__ << dendl;
+ switch (state) {
+ case START_CONNECT:
+ CONTINUATION_RUN(CONTINUATION(send_client_banner));
+ break;
+ case START_ACCEPT:
+ CONTINUATION_RUN(CONTINUATION(send_server_banner));
+ break;
+ case OPENED:
+ CONTINUATION_RUN(CONTINUATION(wait_message));
+ break;
+ case THROTTLE_MESSAGE:
+ CONTINUATION_RUN(CONTINUATION(throttle_message));
+ break;
+ case THROTTLE_BYTES:
+ CONTINUATION_RUN(CONTINUATION(throttle_bytes));
+ break;
+ case THROTTLE_DISPATCH_QUEUE:
+ CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue));
+ break;
+ default:
+ break;
+ }
+}
+
+void ProtocolV1::write_event() {
+ ldout(cct, 10) << __func__ << dendl;
+ ssize_t r = 0;
+
+ connection->write_lock.lock();
+ if (can_write == WriteStatus::CANWRITE) {
+ if (keepalive) {
+ append_keepalive_or_ack();
+ keepalive = false;
+ }
+
+ auto start = ceph::mono_clock::now();
+ bool more;
+ do {
+ bufferlist data;
+ Message *m = _get_next_outgoing(&data);
+ if (!m) {
+ break;
+ }
+
+ if (!connection->policy.lossy) {
+ // put on sent list
+ sent.push_back(m);
+ m->get();
+ }
+ more = !out_q.empty();
+ connection->write_lock.unlock();
+
+ // send_message or requeue messages may not encode message
+ if (!data.length()) {
+ prepare_send_message(connection->get_features(), m, data);
+ }
+
+ r = write_message(m, data, more);
+
+ connection->write_lock.lock();
+ if (r == 0) {
+ ;
+ } else if (r < 0) {
+ ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+ break;
+ } else if (r > 0)
+ break;
+ } while (can_write == WriteStatus::CANWRITE);
+ write_in_progress = false;
+ connection->write_lock.unlock();
+
+ // if r > 0 mean data still lefted, so no need _try_send.
+ if (r == 0) {
+ uint64_t left = ack_left;
+ if (left) {
+ ceph_le64 s;
+ s = in_seq;
+ connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK);
+ connection->outgoing_bl.append((char *)&s, sizeof(s));
+ ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+ << " messages" << dendl;
+ ack_left -= left;
+ left = ack_left;
+ r = connection->_try_send(left);
+ } else if (is_queued()) {
+ r = connection->_try_send();
+ }
+ }
+
+ connection->logger->tinc(l_msgr_running_send_time,
+ ceph::mono_clock::now() - start);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+ connection->lock.lock();
+ fault();
+ connection->lock.unlock();
+ return;
+ }
+ } else {
+ write_in_progress = false;
+ connection->write_lock.unlock();
+ connection->lock.lock();
+ connection->write_lock.lock();
+ if (state == STANDBY && !connection->policy.server && is_queued()) {
+ ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+ connection->_connect();
+ } else if (connection->cs && state != NONE && state != CLOSED &&
+ state != START_CONNECT) {
+ r = connection->_try_send();
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+ connection->write_lock.unlock();
+ fault();
+ connection->lock.unlock();
+ return;
+ }
+ }
+ connection->write_lock.unlock();
+ connection->lock.unlock();
+ }
+}
+
+bool ProtocolV1::is_queued() {
+ return !out_q.empty() || connection->is_queued();
+}
+
+void ProtocolV1::run_continuation(CtPtr pcontinuation) {
+ if (pcontinuation) {
+ CONTINUATION_RUN(*pcontinuation);
+ }
+}
+
+CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE<ProtocolV1> &next,
+ int len, char *buffer) {
+ if (!buffer) {
+ buffer = temp_buffer;
+ }
+ ssize_t r = connection->read(len, buffer,
+ [&next, this](char *buffer, int r) {
+ next.setParams(buffer, r);
+ CONTINUATION_RUN(next);
+ });
+ if (r <= 0) {
+ next.setParams(buffer, r);
+ return &next;
+ }
+
+ return nullptr;
+}
+
+CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE<ProtocolV1> &next,
+ bufferlist &buffer) {
+ ssize_t r = connection->write(buffer, [&next, this](int r) {
+ next.setParams(r);
+ CONTINUATION_RUN(next);
+ });
+ if (r <= 0) {
+ next.setParams(r);
+ return &next;
+ }
+
+ return nullptr;
+}
+
+CtPtr ProtocolV1::ready() {
+ ldout(cct, 25) << __func__ << dendl;
+
+ // make sure no pending tick timer
+ if (connection->last_tick_id) {
+ connection->center->delete_time_event(connection->last_tick_id);
+ }
+ connection->last_tick_id = connection->center->create_time_event(
+ connection->inactive_timeout_us, connection->tick_handler);
+
+ connection->write_lock.lock();
+ can_write = WriteStatus::CANWRITE;
+ if (is_queued()) {
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+ connection->write_lock.unlock();
+ connection->maybe_start_delay_thread();
+
+ state = OPENED;
+ return wait_message();
+}
+
+CtPtr ProtocolV1::wait_message() {
+ if (state != OPENED) { // must have changed due to a replace
+ return nullptr;
+ }
+
+ ldout(cct, 20) << __func__ << dendl;
+
+ return READ(sizeof(char), handle_message);
+}
+
+CtPtr ProtocolV1::handle_message(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read tag failed" << dendl;
+ return _fault();
+ }
+
+ char tag = buffer[0];
+ ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl;
+
+ if (tag == CEPH_MSGR_TAG_KEEPALIVE) {
+ ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl;
+ connection->set_last_keepalive(ceph_clock_now());
+ } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) {
+ return READ(sizeof(ceph_timespec), handle_keepalive2);
+ } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+ return READ(sizeof(ceph_timespec), handle_keepalive2_ack);
+ } else if (tag == CEPH_MSGR_TAG_ACK) {
+ return READ(sizeof(ceph_le64), handle_tag_ack);
+ } else if (tag == CEPH_MSGR_TAG_MSG) {
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+ ltt_recv_stamp = ceph_clock_now();
+#endif
+ recv_stamp = ceph_clock_now();
+ ldout(cct, 20) << __func__ << " begin MSG" << dendl;
+ return READ(sizeof(ceph_msg_header), handle_message_header);
+ } else if (tag == CEPH_MSGR_TAG_CLOSE) {
+ ldout(cct, 20) << __func__ << " got CLOSE" << dendl;
+ stop();
+ } else {
+ ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl;
+ return _fault();
+ }
+ return nullptr;
+}
+
+CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+ ceph_timespec *t;
+ t = (ceph_timespec *)buffer;
+ utime_t kp_t = utime_t(*t);
+ connection->write_lock.lock();
+ append_keepalive_or_ack(true, &kp_t);
+ connection->write_lock.unlock();
+
+ ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl;
+ connection->set_last_keepalive(ceph_clock_now());
+
+ if (is_connected()) {
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+
+ return CONTINUE(wait_message);
+}
+
+void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) {
+ ldout(cct, 10) << __func__ << dendl;
+ if (ack) {
+ ceph_assert(tp);
+ struct ceph_timespec ts;
+ tp->encode_timeval(&ts);
+ connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK);
+ connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+ } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) {
+ struct ceph_timespec ts;
+ utime_t t = ceph_clock_now();
+ t.encode_timeval(&ts);
+ connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2);
+ connection->outgoing_bl.append((char *)&ts, sizeof(ts));
+ } else {
+ connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE);
+ }
+}
+
+CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl;
+ return _fault();
+ }
+
+ ceph_timespec *t;
+ t = (ceph_timespec *)buffer;
+ connection->set_last_keepalive_ack(utime_t(*t));
+ ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+ return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+ return _fault();
+ }
+
+ ceph_le64 seq;
+ seq = *(ceph_le64 *)buffer;
+ ldout(cct, 20) << __func__ << " got ACK" << dendl;
+
+ ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl;
+ // trim sent list
+ static const int max_pending = 128;
+ int i = 0;
+ Message *pending[max_pending];
+ connection->write_lock.lock();
+ while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+ Message *m = sent.front();
+ sent.pop_front();
+ pending[i++] = m;
+ ldout(cct, 10) << __func__ << " got ack seq " << seq
+ << " >= " << m->get_seq() << " on " << m << " " << *m
+ << dendl;
+ }
+ connection->write_lock.unlock();
+ for (int k = 0; k < i; k++) {
+ pending[k]->put();
+ }
+
+ return CONTINUE(wait_message);
+}
+
+CtPtr ProtocolV1::handle_message_header(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read message header failed" << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 20) << __func__ << " got MSG header" << dendl;
+
+ current_header = *((ceph_msg_header *)buffer);
+
+ ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src "
+ << entity_name_t(current_header.src) << " front=" << current_header.front_len
+ << " data=" << current_header.data_len << " off " << current_header.data_off
+ << dendl;
+
+ if (messenger->crcflags & MSG_CRC_HEADER) {
+ __u32 header_crc = 0;
+ header_crc = ceph_crc32c(0, (unsigned char *)&current_header,
+ sizeof(current_header) - sizeof(current_header.crc));
+ // verify header crc
+ if (header_crc != current_header.crc) {
+ ldout(cct, 0) << __func__ << " got bad header crc " << header_crc
+ << " != " << current_header.crc << dendl;
+ return _fault();
+ }
+ }
+
+ // Reset state
+ data_buf.clear();
+ front.clear();
+ middle.clear();
+ data.clear();
+
+ state = THROTTLE_MESSAGE;
+ return CONTINUE(throttle_message);
+}
+
+CtPtr ProtocolV1::throttle_message() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (connection->policy.throttler_messages) {
+ ldout(cct, 10) << __func__ << " wants " << 1
+ << " message from policy throttler "
+ << connection->policy.throttler_messages->get_current()
+ << "/" << connection->policy.throttler_messages->get_max()
+ << dendl;
+ if (!connection->policy.throttler_messages->get_or_fail()) {
+ ldout(cct, 10) << __func__ << " wants 1 message from policy throttle "
+ << connection->policy.throttler_messages->get_current()
+ << "/" << connection->policy.throttler_messages->get_max()
+ << " failed, just wait." << dendl;
+ // following thread pool deal with th full message queue isn't a
+ // short time, so we can wait a ms.
+ if (connection->register_time_events.empty()) {
+ connection->register_time_events.insert(
+ connection->center->create_time_event(1000,
+ connection->wakeup_handler));
+ }
+ return nullptr;
+ }
+ }
+
+ state = THROTTLE_BYTES;
+ return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV1::throttle_bytes() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ cur_msg_size = current_header.front_len + current_header.middle_len +
+ current_header.data_len;
+ if (cur_msg_size) {
+ if (connection->policy.throttler_bytes) {
+ ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+ << " bytes from policy throttler "
+ << connection->policy.throttler_bytes->get_current() << "/"
+ << connection->policy.throttler_bytes->get_max() << dendl;
+ if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+ ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+ << " bytes from policy throttler "
+ << connection->policy.throttler_bytes->get_current()
+ << "/" << connection->policy.throttler_bytes->get_max()
+ << " failed, just wait." << dendl;
+ // following thread pool deal with th full message queue isn't a
+ // short time, so we can wait a ms.
+ if (connection->register_time_events.empty()) {
+ connection->register_time_events.insert(
+ connection->center->create_time_event(
+ 1000, connection->wakeup_handler));
+ }
+ return nullptr;
+ }
+ }
+ }
+
+ state = THROTTLE_DISPATCH_QUEUE;
+ return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV1::throttle_dispatch_queue() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (cur_msg_size) {
+ if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+ cur_msg_size)) {
+ ldout(cct, 10)
+ << __func__ << " wants " << cur_msg_size
+ << " bytes from dispatch throttle "
+ << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+ << connection->dispatch_queue->dispatch_throttler.get_max()
+ << " failed, just wait." << dendl;
+ // following thread pool deal with th full message queue isn't a
+ // short time, so we can wait a ms.
+ if (connection->register_time_events.empty()) {
+ connection->register_time_events.insert(
+ connection->center->create_time_event(1000,
+ connection->wakeup_handler));
+ }
+ return nullptr;
+ }
+ }
+
+ throttle_stamp = ceph_clock_now();
+
+ state = READ_MESSAGE_FRONT;
+ return read_message_front();
+}
+
+CtPtr ProtocolV1::read_message_front() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ unsigned front_len = current_header.front_len;
+ if (front_len) {
+ if (!front.length()) {
+ front.push_back(buffer::create(front_len));
+ }
+ return READB(front_len, front.c_str(), handle_message_front);
+ }
+ return read_message_middle();
+}
+
+CtPtr ProtocolV1::handle_message_front(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read message front failed" << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 20) << __func__ << " got front " << front.length() << dendl;
+
+ return read_message_middle();
+}
+
+CtPtr ProtocolV1::read_message_middle() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (current_header.middle_len) {
+ if (!middle.length()) {
+ middle.push_back(buffer::create(current_header.middle_len));
+ }
+ return READB(current_header.middle_len, middle.c_str(),
+ handle_message_middle);
+ }
+
+ return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read message middle failed" << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl;
+
+ return read_message_data_prepare();
+}
+
+CtPtr ProtocolV1::read_message_data_prepare() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ unsigned data_len = le32_to_cpu(current_header.data_len);
+ unsigned data_off = le32_to_cpu(current_header.data_off);
+
+ if (data_len) {
+ // get a buffer
+#if 0
+ // rx_buffers is broken by design... see
+ // http://tracker.ceph.com/issues/22480
+ map<ceph_tid_t, pair<bufferlist, int> >::iterator p =
+ connection->rx_buffers.find(current_header.tid);
+ if (p != connection->rx_buffers.end()) {
+ ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second
+ << " at offset " << data_off << " len "
+ << p->second.first.length() << dendl;
+ data_buf = p->second.first;
+ // make sure it's big enough
+ if (data_buf.length() < data_len)
+ data_buf.push_back(buffer::create(data_len - data_buf.length()));
+ data_blp = data_buf.begin();
+ } else {
+ ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+ << data_off << dendl;
+ alloc_aligned_buffer(data_buf, data_len, data_off);
+ data_blp = data_buf.begin();
+ }
+#else
+ ldout(cct, 20) << __func__ << " allocating new rx buffer at offset "
+ << data_off << dendl;
+ alloc_aligned_buffer(data_buf, data_len, data_off);
+ data_blp = data_buf.begin();
+#endif
+ }
+
+ msg_left = data_len;
+
+ return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_data() {
+ ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl;
+
+ if (msg_left > 0) {
+ bufferptr bp = data_blp.get_current_ptr();
+ unsigned read_len = std::min(bp.length(), msg_left);
+
+ return READB(read_len, bp.c_str(), handle_message_data);
+ }
+
+ return read_message_footer();
+}
+
+CtPtr ProtocolV1::handle_message_data(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read data error " << dendl;
+ return _fault();
+ }
+
+ bufferptr bp = data_blp.get_current_ptr();
+ unsigned read_len = std::min(bp.length(), msg_left);
+ ceph_assert(read_len < std::numeric_limits<int>::max());
+ data_blp.advance(read_len);
+ data.append(bp, 0, read_len);
+ msg_left -= read_len;
+
+ return CONTINUE(read_message_data);
+}
+
+CtPtr ProtocolV1::read_message_footer() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ state = READ_FOOTER_AND_DISPATCH;
+
+ unsigned len;
+ if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+ len = sizeof(ceph_msg_footer);
+ } else {
+ len = sizeof(ceph_msg_footer_old);
+ }
+
+ return READ(len, handle_message_footer);
+}
+
+CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read footer data error " << dendl;
+ return _fault();
+ }
+
+ ceph_msg_footer footer;
+ ceph_msg_footer_old old_footer;
+
+ if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+ footer = *((ceph_msg_footer *)buffer);
+ } else {
+ old_footer = *((ceph_msg_footer_old *)buffer);
+ footer.front_crc = old_footer.front_crc;
+ footer.middle_crc = old_footer.middle_crc;
+ footer.data_crc = old_footer.data_crc;
+ footer.sig = 0;
+ footer.flags = old_footer.flags;
+ }
+
+ int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0;
+ ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl;
+ if (aborted) {
+ ldout(cct, 0) << __func__ << " got " << front.length() << " + "
+ << middle.length() << " + " << data.length()
+ << " byte message.. ABORTED" << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 20) << __func__ << " got " << front.length() << " + "
+ << middle.length() << " + " << data.length() << " byte message"
+ << dendl;
+ Message *message = decode_message(cct, messenger->crcflags, current_header,
+ footer, front, middle, data, connection);
+ if (!message) {
+ ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+ return _fault();
+ }
+
+ //
+ // Check the signature if one should be present. A zero return indicates
+ // success. PLR
+ //
+
+ if (session_security.get() == NULL) {
+ ldout(cct, 10) << __func__ << " no session security set" << dendl;
+ } else {
+ if (session_security->check_message_signature(message)) {
+ ldout(cct, 0) << __func__ << " Signature check failed" << dendl;
+ message->put();
+ return _fault();
+ }
+ }
+ message->set_byte_throttler(connection->policy.throttler_bytes);
+ message->set_message_throttler(connection->policy.throttler_messages);
+
+ // store reservation size in message, so we don't get confused
+ // by messages entering the dispatch queue through other paths.
+ message->set_dispatch_throttle_size(cur_msg_size);
+
+ message->set_recv_stamp(recv_stamp);
+ message->set_throttle_stamp(throttle_stamp);
+ message->set_recv_complete_stamp(ceph_clock_now());
+
+ // check received seq#. if it is old, drop the message.
+ // note that incoming messages may skip ahead. this is convenient for the
+ // client side queueing because messages can't be renumbered, but the (kernel)
+ // client will occasionally pull a message out of the sent queue to send
+ // elsewhere. in that case it doesn't matter if we "got" it or not.
+ uint64_t cur_seq = in_seq;
+ if (message->get_seq() <= cur_seq) {
+ ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+ << " <= " << cur_seq << " " << message << " " << *message
+ << ", discarding" << dendl;
+ message->put();
+ if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+ cct->_conf->ms_die_on_old_message) {
+ ceph_assert(0 == "old msgs despite reconnect_seq feature");
+ }
+ return nullptr;
+ }
+ if (message->get_seq() > cur_seq + 1) {
+ ldout(cct, 0) << __func__ << " missed message? skipped from seq "
+ << cur_seq << " to " << message->get_seq() << dendl;
+ if (cct->_conf->ms_die_on_skipped_message) {
+ ceph_assert(0 == "skipped incoming seq");
+ }
+ }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+ if (message->get_type() == CEPH_MSG_OSD_OP ||
+ message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+ utime_t ltt_processed_stamp = ceph_clock_now();
+ double usecs_elapsed =
+ (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000;
+ ostringstream buf;
+ if (message->get_type() == CEPH_MSG_OSD_OP)
+ OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+ false);
+ else
+ OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+ false);
+ }
+#endif
+
+ // note last received message.
+ in_seq = message->get_seq();
+ ldout(cct, 5) << " rx " << message->get_source() << " seq "
+ << message->get_seq() << " " << message << " " << *message
+ << dendl;
+
+ bool need_dispatch_writer = false;
+ if (!connection->policy.lossy) {
+ ack_left++;
+ need_dispatch_writer = true;
+ }
+
+ state = OPENED;
+
+ connection->logger->inc(l_msgr_recv_messages);
+ connection->logger->inc(
+ l_msgr_recv_bytes,
+ cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer));
+
+ messenger->ms_fast_preprocess(message);
+ auto fast_dispatch_time = ceph::mono_clock::now();
+ connection->logger->tinc(l_msgr_running_recv_time,
+ fast_dispatch_time - connection->recv_start_time);
+ if (connection->delay_state) {
+ double delay_period = 0;
+ if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+ delay_period =
+ cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+ ldout(cct, 1) << "queue_received will delay after "
+ << (ceph_clock_now() + delay_period) << " on " << message
+ << " " << *message << dendl;
+ }
+ connection->delay_state->queue(delay_period, message);
+ } else if (messenger->ms_can_fast_dispatch(message)) {
+ connection->lock.unlock();
+ connection->dispatch_queue->fast_dispatch(message);
+ connection->recv_start_time = ceph::mono_clock::now();
+ connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+ connection->recv_start_time - fast_dispatch_time);
+ connection->lock.lock();
+ } else {
+ connection->dispatch_queue->enqueue(message, message->get_priority(),
+ connection->conn_id);
+ }
+
+ // clean up local buffer references
+ data_buf.clear();
+ front.clear();
+ middle.clear();
+ data.clear();
+
+ if (need_dispatch_writer && connection->is_connected()) {
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+
+ return CONTINUE(wait_message);
+}
+
+void ProtocolV1::session_reset() {
+ ldout(cct, 10) << __func__ << " started" << dendl;
+
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ if (connection->delay_state) {
+ connection->delay_state->discard();
+ }
+
+ connection->dispatch_queue->discard_queue(connection->conn_id);
+ discard_out_queue();
+ // note: we need to clear outgoing_bl here, but session_reset may be
+ // called by other thread, so let caller clear this itself!
+ // outgoing_bl.clear();
+
+ connection->dispatch_queue->queue_remote_reset(connection);
+
+ randomize_out_seq();
+
+ in_seq = 0;
+ connect_seq = 0;
+ // it's safe to directly set 0, double locked
+ ack_left = 0;
+ once_ready = false;
+ can_write = WriteStatus::NOWRITE;
+}
+
+void ProtocolV1::randomize_out_seq() {
+ if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) {
+ // Set out_seq to a random value, so CRC won't be predictable.
+ auto rand_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK);
+ ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl;
+ out_seq = rand_seq;
+ } else {
+ // previously, seq #'s always started at 0.
+ out_seq = 0;
+ }
+}
+
+ssize_t ProtocolV1::write_message(Message *m, bufferlist &bl, bool more) {
+ FUNCTRACE(cct);
+ ceph_assert(connection->center->in_thread());
+ m->set_seq(++out_seq);
+
+ if (messenger->crcflags & MSG_CRC_HEADER) {
+ m->calc_header_crc();
+ }
+
+ ceph_msg_header &header = m->get_header();
+ ceph_msg_footer &footer = m->get_footer();
+
+ // TODO: let sign_message could be reentry?
+ // Now that we have all the crcs calculated, handle the
+ // digital signature for the message, if the AsyncConnection has session
+ // security set up. Some session security options do not
+ // actually calculate and check the signature, but they should
+ // handle the calls to sign_message and check_signature. PLR
+ if (session_security.get() == NULL) {
+ ldout(cct, 20) << __func__ << " no session security" << dendl;
+ } else {
+ if (session_security->sign_message(m)) {
+ ldout(cct, 20) << __func__ << " failed to sign m=" << m
+ << "): sig = " << footer.sig << dendl;
+ } else {
+ ldout(cct, 20) << __func__ << " signed m=" << m
+ << "): sig = " << footer.sig << dendl;
+ }
+ }
+
+ connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG);
+ connection->outgoing_bl.append((char *)&header, sizeof(header));
+
+ ldout(cct, 20) << __func__ << " sending message type=" << header.type
+ << " src " << entity_name_t(header.src)
+ << " front=" << header.front_len << " data=" << header.data_len
+ << " off " << header.data_off << dendl;
+
+ if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.buffers().size() > 1)) {
+ for (const auto &pb : bl.buffers()) {
+ connection->outgoing_bl.append((char *)pb.c_str(), pb.length());
+ }
+ } else {
+ connection->outgoing_bl.claim_append(bl);
+ }
+
+ // send footer; if receiver doesn't support signatures, use the old footer
+ // format
+ ceph_msg_footer_old old_footer;
+ if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) {
+ connection->outgoing_bl.append((char *)&footer, sizeof(footer));
+ } else {
+ if (messenger->crcflags & MSG_CRC_HEADER) {
+ old_footer.front_crc = footer.front_crc;
+ old_footer.middle_crc = footer.middle_crc;
+ old_footer.data_crc = footer.data_crc;
+ } else {
+ old_footer.front_crc = old_footer.middle_crc = 0;
+ }
+ old_footer.data_crc =
+ messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0;
+ old_footer.flags = footer.flags;
+ connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer));
+ }
+
+ m->trace.event("async writing message");
+ ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m
+ << dendl;
+ ssize_t total_send_size = connection->outgoing_bl.length();
+ ssize_t rc = connection->_try_send(more);
+ if (rc < 0) {
+ ldout(cct, 1) << __func__ << " error sending " << m << ", "
+ << cpp_strerror(rc) << dendl;
+ } else {
+ connection->logger->inc(
+ l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+ ldout(cct, 10) << __func__ << " sending " << m
+ << (rc ? " continuely." : " done.") << dendl;
+ }
+ if (m->get_type() == CEPH_MSG_OSD_OP)
+ OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+ else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+ OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+ m->put();
+
+ return rc;
+}
+
+void ProtocolV1::requeue_sent() {
+ write_in_progress = false;
+ if (sent.empty()) {
+ return;
+ }
+
+ list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+ out_seq -= sent.size();
+ while (!sent.empty()) {
+ Message *m = sent.back();
+ sent.pop_back();
+ ldout(cct, 10) << __func__ << " " << *m << " for resend "
+ << " (" << m->get_seq() << ")" << dendl;
+ rq.push_front(make_pair(bufferlist(), m));
+ }
+}
+
+uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+ ldout(cct, 10) << __func__ << " " << seq << dendl;
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+ return seq;
+ }
+ list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST];
+ uint64_t count = out_seq;
+ while (!rq.empty()) {
+ pair<bufferlist, Message *> p = rq.front();
+ if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break;
+ ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq "
+ << p.second->get_seq() << " <= " << seq << ", discarding"
+ << dendl;
+ p.second->put();
+ rq.pop_front();
+ count++;
+ }
+ if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST);
+ return count;
+}
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV1::discard_out_queue() {
+ ldout(cct, 10) << __func__ << " started" << dendl;
+
+ for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+ ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+ (*p)->put();
+ }
+ sent.clear();
+ for (map<int, list<pair<bufferlist, Message *> > >::iterator p =
+ out_q.begin();
+ p != out_q.end(); ++p) {
+ for (list<pair<bufferlist, Message *> >::iterator r = p->second.begin();
+ r != p->second.end(); ++r) {
+ ldout(cct, 20) << __func__ << " discard " << r->second << dendl;
+ r->second->put();
+ }
+ }
+ out_q.clear();
+ write_in_progress = false;
+}
+
+void ProtocolV1::reset_security()
+{
+ ldout(cct, 5) << __func__ << dendl;
+
+ // clean up state internal variables and states
+ if (state == CONNECTING_SEND_CONNECT_MSG) {
+ if (authorizer) {
+ delete authorizer;
+ }
+ authorizer = nullptr;
+ }
+}
+
+void ProtocolV1::reset_recv_state() {
+ ldout(cct, 5) << __func__ << dendl;
+
+ // execute in the same thread that uses the `session_security`.
+ // We need to do the warp because holding `write_lock` is not
+ // enough as `write_event()` releases it just before calling
+ // `write_message()`. `submit_to()` here is NOT blocking.
+ if (!connection->center->in_thread()) {
+ connection->center->submit_to(connection->center->get_id(), [this] {
+ ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers"
+ << dendl;
+ // Possibly unnecessary. See the comment in `deactivate_existing`.
+ std::lock_guard<std::mutex> l(connection->lock);
+ std::lock_guard<std::mutex> wl(connection->write_lock);
+ reset_security();
+ }, /* nowait = */true);
+ } else {
+ reset_security();
+ }
+
+ // clean read and write callbacks
+ connection->pendingReadLen.reset();
+ connection->writeCallback.reset();
+
+ if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH &&
+ connection->policy.throttler_messages) {
+ ldout(cct, 10) << __func__ << " releasing " << 1
+ << " message to policy throttler "
+ << connection->policy.throttler_messages->get_current()
+ << "/" << connection->policy.throttler_messages->get_max()
+ << dendl;
+ connection->policy.throttler_messages->put();
+ }
+ if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) {
+ if (connection->policy.throttler_bytes) {
+ ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+ << " bytes to policy throttler "
+ << connection->policy.throttler_bytes->get_current() << "/"
+ << connection->policy.throttler_bytes->get_max() << dendl;
+ connection->policy.throttler_bytes->put(cur_msg_size);
+ }
+ }
+ if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) {
+ ldout(cct, 10)
+ << __func__ << " releasing " << cur_msg_size
+ << " bytes to dispatch_queue throttler "
+ << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+ << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+ connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+ }
+}
+
+Message *ProtocolV1::_get_next_outgoing(bufferlist *bl) {
+ Message *m = 0;
+ if (!out_q.empty()) {
+ map<int, list<pair<bufferlist, Message *> > >::reverse_iterator it =
+ out_q.rbegin();
+ ceph_assert(!it->second.empty());
+ list<pair<bufferlist, Message *> >::iterator p = it->second.begin();
+ m = p->second;
+ if (bl) bl->swap(p->first);
+ it->second.erase(p);
+ if (it->second.empty()) out_q.erase(it->first);
+ }
+ return m;
+}
+
+/**
+ * Client Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_client_banner() {
+ ldout(cct, 20) << __func__ << dendl;
+ state = CONNECTING;
+
+ bufferlist bl;
+ bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+ return WRITE(bl, handle_client_banner_write);
+}
+
+CtPtr ProtocolV1::handle_client_banner_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " write client banner failed" << dendl;
+ return _fault();
+ }
+ ldout(cct, 10) << __func__ << " connect write banner done: "
+ << connection->get_peer_addr() << dendl;
+
+ return wait_server_banner();
+}
+
+CtPtr ProtocolV1::wait_server_banner() {
+ state = CONNECTING_WAIT_BANNER_AND_IDENTIFY;
+
+ ldout(cct, 20) << __func__ << dendl;
+
+ bufferlist myaddrbl;
+ unsigned banner_len = strlen(CEPH_BANNER);
+ unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2;
+ return READ(need_len, handle_server_banner_and_identify);
+}
+
+CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read banner and identify addresses failed"
+ << dendl;
+ return _fault();
+ }
+
+ unsigned banner_len = strlen(CEPH_BANNER);
+ if (memcmp(buffer, CEPH_BANNER, banner_len)) {
+ ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer "
+ << connection->get_peer_addr() << dendl;
+ return _fault();
+ }
+
+ bufferlist bl;
+ entity_addr_t paddr, peer_addr_for_me;
+
+ bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2);
+ auto p = bl.cbegin();
+ try {
+ decode(paddr, p);
+ decode(peer_addr_for_me, p);
+ } catch (const buffer::error &e) {
+ lderr(cct) << __func__ << " decode peer addr failed " << dendl;
+ return _fault();
+ }
+ ldout(cct, 20) << __func__ << " connect read peer addr " << paddr
+ << " on socket " << connection->cs.fd() << dendl;
+
+ entity_addr_t peer_addr = connection->peer_addrs->legacy_addr();
+ if (peer_addr != paddr) {
+ if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() &&
+ peer_addr.get_nonce() == paddr.get_nonce()) {
+ ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not "
+ << peer_addr << " - presumably this is the same node!"
+ << dendl;
+ } else {
+ ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not "
+ << peer_addr << dendl;
+ return _fault();
+ }
+ }
+
+ ldout(cct, 20) << __func__ << " connect peer addr for me is "
+ << peer_addr_for_me << dendl;
+ if (messenger->get_myaddrs().empty() ||
+ messenger->get_myaddrs().front().is_blank_ip()) {
+ sockaddr_storage ss;
+ socklen_t len = sizeof(ss);
+ getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+ entity_addr_t a;
+ if (cct->_conf->ms_learn_addr_from_peer) {
+ ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+ << " says I am " << peer_addr_for_me << " (socket says "
+ << (sockaddr*)&ss << ")" << dendl;
+ a = peer_addr_for_me;
+ } else {
+ ldout(cct, 1) << __func__ << " socket to " << connection->target_addr
+ << " says I am " << (sockaddr*)&ss
+ << " (peer says " << peer_addr_for_me << ")" << dendl;
+ a.set_sockaddr((sockaddr *)&ss);
+ }
+ a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this
+ a.set_port(0);
+ connection->lock.unlock();
+ messenger->learned_addr(a);
+ if (cct->_conf->ms_inject_internal_delays &&
+ cct->_conf->ms_inject_socket_failures) {
+ if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(cct, 10) << __func__ << " sleep for "
+ << cct->_conf->ms_inject_internal_delays << dendl;
+ utime_t t;
+ t.set_from_double(cct->_conf->ms_inject_internal_delays);
+ t.sleep();
+ }
+ }
+ connection->lock.lock();
+ if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) {
+ ldout(cct, 1) << __func__
+ << " state changed while learned_addr, mark_down or "
+ << " replacing must be happened just now" << dendl;
+ return nullptr;
+ }
+ }
+
+ bufferlist myaddrbl;
+ encode(messenger->get_myaddr_legacy(), myaddrbl, 0); // legacy
+ return WRITE(myaddrbl, handle_my_addr_write);
+}
+
+CtPtr ProtocolV1::handle_my_addr_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 2) << __func__ << " connect couldn't write my addr, "
+ << cpp_strerror(r) << dendl;
+ return _fault();
+ }
+ ldout(cct, 10) << __func__ << " connect sent my addr "
+ << messenger->get_myaddr_legacy() << dendl;
+
+ return CONTINUE(send_connect_message);
+}
+
+CtPtr ProtocolV1::send_connect_message() {
+ state = CONNECTING_SEND_CONNECT_MSG;
+
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (!authorizer) {
+ authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type);
+ }
+
+ ceph_msg_connect connect;
+ connect.features = connection->policy.features_supported;
+ connect.host_type = messenger->get_myname().type();
+ connect.global_seq = global_seq;
+ connect.connect_seq = connect_seq;
+ connect.protocol_version =
+ messenger->get_proto_version(connection->peer_type, true);
+ connect.authorizer_protocol = authorizer ? authorizer->protocol : 0;
+ connect.authorizer_len = authorizer ? authorizer->bl.length() : 0;
+
+ if (authorizer) {
+ ldout(cct, 10) << __func__
+ << " connect_msg.authorizer_len=" << connect.authorizer_len
+ << " protocol=" << connect.authorizer_protocol << dendl;
+ }
+
+ connect.flags = 0;
+ if (connection->policy.lossy) {
+ connect.flags |=
+ CEPH_MSG_CONNECT_LOSSY; // this is fyi, actually, server decides!
+ }
+
+ bufferlist bl;
+ bl.append((char *)&connect, sizeof(connect));
+ if (authorizer) {
+ bl.append(authorizer->bl.c_str(), authorizer->bl.length());
+ }
+
+ ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq
+ << " cseq=" << connect_seq
+ << " proto=" << connect.protocol_version << dendl;
+
+ return WRITE(bl, handle_connect_message_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 2) << __func__ << " connect couldn't send reply "
+ << cpp_strerror(r) << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 20) << __func__
+ << " connect wrote (self +) cseq, waiting for reply" << dendl;
+
+ return wait_connect_reply();
+}
+
+CtPtr ProtocolV1::wait_connect_reply() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&connect_reply, 0, sizeof(connect_reply));
+ return READ(sizeof(connect_reply), handle_connect_reply_1);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read connect reply failed" << dendl;
+ return _fault();
+ }
+
+ connect_reply = *((ceph_msg_connect_reply *)buffer);
+
+ ldout(cct, 20) << __func__ << " connect got reply tag "
+ << (int)connect_reply.tag << " connect_seq "
+ << connect_reply.connect_seq << " global_seq "
+ << connect_reply.global_seq << " proto "
+ << connect_reply.protocol_version << " flags "
+ << (int)connect_reply.flags << " features "
+ << connect_reply.features << dendl;
+
+ if (connect_reply.authorizer_len) {
+ return wait_connect_reply_auth();
+ }
+
+ return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::wait_connect_reply_auth() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ ldout(cct, 10) << __func__
+ << " reply.authorizer_len=" << connect_reply.authorizer_len
+ << dendl;
+
+ ceph_assert(connect_reply.authorizer_len < 4096);
+
+ return READ(connect_reply.authorizer_len, handle_connect_reply_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read connect reply authorizer failed"
+ << dendl;
+ return _fault();
+ }
+
+ bufferlist authorizer_reply;
+ authorizer_reply.append(buffer, connect_reply.authorizer_len);
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ldout(cct, 10) << __func__ << " connect got auth challenge" << dendl;
+ authorizer->add_challenge(cct, authorizer_reply);
+ return CONTINUE(send_connect_message);
+ }
+
+ auto iter = authorizer_reply.cbegin();
+ if (authorizer && !authorizer->verify_reply(iter,
+ nullptr /* connection_secret */)) {
+ ldout(cct, 0) << __func__ << " failed verifying authorize reply" << dendl;
+ return _fault();
+ }
+
+ return handle_connect_reply_2();
+}
+
+CtPtr ProtocolV1::handle_connect_reply_2() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) {
+ ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my "
+ << std::hex << connection->policy.features_supported
+ << " < peer " << connect_reply.features << " missing "
+ << (connect_reply.features &
+ ~connection->policy.features_supported)
+ << std::dec << dendl;
+ return _fault();
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) {
+ ldout(cct, 0) << __func__ << " connect protocol version mismatch, my "
+ << messenger->get_proto_version(connection->peer_type, true)
+ << " != " << connect_reply.protocol_version << dendl;
+ return _fault();
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) {
+ ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl;
+ return _fault();
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) {
+ ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl;
+ session_reset();
+ connect_seq = 0;
+
+ // see session_reset
+ connection->outgoing_bl.clear();
+
+ return CONTINUE(send_connect_message);
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) {
+ global_seq = messenger->get_global_seq(connect_reply.global_seq);
+ ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL "
+ << connect_reply.global_seq << " chose new " << global_seq
+ << dendl;
+ return CONTINUE(send_connect_message);
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) {
+ ceph_assert(connect_reply.connect_seq > connect_seq);
+ ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq
+ << " -> " << connect_reply.connect_seq << dendl;
+ connect_seq = connect_reply.connect_seq;
+ return CONTINUE(send_connect_message);
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) {
+ ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl;
+ state = WAIT;
+ return _fault();
+ }
+
+ uint64_t feat_missing;
+ feat_missing =
+ connection->policy.features_required & ~(uint64_t)connect_reply.features;
+ if (feat_missing) {
+ ldout(cct, 1) << __func__ << " missing required features " << std::hex
+ << feat_missing << std::dec << dendl;
+ return _fault();
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) {
+ ldout(cct, 10)
+ << __func__
+ << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq"
+ << dendl;
+
+ return wait_ack_seq();
+ }
+
+ if (connect_reply.tag == CEPH_MSGR_TAG_READY) {
+ ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl;
+ }
+
+ return client_ready();
+}
+
+CtPtr ProtocolV1::wait_ack_seq() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ return READ(sizeof(uint64_t), handle_ack_seq);
+}
+
+CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl;
+ return _fault();
+ }
+
+ uint64_t newly_acked_seq = 0;
+
+ newly_acked_seq = *((uint64_t *)buffer);
+ ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq
+ << " vs out_seq " << out_seq << dendl;
+ out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+ bufferlist bl;
+ uint64_t s = in_seq;
+ bl.append((char *)&s, sizeof(s));
+
+ return WRITE(bl, handle_in_seq_write);
+}
+
+CtPtr ProtocolV1::handle_in_seq_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 10) << __func__ << " send in_seq done " << dendl;
+
+ return client_ready();
+}
+
+CtPtr ProtocolV1::client_ready() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ // hooray!
+ peer_global_seq = connect_reply.global_seq;
+ connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY;
+
+ once_ready = true;
+ connect_seq += 1;
+ ceph_assert(connect_seq == connect_reply.connect_seq);
+ backoff = utime_t();
+ connection->set_features((uint64_t)connect_reply.features &
+ (uint64_t)connection->policy.features_supported);
+ ldout(cct, 10) << __func__ << " connect success " << connect_seq
+ << ", lossy = " << connection->policy.lossy << ", features "
+ << connection->get_features() << dendl;
+
+ // If we have an authorizer, get a new AuthSessionHandler to deal with
+ // ongoing security of the connection. PLR
+ if (authorizer != NULL) {
+ ldout(cct, 10) << __func__ << " setting up session_security with auth "
+ << authorizer << dendl;
+ session_security.reset(get_auth_session_handler(
+ cct, authorizer->protocol,
+ authorizer->session_key,
+ connection->get_features()));
+ } else {
+ // We have no authorizer, so we shouldn't be applying security to messages
+ // in this AsyncConnection. PLR
+ ldout(cct, 10) << __func__ << " no authorizer, clearing session_security"
+ << dendl;
+ session_security.reset();
+ }
+
+ if (connection->delay_state) {
+ ceph_assert(connection->delay_state->ready());
+ }
+ connection->dispatch_queue->queue_connect(connection);
+ messenger->ms_deliver_handle_fast_connect(connection);
+
+ return ready();
+}
+
+/**
+ * Server Protocol V1
+ **/
+
+CtPtr ProtocolV1::send_server_banner() {
+ ldout(cct, 20) << __func__ << dendl;
+ state = ACCEPTING;
+
+ bufferlist bl;
+
+ bl.append(CEPH_BANNER, strlen(CEPH_BANNER));
+
+ // as a server, we should have a legacy addr if we accepted this connection.
+ auto legacy = messenger->get_myaddrs().legacy_addr();
+ encode(legacy, bl, 0); // legacy
+ connection->port = legacy.get_port();
+ encode(connection->target_addr, bl, 0); // legacy
+
+ ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd()
+ << " legacy " << legacy
+ << " socket_addr " << connection->socket_addr
+ << " target_addr " << connection->target_addr
+ << dendl;
+
+ return WRITE(bl, handle_server_banner_write);
+}
+
+CtPtr ProtocolV1::handle_server_banner_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << " write server banner failed" << dendl;
+ return _fault();
+ }
+ ldout(cct, 10) << __func__ << " write banner and addr done: "
+ << connection->get_peer_addr() << dendl;
+
+ return wait_client_banner();
+}
+
+CtPtr ProtocolV1::wait_client_banner() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr),
+ handle_client_banner);
+}
+
+CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl;
+ return _fault();
+ }
+
+ if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer
+ << "' (should be '" << CEPH_BANNER << "')" << dendl;
+ return _fault();
+ }
+
+ bufferlist addr_bl;
+ entity_addr_t peer_addr;
+
+ addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr));
+ try {
+ auto ti = addr_bl.cbegin();
+ decode(peer_addr, ti);
+ } catch (const buffer::error &e) {
+ lderr(cct) << __func__ << " decode peer_addr failed " << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl;
+ if (peer_addr.is_blank_ip()) {
+ // peer apparently doesn't know what ip they have; figure it out for them.
+ int port = peer_addr.get_port();
+ peer_addr.set_sockaddr(connection->target_addr.get_sockaddr());
+ peer_addr.set_port(port);
+
+ ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr
+ << " (socket is " << connection->target_addr << ")" << dendl;
+ }
+ connection->set_peer_addr(peer_addr); // so that connection_state gets set up
+ connection->target_addr = peer_addr;
+
+ return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::wait_connect_message() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&connect_msg, 0, sizeof(connect_msg));
+ return READ(sizeof(connect_msg), handle_connect_message_1);
+}
+
+CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read connect msg failed" << dendl;
+ return _fault();
+ }
+
+ connect_msg = *((ceph_msg_connect *)buffer);
+
+ state = ACCEPTING_WAIT_CONNECT_MSG_AUTH;
+
+ if (connect_msg.authorizer_len) {
+ return wait_connect_message_auth();
+ }
+
+ return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::wait_connect_message_auth() {
+ ldout(cct, 20) << __func__ << dendl;
+ authorizer_buf.clear();
+ authorizer_buf.push_back(buffer::create(connect_msg.authorizer_len));
+ return READB(connect_msg.authorizer_len, authorizer_buf.c_str(),
+ handle_connect_message_auth);
+}
+
+CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl;
+ return _fault();
+ }
+
+ return handle_connect_message_2();
+}
+
+CtPtr ProtocolV1::handle_connect_message_2() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ ldout(cct, 20) << __func__ << " accept got peer connect_seq "
+ << connect_msg.connect_seq << " global_seq "
+ << connect_msg.global_seq << dendl;
+
+ connection->set_peer_type(connect_msg.host_type);
+ connection->policy = messenger->get_policy(connect_msg.host_type);
+
+ ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type
+ << ", policy.lossy=" << connection->policy.lossy
+ << " policy.server=" << connection->policy.server
+ << " policy.standby=" << connection->policy.standby
+ << " policy.resetcheck=" << connection->policy.resetcheck
+ << " features 0x" << std::hex << (uint64_t)connect_msg.features
+ << std::dec
+ << dendl;
+
+ ceph_msg_connect_reply reply;
+ bufferlist authorizer_reply;
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&reply, 0, sizeof(reply));
+ reply.protocol_version =
+ messenger->get_proto_version(connection->peer_type, false);
+
+ // mismatch?
+ ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version
+ << ", their proto " << connect_msg.protocol_version << dendl;
+
+ if (connect_msg.protocol_version != reply.protocol_version) {
+ return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply,
+ authorizer_reply);
+ }
+
+ // require signatures for cephx?
+ if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) {
+ if (connection->peer_type == CEPH_ENTITY_TYPE_OSD ||
+ connection->peer_type == CEPH_ENTITY_TYPE_MDS ||
+ connection->peer_type == CEPH_ENTITY_TYPE_MGR) {
+ if (cct->_conf->cephx_require_signatures ||
+ cct->_conf->cephx_cluster_require_signatures) {
+ ldout(cct, 10)
+ << __func__
+ << " using cephx, requiring MSG_AUTH feature bit for cluster"
+ << dendl;
+ connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+ }
+ if (cct->_conf->cephx_require_version >= 2 ||
+ cct->_conf->cephx_cluster_require_version >= 2) {
+ ldout(cct, 10)
+ << __func__
+ << " using cephx, requiring cephx v2 feature bit for cluster"
+ << dendl;
+ connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
+ } else {
+ if (cct->_conf->cephx_require_signatures ||
+ cct->_conf->cephx_service_require_signatures) {
+ ldout(cct, 10)
+ << __func__
+ << " using cephx, requiring MSG_AUTH feature bit for service"
+ << dendl;
+ connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH;
+ }
+ if (cct->_conf->cephx_require_version >= 2 ||
+ cct->_conf->cephx_service_require_version >= 2) {
+ ldout(cct, 10)
+ << __func__
+ << " using cephx, requiring cephx v2 feature bit for service"
+ << dendl;
+ connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2;
+ }
+ }
+ }
+
+ uint64_t feat_missing =
+ connection->policy.features_required & ~(uint64_t)connect_msg.features;
+ if (feat_missing) {
+ ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+ << feat_missing << std::dec << dendl;
+ return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply,
+ authorizer_reply);
+ }
+
+ bufferlist auth_bl_copy = authorizer_buf;
+ connection->lock.unlock();
+ ldout(cct,10) << __func__ << " authorizor_protocol "
+ << connect_msg.authorizer_protocol
+ << " len " << auth_bl_copy.length()
+ << dendl;
+ bool authorizer_valid;
+ bool need_challenge = HAVE_FEATURE(connect_msg.features, CEPHX_V2);
+ bool had_challenge = (bool)authorizer_challenge;
+ if (!messenger->ms_deliver_verify_authorizer(
+ connection, connection->peer_type, connect_msg.authorizer_protocol,
+ auth_bl_copy, authorizer_reply, authorizer_valid, session_key,
+ nullptr /* connection_secret */,
+ need_challenge ? &authorizer_challenge : nullptr) ||
+ !authorizer_valid) {
+ connection->lock.lock();
+ if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED);
+ return _fault();
+ }
+
+ if (need_challenge && !had_challenge && authorizer_challenge) {
+ ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl;
+ ceph_assert(authorizer_reply.length());
+ return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER,
+ reply, authorizer_reply);
+ } else {
+ ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len="
+ << authorizer_reply.length() << dendl;
+ session_security.reset();
+ return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply,
+ authorizer_reply);
+ }
+ }
+
+ // We've verified the authorizer for this AsyncConnection, so set up the
+ // session security structure. PLR
+ ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl;
+
+ // existing?
+ AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+ connection->inject_delay();
+
+ connection->lock.lock();
+ if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED);
+ return _fault();
+ }
+
+ if (existing == connection) {
+ existing = nullptr;
+ }
+ if (existing && existing->protocol->proto_type != 1) {
+ ldout(cct,1) << __func__ << " existing " << existing << " proto "
+ << existing->protocol.get() << " version is "
+ << existing->protocol->proto_type << ", marking down" << dendl;
+ existing->mark_down();
+ existing = nullptr;
+ }
+
+ if (existing) {
+ // There is no possible that existing connection will acquire this
+ // connection's lock
+ existing->lock.lock(); // skip lockdep check (we are locking a second
+ // AsyncConnection here)
+
+ ldout(cct,10) << __func__ << " existing=" << existing << " exproto="
+ << existing->protocol.get() << dendl;
+ ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+ ceph_assert(exproto);
+ ceph_assert(exproto->proto_type == 1);
+
+ if (exproto->state == CLOSED) {
+ ldout(cct, 1) << __func__ << " existing " << existing
+ << " already closed." << dendl;
+ existing->lock.unlock();
+ existing = nullptr;
+
+ return open(reply, authorizer_reply);
+ }
+
+ if (exproto->replacing) {
+ ldout(cct, 1) << __func__
+ << " existing racing replace happened while replacing."
+ << " existing_state="
+ << connection->get_state_name(existing->state) << dendl;
+ reply.global_seq = exproto->peer_global_seq;
+ existing->lock.unlock();
+ return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+ authorizer_reply);
+ }
+
+ if (connect_msg.global_seq < exproto->peer_global_seq) {
+ ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+ << exproto->peer_global_seq << " > "
+ << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl;
+ reply.global_seq = exproto->peer_global_seq; // so we can send it below..
+ existing->lock.unlock();
+ return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply,
+ authorizer_reply);
+ } else {
+ ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq "
+ << exproto->peer_global_seq
+ << " <= " << connect_msg.global_seq << ", looks ok"
+ << dendl;
+ }
+
+ if (existing->policy.lossy) {
+ ldout(cct, 0)
+ << __func__
+ << " accept replacing existing (lossy) channel (new one lossy="
+ << connection->policy.lossy << ")" << dendl;
+ exproto->session_reset();
+ return replace(existing, reply, authorizer_reply);
+ }
+
+ ldout(cct, 1) << __func__ << " accept connect_seq "
+ << connect_msg.connect_seq
+ << " vs existing csq=" << exproto->connect_seq
+ << " existing_state="
+ << connection->get_state_name(existing->state) << dendl;
+
+ if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) {
+ ldout(cct, 0)
+ << __func__
+ << " accept peer reset, then tried to connect to us, replacing"
+ << dendl;
+ // this is a hard reset from peer
+ is_reset_from_peer = true;
+ if (connection->policy.resetcheck) {
+ exproto->session_reset(); // this resets out_queue, msg_ and
+ // connect_seq #'s
+ }
+ return replace(existing, reply, authorizer_reply);
+ }
+
+ if (connect_msg.connect_seq < exproto->connect_seq) {
+ // old attempt, or we sent READY but they didn't get it.
+ ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq "
+ << exproto->connect_seq << " > " << connect_msg.connect_seq
+ << ", RETRY_SESSION" << dendl;
+ reply.connect_seq = exproto->connect_seq + 1;
+ existing->lock.unlock();
+ return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+ authorizer_reply);
+ }
+
+ if (connect_msg.connect_seq == exproto->connect_seq) {
+ // if the existing connection successfully opened, and/or
+ // subsequently went to standby, then the peer should bump
+ // their connect_seq and retry: this is not a connection race
+ // we need to resolve here.
+ if (exproto->state == OPENED || exproto->state == STANDBY) {
+ ldout(cct, 10) << __func__ << " accept connection race, existing "
+ << existing << ".cseq " << exproto->connect_seq
+ << " == " << connect_msg.connect_seq
+ << ", OPEN|STANDBY, RETRY_SESSION " << dendl;
+ // if connect_seq both zero, dont stuck into dead lock. it's ok to
+ // replace
+ if (connection->policy.resetcheck && exproto->connect_seq == 0) {
+ return replace(existing, reply, authorizer_reply);
+ }
+
+ reply.connect_seq = exproto->connect_seq + 1;
+ existing->lock.unlock();
+ return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply,
+ authorizer_reply);
+ }
+
+ // connection race?
+ if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() ||
+ existing->policy.server) {
+ // incoming wins
+ ldout(cct, 10) << __func__ << " accept connection race, existing "
+ << existing << ".cseq " << exproto->connect_seq
+ << " == " << connect_msg.connect_seq
+ << ", or we are server, replacing my attempt" << dendl;
+ return replace(existing, reply, authorizer_reply);
+ } else {
+ // our existing outgoing wins
+ ldout(messenger->cct, 10)
+ << __func__ << " accept connection race, existing " << existing
+ << ".cseq " << exproto->connect_seq
+ << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl;
+ ceph_assert(connection->peer_addrs->legacy_addr() >
+ messenger->get_myaddr_legacy());
+ existing->lock.unlock();
+ // make sure we follow through with opening the existing
+ // connection (if it isn't yet open) since we know the peer
+ // has something to send to us.
+ existing->send_keepalive();
+ return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply,
+ authorizer_reply);
+ }
+ }
+
+ ceph_assert(connect_msg.connect_seq > exproto->connect_seq);
+ ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq);
+ if (connection->policy.resetcheck && // RESETSESSION only used by servers;
+ // peers do not reset each other
+ exproto->connect_seq == 0) {
+ ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+ << connect_msg.connect_seq << ", " << existing
+ << ".cseq = " << exproto->connect_seq
+ << "), sending RESETSESSION " << dendl;
+ existing->lock.unlock();
+ return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+ authorizer_reply);
+ }
+
+ // reconnect
+ ldout(cct, 10) << __func__ << " accept peer sent cseq "
+ << connect_msg.connect_seq << " > " << exproto->connect_seq
+ << dendl;
+ return replace(existing, reply, authorizer_reply);
+ } // existing
+ else if (!replacing && connect_msg.connect_seq > 0) {
+ // we reset, and they are opening a new session
+ ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq "
+ << connect_msg.connect_seq << "), sending RESETSESSION"
+ << dendl;
+ return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply,
+ authorizer_reply);
+ } else {
+ // new session
+ ldout(cct, 10) << __func__ << " accept new session" << dendl;
+ existing = nullptr;
+ return open(reply, authorizer_reply);
+ }
+}
+
+CtPtr ProtocolV1::send_connect_message_reply(char tag,
+ ceph_msg_connect_reply &reply,
+ bufferlist &authorizer_reply) {
+ ldout(cct, 20) << __func__ << dendl;
+ bufferlist reply_bl;
+ reply.tag = tag;
+ reply.features =
+ ((uint64_t)connect_msg.features & connection->policy.features_supported) |
+ connection->policy.features_required;
+ reply.authorizer_len = authorizer_reply.length();
+ reply_bl.append((char *)&reply, sizeof(reply));
+
+ ldout(cct, 10) << __func__ << " reply features 0x" << std::hex
+ << reply.features << " = (policy sup 0x"
+ << connection->policy.features_supported
+ << " & connect 0x" << (uint64_t)connect_msg.features
+ << ") | policy req 0x"
+ << connection->policy.features_required
+ << dendl;
+
+ if (reply.authorizer_len) {
+ reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+ authorizer_reply.clear();
+ }
+
+ return WRITE(reply_bl, handle_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_connect_message_reply_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << " write connect message reply failed" << dendl;
+ connection->inject_delay();
+ return _fault();
+ }
+
+ return CONTINUE(wait_connect_message);
+}
+
+CtPtr ProtocolV1::replace(AsyncConnectionRef existing,
+ ceph_msg_connect_reply &reply,
+ bufferlist &authorizer_reply) {
+ ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl;
+
+ connection->inject_delay();
+ if (existing->policy.lossy) {
+ // disconnect from the Connection
+ ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing"
+ << dendl;
+ existing->protocol->stop();
+ existing->dispatch_queue->queue_reset(existing.get());
+ } else {
+ ceph_assert(can_write == WriteStatus::NOWRITE);
+ existing->write_lock.lock();
+
+ ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get());
+
+ // reset the in_seq if this is a hard reset from peer,
+ // otherwise we respect our original connection's value
+ if (is_reset_from_peer) {
+ exproto->is_reset_from_peer = true;
+ }
+
+ connection->center->delete_file_event(connection->cs.fd(),
+ EVENT_READABLE | EVENT_WRITABLE);
+
+ if (existing->delay_state) {
+ existing->delay_state->flush();
+ ceph_assert(!connection->delay_state);
+ }
+ exproto->reset_recv_state();
+
+ exproto->connect_msg.features = connect_msg.features;
+
+ auto temp_cs = std::move(connection->cs);
+ EventCenter *new_center = connection->center;
+ Worker *new_worker = connection->worker;
+ // avoid _stop shutdown replacing socket
+ // queue a reset on the new connection, which we're dumping for the old
+ stop();
+
+ connection->dispatch_queue->queue_reset(connection);
+ ldout(messenger->cct, 1)
+ << __func__ << " stop myself to swap existing" << dendl;
+ exproto->can_write = WriteStatus::REPLACING;
+ exproto->replacing = true;
+ exproto->write_in_progress = false;
+ existing->state_offset = 0;
+ // avoid previous thread modify event
+ exproto->state = NONE;
+ existing->state = AsyncConnection::STATE_NONE;
+ // Discard existing prefetch buffer in `recv_buf`
+ existing->recv_start = existing->recv_end = 0;
+ // there shouldn't exist any buffer
+ ceph_assert(connection->recv_start == connection->recv_end);
+
+ exproto->authorizer_challenge.reset();
+
+ auto deactivate_existing = std::bind(
+ [existing, new_worker, new_center, exproto, reply,
+ authorizer_reply](ConnectedSocket &cs) mutable {
+ // we need to delete time event in original thread
+ {
+ std::lock_guard<std::mutex> l(existing->lock);
+ existing->write_lock.lock();
+ exproto->requeue_sent();
+ existing->outgoing_bl.clear();
+ existing->open_write = false;
+ existing->write_lock.unlock();
+ if (exproto->state == NONE) {
+ existing->shutdown_socket();
+ existing->cs = std::move(cs);
+ existing->worker->references--;
+ new_worker->references++;
+ existing->logger = new_worker->get_perf_counter();
+ existing->worker = new_worker;
+ existing->center = new_center;
+ if (existing->delay_state)
+ existing->delay_state->set_center(new_center);
+ } else if (exproto->state == CLOSED) {
+ auto back_to_close =
+ std::bind([](ConnectedSocket &cs) mutable { cs.close(); },
+ std::move(cs));
+ new_center->submit_to(new_center->get_id(),
+ std::move(back_to_close), true);
+ return;
+ } else {
+ ceph_abort();
+ }
+ }
+
+ // Before changing existing->center, it may already exists some
+ // events in existing->center's queue. Then if we mark down
+ // `existing`, it will execute in another thread and clean up
+ // connection. Previous event will result in segment fault
+ auto transfer_existing = [existing, exproto, reply,
+ authorizer_reply]() mutable {
+ std::lock_guard<std::mutex> l(existing->lock);
+ if (exproto->state == CLOSED) return;
+ ceph_assert(exproto->state == NONE);
+
+ // we have called shutdown_socket above
+ ceph_assert(existing->last_tick_id == 0);
+ // restart timer since we are going to re-build connection
+ existing->last_connect_started = ceph::coarse_mono_clock::now();
+ existing->last_tick_id = existing->center->create_time_event(
+ existing->connect_timeout_us, existing->tick_handler);
+ existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+ exproto->state = ACCEPTING;
+
+ existing->center->create_file_event(
+ existing->cs.fd(), EVENT_READABLE, existing->read_handler);
+ reply.global_seq = exproto->peer_global_seq;
+ exproto->run_continuation(exproto->send_connect_message_reply(
+ CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply));
+ };
+ if (existing->center->in_thread())
+ transfer_existing();
+ else
+ existing->center->submit_to(existing->center->get_id(),
+ std::move(transfer_existing), true);
+ },
+ std::move(temp_cs));
+
+ existing->center->submit_to(existing->center->get_id(),
+ std::move(deactivate_existing), true);
+ existing->write_lock.unlock();
+ existing->lock.unlock();
+ return nullptr;
+ }
+ existing->lock.unlock();
+
+ return open(reply, authorizer_reply);
+}
+
+CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply,
+ bufferlist &authorizer_reply) {
+ ldout(cct, 20) << __func__ << dendl;
+
+ connect_seq = connect_msg.connect_seq + 1;
+ peer_global_seq = connect_msg.global_seq;
+ ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq
+ << " in_seq=" << in_seq << ", sending READY" << dendl;
+
+ // if it is a hard reset from peer, we don't need a round-trip to negotiate
+ // in/out sequence
+ if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) &&
+ !is_reset_from_peer) {
+ reply.tag = CEPH_MSGR_TAG_SEQ;
+ wait_for_seq = true;
+ } else {
+ reply.tag = CEPH_MSGR_TAG_READY;
+ wait_for_seq = false;
+ out_seq = discard_requeued_up_to(out_seq, 0);
+ is_reset_from_peer = false;
+ in_seq = 0;
+ }
+
+ // send READY reply
+ reply.features = connection->policy.features_supported;
+ reply.global_seq = messenger->get_global_seq();
+ reply.connect_seq = connect_seq;
+ reply.flags = 0;
+ reply.authorizer_len = authorizer_reply.length();
+ if (connection->policy.lossy) {
+ reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY;
+ }
+
+ connection->set_features((uint64_t)reply.features &
+ (uint64_t)connect_msg.features);
+ ldout(cct, 10) << __func__ << " accept features "
+ << connection->get_features()
+ << " authorizer_protocol "
+ << connect_msg.authorizer_protocol << dendl;
+
+ session_security.reset(
+ get_auth_session_handler(cct, connect_msg.authorizer_protocol,
+ session_key,
+ connection->get_features()));
+
+ bufferlist reply_bl;
+ reply_bl.append((char *)&reply, sizeof(reply));
+
+ if (reply.authorizer_len) {
+ reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length());
+ }
+
+ if (reply.tag == CEPH_MSGR_TAG_SEQ) {
+ uint64_t s = in_seq;
+ reply_bl.append((char *)&s, sizeof(s));
+ }
+
+ connection->lock.unlock();
+ // Because "replacing" will prevent other connections preempt this addr,
+ // it's safe that here we don't acquire Connection's lock
+ ssize_t r = messenger->accept_conn(connection);
+
+ connection->inject_delay();
+
+ connection->lock.lock();
+ replacing = false;
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+ << connection->peer_addrs->legacy_addr()
+ << " just fail later one(this)" << dendl;
+ ldout(cct, 10) << "accept fault after register" << dendl;
+ connection->inject_delay();
+ return _fault();
+ }
+ if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept_conn, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED || state == NONE);
+ ldout(cct, 10) << "accept fault after register" << dendl;
+ messenger->unregister_conn(connection);
+ connection->inject_delay();
+ return _fault();
+ }
+
+ return WRITE(reply_bl, handle_ready_connect_message_reply_write);
+}
+
+CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " write ready connect message reply failed"
+ << dendl;
+ return _fault();
+ }
+
+ // notify
+ connection->dispatch_queue->queue_accept(connection);
+ messenger->ms_deliver_handle_fast_accept(connection);
+ once_ready = true;
+
+ state = ACCEPTING_HANDLED_CONNECT_MSG;
+
+ if (wait_for_seq) {
+ return wait_seq();
+ }
+
+ return server_ready();
+}
+
+CtPtr ProtocolV1::wait_seq() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ return READ(sizeof(uint64_t), handle_seq);
+}
+
+CtPtr ProtocolV1::handle_seq(char *buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read ack seq failed" << dendl;
+ return _fault();
+ }
+
+ uint64_t newly_acked_seq = *(uint64_t *)buffer;
+ ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq
+ << dendl;
+ out_seq = discard_requeued_up_to(out_seq, newly_acked_seq);
+
+ return server_ready();
+}
+
+CtPtr ProtocolV1::server_ready() {
+ ldout(cct, 20) << __func__ << " session_security is "
+ << session_security
+ << dendl;
+
+ ldout(cct, 20) << __func__ << " accept done" << dendl;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&connect_msg, 0, sizeof(connect_msg));
+
+ if (connection->delay_state) {
+ ceph_assert(connection->delay_state->ready());
+ }
+
+ return ready();
+}
diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h
new file mode 100644
index 00000000..070ce73f
--- /dev/null
+++ b/src/msg/async/ProtocolV1.h
@@ -0,0 +1,305 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V1_
+#define _MSG_ASYNC_PROTOCOL_V1_
+
+#include "Protocol.h"
+
+class ProtocolV1;
+using CtPtr = Ct<ProtocolV1>*;
+
+class ProtocolV1 : public Protocol {
+/*
+ * ProtocolV1 State Machine
+ *
+
+ send_server_banner send_client_banner
+ | |
+ v v
+ wait_client_banner wait_server_banner
+ | |
+ | v
+ v handle_server_banner_and_identify
+ wait_connect_message <---------\ |
+ | | | v
+ | wait_connect_message_auth | send_connect_message <----------\
+ | | | | |
+ v v | | |
+handle_connect_message_2 | v |
+ | | | wait_connect_reply |
+ v v | | | |
+ replace -> send_connect_message_reply | V |
+ | | wait_connect_reply_auth |
+ | | | |
+ v v v |
+ open ---\ handle_connect_reply_2 --------/
+ | | |
+ | v v
+ | wait_seq wait_ack_seq
+ | | |
+ v v v
+ server_ready client_ready
+ | |
+ \------------------> wait_message <------------/
+ | ^ | ^
+ /------------------------/ | | |
+ | | | \----------------- ------------\
+ v /----------/ v |
+handle_keepalive2 | handle_message_header read_message_footer
+handle_keepalive2_ack | | ^
+handle_tag_ack | v |
+ | | throttle_message read_message_data
+ \----------------/ | ^
+ v |
+ read_message_front --> read_message_middle --/
+*/
+
+protected:
+
+ enum State {
+ NONE = 0,
+ START_CONNECT,
+ CONNECTING,
+ CONNECTING_WAIT_BANNER_AND_IDENTIFY,
+ CONNECTING_SEND_CONNECT_MSG,
+ START_ACCEPT,
+ ACCEPTING,
+ ACCEPTING_WAIT_CONNECT_MSG_AUTH,
+ ACCEPTING_HANDLED_CONNECT_MSG,
+ OPENED,
+ THROTTLE_MESSAGE,
+ THROTTLE_BYTES,
+ THROTTLE_DISPATCH_QUEUE,
+ READ_MESSAGE_FRONT,
+ READ_FOOTER_AND_DISPATCH,
+ CLOSED,
+ WAIT,
+ STANDBY
+ };
+
+ static const char *get_state_name(int state) {
+ const char *const statenames[] = {"NONE",
+ "START_CONNECT",
+ "CONNECTING",
+ "CONNECTING_WAIT_BANNER_AND_IDENTIFY",
+ "CONNECTING_SEND_CONNECT_MSG",
+ "START_ACCEPT",
+ "ACCEPTING",
+ "ACCEPTING_WAIT_CONNECT_MSG_AUTH",
+ "ACCEPTING_HANDLED_CONNECT_MSG",
+ "OPENED",
+ "THROTTLE_MESSAGE",
+ "THROTTLE_BYTES",
+ "THROTTLE_DISPATCH_QUEUE",
+ "READ_MESSAGE_FRONT",
+ "READ_FOOTER_AND_DISPATCH",
+ "CLOSED",
+ "WAIT",
+ "STANDBY"};
+ return statenames[state];
+ }
+
+ char *temp_buffer;
+
+ enum class WriteStatus { NOWRITE, REPLACING, CANWRITE, CLOSED };
+ std::atomic<WriteStatus> can_write;
+ std::list<Message *> sent; // the first bufferlist need to inject seq
+ // priority queue for outbound msgs
+ std::map<int, std::list<std::pair<bufferlist, Message *>>> out_q;
+ bool keepalive;
+ bool write_in_progress = false;
+
+ __u32 connect_seq, peer_global_seq;
+ std::atomic<uint64_t> in_seq{0};
+ std::atomic<uint64_t> out_seq{0};
+ std::atomic<uint64_t> ack_left{0};
+
+ CryptoKey session_key;
+ std::shared_ptr<AuthSessionHandler> session_security;
+ std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge; // accept side
+
+ // Open state
+ ceph_msg_connect connect_msg;
+ ceph_msg_connect_reply connect_reply;
+ bufferlist authorizer_buf;
+
+ utime_t backoff; // backoff time
+ utime_t recv_stamp;
+ utime_t throttle_stamp;
+ unsigned msg_left;
+ uint64_t cur_msg_size;
+ ceph_msg_header current_header;
+ bufferlist data_buf;
+ bufferlist::iterator data_blp;
+ bufferlist front, middle, data;
+
+ bool replacing; // when replacing process happened, we will reply connect
+ // side with RETRY tag and accept side will clear replaced
+ // connection. So when connect side reissue connect_msg,
+ // there won't exists conflicting connection so we use
+ // "replacing" to skip RESETSESSION to avoid detect wrong
+ // presentation
+ bool is_reset_from_peer;
+ bool once_ready;
+
+ State state;
+
+ void run_continuation(CtPtr pcontinuation);
+ CtPtr read(CONTINUATION_RX_TYPE<ProtocolV1> &next, int len,
+ char *buffer = nullptr);
+ CtPtr write(CONTINUATION_TX_TYPE<ProtocolV1> &next,bufferlist &bl);
+ inline CtPtr _fault() { // helper fault method that stops continuation
+ fault();
+ return nullptr;
+ }
+
+ CONTINUATION_DECL(ProtocolV1, wait_message);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2_ack);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_tag_ack);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_header);
+ CONTINUATION_DECL(ProtocolV1, throttle_message);
+ CONTINUATION_DECL(ProtocolV1, throttle_bytes);
+ CONTINUATION_DECL(ProtocolV1, throttle_dispatch_queue);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_front);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_middle);
+ CONTINUATION_DECL(ProtocolV1, read_message_data);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_data);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_footer);
+
+ CtPtr ready();
+ CtPtr wait_message();
+ CtPtr handle_message(char *buffer, int r);
+
+ CtPtr handle_keepalive2(char *buffer, int r);
+ void append_keepalive_or_ack(bool ack = false, utime_t *t = nullptr);
+ CtPtr handle_keepalive2_ack(char *buffer, int r);
+ CtPtr handle_tag_ack(char *buffer, int r);
+
+ CtPtr handle_message_header(char *buffer, int r);
+ CtPtr throttle_message();
+ CtPtr throttle_bytes();
+ CtPtr throttle_dispatch_queue();
+ CtPtr read_message_front();
+ CtPtr handle_message_front(char *buffer, int r);
+ CtPtr read_message_middle();
+ CtPtr handle_message_middle(char *buffer, int r);
+ CtPtr read_message_data_prepare();
+ CtPtr read_message_data();
+ CtPtr handle_message_data(char *buffer, int r);
+ CtPtr read_message_footer();
+ CtPtr handle_message_footer(char *buffer, int r);
+
+ void session_reset();
+ void randomize_out_seq();
+
+ Message *_get_next_outgoing(bufferlist *bl);
+
+ void prepare_send_message(uint64_t features, Message *m, bufferlist &bl);
+ ssize_t write_message(Message *m, bufferlist &bl, bool more);
+
+ void requeue_sent();
+ uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+ void discard_out_queue();
+
+ void reset_recv_state();
+ void reset_security();
+
+ ostream &_conn_prefix(std::ostream *_dout);
+
+public:
+ ProtocolV1(AsyncConnection *connection);
+ virtual ~ProtocolV1();
+
+ virtual void connect() override;
+ virtual void accept() override;
+ virtual bool is_connected() override;
+ virtual void stop() override;
+ virtual void fault() override;
+ virtual void send_message(Message *m) override;
+ virtual void send_keepalive() override;
+
+ virtual void read_event() override;
+ virtual void write_event() override;
+ virtual bool is_queued() override;
+
+ // Client Protocol
+private:
+ int global_seq;
+ AuthAuthorizer *authorizer;
+
+ CONTINUATION_DECL(ProtocolV1, send_client_banner);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner_write);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_and_identify);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_my_addr_write);
+ CONTINUATION_DECL(ProtocolV1, send_connect_message);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_write);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_1);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_auth);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_ack_seq);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_in_seq_write);
+
+ CtPtr send_client_banner();
+ CtPtr handle_client_banner_write(int r);
+ CtPtr wait_server_banner();
+ CtPtr handle_server_banner_and_identify(char *buffer, int r);
+ CtPtr handle_my_addr_write(int r);
+ CtPtr send_connect_message();
+ CtPtr handle_connect_message_write(int r);
+ CtPtr wait_connect_reply();
+ CtPtr handle_connect_reply_1(char *buffer, int r);
+ CtPtr wait_connect_reply_auth();
+ CtPtr handle_connect_reply_auth(char *buffer, int r);
+ CtPtr handle_connect_reply_2();
+ CtPtr wait_ack_seq();
+ CtPtr handle_ack_seq(char *buffer, int r);
+ CtPtr handle_in_seq_write(int r);
+ CtPtr client_ready();
+
+ // Server Protocol
+protected:
+ bool wait_for_seq;
+
+ CONTINUATION_DECL(ProtocolV1, send_server_banner);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_write);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner);
+ CONTINUATION_DECL(ProtocolV1, wait_connect_message);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_1);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_auth);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+ handle_connect_message_reply_write);
+ WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1,
+ handle_ready_connect_message_reply_write);
+ READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_seq);
+
+ CtPtr send_server_banner();
+ CtPtr handle_server_banner_write(int r);
+ CtPtr wait_client_banner();
+ CtPtr handle_client_banner(char *buffer, int r);
+ CtPtr wait_connect_message();
+ CtPtr handle_connect_message_1(char *buffer, int r);
+ CtPtr wait_connect_message_auth();
+ CtPtr handle_connect_message_auth(char *buffer, int r);
+ CtPtr handle_connect_message_2();
+ CtPtr send_connect_message_reply(char tag, ceph_msg_connect_reply &reply,
+ bufferlist &authorizer_reply);
+ CtPtr handle_connect_message_reply_write(int r);
+ CtPtr replace(AsyncConnectionRef existing, ceph_msg_connect_reply &reply,
+ bufferlist &authorizer_reply);
+ CtPtr open(ceph_msg_connect_reply &reply, bufferlist &authorizer_reply);
+ CtPtr handle_ready_connect_message_reply_write(int r);
+ CtPtr wait_seq();
+ CtPtr handle_seq(char *buffer, int r);
+ CtPtr server_ready();
+};
+
+class LoopbackProtocolV1 : public ProtocolV1 {
+public:
+ LoopbackProtocolV1(AsyncConnection *connection) : ProtocolV1(connection) {
+ this->can_write = WriteStatus::CANWRITE;
+ }
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V1_ */
diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc
new file mode 100644
index 00000000..381d42c3
--- /dev/null
+++ b/src/msg/async/ProtocolV2.cc
@@ -0,0 +1,2870 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <type_traits>
+
+#include "ProtocolV2.h"
+#include "AsyncMessenger.h"
+
+#include "common/EventTrace.h"
+#include "common/ceph_crypto.h"
+#include "common/errno.h"
+#include "include/random.h"
+#include "auth/AuthClient.h"
+#include "auth/AuthServer.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix _conn_prefix(_dout)
+ostream &ProtocolV2::_conn_prefix(std::ostream *_dout) {
+ return *_dout << "--2- " << messenger->get_myaddrs() << " >> "
+ << *connection->peer_addrs << " conn(" << connection << " "
+ << this
+ << " " << ceph_con_mode_name(auth_meta->con_mode)
+ << " :" << connection->port
+ << " s=" << get_state_name(state) << " pgs=" << peer_global_seq
+ << " cs=" << connect_seq << " l=" << connection->policy.lossy
+ << " rev1=" << HAVE_MSGR2_FEATURE(peer_supported_features,
+ REVISION_1)
+ << " rx=" << session_stream_handlers.rx.get()
+ << " tx=" << session_stream_handlers.tx.get()
+ << ").";
+}
+
+using namespace ceph::msgr::v2;
+
+using CtPtr = Ct<ProtocolV2> *;
+using CtRef = Ct<ProtocolV2> &;
+
+void ProtocolV2::run_continuation(CtPtr pcontinuation) {
+ if (pcontinuation) {
+ run_continuation(*pcontinuation);
+ }
+}
+
+void ProtocolV2::run_continuation(CtRef continuation) {
+ try {
+ CONTINUATION_RUN(continuation)
+ } catch (const buffer::error &e) {
+ lderr(cct) << __func__ << " failed decoding of frame header: " << e
+ << dendl;
+ _fault();
+ } catch (const ceph::crypto::onwire::MsgAuthError &e) {
+ lderr(cct) << __func__ << " " << e.what() << dendl;
+ _fault();
+ } catch (const DecryptionError &) {
+ lderr(cct) << __func__ << " failed to decrypt frame payload" << dendl;
+ }
+}
+
+#define WRITE(B, D, C) write(D, CONTINUATION(C), B)
+
+#define READ(L, C) read(CONTINUATION(C), buffer::ptr_node::create(buffer::create(L)))
+
+#define READ_RXBUF(B, C) read(CONTINUATION(C), B)
+
+#ifdef UNIT_TESTS_BUILT
+
+#define INTERCEPT(S) { \
+if(connection->interceptor) { \
+ auto a = connection->interceptor->intercept(connection, (S)); \
+ if (a == Interceptor::ACTION::FAIL) { \
+ return _fault(); \
+ } else if (a == Interceptor::ACTION::STOP) { \
+ stop(); \
+ connection->dispatch_queue->queue_reset(connection); \
+ return nullptr; \
+ }}}
+
+#else
+#define INTERCEPT(S)
+#endif
+
+ProtocolV2::ProtocolV2(AsyncConnection *connection)
+ : Protocol(2, connection),
+ state(NONE),
+ peer_supported_features(0),
+ client_cookie(0),
+ server_cookie(0),
+ global_seq(0),
+ connect_seq(0),
+ peer_global_seq(0),
+ message_seq(0),
+ reconnecting(false),
+ replacing(false),
+ can_write(false),
+ bannerExchangeCallback(nullptr),
+ tx_frame_asm(&session_stream_handlers, false),
+ rx_frame_asm(&session_stream_handlers, false),
+ next_tag(static_cast<Tag>(0)),
+ keepalive(false) {
+}
+
+ProtocolV2::~ProtocolV2() {
+}
+
+void ProtocolV2::connect() {
+ ldout(cct, 1) << __func__ << dendl;
+ state = START_CONNECT;
+ pre_auth.enabled = true;
+}
+
+void ProtocolV2::accept() {
+ ldout(cct, 1) << __func__ << dendl;
+ state = START_ACCEPT;
+}
+
+bool ProtocolV2::is_connected() { return can_write; }
+
+/*
+ * Tears down the message queues, and removes them from the
+ * DispatchQueue Must hold write_lock prior to calling.
+ */
+void ProtocolV2::discard_out_queue() {
+ ldout(cct, 10) << __func__ << " started" << dendl;
+
+ for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) {
+ ldout(cct, 20) << __func__ << " discard " << *p << dendl;
+ (*p)->put();
+ }
+ sent.clear();
+ for (auto& [ prio, entries ] : out_queue) {
+ static_cast<void>(prio);
+ for (auto& entry : entries) {
+ ldout(cct, 20) << __func__ << " discard " << *entry.m << dendl;
+ entry.m->put();
+ }
+ }
+ out_queue.clear();
+ write_in_progress = false;
+}
+
+void ProtocolV2::reset_session() {
+ ldout(cct, 1) << __func__ << dendl;
+
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ if (connection->delay_state) {
+ connection->delay_state->discard();
+ }
+
+ connection->dispatch_queue->discard_queue(connection->conn_id);
+ discard_out_queue();
+ connection->outgoing_bl.clear();
+
+ connection->dispatch_queue->queue_remote_reset(connection);
+
+ out_seq = 0;
+ in_seq = 0;
+ client_cookie = 0;
+ server_cookie = 0;
+ connect_seq = 0;
+ peer_global_seq = 0;
+ message_seq = 0;
+ ack_left = 0;
+ can_write = false;
+}
+
+void ProtocolV2::stop() {
+ ldout(cct, 1) << __func__ << dendl;
+ if (state == CLOSED) {
+ return;
+ }
+
+ if (connection->delay_state) connection->delay_state->flush();
+
+ std::lock_guard<std::mutex> l(connection->write_lock);
+
+ reset_recv_state();
+ discard_out_queue();
+
+ connection->_stop();
+
+ can_write = false;
+ state = CLOSED;
+}
+
+void ProtocolV2::fault() { _fault(); }
+
+void ProtocolV2::requeue_sent() {
+ write_in_progress = false;
+ if (sent.empty()) {
+ return;
+ }
+
+ auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+ out_seq -= sent.size();
+ while (!sent.empty()) {
+ Message *m = sent.back();
+ sent.pop_back();
+ ldout(cct, 5) << __func__ << " requeueing message m=" << m
+ << " seq=" << m->get_seq() << " type=" << m->get_type() << " "
+ << *m << dendl;
+ rq.emplace_front(out_queue_entry_t{false, m});
+ }
+}
+
+uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) {
+ ldout(cct, 10) << __func__ << " " << seq << dendl;
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) {
+ return seq;
+ }
+ auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST];
+ uint64_t count = out_seq;
+ while (!rq.empty()) {
+ Message* const m = rq.front().m;
+ if (m->get_seq() == 0 || m->get_seq() > seq) break;
+ ldout(cct, 5) << __func__ << " discarding message m=" << m
+ << " seq=" << m->get_seq() << " ack_seq=" << seq << " "
+ << *m << dendl;
+ m->put();
+ rq.pop_front();
+ count++;
+ }
+ if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST);
+ return count;
+}
+
+void ProtocolV2::reset_security() {
+ ldout(cct, 5) << __func__ << dendl;
+
+ auth_meta.reset(new AuthConnectionMeta);
+ session_stream_handlers.rx.reset(nullptr);
+ session_stream_handlers.tx.reset(nullptr);
+ pre_auth.rxbuf.clear();
+ pre_auth.txbuf.clear();
+}
+
+// it's expected the `write_lock` is held while calling this method.
+void ProtocolV2::reset_recv_state() {
+ ldout(cct, 5) << __func__ << dendl;
+
+ if (!connection->center->in_thread()) {
+ // execute in the same thread that uses the rx/tx handlers. We need
+ // to do the warp because holding `write_lock` is not enough as
+ // `write_event()` unlocks it just before calling `write_message()`.
+ // `submit_to()` here is NOT blocking.
+ connection->center->submit_to(connection->center->get_id(), [this] {
+ ldout(cct, 5) << "reset_recv_state (warped) reseting crypto handlers"
+ << dendl;
+ // Possibly unnecessary. See the comment in `deactivate_existing`.
+ std::lock_guard<std::mutex> l(connection->lock);
+ std::lock_guard<std::mutex> wl(connection->write_lock);
+ reset_security();
+ }, /* nowait = */true);
+ } else {
+ reset_security();
+ }
+
+ // clean read and write callbacks
+ connection->pendingReadLen.reset();
+ connection->writeCallback.reset();
+
+ next_tag = static_cast<Tag>(0);
+
+ reset_throttle();
+}
+
+size_t ProtocolV2::get_current_msg_size() const {
+ ceph_assert(rx_frame_asm.get_num_segments() > 0);
+ size_t sum = 0;
+ // we don't include SegmentIndex::Msg::HEADER.
+ for (size_t i = 1; i < rx_frame_asm.get_num_segments(); i++) {
+ sum += rx_frame_asm.get_segment_logical_len(i);
+ }
+ return sum;
+}
+
+void ProtocolV2::reset_throttle() {
+ if (state > THROTTLE_MESSAGE && state <= THROTTLE_DONE &&
+ connection->policy.throttler_messages) {
+ ldout(cct, 10) << __func__ << " releasing " << 1
+ << " message to policy throttler "
+ << connection->policy.throttler_messages->get_current()
+ << "/" << connection->policy.throttler_messages->get_max()
+ << dendl;
+ connection->policy.throttler_messages->put();
+ }
+ if (state > THROTTLE_BYTES && state <= THROTTLE_DONE) {
+ if (connection->policy.throttler_bytes) {
+ const size_t cur_msg_size = get_current_msg_size();
+ ldout(cct, 10) << __func__ << " releasing " << cur_msg_size
+ << " bytes to policy throttler "
+ << connection->policy.throttler_bytes->get_current() << "/"
+ << connection->policy.throttler_bytes->get_max() << dendl;
+ connection->policy.throttler_bytes->put(cur_msg_size);
+ }
+ }
+ if (state > THROTTLE_DISPATCH_QUEUE && state <= THROTTLE_DONE) {
+ const size_t cur_msg_size = get_current_msg_size();
+ ldout(cct, 10)
+ << __func__ << " releasing " << cur_msg_size
+ << " bytes to dispatch_queue throttler "
+ << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+ << connection->dispatch_queue->dispatch_throttler.get_max() << dendl;
+ connection->dispatch_queue->dispatch_throttle_release(cur_msg_size);
+ }
+}
+
+CtPtr ProtocolV2::_fault() {
+ ldout(cct, 10) << __func__ << dendl;
+
+ if (state == CLOSED || state == NONE) {
+ ldout(cct, 10) << __func__ << " connection is already closed" << dendl;
+ return nullptr;
+ }
+
+ if (connection->policy.lossy &&
+ !(state >= START_CONNECT && state <= SESSION_RECONNECTING)) {
+ ldout(cct, 2) << __func__ << " on lossy channel, failing" << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+
+ connection->write_lock.lock();
+
+ can_write = false;
+ // requeue sent items
+ requeue_sent();
+
+ if (out_queue.empty() && state >= START_ACCEPT &&
+ state <= SESSION_ACCEPTING && !replacing) {
+ ldout(cct, 2) << __func__ << " with nothing to send and in the half "
+ << " accept state just closed" << dendl;
+ connection->write_lock.unlock();
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+
+ replacing = false;
+ connection->fault();
+ reset_recv_state();
+
+ reconnecting = false;
+
+ if (connection->policy.standby && out_queue.empty() && !keepalive &&
+ state != WAIT) {
+ ldout(cct, 1) << __func__ << " with nothing to send, going to standby"
+ << dendl;
+ state = STANDBY;
+ connection->write_lock.unlock();
+ return nullptr;
+ }
+ if (connection->policy.server) {
+ ldout(cct, 1) << __func__ << " server, going to standby, even though i have stuff queued" << dendl;
+ state = STANDBY;
+ connection->write_lock.unlock();
+ return nullptr;
+ }
+
+ connection->write_lock.unlock();
+
+ if (!(state >= START_CONNECT && state <= SESSION_RECONNECTING) &&
+ state != WAIT &&
+ state != SESSION_ACCEPTING /* due to connection race */) {
+ // policy maybe empty when state is in accept
+ if (connection->policy.server) {
+ ldout(cct, 1) << __func__ << " server, going to standby" << dendl;
+ state = STANDBY;
+ } else {
+ ldout(cct, 1) << __func__ << " initiating reconnect" << dendl;
+ connect_seq++;
+ global_seq = messenger->get_global_seq();
+ state = START_CONNECT;
+ pre_auth.enabled = true;
+ connection->state = AsyncConnection::STATE_CONNECTING;
+ }
+ backoff = utime_t();
+ connection->center->dispatch_event_external(connection->read_handler);
+ } else {
+ if (state == WAIT) {
+ backoff.set_from_double(cct->_conf->ms_max_backoff);
+ } else if (backoff == utime_t()) {
+ backoff.set_from_double(cct->_conf->ms_initial_backoff);
+ } else {
+ backoff += backoff;
+ if (backoff > cct->_conf->ms_max_backoff)
+ backoff.set_from_double(cct->_conf->ms_max_backoff);
+ }
+
+ if (server_cookie) {
+ connect_seq++;
+ }
+
+ global_seq = messenger->get_global_seq();
+ state = START_CONNECT;
+ pre_auth.enabled = true;
+ connection->state = AsyncConnection::STATE_CONNECTING;
+ ldout(cct, 1) << __func__ << " waiting " << backoff << dendl;
+ // woke up again;
+ connection->register_time_events.insert(
+ connection->center->create_time_event(backoff.to_nsec() / 1000,
+ connection->wakeup_handler));
+ }
+ return nullptr;
+}
+
+void ProtocolV2::prepare_send_message(uint64_t features,
+ Message *m) {
+ ldout(cct, 20) << __func__ << " m=" << *m << dendl;
+
+ // associate message with Connection (for benefit of encode_payload)
+ if (m->empty_payload()) {
+ ldout(cct, 20) << __func__ << " encoding features " << features << " " << m
+ << " " << *m << dendl;
+ } else {
+ ldout(cct, 20) << __func__ << " half-reencoding features " << features
+ << " " << m << " " << *m << dendl;
+ }
+
+ // encode and copy out of *m
+ m->encode(features, 0);
+}
+
+void ProtocolV2::send_message(Message *m) {
+ uint64_t f = connection->get_features();
+
+ // TODO: Currently not all messages supports reencode like MOSDMap, so here
+ // only let fast dispatch support messages prepare message
+ const bool can_fast_prepare = messenger->ms_can_fast_dispatch(m);
+ if (can_fast_prepare) {
+ prepare_send_message(f, m);
+ }
+
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ bool is_prepared = can_fast_prepare;
+ // "features" changes will change the payload encoding
+ if (can_fast_prepare && (!can_write || connection->get_features() != f)) {
+ // ensure the correctness of message encoding
+ m->clear_payload();
+ is_prepared = false;
+ ldout(cct, 10) << __func__ << " clear encoded buffer previous " << f
+ << " != " << connection->get_features() << dendl;
+ }
+ if (state == CLOSED) {
+ ldout(cct, 10) << __func__ << " connection closed."
+ << " Drop message " << m << dendl;
+ m->put();
+ } else {
+ ldout(cct, 5) << __func__ << " enqueueing message m=" << m
+ << " type=" << m->get_type() << " " << *m << dendl;
+ m->trace.event("async enqueueing message");
+ out_queue[m->get_priority()].emplace_back(
+ out_queue_entry_t{is_prepared, m});
+ ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m
+ << dendl;
+ if (((!replacing && can_write) || state == STANDBY) && !write_in_progress) {
+ write_in_progress = true;
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+ }
+}
+
+void ProtocolV2::send_keepalive() {
+ ldout(cct, 10) << __func__ << dendl;
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ if (state != CLOSED) {
+ keepalive = true;
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+}
+
+void ProtocolV2::read_event() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ switch (state) {
+ case START_CONNECT:
+ run_continuation(CONTINUATION(start_client_banner_exchange));
+ break;
+ case START_ACCEPT:
+ run_continuation(CONTINUATION(start_server_banner_exchange));
+ break;
+ case READY:
+ run_continuation(CONTINUATION(read_frame));
+ break;
+ case THROTTLE_MESSAGE:
+ run_continuation(CONTINUATION(throttle_message));
+ break;
+ case THROTTLE_BYTES:
+ run_continuation(CONTINUATION(throttle_bytes));
+ break;
+ case THROTTLE_DISPATCH_QUEUE:
+ run_continuation(CONTINUATION(throttle_dispatch_queue));
+ break;
+ default:
+ break;
+ }
+}
+
+ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() {
+ out_queue_entry_t out_entry;
+
+ if (!out_queue.empty()) {
+ auto it = out_queue.rbegin();
+ auto& entries = it->second;
+ ceph_assert(!entries.empty());
+ out_entry = entries.front();
+ entries.pop_front();
+ if (entries.empty()) {
+ out_queue.erase(it->first);
+ }
+ }
+ return out_entry;
+}
+
+ssize_t ProtocolV2::write_message(Message *m, bool more) {
+ FUNCTRACE(cct);
+ ceph_assert(connection->center->in_thread());
+ m->set_seq(++out_seq);
+
+ connection->lock.lock();
+ uint64_t ack_seq = in_seq;
+ ack_left = 0;
+ connection->lock.unlock();
+
+ ceph_msg_header &header = m->get_header();
+ ceph_msg_footer &footer = m->get_footer();
+
+ ceph_msg_header2 header2{header.seq, header.tid,
+ header.type, header.priority,
+ header.version,
+ init_le32(0), header.data_off,
+ init_le64(ack_seq),
+ footer.flags, header.compat_version,
+ header.reserved};
+
+ auto message = MessageFrame::Encode(
+ header2,
+ m->get_payload(),
+ m->get_middle(),
+ m->get_data());
+ if (!append_frame(message)) {
+ m->put();
+ return -EILSEQ;
+ }
+
+ ldout(cct, 5) << __func__ << " sending message m=" << m
+ << " seq=" << m->get_seq() << " " << *m << dendl;
+
+ m->trace.event("async writing message");
+ ldout(cct, 20) << __func__ << " sending m=" << m << " seq=" << m->get_seq()
+ << " src=" << entity_name_t(messenger->get_myname())
+ << " off=" << header2.data_off
+ << dendl;
+ ssize_t total_send_size = connection->outgoing_bl.length();
+ ssize_t rc = connection->_try_send(more);
+ if (rc < 0) {
+ ldout(cct, 1) << __func__ << " error sending " << m << ", "
+ << cpp_strerror(rc) << dendl;
+ } else {
+ connection->logger->inc(
+ l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length());
+ ldout(cct, 10) << __func__ << " sending " << m
+ << (rc ? " continuely." : " done.") << dendl;
+ }
+ if (m->get_type() == CEPH_MSG_OSD_OP)
+ OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false);
+ else if (m->get_type() == CEPH_MSG_OSD_OPREPLY)
+ OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false);
+ m->put();
+
+ return rc;
+}
+
+template <class F>
+bool ProtocolV2::append_frame(F& frame) {
+ ceph::bufferlist bl;
+ try {
+ bl = frame.get_buffer(tx_frame_asm);
+ } catch (ceph::crypto::onwire::TxHandlerError &e) {
+ ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+ return false;
+ }
+
+ ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+ << " bytes " << tx_frame_asm << dendl;
+ connection->outgoing_bl.append(bl);
+ return true;
+}
+
+void ProtocolV2::handle_message_ack(uint64_t seq) {
+ if (connection->policy.lossy) { // lossy connections don't keep sent messages
+ return;
+ }
+
+ ldout(cct, 15) << __func__ << " seq=" << seq << dendl;
+
+ // trim sent list
+ static const int max_pending = 128;
+ int i = 0;
+ Message *pending[max_pending];
+ connection->write_lock.lock();
+ while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) {
+ Message *m = sent.front();
+ sent.pop_front();
+ pending[i++] = m;
+ ldout(cct, 10) << __func__ << " got ack seq " << seq
+ << " >= " << m->get_seq() << " on " << m << " " << *m
+ << dendl;
+ }
+ connection->write_lock.unlock();
+ for (int k = 0; k < i; k++) {
+ pending[k]->put();
+ }
+}
+
+void ProtocolV2::write_event() {
+ ldout(cct, 10) << __func__ << dendl;
+ ssize_t r = 0;
+
+ connection->write_lock.lock();
+ if (can_write) {
+ if (keepalive) {
+ ldout(cct, 10) << __func__ << " appending keepalive" << dendl;
+ auto keepalive_frame = KeepAliveFrame::Encode();
+ if (!append_frame(keepalive_frame)) {
+ connection->write_lock.unlock();
+ connection->lock.lock();
+ fault();
+ connection->lock.unlock();
+ return;
+ }
+ keepalive = false;
+ }
+
+ auto start = ceph::mono_clock::now();
+ bool more;
+ do {
+ const auto out_entry = _get_next_outgoing();
+ if (!out_entry.m) {
+ break;
+ }
+
+ if (!connection->policy.lossy) {
+ // put on sent list
+ sent.push_back(out_entry.m);
+ out_entry.m->get();
+ }
+ more = !out_queue.empty();
+ connection->write_lock.unlock();
+
+ // send_message or requeue messages may not encode message
+ if (!out_entry.is_prepared) {
+ prepare_send_message(connection->get_features(), out_entry.m);
+ }
+
+ r = write_message(out_entry.m, more);
+
+ connection->write_lock.lock();
+ if (r == 0) {
+ ;
+ } else if (r < 0) {
+ ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+ break;
+ } else if (r > 0)
+ break;
+ } while (can_write);
+ write_in_progress = false;
+
+ // if r > 0 mean data still lefted, so no need _try_send.
+ if (r == 0) {
+ uint64_t left = ack_left;
+ if (left) {
+ ceph_le64 s;
+ s = in_seq;
+ ldout(cct, 10) << __func__ << " try send msg ack, acked " << left
+ << " messages" << dendl;
+ auto ack_frame = AckFrame::Encode(in_seq);
+ if (append_frame(ack_frame)) {
+ ack_left -= left;
+ left = ack_left;
+ r = connection->_try_send(left);
+ } else {
+ r = -EILSEQ;
+ }
+ } else if (is_queued()) {
+ r = connection->_try_send();
+ }
+ }
+ connection->write_lock.unlock();
+
+ connection->logger->tinc(l_msgr_running_send_time,
+ ceph::mono_clock::now() - start);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send msg failed" << dendl;
+ connection->lock.lock();
+ fault();
+ connection->lock.unlock();
+ return;
+ }
+ } else {
+ write_in_progress = false;
+ connection->write_lock.unlock();
+ connection->lock.lock();
+ connection->write_lock.lock();
+ if (state == STANDBY && !connection->policy.server && is_queued()) {
+ ldout(cct, 10) << __func__ << " policy.server is false" << dendl;
+ if (server_cookie) { // only increment connect_seq if there is a session
+ connect_seq++;
+ }
+ connection->_connect();
+ } else if (connection->cs && state != NONE && state != CLOSED &&
+ state != START_CONNECT) {
+ r = connection->_try_send();
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl;
+ connection->write_lock.unlock();
+ fault();
+ connection->lock.unlock();
+ return;
+ }
+ }
+ connection->write_lock.unlock();
+ connection->lock.unlock();
+ }
+}
+
+bool ProtocolV2::is_queued() {
+ return !out_queue.empty() || connection->is_queued();
+}
+
+CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+ rx_buffer_t &&buffer) {
+ const auto len = buffer->length();
+ const auto buf = buffer->c_str();
+ next.node = std::move(buffer);
+ ssize_t r = connection->read(len, buf,
+ [&next, this](char *buffer, int r) {
+ if (unlikely(pre_auth.enabled) && r >= 0) {
+ pre_auth.rxbuf.append(*next.node);
+ ceph_assert(!cct->_conf->ms_die_on_bug ||
+ pre_auth.rxbuf.length() < 20000000);
+ }
+ next.r = r;
+ run_continuation(next);
+ });
+ if (r <= 0) {
+ // error or done synchronously
+ if (unlikely(pre_auth.enabled) && r >= 0) {
+ pre_auth.rxbuf.append(*next.node);
+ ceph_assert(!cct->_conf->ms_die_on_bug ||
+ pre_auth.rxbuf.length() < 20000000);
+ }
+ next.r = r;
+ return &next;
+ }
+
+ return nullptr;
+}
+
+template <class F>
+CtPtr ProtocolV2::write(const std::string &desc,
+ CONTINUATION_TYPE<ProtocolV2> &next,
+ F &frame) {
+ ceph::bufferlist bl;
+ try {
+ bl = frame.get_buffer(tx_frame_asm);
+ } catch (ceph::crypto::onwire::TxHandlerError &e) {
+ ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 25) << __func__ << " assembled frame " << bl.length()
+ << " bytes " << tx_frame_asm << dendl;
+ return write(desc, next, bl);
+}
+
+CtPtr ProtocolV2::write(const std::string &desc,
+ CONTINUATION_TYPE<ProtocolV2> &next,
+ bufferlist &buffer) {
+ if (unlikely(pre_auth.enabled)) {
+ pre_auth.txbuf.append(buffer);
+ ceph_assert(!cct->_conf->ms_die_on_bug ||
+ pre_auth.txbuf.length() < 20000000);
+ }
+
+ ssize_t r =
+ connection->write(buffer, [&next, desc, this](int r) {
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+ << " (" << cpp_strerror(r) << ")" << dendl;
+ connection->inject_delay();
+ _fault();
+ }
+ run_continuation(next);
+ });
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r
+ << " (" << cpp_strerror(r) << ")" << dendl;
+ return _fault();
+ } else if (r == 0) {
+ next.setParams();
+ return &next;
+ }
+
+ return nullptr;
+}
+
+CtPtr ProtocolV2::_banner_exchange(CtRef callback) {
+ ldout(cct, 20) << __func__ << dendl;
+ bannerExchangeCallback = &callback;
+
+ bufferlist banner_payload;
+ encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0);
+ encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0);
+
+ bufferlist bl;
+ bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX));
+ encode((uint16_t)banner_payload.length(), bl, 0);
+ bl.claim_append(banner_payload);
+
+ INTERCEPT(state == BANNER_CONNECTING ? 3 : 4);
+
+ return WRITE(bl, "banner", _wait_for_peer_banner);
+}
+
+CtPtr ProtocolV2::_wait_for_peer_banner() {
+ unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(__le16);
+ return READ(banner_len, _handle_peer_banner);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner(rx_buffer_t &&buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read peer banner failed r=" << r << " ("
+ << cpp_strerror(r) << ")" << dendl;
+ return _fault();
+ }
+
+ unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX);
+
+ if (memcmp(buffer->c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len)) {
+ if (memcmp(buffer->c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) {
+ lderr(cct) << __func__ << " peer " << *connection->peer_addrs
+ << " is using msgr V1 protocol" << dendl;
+ return _fault();
+ }
+ ldout(cct, 1) << __func__ << " accept peer sent bad banner" << dendl;
+ return _fault();
+ }
+
+ uint16_t payload_len;
+ bufferlist bl;
+ buffer->set_offset(banner_prefix_len);
+ buffer->set_length(sizeof(__le16));
+ bl.push_back(std::move(buffer));
+ auto ti = bl.cbegin();
+ try {
+ decode(payload_len, ti);
+ } catch (const buffer::error &e) {
+ lderr(cct) << __func__ << " decode banner payload len failed " << dendl;
+ return _fault();
+ }
+
+ INTERCEPT(state == BANNER_CONNECTING ? 5 : 6);
+
+ return READ(payload_len, _handle_peer_banner_payload);
+}
+
+CtPtr ProtocolV2::_handle_peer_banner_payload(rx_buffer_t &&buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read peer banner payload failed r=" << r
+ << " (" << cpp_strerror(r) << ")" << dendl;
+ return _fault();
+ }
+
+ uint64_t peer_supported_features;
+ uint64_t peer_required_features;
+
+ bufferlist bl;
+ bl.push_back(std::move(buffer));
+ auto ti = bl.cbegin();
+ try {
+ decode(peer_supported_features, ti);
+ decode(peer_required_features, ti);
+ } catch (const buffer::error &e) {
+ lderr(cct) << __func__ << " decode banner payload failed " << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 1) << __func__ << " supported=" << std::hex
+ << peer_supported_features << " required=" << std::hex
+ << peer_required_features << std::dec << dendl;
+
+ // Check feature bit compatibility
+
+ uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES;
+ uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES;
+
+ if ((required_features & peer_supported_features) != required_features) {
+ ldout(cct, 1) << __func__ << " peer does not support all required features"
+ << " required=" << std::hex << required_features
+ << " supported=" << std::hex << peer_supported_features
+ << std::dec << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+ if ((supported_features & peer_required_features) != peer_required_features) {
+ ldout(cct, 1) << __func__ << " we do not support all peer required features"
+ << " required=" << std::hex << peer_required_features
+ << " supported=" << supported_features << std::dec << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+
+ this->peer_supported_features = peer_supported_features;
+ if (peer_required_features == 0) {
+ this->connection_features = msgr2_required;
+ }
+
+ // if the peer supports msgr2.1, switch to it
+ bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+ tx_frame_asm.set_is_rev1(is_rev1);
+ rx_frame_asm.set_is_rev1(is_rev1);
+
+ if (state == BANNER_CONNECTING) {
+ state = HELLO_CONNECTING;
+ }
+ else {
+ ceph_assert(state == BANNER_ACCEPTING);
+ state = HELLO_ACCEPTING;
+ }
+
+ auto hello = HelloFrame::Encode(messenger->get_mytype(),
+ connection->target_addr);
+
+ INTERCEPT(state == HELLO_CONNECTING ? 7 : 8);
+
+ return WRITE(hello, "hello frame", read_frame);
+}
+
+CtPtr ProtocolV2::handle_hello(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != HELLO_CONNECTING && state != HELLO_ACCEPTING) {
+ lderr(cct) << __func__ << " not in hello exchange state!" << dendl;
+ return _fault();
+ }
+
+ auto hello = HelloFrame::Decode(payload);
+
+ ldout(cct, 5) << __func__ << " received hello:"
+ << " peer_type=" << (int)hello.entity_type()
+ << " peer_addr_for_me=" << hello.peer_addr() << dendl;
+
+ sockaddr_storage ss;
+ socklen_t len = sizeof(ss);
+ getsockname(connection->cs.fd(), (sockaddr *)&ss, &len);
+ ldout(cct, 5) << __func__ << " getsockname says I am " << (sockaddr *)&ss
+ << " when talking to " << connection->target_addr << dendl;
+
+ if (connection->get_peer_type() == -1) {
+ connection->set_peer_type(hello.entity_type());
+
+ ceph_assert(state == HELLO_ACCEPTING);
+ connection->policy = messenger->get_policy(hello.entity_type());
+ ldout(cct, 10) << __func__ << " accept of host_type "
+ << (int)hello.entity_type()
+ << ", policy.lossy=" << connection->policy.lossy
+ << " policy.server=" << connection->policy.server
+ << " policy.standby=" << connection->policy.standby
+ << " policy.resetcheck=" << connection->policy.resetcheck
+ << dendl;
+ } else {
+ ceph_assert(state == HELLO_CONNECTING);
+ if (connection->get_peer_type() != hello.entity_type()) {
+ ldout(cct, 1) << __func__ << " connection peer type does not match what"
+ << " peer advertises " << connection->get_peer_type()
+ << " != " << (int)hello.entity_type() << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+ }
+
+ if (messenger->get_myaddrs().empty() ||
+ messenger->get_myaddrs().front().is_blank_ip()) {
+ entity_addr_t a;
+ if (cct->_conf->ms_learn_addr_from_peer) {
+ ldout(cct, 1) << __func__ << " peer " << connection->target_addr
+ << " says I am " << hello.peer_addr() << " (socket says "
+ << (sockaddr*)&ss << ")" << dendl;
+ a = hello.peer_addr();
+ } else {
+ ldout(cct, 1) << __func__ << " socket to " << connection->target_addr
+ << " says I am " << (sockaddr*)&ss
+ << " (peer says " << hello.peer_addr() << ")" << dendl;
+ a.set_sockaddr((sockaddr *)&ss);
+ }
+ a.set_type(entity_addr_t::TYPE_MSGR2); // anything but NONE; learned_addr ignores this
+ a.set_port(0);
+ connection->lock.unlock();
+ messenger->learned_addr(a);
+ if (cct->_conf->ms_inject_internal_delays &&
+ cct->_conf->ms_inject_socket_failures) {
+ if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(cct, 10) << __func__ << " sleep for "
+ << cct->_conf->ms_inject_internal_delays << dendl;
+ utime_t t;
+ t.set_from_double(cct->_conf->ms_inject_internal_delays);
+ t.sleep();
+ }
+ }
+ connection->lock.lock();
+ if (state != HELLO_CONNECTING) {
+ ldout(cct, 1) << __func__
+ << " state changed while learned_addr, mark_down or "
+ << " replacing must be happened just now" << dendl;
+ return nullptr;
+ }
+ }
+
+
+
+ CtPtr callback;
+ callback = bannerExchangeCallback;
+ bannerExchangeCallback = nullptr;
+ ceph_assert(callback);
+ return callback;
+}
+
+CtPtr ProtocolV2::read_frame() {
+ if (state == CLOSED) {
+ return nullptr;
+ }
+
+ ldout(cct, 20) << __func__ << dendl;
+ rx_preamble.clear();
+ rx_epilogue.clear();
+ rx_segments_data.clear();
+
+ return READ(rx_frame_asm.get_preamble_onwire_len(),
+ handle_read_frame_preamble_main);
+}
+
+CtPtr ProtocolV2::handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read frame preamble failed r=" << r
+ << " (" << cpp_strerror(r) << ")" << dendl;
+ return _fault();
+ }
+
+ rx_preamble.push_back(std::move(buffer));
+
+ ldout(cct, 30) << __func__ << " preamble\n";
+ rx_preamble.hexdump(*_dout);
+ *_dout << dendl;
+
+ try {
+ next_tag = rx_frame_asm.disassemble_preamble(rx_preamble);
+ } catch (FrameError& e) {
+ ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+ return _fault();
+ } catch (ceph::crypto::onwire::MsgAuthError&) {
+ ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+ return _fault();
+ }
+
+ ldout(cct, 25) << __func__ << " disassembled preamble " << rx_frame_asm
+ << dendl;
+
+ if (session_stream_handlers.rx) {
+ ldout(cct, 30) << __func__ << " preamble after decrypt\n";
+ rx_preamble.hexdump(*_dout);
+ *_dout << dendl;
+ }
+
+ // does it need throttle?
+ if (next_tag == Tag::MESSAGE) {
+ if (state != READY) {
+ lderr(cct) << __func__ << " not in ready state!" << dendl;
+ return _fault();
+ }
+ state = THROTTLE_MESSAGE;
+ return CONTINUE(throttle_message);
+ } else {
+ return read_frame_segment();
+ }
+}
+
+CtPtr ProtocolV2::handle_read_frame_dispatch() {
+ ldout(cct, 10) << __func__
+ << " tag=" << static_cast<uint32_t>(next_tag) << dendl;
+
+ switch (next_tag) {
+ case Tag::HELLO:
+ case Tag::AUTH_REQUEST:
+ case Tag::AUTH_BAD_METHOD:
+ case Tag::AUTH_REPLY_MORE:
+ case Tag::AUTH_REQUEST_MORE:
+ case Tag::AUTH_DONE:
+ case Tag::AUTH_SIGNATURE:
+ case Tag::CLIENT_IDENT:
+ case Tag::SERVER_IDENT:
+ case Tag::IDENT_MISSING_FEATURES:
+ case Tag::SESSION_RECONNECT:
+ case Tag::SESSION_RESET:
+ case Tag::SESSION_RETRY:
+ case Tag::SESSION_RETRY_GLOBAL:
+ case Tag::SESSION_RECONNECT_OK:
+ case Tag::KEEPALIVE2:
+ case Tag::KEEPALIVE2_ACK:
+ case Tag::ACK:
+ case Tag::WAIT:
+ return handle_frame_payload();
+ case Tag::MESSAGE:
+ return handle_message();
+ default: {
+ lderr(cct) << __func__
+ << " received unknown tag=" << static_cast<uint32_t>(next_tag)
+ << dendl;
+ return _fault();
+ }
+ }
+
+ return nullptr;
+}
+
+CtPtr ProtocolV2::read_frame_segment() {
+ size_t seg_idx = rx_segments_data.size();
+ ldout(cct, 20) << __func__ << " seg_idx=" << seg_idx << dendl;
+ rx_segments_data.emplace_back();
+
+ uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx);
+ if (onwire_len == 0) {
+ return _handle_read_frame_segment();
+ }
+
+ rx_buffer_t rx_buffer;
+ uint16_t align = rx_frame_asm.get_segment_align(seg_idx);
+ try {
+ rx_buffer = buffer::ptr_node::create(buffer::create_aligned(
+ onwire_len, align));
+ } catch (std::bad_alloc&) {
+ // Catching because of potential issues with satisfying alignment.
+ ldout(cct, 1) << __func__ << " can't allocate aligned rx_buffer"
+ << " len=" << onwire_len
+ << " align=" << align
+ << dendl;
+ return _fault();
+ }
+
+ return READ_RXBUF(std::move(rx_buffer), handle_read_frame_segment);
+}
+
+CtPtr ProtocolV2::handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r) {
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read frame segment failed r=" << r << " ("
+ << cpp_strerror(r) << ")" << dendl;
+ return _fault();
+ }
+
+ rx_segments_data.back().push_back(std::move(rx_buffer));
+ return _handle_read_frame_segment();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_segment() {
+ if (rx_segments_data.size() == rx_frame_asm.get_num_segments()) {
+ // OK, all segments planned to read are read. Can go with epilogue.
+ uint32_t epilogue_onwire_len = rx_frame_asm.get_epilogue_onwire_len();
+ if (epilogue_onwire_len == 0) {
+ return _handle_read_frame_epilogue_main();
+ }
+ return READ(epilogue_onwire_len, handle_read_frame_epilogue_main);
+ }
+ // TODO: for makeshift only. This will be more generic and throttled
+ return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_frame_payload() {
+ ceph_assert(!rx_segments_data.empty());
+ auto& payload = rx_segments_data.back();
+
+ ldout(cct, 30) << __func__ << "\n";
+ payload.hexdump(*_dout);
+ *_dout << dendl;
+
+ switch (next_tag) {
+ case Tag::HELLO:
+ return handle_hello(payload);
+ case Tag::AUTH_REQUEST:
+ return handle_auth_request(payload);
+ case Tag::AUTH_BAD_METHOD:
+ return handle_auth_bad_method(payload);
+ case Tag::AUTH_REPLY_MORE:
+ return handle_auth_reply_more(payload);
+ case Tag::AUTH_REQUEST_MORE:
+ return handle_auth_request_more(payload);
+ case Tag::AUTH_DONE:
+ return handle_auth_done(payload);
+ case Tag::AUTH_SIGNATURE:
+ return handle_auth_signature(payload);
+ case Tag::CLIENT_IDENT:
+ return handle_client_ident(payload);
+ case Tag::SERVER_IDENT:
+ return handle_server_ident(payload);
+ case Tag::IDENT_MISSING_FEATURES:
+ return handle_ident_missing_features(payload);
+ case Tag::SESSION_RECONNECT:
+ return handle_reconnect(payload);
+ case Tag::SESSION_RESET:
+ return handle_session_reset(payload);
+ case Tag::SESSION_RETRY:
+ return handle_session_retry(payload);
+ case Tag::SESSION_RETRY_GLOBAL:
+ return handle_session_retry_global(payload);
+ case Tag::SESSION_RECONNECT_OK:
+ return handle_reconnect_ok(payload);
+ case Tag::KEEPALIVE2:
+ return handle_keepalive2(payload);
+ case Tag::KEEPALIVE2_ACK:
+ return handle_keepalive2_ack(payload);
+ case Tag::ACK:
+ return handle_message_ack(payload);
+ case Tag::WAIT:
+ return handle_wait(payload);
+ default:
+ ceph_abort();
+ }
+ return nullptr;
+}
+
+CtPtr ProtocolV2::ready() {
+ ldout(cct, 25) << __func__ << dendl;
+
+ reconnecting = false;
+ replacing = false;
+
+ // make sure no pending tick timer
+ if (connection->last_tick_id) {
+ connection->center->delete_time_event(connection->last_tick_id);
+ }
+ connection->last_tick_id = connection->center->create_time_event(
+ connection->inactive_timeout_us, connection->tick_handler);
+
+ {
+ std::lock_guard<std::mutex> l(connection->write_lock);
+ can_write = true;
+ if (!out_queue.empty()) {
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+ }
+
+ connection->maybe_start_delay_thread();
+
+ state = READY;
+ ldout(cct, 1) << __func__ << " entity=" << peer_name << " client_cookie="
+ << std::hex << client_cookie << " server_cookie="
+ << server_cookie << std::dec << " in_seq=" << in_seq
+ << " out_seq=" << out_seq << dendl;
+
+ INTERCEPT(15);
+
+ return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r)
+{
+ ldout(cct, 20) << __func__ << " r=" << r << dendl;
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " read frame epilogue failed r=" << r
+ << " (" << cpp_strerror(r) << ")" << dendl;
+ return _fault();
+ }
+
+ rx_epilogue.push_back(std::move(buffer));
+ return _handle_read_frame_epilogue_main();
+}
+
+CtPtr ProtocolV2::_handle_read_frame_epilogue_main() {
+ bool aborted;
+ try {
+ rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]);
+ aborted = !rx_frame_asm.disassemble_remaining_segments(
+ rx_segments_data.data(), rx_epilogue);
+ } catch (FrameError& e) {
+ ldout(cct, 1) << __func__ << " " << e.what() << dendl;
+ return _fault();
+ } catch (ceph::crypto::onwire::MsgAuthError&) {
+ ldout(cct, 1) << __func__ << "bad auth tag" << dendl;
+ return _fault();
+ }
+
+ // we do have a mechanism that allows transmitter to start sending message
+ // and abort after putting entire data field on wire. This will be used by
+ // the kernel client to avoid unnecessary buffering.
+ if (aborted) {
+ reset_throttle();
+ state = READY;
+ return CONTINUE(read_frame);
+ }
+ return handle_read_frame_dispatch();
+}
+
+CtPtr ProtocolV2::handle_message() {
+ ldout(cct, 20) << __func__ << dendl;
+ ceph_assert(state == THROTTLE_DONE);
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+ ltt_recv_stamp = ceph_clock_now();
+#endif
+ recv_stamp = ceph_clock_now();
+
+ const size_t cur_msg_size = get_current_msg_size();
+ auto msg_frame = MessageFrame::Decode(rx_segments_data);
+
+ // XXX: paranoid copy just to avoid oops
+ ceph_msg_header2 current_header = msg_frame.header();
+
+ ldout(cct, 5) << __func__
+ << " got " << msg_frame.front_len()
+ << " + " << msg_frame.middle_len()
+ << " + " << msg_frame.data_len()
+ << " byte message."
+ << " envelope type=" << current_header.type
+ << " src " << peer_name
+ << " off " << current_header.data_off
+ << dendl;
+
+ INTERCEPT(16);
+ ceph_msg_header header{current_header.seq,
+ current_header.tid,
+ current_header.type,
+ current_header.priority,
+ current_header.version,
+ init_le32(msg_frame.front_len()),
+ init_le32(msg_frame.middle_len()),
+ init_le32(msg_frame.data_len()),
+ current_header.data_off,
+ peer_name,
+ current_header.compat_version,
+ current_header.reserved,
+ init_le32(0)};
+ ceph_msg_footer footer{init_le32(0), init_le32(0),
+ init_le32(0), init_le64(0), current_header.flags};
+
+ Message *message = decode_message(cct, 0, header, footer,
+ msg_frame.front(),
+ msg_frame.middle(),
+ msg_frame.data(),
+ connection);
+ if (!message) {
+ ldout(cct, 1) << __func__ << " decode message failed " << dendl;
+ return _fault();
+ } else {
+ state = READ_MESSAGE_COMPLETE;
+ }
+
+ INTERCEPT(17);
+
+ message->set_byte_throttler(connection->policy.throttler_bytes);
+ message->set_message_throttler(connection->policy.throttler_messages);
+
+ // store reservation size in message, so we don't get confused
+ // by messages entering the dispatch queue through other paths.
+ message->set_dispatch_throttle_size(cur_msg_size);
+
+ message->set_recv_stamp(recv_stamp);
+ message->set_throttle_stamp(throttle_stamp);
+ message->set_recv_complete_stamp(ceph_clock_now());
+
+ // check received seq#. if it is old, drop the message.
+ // note that incoming messages may skip ahead. this is convenient for the
+ // client side queueing because messages can't be renumbered, but the (kernel)
+ // client will occasionally pull a message out of the sent queue to send
+ // elsewhere. in that case it doesn't matter if we "got" it or not.
+ uint64_t cur_seq = in_seq;
+ if (message->get_seq() <= cur_seq) {
+ ldout(cct, 0) << __func__ << " got old message " << message->get_seq()
+ << " <= " << cur_seq << " " << message << " " << *message
+ << ", discarding" << dendl;
+ message->put();
+ if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) &&
+ cct->_conf->ms_die_on_old_message) {
+ ceph_assert(0 == "old msgs despite reconnect_seq feature");
+ }
+ return nullptr;
+ }
+ if (message->get_seq() > cur_seq + 1) {
+ ldout(cct, 0) << __func__ << " missed message? skipped from seq "
+ << cur_seq << " to " << message->get_seq() << dendl;
+ if (cct->_conf->ms_die_on_skipped_message) {
+ ceph_assert(0 == "skipped incoming seq");
+ }
+ }
+
+#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE)
+ if (message->get_type() == CEPH_MSG_OSD_OP ||
+ message->get_type() == CEPH_MSG_OSD_OPREPLY) {
+ utime_t ltt_processed_stamp = ceph_clock_now();
+ double usecs_elapsed =
+ (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000;
+ ostringstream buf;
+ if (message->get_type() == CEPH_MSG_OSD_OP)
+ OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP",
+ false);
+ else
+ OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY",
+ false);
+ }
+#endif
+
+ // note last received message.
+ in_seq = message->get_seq();
+ ldout(cct, 5) << __func__ << " received message m=" << message
+ << " seq=" << message->get_seq()
+ << " from=" << message->get_source() << " type=" << header.type
+ << " " << *message << dendl;
+
+ bool need_dispatch_writer = false;
+ if (!connection->policy.lossy) {
+ ack_left++;
+ need_dispatch_writer = true;
+ }
+
+ state = READY;
+
+ connection->logger->inc(l_msgr_recv_messages);
+ connection->logger->inc(l_msgr_recv_bytes,
+ rx_frame_asm.get_frame_onwire_len());
+
+ messenger->ms_fast_preprocess(message);
+ auto fast_dispatch_time = ceph::mono_clock::now();
+ connection->logger->tinc(l_msgr_running_recv_time,
+ fast_dispatch_time - connection->recv_start_time);
+ if (connection->delay_state) {
+ double delay_period = 0;
+ if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) {
+ delay_period =
+ cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0;
+ ldout(cct, 1) << "queue_received will delay after "
+ << (ceph_clock_now() + delay_period) << " on " << message
+ << " " << *message << dendl;
+ }
+ connection->delay_state->queue(delay_period, message);
+ } else if (messenger->ms_can_fast_dispatch(message)) {
+ connection->lock.unlock();
+ connection->dispatch_queue->fast_dispatch(message);
+ connection->recv_start_time = ceph::mono_clock::now();
+ connection->logger->tinc(l_msgr_running_fast_dispatch_time,
+ connection->recv_start_time - fast_dispatch_time);
+ connection->lock.lock();
+ } else {
+ connection->dispatch_queue->enqueue(message, message->get_priority(),
+ connection->conn_id);
+ }
+
+ handle_message_ack(current_header.ack_seq);
+
+ // we might have been reused by another connection
+ // let's check if that is the case
+ if (state != READY) {
+ // yes, that was the case, let's do nothing
+ return nullptr;
+ }
+
+ if (need_dispatch_writer && connection->is_connected()) {
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+
+ return CONTINUE(read_frame);
+}
+
+
+CtPtr ProtocolV2::throttle_message() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (connection->policy.throttler_messages) {
+ ldout(cct, 10) << __func__ << " wants " << 1
+ << " message from policy throttler "
+ << connection->policy.throttler_messages->get_current()
+ << "/" << connection->policy.throttler_messages->get_max()
+ << dendl;
+ if (!connection->policy.throttler_messages->get_or_fail()) {
+ ldout(cct, 10) << __func__ << " wants 1 message from policy throttle "
+ << connection->policy.throttler_messages->get_current()
+ << "/" << connection->policy.throttler_messages->get_max()
+ << " failed, just wait." << dendl;
+ // following thread pool deal with th full message queue isn't a
+ // short time, so we can wait a ms.
+ if (connection->register_time_events.empty()) {
+ connection->register_time_events.insert(
+ connection->center->create_time_event(1000,
+ connection->wakeup_handler));
+ }
+ return nullptr;
+ }
+ }
+
+ state = THROTTLE_BYTES;
+ return CONTINUE(throttle_bytes);
+}
+
+CtPtr ProtocolV2::throttle_bytes() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ const size_t cur_msg_size = get_current_msg_size();
+ if (cur_msg_size) {
+ if (connection->policy.throttler_bytes) {
+ ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+ << " bytes from policy throttler "
+ << connection->policy.throttler_bytes->get_current() << "/"
+ << connection->policy.throttler_bytes->get_max() << dendl;
+ if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) {
+ ldout(cct, 10) << __func__ << " wants " << cur_msg_size
+ << " bytes from policy throttler "
+ << connection->policy.throttler_bytes->get_current()
+ << "/" << connection->policy.throttler_bytes->get_max()
+ << " failed, just wait." << dendl;
+ // following thread pool deal with th full message queue isn't a
+ // short time, so we can wait a ms.
+ if (connection->register_time_events.empty()) {
+ connection->register_time_events.insert(
+ connection->center->create_time_event(
+ 1000, connection->wakeup_handler));
+ }
+ return nullptr;
+ }
+ }
+ }
+
+ state = THROTTLE_DISPATCH_QUEUE;
+ return CONTINUE(throttle_dispatch_queue);
+}
+
+CtPtr ProtocolV2::throttle_dispatch_queue() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ const size_t cur_msg_size = get_current_msg_size();
+ if (cur_msg_size) {
+ if (!connection->dispatch_queue->dispatch_throttler.get_or_fail(
+ cur_msg_size)) {
+ ldout(cct, 10)
+ << __func__ << " wants " << cur_msg_size
+ << " bytes from dispatch throttle "
+ << connection->dispatch_queue->dispatch_throttler.get_current() << "/"
+ << connection->dispatch_queue->dispatch_throttler.get_max()
+ << " failed, just wait." << dendl;
+ // following thread pool deal with th full message queue isn't a
+ // short time, so we can wait a ms.
+ if (connection->register_time_events.empty()) {
+ connection->register_time_events.insert(
+ connection->center->create_time_event(1000,
+ connection->wakeup_handler));
+ }
+ return nullptr;
+ }
+ }
+
+ throttle_stamp = ceph_clock_now();
+ state = THROTTLE_DONE;
+
+ return read_frame_segment();
+}
+
+CtPtr ProtocolV2::handle_keepalive2(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != READY) {
+ lderr(cct) << __func__ << " not in ready state!" << dendl;
+ return _fault();
+ }
+
+ auto keepalive_frame = KeepAliveFrame::Decode(payload);
+
+ ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl;
+
+ connection->write_lock.lock();
+ auto keepalive_ack_frame = KeepAliveFrameAck::Encode(keepalive_frame.timestamp());
+ if (!append_frame(keepalive_ack_frame)) {
+ connection->write_lock.unlock();
+ return _fault();
+ }
+ connection->write_lock.unlock();
+
+ ldout(cct, 20) << __func__ << " got KEEPALIVE2 "
+ << keepalive_frame.timestamp() << dendl;
+ connection->set_last_keepalive(ceph_clock_now());
+
+ if (is_connected()) {
+ connection->center->dispatch_event_external(connection->write_handler);
+ }
+
+ return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_keepalive2_ack(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != READY) {
+ lderr(cct) << __func__ << " not in ready state!" << dendl;
+ return _fault();
+ }
+
+ auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload);
+ connection->set_last_keepalive_ack(keepalive_ack_frame.timestamp());
+ ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl;
+
+ return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_message_ack(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != READY) {
+ lderr(cct) << __func__ << " not in ready state!" << dendl;
+ return _fault();
+ }
+
+ auto ack = AckFrame::Decode(payload);
+ handle_message_ack(ack.seq());
+ return CONTINUE(read_frame);
+}
+
+/* Client Protocol Methods */
+
+CtPtr ProtocolV2::start_client_banner_exchange() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ INTERCEPT(1);
+
+ state = BANNER_CONNECTING;
+
+ global_seq = messenger->get_global_seq();
+
+ return _banner_exchange(CONTINUATION(post_client_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_client_banner_exchange() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ state = AUTH_CONNECTING;
+
+ return send_auth_request();
+}
+
+CtPtr ProtocolV2::send_auth_request(std::vector<uint32_t> &allowed_methods) {
+ ldout(cct, 20) << __func__ << " peer_type " << (int)connection->peer_type
+ << " auth_client " << messenger->auth_client << dendl;
+ ceph_assert(messenger->auth_client);
+
+ bufferlist bl;
+ vector<uint32_t> preferred_modes;
+ auto am = auth_meta;
+ connection->lock.unlock();
+ int r = messenger->auth_client->get_auth_request(
+ connection, am.get(),
+ &am->auth_method, &preferred_modes, &bl);
+ connection->lock.lock();
+ if (state != AUTH_CONNECTING) {
+ ldout(cct, 1) << __func__ << " state changed!" << dendl;
+ return _fault();
+ }
+ if (r < 0) {
+ ldout(cct, 0) << __func__ << " get_initial_auth_request returned " << r
+ << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+
+ INTERCEPT(9);
+
+ auto frame = AuthRequestFrame::Encode(auth_meta->auth_method, preferred_modes,
+ bl);
+ return WRITE(frame, "auth request", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_bad_method(ceph::bufferlist &payload) {
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != AUTH_CONNECTING) {
+ lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+ return _fault();
+ }
+
+ auto bad_method = AuthBadMethodFrame::Decode(payload);
+ ldout(cct, 1) << __func__ << " method=" << bad_method.method()
+ << " result " << cpp_strerror(bad_method.result())
+ << ", allowed methods=" << bad_method.allowed_methods()
+ << ", allowed modes=" << bad_method.allowed_modes()
+ << dendl;
+ ceph_assert(messenger->auth_client);
+ auto am = auth_meta;
+ connection->lock.unlock();
+ int r = messenger->auth_client->handle_auth_bad_method(
+ connection,
+ am.get(),
+ bad_method.method(), bad_method.result(),
+ bad_method.allowed_methods(),
+ bad_method.allowed_modes());
+ connection->lock.lock();
+ if (state != AUTH_CONNECTING || r < 0) {
+ return _fault();
+ }
+ return send_auth_request(bad_method.allowed_methods());
+}
+
+CtPtr ProtocolV2::handle_auth_reply_more(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != AUTH_CONNECTING) {
+ lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+ return _fault();
+ }
+
+ auto auth_more = AuthReplyMoreFrame::Decode(payload);
+ ldout(cct, 5) << __func__
+ << " auth reply more len=" << auth_more.auth_payload().length()
+ << dendl;
+ ceph_assert(messenger->auth_client);
+ ceph::bufferlist reply;
+ auto am = auth_meta;
+ connection->lock.unlock();
+ int r = messenger->auth_client->handle_auth_reply_more(
+ connection, am.get(), auth_more.auth_payload(), &reply);
+ connection->lock.lock();
+ if (state != AUTH_CONNECTING) {
+ ldout(cct, 1) << __func__ << " state changed!" << dendl;
+ return _fault();
+ }
+ if (r < 0) {
+ lderr(cct) << __func__ << " auth_client handle_auth_reply_more returned "
+ << r << dendl;
+ return _fault();
+ }
+ auto more_reply = AuthRequestMoreFrame::Encode(reply);
+ return WRITE(more_reply, "auth request more", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != AUTH_CONNECTING) {
+ lderr(cct) << __func__ << " not in auth connect state!" << dendl;
+ return _fault();
+ }
+
+ auto auth_done = AuthDoneFrame::Decode(payload);
+
+ ceph_assert(messenger->auth_client);
+ auto am = auth_meta;
+ connection->lock.unlock();
+ int r = messenger->auth_client->handle_auth_done(
+ connection,
+ am.get(),
+ auth_done.global_id(),
+ auth_done.con_mode(),
+ auth_done.auth_payload(),
+ &am->session_key,
+ &am->connection_secret);
+ connection->lock.lock();
+ if (state != AUTH_CONNECTING) {
+ ldout(cct, 1) << __func__ << " state changed!" << dendl;
+ return _fault();
+ }
+ if (r < 0) {
+ return _fault();
+ }
+ auth_meta->con_mode = auth_done.con_mode();
+ bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+ session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+ cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/false);
+
+ state = AUTH_CONNECTING_SIGN;
+
+ const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+ auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+ auto sig_frame = AuthSignatureFrame::Encode(sig);
+ pre_auth.enabled = false;
+ pre_auth.rxbuf.clear();
+ return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::finish_client_auth() {
+ if (!server_cookie) {
+ ceph_assert(connect_seq == 0);
+ state = SESSION_CONNECTING;
+ return send_client_ident();
+ } else { // reconnecting to previous session
+ state = SESSION_RECONNECTING;
+ ceph_assert(connect_seq > 0);
+ return send_reconnect();
+ }
+}
+
+CtPtr ProtocolV2::send_client_ident() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (!connection->policy.lossy && !client_cookie) {
+ client_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+ }
+
+ uint64_t flags = 0;
+ if (connection->policy.lossy) {
+ flags |= CEPH_MSG_CONNECT_LOSSY;
+ }
+
+ auto client_ident = ClientIdentFrame::Encode(
+ messenger->get_myaddrs(),
+ connection->target_addr,
+ messenger->get_myname().num(),
+ global_seq,
+ connection->policy.features_supported,
+ connection->policy.features_required | msgr2_required,
+ flags,
+ client_cookie);
+
+ ldout(cct, 5) << __func__ << " sending identification: "
+ << "addrs=" << messenger->get_myaddrs()
+ << " target=" << connection->target_addr
+ << " gid=" << messenger->get_myname().num()
+ << " global_seq=" << global_seq
+ << " features_supported=" << std::hex
+ << connection->policy.features_supported
+ << " features_required="
+ << (connection->policy.features_required | msgr2_required)
+ << " flags=" << flags
+ << " cookie=" << client_cookie << std::dec << dendl;
+
+ INTERCEPT(11);
+
+ return WRITE(client_ident, "client ident", read_frame);
+}
+
+CtPtr ProtocolV2::send_reconnect() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ auto reconnect = ReconnectFrame::Encode(messenger->get_myaddrs(),
+ client_cookie,
+ server_cookie,
+ global_seq,
+ connect_seq,
+ in_seq);
+
+ ldout(cct, 5) << __func__ << " reconnect to session: client_cookie="
+ << std::hex << client_cookie << " server_cookie="
+ << server_cookie << std::dec
+ << " gs=" << global_seq << " cs=" << connect_seq
+ << " ms=" << in_seq << dendl;
+
+ INTERCEPT(13);
+
+ return WRITE(reconnect, "reconnect", read_frame);
+}
+
+CtPtr ProtocolV2::handle_ident_missing_features(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_CONNECTING) {
+ lderr(cct) << __func__ << " not in session connect state!" << dendl;
+ return _fault();
+ }
+
+ auto ident_missing =
+ IdentMissingFeaturesFrame::Decode(payload);
+ lderr(cct) << __func__
+ << " client does not support all server features: " << std::hex
+ << ident_missing.features() << std::dec << dendl;
+
+ return _fault();
+}
+
+CtPtr ProtocolV2::handle_session_reset(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_RECONNECTING) {
+ lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+ return _fault();
+ }
+
+ auto reset = ResetFrame::Decode(payload);
+
+ ldout(cct, 1) << __func__ << " received session reset full=" << reset.full()
+ << dendl;
+ if (reset.full()) {
+ reset_session();
+ } else {
+ server_cookie = 0;
+ connect_seq = 0;
+ in_seq = 0;
+ }
+
+ state = SESSION_CONNECTING;
+ return send_client_ident();
+}
+
+CtPtr ProtocolV2::handle_session_retry(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_RECONNECTING) {
+ lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+ return _fault();
+ }
+
+ auto retry = RetryFrame::Decode(payload);
+ connect_seq = retry.connect_seq() + 1;
+
+ ldout(cct, 1) << __func__
+ << " received session retry connect_seq=" << retry.connect_seq()
+ << ", inc to cs=" << connect_seq << dendl;
+
+ return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_session_retry_global(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_RECONNECTING) {
+ lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+ return _fault();
+ }
+
+ auto retry = RetryGlobalFrame::Decode(payload);
+ global_seq = messenger->get_global_seq(retry.global_seq());
+
+ ldout(cct, 1) << __func__ << " received session retry global global_seq="
+ << retry.global_seq() << ", choose new gs=" << global_seq
+ << dendl;
+
+ return send_reconnect();
+}
+
+CtPtr ProtocolV2::handle_wait(ceph::bufferlist &payload) {
+ ldout(cct, 20) << __func__
+ << " received WAIT (connection race)"
+ << " payload.length()=" << payload.length()
+ << dendl;
+
+ if (state != SESSION_CONNECTING && state != SESSION_RECONNECTING) {
+ lderr(cct) << __func__ << " not in session (re)connect state!" << dendl;
+ return _fault();
+ }
+
+ state = WAIT;
+ WaitFrame::Decode(payload);
+ return _fault();
+}
+
+CtPtr ProtocolV2::handle_reconnect_ok(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_RECONNECTING) {
+ lderr(cct) << __func__ << " not in session reconnect state!" << dendl;
+ return _fault();
+ }
+
+ auto reconnect_ok = ReconnectOkFrame::Decode(payload);
+ ldout(cct, 5) << __func__
+ << " reconnect accepted: sms=" << reconnect_ok.msg_seq()
+ << dendl;
+
+ out_seq = discard_requeued_up_to(out_seq, reconnect_ok.msg_seq());
+
+ backoff = utime_t();
+ ldout(cct, 10) << __func__ << " reconnect success " << connect_seq
+ << ", lossy = " << connection->policy.lossy << ", features "
+ << connection->get_features() << dendl;
+
+ if (connection->delay_state) {
+ ceph_assert(connection->delay_state->ready());
+ }
+
+ connection->dispatch_queue->queue_connect(connection);
+ messenger->ms_deliver_handle_fast_connect(connection);
+
+ return ready();
+}
+
+CtPtr ProtocolV2::handle_server_ident(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_CONNECTING) {
+ lderr(cct) << __func__ << " not in session connect state!" << dendl;
+ return _fault();
+ }
+
+ auto server_ident = ServerIdentFrame::Decode(payload);
+ ldout(cct, 5) << __func__ << " received server identification:"
+ << " addrs=" << server_ident.addrs()
+ << " gid=" << server_ident.gid()
+ << " global_seq=" << server_ident.global_seq()
+ << " features_supported=" << std::hex
+ << server_ident.supported_features()
+ << " features_required=" << server_ident.required_features()
+ << " flags=" << server_ident.flags()
+ << " cookie=" << server_ident.cookie() << std::dec << dendl;
+
+ // is this who we intended to talk to?
+ // be a bit forgiving here, since we may be connecting based on addresses parsed out
+ // of mon_host or something.
+ if (!server_ident.addrs().contains(connection->target_addr)) {
+ ldout(cct,1) << __func__ << " peer identifies as " << server_ident.addrs()
+ << ", does not include " << connection->target_addr << dendl;
+ return _fault();
+ }
+
+ server_cookie = server_ident.cookie();
+
+ connection->set_peer_addrs(server_ident.addrs());
+ peer_name = entity_name_t(connection->get_peer_type(), server_ident.gid());
+ connection->set_features(server_ident.supported_features() &
+ connection->policy.features_supported);
+ peer_global_seq = server_ident.global_seq();
+
+ connection->policy.lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY;
+
+ backoff = utime_t();
+ ldout(cct, 10) << __func__ << " connect success " << connect_seq
+ << ", lossy = " << connection->policy.lossy << ", features "
+ << connection->get_features() << dendl;
+
+ if (connection->delay_state) {
+ ceph_assert(connection->delay_state->ready());
+ }
+
+ connection->dispatch_queue->queue_connect(connection);
+ messenger->ms_deliver_handle_fast_connect(connection);
+
+ return ready();
+}
+
+/* Server Protocol Methods */
+
+CtPtr ProtocolV2::start_server_banner_exchange() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ INTERCEPT(2);
+
+ state = BANNER_ACCEPTING;
+
+ return _banner_exchange(CONTINUATION(post_server_banner_exchange));
+}
+
+CtPtr ProtocolV2::post_server_banner_exchange() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ state = AUTH_ACCEPTING;
+
+ return CONTINUE(read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request(ceph::bufferlist &payload) {
+ ldout(cct, 20) << __func__ << " payload.length()=" << payload.length()
+ << dendl;
+
+ if (state != AUTH_ACCEPTING) {
+ lderr(cct) << __func__ << " not in auth accept state!" << dendl;
+ return _fault();
+ }
+
+ auto request = AuthRequestFrame::Decode(payload);
+ ldout(cct, 10) << __func__ << " AuthRequest(method=" << request.method()
+ << ", preferred_modes=" << request.preferred_modes()
+ << ", payload_len=" << request.auth_payload().length() << ")"
+ << dendl;
+ auth_meta->auth_method = request.method();
+ auth_meta->con_mode = messenger->auth_server->pick_con_mode(
+ connection->get_peer_type(), auth_meta->auth_method,
+ request.preferred_modes());
+ if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) {
+ return _auth_bad_method(-EOPNOTSUPP);
+ }
+ return _handle_auth_request(request.auth_payload(), false);
+}
+
+CtPtr ProtocolV2::_auth_bad_method(int r)
+{
+ ceph_assert(r < 0);
+ std::vector<uint32_t> allowed_methods;
+ std::vector<uint32_t> allowed_modes;
+ messenger->auth_server->get_supported_auth_methods(
+ connection->get_peer_type(), &allowed_methods, &allowed_modes);
+ ldout(cct, 1) << __func__ << " auth_method " << auth_meta->auth_method
+ << " r " << cpp_strerror(r)
+ << ", allowed_methods " << allowed_methods
+ << ", allowed_modes " << allowed_modes
+ << dendl;
+ auto bad_method = AuthBadMethodFrame::Encode(auth_meta->auth_method, r,
+ allowed_methods, allowed_modes);
+ return WRITE(bad_method, "bad auth method", read_frame);
+}
+
+CtPtr ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more)
+{
+ if (!messenger->auth_server) {
+ return _fault();
+ }
+ bufferlist reply;
+ auto am = auth_meta;
+ connection->lock.unlock();
+ int r = messenger->auth_server->handle_auth_request(
+ connection, am.get(),
+ more, am->auth_method, auth_payload,
+ &reply);
+ connection->lock.lock();
+ if (state != AUTH_ACCEPTING && state != AUTH_ACCEPTING_MORE) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED);
+ return _fault();
+ }
+ if (r == 1) {
+ INTERCEPT(10);
+ state = AUTH_ACCEPTING_SIGN;
+
+ auto auth_done = AuthDoneFrame::Encode(connection->peer_global_id,
+ auth_meta->con_mode,
+ reply);
+ return WRITE(auth_done, "auth done", finish_auth);
+ } else if (r == 0) {
+ state = AUTH_ACCEPTING_MORE;
+
+ auto more = AuthReplyMoreFrame::Encode(reply);
+ return WRITE(more, "auth reply more", read_frame);
+ } else if (r == -EBUSY) {
+ // kick the client and maybe they'll come back later
+ return _fault();
+ } else {
+ return _auth_bad_method(r);
+ }
+}
+
+CtPtr ProtocolV2::finish_auth()
+{
+ ceph_assert(auth_meta);
+ // TODO: having a possibility to check whether we're server or client could
+ // allow reusing finish_auth().
+ bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1);
+ session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair(
+ cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/true);
+
+ const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() :
+ auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf);
+ auto sig_frame = AuthSignatureFrame::Encode(sig);
+ pre_auth.enabled = false;
+ pre_auth.rxbuf.clear();
+ return WRITE(sig_frame, "auth signature", read_frame);
+}
+
+CtPtr ProtocolV2::handle_auth_request_more(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != AUTH_ACCEPTING_MORE) {
+ lderr(cct) << __func__ << " not in auth accept more state!" << dendl;
+ return _fault();
+ }
+
+ auto auth_more = AuthRequestMoreFrame::Decode(payload);
+ return _handle_auth_request(auth_more.auth_payload(), true);
+}
+
+CtPtr ProtocolV2::handle_auth_signature(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != AUTH_ACCEPTING_SIGN && state != AUTH_CONNECTING_SIGN) {
+ lderr(cct) << __func__
+ << " pre-auth verification signature seen in wrong state!"
+ << dendl;
+ return _fault();
+ }
+
+ auto sig_frame = AuthSignatureFrame::Decode(payload);
+
+ const auto actual_tx_sig = auth_meta->session_key.empty() ?
+ sha256_digest_t() : auth_meta->session_key.hmac_sha256(cct, pre_auth.txbuf);
+ if (sig_frame.signature() != actual_tx_sig) {
+ ldout(cct, 2) << __func__ << " pre-auth signature mismatch"
+ << " actual_tx_sig=" << actual_tx_sig
+ << " sig_frame.signature()=" << sig_frame.signature()
+ << dendl;
+ return _fault();
+ } else {
+ ldout(cct, 20) << __func__ << " pre-auth signature success"
+ << " sig_frame.signature()=" << sig_frame.signature()
+ << dendl;
+ pre_auth.txbuf.clear();
+ }
+
+ if (state == AUTH_ACCEPTING_SIGN) {
+ // server had sent AuthDone and client responded with correct pre-auth
+ // signature. we can start accepting new sessions/reconnects.
+ state = SESSION_ACCEPTING;
+ return CONTINUE(read_frame);
+ } else if (state == AUTH_CONNECTING_SIGN) {
+ // this happened at client side
+ return finish_client_auth();
+ } else {
+ ceph_assert_always("state corruption" == nullptr);
+ }
+}
+
+CtPtr ProtocolV2::handle_client_ident(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_ACCEPTING) {
+ lderr(cct) << __func__ << " not in session accept state!" << dendl;
+ return _fault();
+ }
+
+ auto client_ident = ClientIdentFrame::Decode(payload);
+
+ ldout(cct, 5) << __func__ << " received client identification:"
+ << " addrs=" << client_ident.addrs()
+ << " target=" << client_ident.target_addr()
+ << " gid=" << client_ident.gid()
+ << " global_seq=" << client_ident.global_seq()
+ << " features_supported=" << std::hex
+ << client_ident.supported_features()
+ << " features_required=" << client_ident.required_features()
+ << " flags=" << client_ident.flags()
+ << " cookie=" << client_ident.cookie() << std::dec << dendl;
+
+ if (client_ident.addrs().empty() ||
+ client_ident.addrs().front() == entity_addr_t()) {
+ ldout(cct,5) << __func__ << " oops, client_ident.addrs() is empty" << dendl;
+ return _fault(); // a v2 peer should never do this
+ }
+ if (!messenger->get_myaddrs().contains(client_ident.target_addr())) {
+ ldout(cct,5) << __func__ << " peer is trying to reach "
+ << client_ident.target_addr()
+ << " which is not us (" << messenger->get_myaddrs() << ")"
+ << dendl;
+ return _fault();
+ }
+
+ connection->set_peer_addrs(client_ident.addrs());
+ connection->target_addr = connection->_infer_target_addr(client_ident.addrs());
+
+ peer_name = entity_name_t(connection->get_peer_type(), client_ident.gid());
+ connection->set_peer_id(client_ident.gid());
+
+ client_cookie = client_ident.cookie();
+
+ uint64_t feat_missing =
+ (connection->policy.features_required | msgr2_required) &
+ ~(uint64_t)client_ident.supported_features();
+ if (feat_missing) {
+ ldout(cct, 1) << __func__ << " peer missing required features " << std::hex
+ << feat_missing << std::dec << dendl;
+ auto ident_missing_features =
+ IdentMissingFeaturesFrame::Encode(feat_missing);
+
+ return WRITE(ident_missing_features, "ident missing features", read_frame);
+ }
+
+ connection_features =
+ client_ident.supported_features() & connection->policy.features_supported;
+
+ peer_global_seq = client_ident.global_seq();
+
+ // Looks good so far, let's check if there is already an existing connection
+ // to this peer.
+
+ connection->lock.unlock();
+ AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+ if (existing &&
+ existing->protocol->proto_type != 2) {
+ ldout(cct,1) << __func__ << " existing " << existing << " proto "
+ << existing->protocol.get() << " version is "
+ << existing->protocol->proto_type << ", marking down" << dendl;
+ existing->mark_down();
+ existing = nullptr;
+ }
+
+ connection->inject_delay();
+
+ connection->lock.lock();
+ if (state != SESSION_ACCEPTING) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED);
+ return _fault();
+ }
+
+ if (existing) {
+ return handle_existing_connection(existing);
+ }
+
+ // if everything is OK reply with server identification
+ return send_server_ident();
+}
+
+CtPtr ProtocolV2::handle_reconnect(ceph::bufferlist &payload)
+{
+ ldout(cct, 20) << __func__
+ << " payload.length()=" << payload.length() << dendl;
+
+ if (state != SESSION_ACCEPTING) {
+ lderr(cct) << __func__ << " not in session accept state!" << dendl;
+ return _fault();
+ }
+
+ auto reconnect = ReconnectFrame::Decode(payload);
+
+ ldout(cct, 5) << __func__
+ << " received reconnect:"
+ << " client_cookie=" << std::hex << reconnect.client_cookie()
+ << " server_cookie=" << reconnect.server_cookie() << std::dec
+ << " gs=" << reconnect.global_seq()
+ << " cs=" << reconnect.connect_seq()
+ << " ms=" << reconnect.msg_seq()
+ << dendl;
+
+ // Should we check if one of the ident.addrs match connection->target_addr
+ // as we do in ProtocolV1?
+ connection->set_peer_addrs(reconnect.addrs());
+ connection->target_addr = connection->_infer_target_addr(reconnect.addrs());
+ peer_global_seq = reconnect.global_seq();
+
+ connection->lock.unlock();
+ AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs);
+
+ if (existing &&
+ existing->protocol->proto_type != 2) {
+ ldout(cct,1) << __func__ << " existing " << existing << " proto "
+ << existing->protocol.get() << " version is "
+ << existing->protocol->proto_type << ", marking down" << dendl;
+ existing->mark_down();
+ existing = nullptr;
+ }
+
+ connection->inject_delay();
+
+ connection->lock.lock();
+ if (state != SESSION_ACCEPTING) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED);
+ return _fault();
+ }
+
+ if (!existing) {
+ // there is no existing connection therefore cannot reconnect to previous
+ // session
+ ldout(cct, 0) << __func__
+ << " no existing connection exists, reseting client" << dendl;
+ auto reset = ResetFrame::Encode(true);
+ return WRITE(reset, "session reset", read_frame);
+ }
+
+ std::lock_guard<std::mutex> l(existing->lock);
+
+ ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+ if (!exproto) {
+ ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+ ceph_assert(false);
+ }
+
+ if (exproto->state == CLOSED) {
+ ldout(cct, 5) << __func__ << " existing " << existing
+ << " already closed. Reseting client" << dendl;
+ auto reset = ResetFrame::Encode(true);
+ return WRITE(reset, "session reset", read_frame);
+ }
+
+ if (exproto->replacing) {
+ ldout(cct, 1) << __func__
+ << " existing racing replace happened while replacing."
+ << " existing=" << existing << dendl;
+ auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+ return WRITE(retry, "session retry", read_frame);
+ }
+
+ if (exproto->client_cookie != reconnect.client_cookie()) {
+ ldout(cct, 1) << __func__ << " existing=" << existing
+ << " client cookie mismatch, I must have reseted:"
+ << " cc=" << std::hex << exproto->client_cookie
+ << " rcc=" << reconnect.client_cookie()
+ << ", reseting client." << std::dec
+ << dendl;
+ auto reset = ResetFrame::Encode(connection->policy.resetcheck);
+ return WRITE(reset, "session reset", read_frame);
+ } else if (exproto->server_cookie == 0) {
+ // this happens when:
+ // - a connects to b
+ // - a sends client_ident
+ // - b gets client_ident, sends server_ident and sets cookie X
+ // - connection fault
+ // - b reconnects to a with cookie X, connect_seq=1
+ // - a has cookie==0
+ ldout(cct, 1) << __func__ << " I was a client and didn't received the"
+ << " server_ident. Asking peer to resume session"
+ << " establishment" << dendl;
+ auto reset = ResetFrame::Encode(false);
+ return WRITE(reset, "session reset", read_frame);
+ }
+
+ if (exproto->peer_global_seq > reconnect.global_seq()) {
+ ldout(cct, 5) << __func__
+ << " stale global_seq: sgs=" << exproto->peer_global_seq
+ << " cgs=" << reconnect.global_seq()
+ << ", ask client to retry global" << dendl;
+ auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq);
+
+ INTERCEPT(18);
+
+ return WRITE(retry, "session retry", read_frame);
+ }
+
+ if (exproto->connect_seq > reconnect.connect_seq()) {
+ ldout(cct, 5) << __func__
+ << " stale connect_seq scs=" << exproto->connect_seq
+ << " ccs=" << reconnect.connect_seq()
+ << " , ask client to retry" << dendl;
+ auto retry = RetryFrame::Encode(exproto->connect_seq);
+ return WRITE(retry, "session retry", read_frame);
+ }
+
+ if (exproto->connect_seq == reconnect.connect_seq()) {
+ // reconnect race: both peers are sending reconnect messages
+ if (existing->peer_addrs->msgr2_addr() >
+ messenger->get_myaddrs().msgr2_addr() &&
+ !existing->policy.server) {
+ // the existing connection wins
+ ldout(cct, 1)
+ << __func__
+ << " reconnect race detected, this connection loses to existing="
+ << existing << dendl;
+
+ auto wait = WaitFrame::Encode();
+ return WRITE(wait, "wait", read_frame);
+ } else {
+ // this connection wins
+ ldout(cct, 1) << __func__
+ << " reconnect race detected, replacing existing="
+ << existing << " socket by this connection's socket"
+ << dendl;
+ }
+ }
+
+ ldout(cct, 1) << __func__ << " reconnect to existing=" << existing << dendl;
+
+ reconnecting = true;
+
+ // everything looks good
+ exproto->connect_seq = reconnect.connect_seq();
+ exproto->message_seq = reconnect.msg_seq();
+
+ return reuse_connection(existing, exproto);
+}
+
+CtPtr ProtocolV2::handle_existing_connection(AsyncConnectionRef existing) {
+ ldout(cct, 20) << __func__ << " existing=" << existing << dendl;
+
+ std::lock_guard<std::mutex> l(existing->lock);
+
+ ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get());
+ if (!exproto) {
+ ldout(cct, 1) << __func__ << " existing=" << existing << dendl;
+ ceph_assert(false);
+ }
+
+ if (exproto->state == CLOSED) {
+ ldout(cct, 1) << __func__ << " existing " << existing << " already closed."
+ << dendl;
+ return send_server_ident();
+ }
+
+ if (exproto->replacing) {
+ ldout(cct, 1) << __func__
+ << " existing racing replace happened while replacing."
+ << " existing=" << existing << dendl;
+ auto wait = WaitFrame::Encode();
+ return WRITE(wait, "wait", read_frame);
+ }
+
+ if (exproto->peer_global_seq > peer_global_seq) {
+ ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq="
+ << peer_global_seq
+ << " existing->peer_global_seq=" << exproto->peer_global_seq
+ << ", stopping this connection." << dendl;
+ stop();
+ connection->dispatch_queue->queue_reset(connection);
+ return nullptr;
+ }
+
+ if (existing->policy.lossy) {
+ // existing connection can be thrown out in favor of this one
+ ldout(cct, 1)
+ << __func__ << " existing=" << existing
+ << " is a lossy channel. Stopping existing in favor of this connection"
+ << dendl;
+ existing->protocol->stop();
+ existing->dispatch_queue->queue_reset(existing.get());
+ return send_server_ident();
+ }
+
+ if (exproto->server_cookie && exproto->client_cookie &&
+ exproto->client_cookie != client_cookie) {
+ // Found previous session
+ // peer has reseted and we're going to reuse the existing connection
+ // by replacing the communication socket
+ ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+ << ", peer must have reseted." << dendl;
+ if (connection->policy.resetcheck) {
+ exproto->reset_session();
+ }
+ return reuse_connection(existing, exproto);
+ }
+
+ if (exproto->client_cookie == client_cookie) {
+ // session establishment interrupted between client_ident and server_ident,
+ // continuing...
+ ldout(cct, 1) << __func__ << " found previous session existing=" << existing
+ << ", continuing session establishment." << dendl;
+ return reuse_connection(existing, exproto);
+ }
+
+ if (exproto->state == READY || exproto->state == STANDBY) {
+ ldout(cct, 1) << __func__ << " existing=" << existing
+ << " is READY/STANDBY, lets reuse it" << dendl;
+ return reuse_connection(existing, exproto);
+ }
+
+ // Looks like a connection race: server and client are both connecting to
+ // each other at the same time.
+ if (connection->peer_addrs->msgr2_addr() <
+ messenger->get_myaddrs().msgr2_addr() ||
+ existing->policy.server) {
+ // this connection wins
+ ldout(cct, 1) << __func__
+ << " connection race detected, replacing existing="
+ << existing << " socket by this connection's socket" << dendl;
+ return reuse_connection(existing, exproto);
+ } else {
+ // the existing connection wins
+ ldout(cct, 1)
+ << __func__
+ << " connection race detected, this connection loses to existing="
+ << existing << dendl;
+ ceph_assert(connection->peer_addrs->msgr2_addr() >
+ messenger->get_myaddrs().msgr2_addr());
+
+ // make sure we follow through with opening the existing
+ // connection (if it isn't yet open) since we know the peer
+ // has something to send to us.
+ existing->send_keepalive();
+ auto wait = WaitFrame::Encode();
+ return WRITE(wait, "wait", read_frame);
+ }
+}
+
+CtPtr ProtocolV2::reuse_connection(AsyncConnectionRef existing,
+ ProtocolV2 *exproto) {
+ ldout(cct, 20) << __func__ << " existing=" << existing
+ << " reconnect=" << reconnecting << dendl;
+
+ connection->inject_delay();
+
+ std::lock_guard<std::mutex> l(existing->write_lock);
+
+ connection->center->delete_file_event(connection->cs.fd(),
+ EVENT_READABLE | EVENT_WRITABLE);
+
+ if (existing->delay_state) {
+ existing->delay_state->flush();
+ ceph_assert(!connection->delay_state);
+ }
+ exproto->reset_recv_state();
+ exproto->pre_auth.enabled = false;
+
+ if (!reconnecting) {
+ exproto->peer_supported_features = peer_supported_features;
+ exproto->tx_frame_asm.set_is_rev1(tx_frame_asm.get_is_rev1());
+ exproto->rx_frame_asm.set_is_rev1(rx_frame_asm.get_is_rev1());
+
+ exproto->client_cookie = client_cookie;
+ exproto->peer_name = peer_name;
+ exproto->connection_features = connection_features;
+ existing->set_features(connection_features);
+ }
+ exproto->peer_global_seq = peer_global_seq;
+
+ ceph_assert(connection->center->in_thread());
+ auto temp_cs = std::move(connection->cs);
+ EventCenter *new_center = connection->center;
+ Worker *new_worker = connection->worker;
+ // we can steal the session_stream_handlers under the assumption
+ // this happens in the event center's thread as there should be
+ // no user outside its boundaries (simlarly to e.g. outgoing_bl).
+ auto temp_stream_handlers = std::move(session_stream_handlers);
+ exproto->auth_meta = auth_meta;
+
+ ldout(messenger->cct, 5) << __func__ << " stop myself to swap existing"
+ << dendl;
+
+ // avoid _stop shutdown replacing socket
+ // queue a reset on the new connection, which we're dumping for the old
+ stop();
+
+ connection->dispatch_queue->queue_reset(connection);
+
+ exproto->can_write = false;
+ exproto->write_in_progress = false;
+ exproto->reconnecting = reconnecting;
+ exproto->replacing = true;
+ existing->state_offset = 0;
+ // avoid previous thread modify event
+ exproto->state = NONE;
+ existing->state = AsyncConnection::STATE_NONE;
+ // Discard existing prefetch buffer in `recv_buf`
+ existing->recv_start = existing->recv_end = 0;
+ // there shouldn't exist any buffer
+ ceph_assert(connection->recv_start == connection->recv_end);
+
+ auto deactivate_existing = std::bind(
+ [ existing,
+ new_worker,
+ new_center,
+ exproto,
+ temp_stream_handlers=std::move(temp_stream_handlers)
+ ](ConnectedSocket &cs) mutable {
+ // we need to delete time event in original thread
+ {
+ std::lock_guard<std::mutex> l(existing->lock);
+ existing->write_lock.lock();
+ exproto->requeue_sent();
+ // XXX: do we really need the locking for `outgoing_bl`? There is
+ // a comment just above its definition saying "lockfree, only used
+ // in own thread". I'm following lockfull schema just in the case.
+ // From performance point of view it should be fine – this happens
+ // far away from hot paths.
+ existing->outgoing_bl.clear();
+ existing->open_write = false;
+ exproto->session_stream_handlers = std::move(temp_stream_handlers);
+ existing->write_lock.unlock();
+ if (exproto->state == NONE) {
+ existing->shutdown_socket();
+ existing->cs = std::move(cs);
+ existing->worker->references--;
+ new_worker->references++;
+ existing->logger = new_worker->get_perf_counter();
+ existing->worker = new_worker;
+ existing->center = new_center;
+ if (existing->delay_state)
+ existing->delay_state->set_center(new_center);
+ } else if (exproto->state == CLOSED) {
+ auto back_to_close = std::bind(
+ [](ConnectedSocket &cs) mutable { cs.close(); }, std::move(cs));
+ new_center->submit_to(new_center->get_id(),
+ std::move(back_to_close), true);
+ return;
+ } else {
+ ceph_abort();
+ }
+ }
+
+ // Before changing existing->center, it may already exists some
+ // events in existing->center's queue. Then if we mark down
+ // `existing`, it will execute in another thread and clean up
+ // connection. Previous event will result in segment fault
+ auto transfer_existing = [existing, exproto]() mutable {
+ std::lock_guard<std::mutex> l(existing->lock);
+ if (exproto->state == CLOSED) return;
+ ceph_assert(exproto->state == NONE);
+
+ exproto->state = SESSION_ACCEPTING;
+ // we have called shutdown_socket above
+ ceph_assert(existing->last_tick_id == 0);
+ // restart timer since we are going to re-build connection
+ existing->last_connect_started = ceph::coarse_mono_clock::now();
+ existing->last_tick_id = existing->center->create_time_event(
+ existing->connect_timeout_us, existing->tick_handler);
+ existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED;
+ existing->center->create_file_event(existing->cs.fd(), EVENT_READABLE,
+ existing->read_handler);
+ if (!exproto->reconnecting) {
+ exproto->run_continuation(exproto->send_server_ident());
+ } else {
+ exproto->run_continuation(exproto->send_reconnect_ok());
+ }
+ };
+ if (existing->center->in_thread())
+ transfer_existing();
+ else
+ existing->center->submit_to(existing->center->get_id(),
+ std::move(transfer_existing), true);
+ },
+ std::move(temp_cs));
+
+ existing->center->submit_to(existing->center->get_id(),
+ std::move(deactivate_existing), true);
+ return nullptr;
+}
+
+CtPtr ProtocolV2::send_server_ident() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ // this is required for the case when this connection is being replaced
+ out_seq = discard_requeued_up_to(out_seq, 0);
+ in_seq = 0;
+
+ if (!connection->policy.lossy) {
+ server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll);
+ }
+
+ uint64_t flags = 0;
+ if (connection->policy.lossy) {
+ flags = flags | CEPH_MSG_CONNECT_LOSSY;
+ }
+
+ uint64_t gs = messenger->get_global_seq();
+ auto server_ident = ServerIdentFrame::Encode(
+ messenger->get_myaddrs(),
+ messenger->get_myname().num(),
+ gs,
+ connection->policy.features_supported,
+ connection->policy.features_required | msgr2_required,
+ flags,
+ server_cookie);
+
+ ldout(cct, 5) << __func__ << " sending identification:"
+ << " addrs=" << messenger->get_myaddrs()
+ << " gid=" << messenger->get_myname().num()
+ << " global_seq=" << gs << " features_supported=" << std::hex
+ << connection->policy.features_supported
+ << " features_required="
+ << (connection->policy.features_required | msgr2_required)
+ << " flags=" << flags
+ << " cookie=" << server_cookie << std::dec << dendl;
+
+ connection->lock.unlock();
+ // Because "replacing" will prevent other connections preempt this addr,
+ // it's safe that here we don't acquire Connection's lock
+ ssize_t r = messenger->accept_conn(connection);
+
+ connection->inject_delay();
+
+ connection->lock.lock();
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+ << connection->peer_addrs->msgr2_addr()
+ << " just fail later one(this)" << dendl;
+ connection->inject_delay();
+ return _fault();
+ }
+ if (state != SESSION_ACCEPTING) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept_conn, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED || state == NONE);
+ messenger->unregister_conn(connection);
+ connection->inject_delay();
+ return _fault();
+ }
+
+ connection->set_features(connection_features);
+
+ // notify
+ connection->dispatch_queue->queue_accept(connection);
+ messenger->ms_deliver_handle_fast_accept(connection);
+
+ INTERCEPT(12);
+
+ return WRITE(server_ident, "server ident", server_ready);
+}
+
+CtPtr ProtocolV2::server_ready() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ if (connection->delay_state) {
+ ceph_assert(connection->delay_state->ready());
+ }
+
+ return ready();
+}
+
+CtPtr ProtocolV2::send_reconnect_ok() {
+ ldout(cct, 20) << __func__ << dendl;
+
+ out_seq = discard_requeued_up_to(out_seq, message_seq);
+
+ uint64_t ms = in_seq;
+ auto reconnect_ok = ReconnectOkFrame::Encode(ms);
+
+ ldout(cct, 5) << __func__ << " sending reconnect_ok: msg_seq=" << ms << dendl;
+
+ connection->lock.unlock();
+ // Because "replacing" will prevent other connections preempt this addr,
+ // it's safe that here we don't acquire Connection's lock
+ ssize_t r = messenger->accept_conn(connection);
+
+ connection->inject_delay();
+
+ connection->lock.lock();
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " existing race replacing process for addr = "
+ << connection->peer_addrs->msgr2_addr()
+ << " just fail later one(this)" << dendl;
+ connection->inject_delay();
+ return _fault();
+ }
+ if (state != SESSION_ACCEPTING) {
+ ldout(cct, 1) << __func__
+ << " state changed while accept_conn, it must be mark_down"
+ << dendl;
+ ceph_assert(state == CLOSED || state == NONE);
+ messenger->unregister_conn(connection);
+ connection->inject_delay();
+ return _fault();
+ }
+
+ // notify
+ connection->dispatch_queue->queue_accept(connection);
+ messenger->ms_deliver_handle_fast_accept(connection);
+
+ INTERCEPT(14);
+
+ return WRITE(reconnect_ok, "reconnect ok", server_ready);
+}
diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h
new file mode 100644
index 00000000..4941cea5
--- /dev/null
+++ b/src/msg/async/ProtocolV2.h
@@ -0,0 +1,259 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#ifndef _MSG_ASYNC_PROTOCOL_V2_
+#define _MSG_ASYNC_PROTOCOL_V2_
+
+#include "Protocol.h"
+#include "crypto_onwire.h"
+#include "frames_v2.h"
+
+class ProtocolV2 : public Protocol {
+private:
+ enum State {
+ NONE,
+ START_CONNECT,
+ BANNER_CONNECTING,
+ HELLO_CONNECTING,
+ AUTH_CONNECTING,
+ AUTH_CONNECTING_SIGN,
+ SESSION_CONNECTING,
+ SESSION_RECONNECTING,
+ START_ACCEPT,
+ BANNER_ACCEPTING,
+ HELLO_ACCEPTING,
+ AUTH_ACCEPTING,
+ AUTH_ACCEPTING_MORE,
+ AUTH_ACCEPTING_SIGN,
+ SESSION_ACCEPTING,
+ READY,
+ THROTTLE_MESSAGE,
+ THROTTLE_BYTES,
+ THROTTLE_DISPATCH_QUEUE,
+ THROTTLE_DONE,
+ READ_MESSAGE_COMPLETE,
+ STANDBY,
+ WAIT,
+ CLOSED
+ };
+
+ static const char *get_state_name(int state) {
+ const char *const statenames[] = {"NONE",
+ "START_CONNECT",
+ "BANNER_CONNECTING",
+ "HELLO_CONNECTING",
+ "AUTH_CONNECTING",
+ "AUTH_CONNECTING_SIGN",
+ "SESSION_CONNECTING",
+ "SESSION_RECONNECTING",
+ "START_ACCEPT",
+ "BANNER_ACCEPTING",
+ "HELLO_ACCEPTING",
+ "AUTH_ACCEPTING",
+ "AUTH_ACCEPTING_MORE",
+ "AUTH_ACCEPTING_SIGN",
+ "SESSION_ACCEPTING",
+ "READY",
+ "THROTTLE_MESSAGE",
+ "THROTTLE_BYTES",
+ "THROTTLE_DISPATCH_QUEUE",
+ "THROTTLE_DONE",
+ "READ_MESSAGE_COMPLETE",
+ "STANDBY",
+ "WAIT",
+ "CLOSED"};
+ return statenames[state];
+ }
+
+ // TODO: move into auth_meta?
+ ceph::crypto::onwire::rxtx_t session_stream_handlers;
+
+ entity_name_t peer_name;
+ State state;
+ uint64_t peer_supported_features; // CEPH_MSGR2_FEATURE_*
+
+ uint64_t client_cookie;
+ uint64_t server_cookie;
+ uint64_t global_seq;
+ uint64_t connect_seq;
+ uint64_t peer_global_seq;
+ uint64_t message_seq;
+ bool reconnecting;
+ bool replacing;
+ bool can_write;
+ struct out_queue_entry_t {
+ bool is_prepared {false};
+ Message* m {nullptr};
+ };
+ std::map<int, std::list<out_queue_entry_t>> out_queue;
+ std::list<Message *> sent;
+ std::atomic<uint64_t> out_seq{0};
+ std::atomic<uint64_t> in_seq{0};
+ std::atomic<uint64_t> ack_left{0};
+
+ using ProtFuncPtr = void (ProtocolV2::*)();
+ Ct<ProtocolV2> *bannerExchangeCallback;
+
+ ceph::msgr::v2::FrameAssembler tx_frame_asm;
+ ceph::msgr::v2::FrameAssembler rx_frame_asm;
+
+ ceph::bufferlist rx_preamble;
+ ceph::bufferlist rx_epilogue;
+ ceph::msgr::v2::segment_bls_t rx_segments_data;
+ ceph::msgr::v2::Tag next_tag;
+ utime_t backoff; // backoff time
+ utime_t recv_stamp;
+ utime_t throttle_stamp;
+
+ struct {
+ ceph::bufferlist rxbuf;
+ ceph::bufferlist txbuf;
+ bool enabled {true};
+ } pre_auth;
+
+ bool keepalive;
+ bool write_in_progress = false;
+
+ ostream &_conn_prefix(std::ostream *_dout);
+ void run_continuation(Ct<ProtocolV2> *pcontinuation);
+ void run_continuation(Ct<ProtocolV2> &continuation);
+
+ Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next,
+ rx_buffer_t&& buffer);
+ template <class F>
+ Ct<ProtocolV2> *write(const std::string &desc,
+ CONTINUATION_TYPE<ProtocolV2> &next,
+ F &frame);
+ Ct<ProtocolV2> *write(const std::string &desc,
+ CONTINUATION_TYPE<ProtocolV2> &next,
+ bufferlist &buffer);
+
+ template <class F>
+ bool append_frame(F& frame);
+
+ void requeue_sent();
+ uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq);
+ void reset_recv_state();
+ void reset_security();
+ void reset_throttle();
+ Ct<ProtocolV2> *_fault();
+ void discard_out_queue();
+ void reset_session();
+ void prepare_send_message(uint64_t features, Message *m);
+ out_queue_entry_t _get_next_outgoing();
+ ssize_t write_message(Message *m, bool more);
+ void handle_message_ack(uint64_t seq);
+
+ CONTINUATION_DECL(ProtocolV2, _wait_for_peer_banner);
+ READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner);
+ READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner_payload);
+
+ Ct<ProtocolV2> *_banner_exchange(Ct<ProtocolV2> &callback);
+ Ct<ProtocolV2> *_wait_for_peer_banner();
+ Ct<ProtocolV2> *_handle_peer_banner(rx_buffer_t &&buffer, int r);
+ Ct<ProtocolV2> *_handle_peer_banner_payload(rx_buffer_t &&buffer, int r);
+ Ct<ProtocolV2> *handle_hello(ceph::bufferlist &payload);
+
+ CONTINUATION_DECL(ProtocolV2, read_frame);
+ CONTINUATION_DECL(ProtocolV2, finish_auth);
+ READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_preamble_main);
+ READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_segment);
+ READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_epilogue_main);
+ CONTINUATION_DECL(ProtocolV2, throttle_message);
+ CONTINUATION_DECL(ProtocolV2, throttle_bytes);
+ CONTINUATION_DECL(ProtocolV2, throttle_dispatch_queue);
+
+ Ct<ProtocolV2> *read_frame();
+ Ct<ProtocolV2> *finish_auth();
+ Ct<ProtocolV2> *finish_client_auth();
+ Ct<ProtocolV2> *handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r);
+ Ct<ProtocolV2> *read_frame_segment();
+ Ct<ProtocolV2> *handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r);
+ Ct<ProtocolV2> *_handle_read_frame_segment();
+ Ct<ProtocolV2> *handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r);
+ Ct<ProtocolV2> *_handle_read_frame_epilogue_main();
+ Ct<ProtocolV2> *handle_read_frame_dispatch();
+ Ct<ProtocolV2> *handle_frame_payload();
+
+ Ct<ProtocolV2> *ready();
+
+ Ct<ProtocolV2> *handle_message();
+ Ct<ProtocolV2> *throttle_message();
+ Ct<ProtocolV2> *throttle_bytes();
+ Ct<ProtocolV2> *throttle_dispatch_queue();
+ Ct<ProtocolV2> *read_message_data_prepare();
+
+ Ct<ProtocolV2> *handle_keepalive2(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_keepalive2_ack(ceph::bufferlist &payload);
+
+ Ct<ProtocolV2> *handle_message_ack(ceph::bufferlist &payload);
+
+public:
+ uint64_t connection_features;
+
+ ProtocolV2(AsyncConnection *connection);
+ virtual ~ProtocolV2();
+
+ virtual void connect() override;
+ virtual void accept() override;
+ virtual bool is_connected() override;
+ virtual void stop() override;
+ virtual void fault() override;
+ virtual void send_message(Message *m) override;
+ virtual void send_keepalive() override;
+
+ virtual void read_event() override;
+ virtual void write_event() override;
+ virtual bool is_queued() override;
+
+private:
+ // Client Protocol
+ CONTINUATION_DECL(ProtocolV2, start_client_banner_exchange);
+ CONTINUATION_DECL(ProtocolV2, post_client_banner_exchange);
+
+ Ct<ProtocolV2> *start_client_banner_exchange();
+ Ct<ProtocolV2> *post_client_banner_exchange();
+ inline Ct<ProtocolV2> *send_auth_request() {
+ std::vector<uint32_t> empty;
+ return send_auth_request(empty);
+ }
+ Ct<ProtocolV2> *send_auth_request(std::vector<uint32_t> &allowed_methods);
+ Ct<ProtocolV2> *handle_auth_bad_method(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_auth_reply_more(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_auth_done(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_auth_signature(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *send_client_ident();
+ Ct<ProtocolV2> *send_reconnect();
+ Ct<ProtocolV2> *handle_ident_missing_features(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_session_reset(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_session_retry(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_session_retry_global(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_wait(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_reconnect_ok(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_server_ident(ceph::bufferlist &payload);
+
+ // Server Protocol
+ CONTINUATION_DECL(ProtocolV2, start_server_banner_exchange);
+ CONTINUATION_DECL(ProtocolV2, post_server_banner_exchange);
+ CONTINUATION_DECL(ProtocolV2, server_ready);
+
+ Ct<ProtocolV2> *start_server_banner_exchange();
+ Ct<ProtocolV2> *post_server_banner_exchange();
+ Ct<ProtocolV2> *handle_auth_request(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_auth_request_more(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *_handle_auth_request(bufferlist& auth_payload, bool more);
+ Ct<ProtocolV2> *_auth_bad_method(int r);
+ Ct<ProtocolV2> *handle_client_ident(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_ident_missing_features_write(int r);
+ Ct<ProtocolV2> *handle_reconnect(ceph::bufferlist &payload);
+ Ct<ProtocolV2> *handle_existing_connection(AsyncConnectionRef existing);
+ Ct<ProtocolV2> *reuse_connection(AsyncConnectionRef existing,
+ ProtocolV2 *exproto);
+ Ct<ProtocolV2> *send_server_ident();
+ Ct<ProtocolV2> *send_reconnect_ok();
+ Ct<ProtocolV2> *server_ready();
+
+ size_t get_current_msg_size() const;
+};
+
+#endif /* _MSG_ASYNC_PROTOCOL_V2_ */
diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc
new file mode 100644
index 00000000..8976c3cc
--- /dev/null
+++ b/src/msg/async/Stack.cc
@@ -0,0 +1,217 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <mutex>
+
+#include "include/compat.h"
+#include "common/Cond.h"
+#include "common/errno.h"
+#include "PosixStack.h"
+#ifdef HAVE_RDMA
+#include "rdma/RDMAStack.h"
+#endif
+#ifdef HAVE_DPDK
+#include "dpdk/DPDKStack.h"
+#endif
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "stack "
+
+std::function<void ()> NetworkStack::add_thread(unsigned i)
+{
+ Worker *w = workers[i];
+ return [this, w]() {
+ char tp_name[16];
+ sprintf(tp_name, "msgr-worker-%u", w->id);
+ ceph_pthread_setname(pthread_self(), tp_name);
+ const unsigned EventMaxWaitUs = 30000000;
+ w->center.set_owner();
+ ldout(cct, 10) << __func__ << " starting" << dendl;
+ w->initialize();
+ w->init_done();
+ while (!w->done) {
+ ldout(cct, 30) << __func__ << " calling event process" << dendl;
+
+ ceph::timespan dur;
+ int r = w->center.process_events(EventMaxWaitUs, &dur);
+ if (r < 0) {
+ ldout(cct, 20) << __func__ << " process events failed: "
+ << cpp_strerror(errno) << dendl;
+ // TODO do something?
+ }
+ w->perf_logger->tinc(l_msgr_running_total_time, dur);
+ }
+ w->reset();
+ w->destroy();
+ };
+}
+
+std::shared_ptr<NetworkStack> NetworkStack::create(CephContext *c, const string &t)
+{
+ if (t == "posix")
+ return std::make_shared<PosixNetworkStack>(c, t);
+#ifdef HAVE_RDMA
+ else if (t == "rdma")
+ return std::make_shared<RDMAStack>(c, t);
+#endif
+#ifdef HAVE_DPDK
+ else if (t == "dpdk")
+ return std::make_shared<DPDKStack>(c, t);
+#endif
+
+ lderr(c) << __func__ << " ms_async_transport_type " << t <<
+ " is not supported! " << dendl;
+ ceph_abort();
+ return nullptr;
+}
+
+Worker* NetworkStack::create_worker(CephContext *c, const string &type, unsigned i)
+{
+ if (type == "posix")
+ return new PosixWorker(c, i);
+#ifdef HAVE_RDMA
+ else if (type == "rdma")
+ return new RDMAWorker(c, i);
+#endif
+#ifdef HAVE_DPDK
+ else if (type == "dpdk")
+ return new DPDKWorker(c, i);
+#endif
+
+ lderr(c) << __func__ << " ms_async_transport_type " << type <<
+ " is not supported! " << dendl;
+ ceph_abort();
+ return nullptr;
+}
+
+NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c)
+{
+ ceph_assert(cct->_conf->ms_async_op_threads > 0);
+
+ const int InitEventNumber = 5000;
+ num_workers = cct->_conf->ms_async_op_threads;
+ if (num_workers >= EventCenter::MAX_EVENTCENTER) {
+ ldout(cct, 0) << __func__ << " max thread limit is "
+ << EventCenter::MAX_EVENTCENTER << ", switching to this now. "
+ << "Higher thread values are unnecessary and currently unsupported."
+ << dendl;
+ num_workers = EventCenter::MAX_EVENTCENTER;
+ }
+
+ for (unsigned i = 0; i < num_workers; ++i) {
+ Worker *w = create_worker(cct, type, i);
+ w->center.init(InitEventNumber, i, type);
+ workers.push_back(w);
+ }
+}
+
+void NetworkStack::start()
+{
+ std::unique_lock<decltype(pool_spin)> lk(pool_spin);
+
+ if (started) {
+ return ;
+ }
+
+ for (unsigned i = 0; i < num_workers; ++i) {
+ if (workers[i]->is_init())
+ continue;
+ std::function<void ()> thread = add_thread(i);
+ spawn_worker(i, std::move(thread));
+ }
+ started = true;
+ lk.unlock();
+
+ for (unsigned i = 0; i < num_workers; ++i)
+ workers[i]->wait_for_init();
+}
+
+Worker* NetworkStack::get_worker()
+{
+ ldout(cct, 30) << __func__ << dendl;
+
+ // start with some reasonably large number
+ unsigned min_load = std::numeric_limits<int>::max();
+ Worker* current_best = nullptr;
+
+ pool_spin.lock();
+ // find worker with least references
+ // tempting case is returning on references == 0, but in reality
+ // this will happen so rarely that there's no need for special case.
+ for (unsigned i = 0; i < num_workers; ++i) {
+ unsigned worker_load = workers[i]->references.load();
+ if (worker_load < min_load) {
+ current_best = workers[i];
+ min_load = worker_load;
+ }
+ }
+
+ pool_spin.unlock();
+ ceph_assert(current_best);
+ ++current_best->references;
+ return current_best;
+}
+
+void NetworkStack::stop()
+{
+ std::lock_guard<decltype(pool_spin)> lk(pool_spin);
+ for (unsigned i = 0; i < num_workers; ++i) {
+ workers[i]->done = true;
+ workers[i]->center.wakeup();
+ join_worker(i);
+ }
+ started = false;
+}
+
+class C_drain : public EventCallback {
+ Mutex drain_lock;
+ Cond drain_cond;
+ unsigned drain_count;
+
+ public:
+ explicit C_drain(size_t c)
+ : drain_lock("C_drain::drain_lock"),
+ drain_count(c) {}
+ void do_request(uint64_t id) override {
+ Mutex::Locker l(drain_lock);
+ drain_count--;
+ if (drain_count == 0) drain_cond.Signal();
+ }
+ void wait() {
+ Mutex::Locker l(drain_lock);
+ while (drain_count)
+ drain_cond.Wait(drain_lock);
+ }
+};
+
+void NetworkStack::drain()
+{
+ ldout(cct, 30) << __func__ << " started." << dendl;
+ pthread_t cur = pthread_self();
+ pool_spin.lock();
+ C_drain drain(num_workers);
+ for (unsigned i = 0; i < num_workers; ++i) {
+ ceph_assert(cur != workers[i]->center.get_owner());
+ workers[i]->center.dispatch_event_external(EventCallbackRef(&drain));
+ }
+ pool_spin.unlock();
+ drain.wait();
+ ldout(cct, 30) << __func__ << " end." << dendl;
+}
diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h
new file mode 100644
index 00000000..a093dadb
--- /dev/null
+++ b/src/msg/async/Stack.h
@@ -0,0 +1,356 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_ASYNC_STACK_H
+#define CEPH_MSG_ASYNC_STACK_H
+
+#include "include/spinlock.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/Event.h"
+
+class Worker;
+class ConnectedSocketImpl {
+ public:
+ virtual ~ConnectedSocketImpl() {}
+ virtual int is_connected() = 0;
+ virtual ssize_t read(char*, size_t) = 0;
+ virtual ssize_t zero_copy_read(bufferptr&) = 0;
+ virtual ssize_t send(bufferlist &bl, bool more) = 0;
+ virtual void shutdown() = 0;
+ virtual void close() = 0;
+ virtual int fd() const = 0;
+ virtual int socket_fd() const = 0;
+};
+
+class ConnectedSocket;
+struct SocketOptions {
+ bool nonblock = true;
+ bool nodelay = true;
+ int rcbuf_size = 0;
+ int priority = -1;
+ entity_addr_t connect_bind_addr;
+};
+
+/// \cond internal
+class ServerSocketImpl {
+ public:
+ unsigned addr_type; ///< entity_addr_t::TYPE_*
+ unsigned addr_slot; ///< position of our addr in myaddrs().v
+ ServerSocketImpl(unsigned type, unsigned slot)
+ : addr_type(type), addr_slot(slot) {}
+ virtual ~ServerSocketImpl() {}
+ virtual int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) = 0;
+ virtual void abort_accept() = 0;
+ /// Get file descriptor
+ virtual int fd() const = 0;
+};
+/// \endcond
+
+/// \addtogroup networking-module
+/// @{
+
+/// A TCP (or other stream-based protocol) connection.
+///
+/// A \c ConnectedSocket represents a full-duplex stream between
+/// two endpoints, a local endpoint and a remote endpoint.
+class ConnectedSocket {
+ std::unique_ptr<ConnectedSocketImpl> _csi;
+
+ public:
+ /// Constructs a \c ConnectedSocket not corresponding to a connection
+ ConnectedSocket() {};
+ /// \cond internal
+ explicit ConnectedSocket(std::unique_ptr<ConnectedSocketImpl> csi)
+ : _csi(std::move(csi)) {}
+ /// \endcond
+ ~ConnectedSocket() {
+ if (_csi)
+ _csi->close();
+ }
+ /// Moves a \c ConnectedSocket object.
+ ConnectedSocket(ConnectedSocket&& cs) = default;
+ /// Move-assigns a \c ConnectedSocket object.
+ ConnectedSocket& operator=(ConnectedSocket&& cs) = default;
+
+ int is_connected() {
+ return _csi->is_connected();
+ }
+ /// Read the input stream with copy.
+ ///
+ /// Copy an object returning data sent from the remote endpoint.
+ ssize_t read(char* buf, size_t len) {
+ return _csi->read(buf, len);
+ }
+ /// Gets the input stream.
+ ///
+ /// Gets an object returning data sent from the remote endpoint.
+ ssize_t zero_copy_read(bufferptr &data) {
+ return _csi->zero_copy_read(data);
+ }
+ /// Gets the output stream.
+ ///
+ /// Gets an object that sends data to the remote endpoint.
+ ssize_t send(bufferlist &bl, bool more) {
+ return _csi->send(bl, more);
+ }
+ /// Disables output to the socket.
+ ///
+ /// Current or future writes that have not been successfully flushed
+ /// will immediately fail with an error. This is useful to abort
+ /// operations on a socket that is not making progress due to a
+ /// peer failure.
+ void shutdown() {
+ return _csi->shutdown();
+ }
+ /// Disables input from the socket.
+ ///
+ /// Current or future reads will immediately fail with an error.
+ /// This is useful to abort operations on a socket that is not making
+ /// progress due to a peer failure.
+ void close() {
+ _csi->close();
+ _csi.reset();
+ }
+
+ /// Get file descriptor
+ int fd() const {
+ return _csi->fd();
+ }
+ int socket_fd() const {
+ return _csi->socket_fd();
+ }
+
+ explicit operator bool() const {
+ return _csi.get();
+ }
+};
+/// @}
+
+/// \addtogroup networking-module
+/// @{
+
+/// A listening socket, waiting to accept incoming network connections.
+class ServerSocket {
+ std::unique_ptr<ServerSocketImpl> _ssi;
+ public:
+ /// Constructs a \c ServerSocket not corresponding to a connection
+ ServerSocket() {}
+ /// \cond internal
+ explicit ServerSocket(std::unique_ptr<ServerSocketImpl> ssi)
+ : _ssi(std::move(ssi)) {}
+ ~ServerSocket() {
+ if (_ssi)
+ _ssi->abort_accept();
+ }
+ /// \endcond
+ /// Moves a \c ServerSocket object.
+ ServerSocket(ServerSocket&& ss) = default;
+ /// Move-assigns a \c ServerSocket object.
+ ServerSocket& operator=(ServerSocket&& cs) = default;
+
+ /// Accepts the next connection to successfully connect to this socket.
+ ///
+ /// \Accepts a \ref ConnectedSocket representing the connection, and
+ /// a \ref entity_addr_t describing the remote endpoint.
+ int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) {
+ return _ssi->accept(sock, opt, out, w);
+ }
+
+ /// Stops any \ref accept() in progress.
+ ///
+ /// Current and future \ref accept() calls will terminate immediately
+ /// with an error.
+ void abort_accept() {
+ _ssi->abort_accept();
+ _ssi.reset();
+ }
+
+ /// Get file descriptor
+ int fd() const {
+ return _ssi->fd();
+ }
+
+ /// get listen/bind addr
+ unsigned get_addr_slot() {
+ return _ssi->addr_slot;
+ }
+
+ explicit operator bool() const {
+ return _ssi.get();
+ }
+};
+/// @}
+
+class NetworkStack;
+
+enum {
+ l_msgr_first = 94000,
+ l_msgr_recv_messages,
+ l_msgr_send_messages,
+ l_msgr_recv_bytes,
+ l_msgr_send_bytes,
+ l_msgr_created_connections,
+ l_msgr_active_connections,
+
+ l_msgr_running_total_time,
+ l_msgr_running_send_time,
+ l_msgr_running_recv_time,
+ l_msgr_running_fast_dispatch_time,
+
+ l_msgr_last,
+};
+
+class Worker {
+ std::mutex init_lock;
+ std::condition_variable init_cond;
+ bool init = false;
+
+ public:
+ bool done = false;
+
+ CephContext *cct;
+ PerfCounters *perf_logger;
+ unsigned id;
+
+ std::atomic_uint references;
+ EventCenter center;
+
+ Worker(const Worker&) = delete;
+ Worker& operator=(const Worker&) = delete;
+
+ Worker(CephContext *c, unsigned i)
+ : cct(c), perf_logger(NULL), id(i), references(0), center(c) {
+ char name[128];
+ sprintf(name, "AsyncMessenger::Worker-%u", id);
+ // initialize perf_logger
+ PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last);
+
+ plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages");
+ plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages");
+ plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number");
+ plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number");
+
+ plb.add_time(l_msgr_running_total_time, "msgr_running_total_time", "The total time of thread running");
+ plb.add_time(l_msgr_running_send_time, "msgr_running_send_time", "The total time of message sending");
+ plb.add_time(l_msgr_running_recv_time, "msgr_running_recv_time", "The total time of message receiving");
+ plb.add_time(l_msgr_running_fast_dispatch_time, "msgr_running_fast_dispatch_time", "The total time of fast dispatch");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ }
+ virtual ~Worker() {
+ if (perf_logger) {
+ cct->get_perfcounters_collection()->remove(perf_logger);
+ delete perf_logger;
+ }
+ }
+
+ virtual int listen(entity_addr_t &addr, unsigned addr_slot,
+ const SocketOptions &opts, ServerSocket *) = 0;
+ virtual int connect(const entity_addr_t &addr,
+ const SocketOptions &opts, ConnectedSocket *socket) = 0;
+ virtual void destroy() {}
+
+ virtual void initialize() {}
+ PerfCounters *get_perf_counter() { return perf_logger; }
+ void release_worker() {
+ int oldref = references.fetch_sub(1);
+ ceph_assert(oldref > 0);
+ }
+ void init_done() {
+ init_lock.lock();
+ init = true;
+ init_cond.notify_all();
+ init_lock.unlock();
+ }
+ bool is_init() {
+ std::lock_guard<std::mutex> l(init_lock);
+ return init;
+ }
+ void wait_for_init() {
+ std::unique_lock<std::mutex> l(init_lock);
+ while (!init)
+ init_cond.wait(l);
+ }
+ void reset() {
+ init_lock.lock();
+ init = false;
+ init_cond.notify_all();
+ init_lock.unlock();
+ done = false;
+ }
+};
+
+class NetworkStack {
+ std::string type;
+ unsigned num_workers = 0;
+ ceph::spinlock pool_spin;
+ bool started = false;
+
+ std::function<void ()> add_thread(unsigned i);
+
+ protected:
+ CephContext *cct;
+ vector<Worker*> workers;
+
+ explicit NetworkStack(CephContext *c, const string &t);
+ public:
+ NetworkStack(const NetworkStack &) = delete;
+ NetworkStack& operator=(const NetworkStack &) = delete;
+ virtual ~NetworkStack() {
+ for (auto &&w : workers)
+ delete w;
+ }
+
+ static std::shared_ptr<NetworkStack> create(
+ CephContext *c, const string &type);
+
+ static Worker* create_worker(
+ CephContext *c, const string &t, unsigned i);
+ // backend need to override this method if supports zero copy read
+ virtual bool support_zero_copy_read() const { return false; }
+ // backend need to override this method if backend doesn't support shared
+ // listen table.
+ // For example, posix backend has in kernel global listen table. If one
+ // thread bind a port, other threads also aware this.
+ // But for dpdk backend, we maintain listen table in each thread. So we
+ // need to let each thread do binding port.
+ virtual bool support_local_listen_table() const { return false; }
+ virtual bool nonblock_connect_need_writable_event() const { return true; }
+
+ void start();
+ void stop();
+ virtual Worker *get_worker();
+ Worker *get_worker(unsigned i) {
+ return workers[i];
+ }
+ void drain();
+ unsigned get_num_worker() const {
+ return num_workers;
+ }
+
+ // direct is used in tests only
+ virtual void spawn_worker(unsigned i, std::function<void ()> &&) = 0;
+ virtual void join_worker(unsigned i) = 0;
+
+ virtual bool is_ready() { return true; };
+ virtual void ready() { };
+};
+
+#endif //CEPH_MSG_ASYNC_STACK_H
diff --git a/src/msg/async/crypto_onwire.cc b/src/msg/async/crypto_onwire.cc
new file mode 100644
index 00000000..4e423406
--- /dev/null
+++ b/src/msg/async/crypto_onwire.cc
@@ -0,0 +1,311 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include <array>
+#include <openssl/evp.h>
+
+#include "crypto_onwire.h"
+
+#include "common/debug.h"
+#include "common/ceph_crypto.h"
+#include "include/types.h"
+
+#define dout_subsys ceph_subsys_ms
+
+namespace ceph::crypto::onwire {
+
+static constexpr const std::size_t AESGCM_KEY_LEN{16};
+static constexpr const std::size_t AESGCM_IV_LEN{12};
+static constexpr const std::size_t AESGCM_TAG_LEN{16};
+static constexpr const std::size_t AESGCM_BLOCK_LEN{16};
+
+struct nonce_t {
+ ceph_le32 fixed;
+ ceph_le64 counter;
+
+ bool operator==(const nonce_t& rhs) const {
+ return !memcmp(this, &rhs, sizeof(*this));
+ }
+} __attribute__((packed));
+static_assert(sizeof(nonce_t) == AESGCM_IV_LEN);
+
+using key_t = std::array<std::uint8_t, AESGCM_KEY_LEN>;
+
+// http://www.mindspring.com/~dmcgrew/gcm-nist-6.pdf
+// https://www.openssl.org/docs/man1.0.2/crypto/EVP_aes_128_gcm.html#GCM-mode
+// https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption
+// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf
+class AES128GCM_OnWireTxHandler : public ceph::crypto::onwire::TxHandler {
+ CephContext* const cct;
+ std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+ ceph::bufferlist buffer;
+ nonce_t nonce, initial_nonce;
+ bool used_initial_nonce;
+ bool new_nonce_format; // 64-bit counter?
+ static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+ AES128GCM_OnWireTxHandler(CephContext* const cct,
+ const key_t& key,
+ const nonce_t& nonce,
+ bool new_nonce_format)
+ : cct(cct),
+ ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+ nonce(nonce), initial_nonce(nonce), used_initial_nonce(false),
+ new_nonce_format(new_nonce_format) {
+ ceph_assert_always(ectx);
+ ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+ if (1 != EVP_EncryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+ nullptr, nullptr, nullptr)) {
+ throw std::runtime_error("EVP_EncryptInit_ex failed");
+ }
+
+ if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr,
+ key.data(), nullptr)) {
+ throw std::runtime_error("EVP_EncryptInit_ex failed");
+ }
+ }
+
+ ~AES128GCM_OnWireTxHandler() override {
+ ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+ ::ceph::crypto::zeroize_for_security(&initial_nonce, sizeof(initial_nonce));
+ }
+
+ void reset_tx_handler(const uint32_t* first, const uint32_t* last) override;
+
+ void authenticated_encrypt_update(const ceph::bufferlist& plaintext) override;
+ ceph::bufferlist authenticated_encrypt_final() override;
+};
+
+void AES128GCM_OnWireTxHandler::reset_tx_handler(const uint32_t* first,
+ const uint32_t* last)
+{
+ if (nonce == initial_nonce) {
+ if (used_initial_nonce) {
+ throw ceph::crypto::onwire::TxHandlerError("out of nonces");
+ }
+ used_initial_nonce = true;
+ }
+
+ if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+ reinterpret_cast<const unsigned char*>(&nonce))) {
+ throw std::runtime_error("EVP_EncryptInit_ex failed");
+ }
+
+ ceph_assert(buffer.get_append_buffer_unused_tail_length() == 0);
+ buffer.reserve(std::accumulate(first, last, AESGCM_TAG_LEN));
+
+ if (!new_nonce_format) {
+ // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+ // susceptible to overflow!
+ nonce.fixed = nonce.fixed + 1;
+ } else {
+ nonce.counter = nonce.counter + 1;
+ }
+}
+
+void AES128GCM_OnWireTxHandler::authenticated_encrypt_update(
+ const ceph::bufferlist& plaintext)
+{
+ ceph_assert(buffer.get_append_buffer_unused_tail_length() >=
+ plaintext.length());
+ auto filler = buffer.append_hole(plaintext.length());
+
+ for (const auto& plainbuf : plaintext.buffers()) {
+ int update_len = 0;
+
+ if(1 != EVP_EncryptUpdate(ectx.get(),
+ reinterpret_cast<unsigned char*>(filler.c_str()),
+ &update_len,
+ reinterpret_cast<const unsigned char*>(plainbuf.c_str()),
+ plainbuf.length())) {
+ throw std::runtime_error("EVP_EncryptUpdate failed");
+ }
+ ceph_assert_always(update_len >= 0);
+ ceph_assert(static_cast<unsigned>(update_len) == plainbuf.length());
+ filler.advance(update_len);
+ }
+
+ ldout(cct, 15) << __func__
+ << " plaintext.length()=" << plaintext.length()
+ << " buffer.length()=" << buffer.length()
+ << dendl;
+}
+
+ceph::bufferlist AES128GCM_OnWireTxHandler::authenticated_encrypt_final()
+{
+ int final_len = 0;
+ ceph_assert(buffer.get_append_buffer_unused_tail_length() ==
+ AESGCM_BLOCK_LEN);
+ auto filler = buffer.append_hole(AESGCM_BLOCK_LEN);
+ if(1 != EVP_EncryptFinal_ex(ectx.get(),
+ reinterpret_cast<unsigned char*>(filler.c_str()),
+ &final_len)) {
+ throw std::runtime_error("EVP_EncryptFinal_ex failed");
+ }
+ ceph_assert_always(final_len == 0);
+
+ static_assert(AESGCM_BLOCK_LEN == AESGCM_TAG_LEN);
+ if(1 != EVP_CIPHER_CTX_ctrl(ectx.get(),
+ EVP_CTRL_GCM_GET_TAG, AESGCM_TAG_LEN,
+ filler.c_str())) {
+ throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+ }
+
+ ldout(cct, 15) << __func__
+ << " buffer.length()=" << buffer.length()
+ << " final_len=" << final_len
+ << dendl;
+ return std::move(buffer);
+}
+
+// RX PART
+class AES128GCM_OnWireRxHandler : public ceph::crypto::onwire::RxHandler {
+ CephContext* const cct;
+ std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx;
+ nonce_t nonce;
+ bool new_nonce_format; // 64-bit counter?
+ static_assert(sizeof(nonce) == AESGCM_IV_LEN);
+
+public:
+ AES128GCM_OnWireRxHandler(CephContext* const cct,
+ const key_t& key,
+ const nonce_t& nonce,
+ bool new_nonce_format)
+ : cct(cct),
+ ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free),
+ nonce(nonce), new_nonce_format(new_nonce_format) {
+ ceph_assert_always(ectx);
+ ceph_assert_always(key.size() * CHAR_BIT == 128);
+
+ if (1 != EVP_DecryptInit_ex(ectx.get(), EVP_aes_128_gcm(),
+ nullptr, nullptr, nullptr)) {
+ throw std::runtime_error("EVP_DecryptInit_ex failed");
+ }
+
+ if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr,
+ key.data(), nullptr)) {
+ throw std::runtime_error("EVP_DecryptInit_ex failed");
+ }
+ }
+
+ ~AES128GCM_OnWireRxHandler() override {
+ ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce));
+ }
+
+ std::uint32_t get_extra_size_at_final() override {
+ return AESGCM_TAG_LEN;
+ }
+ void reset_rx_handler() override;
+ void authenticated_decrypt_update(ceph::bufferlist& bl) override;
+ void authenticated_decrypt_update_final(ceph::bufferlist& bl) override;
+};
+
+void AES128GCM_OnWireRxHandler::reset_rx_handler()
+{
+ if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, nullptr,
+ reinterpret_cast<const unsigned char*>(&nonce))) {
+ throw std::runtime_error("EVP_DecryptInit_ex failed");
+ }
+
+ if (!new_nonce_format) {
+ // msgr2.0: 32-bit counter followed by 64-bit fixed field,
+ // susceptible to overflow!
+ nonce.fixed = nonce.fixed + 1;
+ } else {
+ nonce.counter = nonce.counter + 1;
+ }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update(
+ ceph::bufferlist& bl)
+{
+ // discard cached crcs as we will be writing through c_str()
+ bl.invalidate_crc();
+ for (auto& buf : bl.buffers()) {
+ auto p = reinterpret_cast<unsigned char*>(const_cast<char*>(buf.c_str()));
+ int update_len = 0;
+
+ if (1 != EVP_DecryptUpdate(ectx.get(), p, &update_len, p, buf.length())) {
+ throw std::runtime_error("EVP_DecryptUpdate failed");
+ }
+ ceph_assert_always(update_len >= 0);
+ ceph_assert(static_cast<unsigned>(update_len) == buf.length());
+ }
+}
+
+void AES128GCM_OnWireRxHandler::authenticated_decrypt_update_final(
+ ceph::bufferlist& bl)
+{
+ unsigned orig_len = bl.length();
+ ceph_assert(orig_len >= AESGCM_TAG_LEN);
+
+ // decrypt optional data. Caller is obliged to provide only signature but it
+ // may supply ciphertext as well. Combining the update + final is reflected
+ // combined together.
+ ceph::bufferlist auth_tag;
+ bl.splice(orig_len - AESGCM_TAG_LEN, AESGCM_TAG_LEN, &auth_tag);
+ if (bl.length() > 0) {
+ authenticated_decrypt_update(bl);
+ }
+
+ // we need to ensure the tag is stored in continuous memory.
+ if (1 != EVP_CIPHER_CTX_ctrl(ectx.get(), EVP_CTRL_GCM_SET_TAG,
+ AESGCM_TAG_LEN, auth_tag.c_str())) {
+ throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed");
+ }
+
+ // I expect that 0 bytes will be appended. The call is supposed solely to
+ // authenticate the message.
+ {
+ int final_len = 0;
+ if (0 >= EVP_DecryptFinal_ex(ectx.get(), nullptr, &final_len)) {
+ throw MsgAuthError();
+ }
+ ceph_assert_always(final_len == 0);
+ ceph_assert(bl.length() + AESGCM_TAG_LEN == orig_len);
+ }
+}
+
+ceph::crypto::onwire::rxtx_t ceph::crypto::onwire::rxtx_t::create_handler_pair(
+ CephContext* cct,
+ const AuthConnectionMeta& auth_meta,
+ bool new_nonce_format,
+ bool crossed)
+{
+ if (auth_meta.is_mode_secure()) {
+ ceph_assert_always(auth_meta.connection_secret.length() >= \
+ sizeof(key_t) + 2 * sizeof(nonce_t));
+ const char* secbuf = auth_meta.connection_secret.c_str();
+
+ key_t key;
+ {
+ ::memcpy(key.data(), secbuf, sizeof(key));
+ secbuf += sizeof(key);
+ }
+
+ nonce_t rx_nonce;
+ {
+ ::memcpy(&rx_nonce, secbuf, sizeof(rx_nonce));
+ secbuf += sizeof(rx_nonce);
+ }
+
+ nonce_t tx_nonce;
+ {
+ ::memcpy(&tx_nonce, secbuf, sizeof(tx_nonce));
+ secbuf += sizeof(tx_nonce);
+ }
+
+ return {
+ std::make_unique<AES128GCM_OnWireRxHandler>(
+ cct, key, crossed ? tx_nonce : rx_nonce, new_nonce_format),
+ std::make_unique<AES128GCM_OnWireTxHandler>(
+ cct, key, crossed ? rx_nonce : tx_nonce, new_nonce_format)
+ };
+ } else {
+ return { nullptr, nullptr };
+ }
+}
+
+} // namespace ceph::crypto::onwire
diff --git a/src/msg/async/crypto_onwire.h b/src/msg/async/crypto_onwire.h
new file mode 100644
index 00000000..55f75508
--- /dev/null
+++ b/src/msg/async/crypto_onwire.h
@@ -0,0 +1,130 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef CEPH_CRYPTO_ONWIRE_H
+#define CEPH_CRYPTO_ONWIRE_H
+
+#include <cstdint>
+#include <memory>
+
+#include "auth/Auth.h"
+#include "include/buffer.h"
+
+namespace ceph::math {
+
+// TODO
+template <typename T>
+class always_aligned_t {
+ T val;
+
+ template <class... Args>
+ always_aligned_t(Args&&... args)
+ : val(std::forward<Args>(args)...) {
+ }
+};
+
+} // namespace ceph::math
+
+namespace ceph::crypto::onwire {
+
+struct MsgAuthError : public std::runtime_error {
+ MsgAuthError()
+ : runtime_error("message signature mismatch") {
+ }
+};
+
+struct TxHandlerError : public std::runtime_error {
+ TxHandlerError(const char* what)
+ : std::runtime_error(std::string("tx handler error: ") + what) {}
+};
+
+struct TxHandler {
+ virtual ~TxHandler() = default;
+
+ // Instance of TxHandler must be reset before doing any encrypt-update
+ // step. This applies also to situation when encrypt-final was already
+ // called and another round of update-...-update-final will take place.
+ //
+ // The input parameter informs implementation how the -update sequence
+ // is fragmented and allows to make concious decision about allocation
+ // or reusage of provided memory. One implementation could do in-place
+ // encryption while other might prefer one huge output buffer.
+ //
+ // It's undefined what will happen if client doesn't follow the order.
+ //
+ // TODO: switch to always_aligned_t
+ virtual void reset_tx_handler(const uint32_t* first,
+ const uint32_t* last) = 0;
+
+ void reset_tx_handler(std::initializer_list<uint32_t> update_size_sequence) {
+ if (update_size_sequence.size() > 0) {
+ const uint32_t* first = &*update_size_sequence.begin();
+ reset_tx_handler(first, first + update_size_sequence.size());
+ } else {
+ reset_tx_handler(nullptr, nullptr);
+ }
+ }
+
+ // Perform encryption. Client gives full ownership right to provided
+ // bufferlist. The method MUST NOT be called after _final() if there
+ // was no call to _reset().
+ virtual void authenticated_encrypt_update(
+ const ceph::bufferlist& plaintext) = 0;
+
+ // Generates authentication signature and returns bufferlist crafted
+ // basing on plaintext from preceding call to _update().
+ virtual ceph::bufferlist authenticated_encrypt_final() = 0;
+};
+
+class RxHandler {
+public:
+ virtual ~RxHandler() = default;
+
+ // Transmitter can append extra bytes of ciphertext at the -final step.
+ // This method return how much was added, and thus let client translate
+ // plaintext size into ciphertext size to grab from wire.
+ virtual std::uint32_t get_extra_size_at_final() = 0;
+
+ // Instance of RxHandler must be reset before doing any decrypt-update
+ // step. This applies also to situation when decrypt-final was already
+ // called and another round of update-...-update-final will take place.
+ virtual void reset_rx_handler() = 0;
+
+ // Perform decryption ciphertext must be ALWAYS aligned to 16 bytes.
+ virtual void authenticated_decrypt_update(ceph::bufferlist& bl) = 0;
+
+ // Perform decryption of last cipertext's portion and verify signature
+ // for overall decryption sequence.
+ // Throws on integrity/authenticity checks
+ virtual void authenticated_decrypt_update_final(ceph::bufferlist& bl) = 0;
+};
+
+struct rxtx_t {
+ //rxtx_t(rxtx_t&& r) : rx(std::move(rx)), tx(std::move(tx)) {}
+ // Each peer can use different handlers.
+ // Hmm, isn't that too much flexbility?
+ std::unique_ptr<RxHandler> rx;
+ std::unique_ptr<TxHandler> tx;
+
+ static rxtx_t create_handler_pair(
+ CephContext* ctx,
+ const class AuthConnectionMeta& auth_meta,
+ bool new_nonce_format,
+ bool crossed);
+};
+
+} // namespace ceph::crypto::onwire
+
+#endif // CEPH_CRYPTO_ONWIRE_H
diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc
new file mode 100644
index 00000000..dedc9e3c
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "ARP.h"
+
+arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num)
+ : _arp(a), _proto_num(proto_num)
+{
+ _arp.add(proto_num, this);
+}
+
+arp_for_protocol::~arp_for_protocol()
+{
+ _arp.del(_proto_num);
+}
+
+arp::arp(interface* netif):
+ _netif(netif),
+ _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }),
+ _rx_packets(
+ _proto.receive(
+ [this] (Packet p, ethernet_address ea) {
+ return process_packet(std::move(p), ea);
+ },
+ [this](forward_hash& out_hash_data, Packet& p, size_t off) {
+ return forward(out_hash_data, p, off);
+ }
+ )
+ )
+{}
+
+Tub<l3_protocol::l3packet> arp::get_packet()
+{
+ Tub<l3_protocol::l3packet> p;
+ if (!_packetq.empty()) {
+ p = std::move(_packetq.front());
+ _packetq.pop_front();
+ }
+ return p;
+}
+
+bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+ auto ah = p.get_header<arp_hdr>(off);
+ auto i = _arp_for_protocol.find(ntoh(ah->ptype));
+ if (i != _arp_for_protocol.end()) {
+ return i->second->forward(out_hash_data, p, off);
+ }
+ return false;
+}
+
+void arp::add(uint16_t proto_num, arp_for_protocol* afp)
+{
+ _arp_for_protocol[proto_num] = afp;
+}
+
+void arp::del(uint16_t proto_num)
+{
+ _arp_for_protocol.erase(proto_num);
+}
+
+int arp::process_packet(Packet p, ethernet_address from)
+{
+ auto ah = p.get_header<arp_hdr>()->ntoh();
+ auto i = _arp_for_protocol.find(ah.ptype);
+ if (i != _arp_for_protocol.end()) {
+ i->second->received(std::move(p));
+ }
+ return 0;
+}
diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h
new file mode 100644
index 00000000..54569564
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_ARP_H_
+#define CEPH_MSG_ARP_H_
+
+#include <errno.h>
+
+#include <unordered_map>
+#include <functional>
+
+#include "msg/async/Event.h"
+
+#include "ethernet.h"
+#include "circular_buffer.h"
+#include "ip_types.h"
+#include "net.h"
+#include "Packet.h"
+
+class arp;
+template <typename L3>
+class arp_for;
+
+class arp_for_protocol {
+ protected:
+ arp& _arp;
+ uint16_t _proto_num;
+ public:
+ arp_for_protocol(arp& a, uint16_t proto_num);
+ virtual ~arp_for_protocol();
+ virtual int received(Packet p) = 0;
+ virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; }
+};
+
+class interface;
+
+class arp {
+ interface* _netif;
+ l3_protocol _proto;
+ subscription<Packet, ethernet_address> _rx_packets;
+ std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol;
+ circular_buffer<l3_protocol::l3packet> _packetq;
+ private:
+ struct arp_hdr {
+ uint16_t htype;
+ uint16_t ptype;
+ arp_hdr ntoh() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::ntoh(htype);
+ hdr.ptype = ::ntoh(ptype);
+ return hdr;
+ }
+ arp_hdr hton() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::hton(htype);
+ hdr.ptype = ::hton(ptype);
+ return hdr;
+ }
+ };
+ public:
+ explicit arp(interface* netif);
+ void add(uint16_t proto_num, arp_for_protocol* afp);
+ void del(uint16_t proto_num);
+ private:
+ ethernet_address l2self() { return _netif->hw_address(); }
+ int process_packet(Packet p, ethernet_address from);
+ bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+ Tub<l3_protocol::l3packet> get_packet();
+ template <class l3_proto>
+ friend class arp_for;
+};
+
+template <typename L3>
+class arp_for : public arp_for_protocol {
+ public:
+ using l2addr = ethernet_address;
+ using l3addr = typename L3::address_type;
+ private:
+ static constexpr auto max_waiters = 512;
+ enum oper {
+ op_request = 1,
+ op_reply = 2,
+ };
+ struct arp_hdr {
+ uint16_t htype;
+ uint16_t ptype;
+ uint8_t hlen;
+ uint8_t plen;
+ uint16_t oper;
+ l2addr sender_hwaddr;
+ l3addr sender_paddr;
+ l2addr target_hwaddr;
+ l3addr target_paddr;
+
+ arp_hdr ntoh() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::ntoh(htype);
+ hdr.ptype = ::ntoh(ptype);
+ hdr.oper = ::ntoh(oper);
+ hdr.sender_hwaddr = sender_hwaddr.ntoh();
+ hdr.sender_paddr = sender_paddr.ntoh();
+ hdr.target_hwaddr = target_hwaddr.ntoh();
+ hdr.target_paddr = target_paddr.ntoh();
+ return hdr;
+ }
+
+ arp_hdr hton() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::hton(htype);
+ hdr.ptype = ::hton(ptype);
+ hdr.oper = ::hton(oper);
+ hdr.sender_hwaddr = sender_hwaddr.hton();
+ hdr.sender_paddr = sender_paddr.hton();
+ hdr.target_hwaddr = target_hwaddr.hton();
+ hdr.target_paddr = target_paddr.hton();
+ return hdr;
+ }
+ };
+ struct resolution {
+ std::vector<std::pair<resolution_cb, Packet>> _waiters;
+ uint64_t timeout_fd;
+ };
+ class C_handle_arp_timeout : public EventCallback {
+ arp_for *arp;
+ l3addr paddr;
+ bool first_request;
+
+ public:
+ C_handle_arp_timeout(arp_for *a, l3addr addr, bool first):
+ arp(a), paddr(addr), first_request(first) {}
+ void do_request(uint64_t r) {
+ arp->send_query(paddr);
+ auto &res = arp->_in_progress[paddr];
+
+ for (auto& p : res._waiters) {
+ p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT);
+ }
+ res._waiters.clear();
+ res.timeout_fd = arp->center->create_time_event(
+ 1*1000*1000, this);
+ }
+ };
+ friend class C_handle_arp_timeout;
+
+ private:
+ CephContext *cct;
+ EventCenter *center;
+ l3addr _l3self = L3::broadcast_address();
+ std::unordered_map<l3addr, l2addr> _table;
+ std::unordered_map<l3addr, resolution> _in_progress;
+ private:
+ Packet make_query_packet(l3addr paddr);
+ virtual int received(Packet p) override;
+ int handle_request(arp_hdr* ah);
+ l2addr l2self() { return _arp.l2self(); }
+ void send(l2addr to, Packet &&p);
+ public:
+ void send_query(const l3addr& paddr);
+ explicit arp_for(CephContext *c, arp& a, EventCenter *cen)
+ : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) {
+ _table[L3::broadcast_address()] = ethernet::broadcast_address();
+ }
+ ~arp_for() {
+ for (auto && p : _in_progress)
+ center->delete_time_event(p.second.timeout_fd);
+ }
+ void wait(const l3addr& addr, Packet p, resolution_cb cb);
+ void learn(l2addr l2, l3addr l3);
+ void run();
+ void set_self_addr(l3addr addr) {
+ _table.erase(_l3self);
+ _table[addr] = l2self();
+ _l3self = addr;
+ }
+ friend class arp;
+};
+
+template <typename L3>
+void arp_for<L3>::send(l2addr to, Packet &&p) {
+ _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)});
+}
+
+template <typename L3>
+Packet arp_for<L3>::make_query_packet(l3addr paddr) {
+ arp_hdr hdr;
+ hdr.htype = ethernet::arp_hardware_type();
+ hdr.ptype = L3::arp_protocol_type();
+ hdr.hlen = sizeof(l2addr);
+ hdr.plen = sizeof(l3addr);
+ hdr.oper = op_request;
+ hdr.sender_hwaddr = l2self();
+ hdr.sender_paddr = _l3self;
+ hdr.target_hwaddr = ethernet::broadcast_address();
+ hdr.target_paddr = paddr;
+ hdr = hdr.hton();
+ return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr));
+}
+
+template <typename L3>
+void arp_for<L3>::send_query(const l3addr& paddr) {
+ send(ethernet::broadcast_address(), make_query_packet(paddr));
+}
+
+template <typename L3>
+void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) {
+ _table[paddr] = hwaddr;
+ auto i = _in_progress.find(paddr);
+ if (i != _in_progress.end()) {
+ auto& res = i->second;
+ center->delete_time_event(res.timeout_fd);
+ for (auto &&p : res._waiters) {
+ p.first(hwaddr, std::move(p.second), 0);
+ }
+ _in_progress.erase(i);
+ }
+}
+
+template <typename L3>
+void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) {
+ auto i = _table.find(paddr);
+ if (i != _table.end()) {
+ cb(i->second, std::move(p), 0);
+ return ;
+ }
+
+ auto j = _in_progress.find(paddr);
+ auto first_request = j == _in_progress.end();
+ auto& res = first_request ? _in_progress[paddr] : j->second;
+
+ if (first_request) {
+ res.timeout_fd = center->create_time_event(
+ 1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request));
+ send_query(paddr);
+ }
+
+ if (res._waiters.size() >= max_waiters) {
+ cb(ethernet_address(), std::move(p), -EBUSY);
+ return ;
+ }
+
+ res._waiters.emplace_back(cb, std::move(p));
+ return ;
+}
+
+template <typename L3>
+int arp_for<L3>::received(Packet p) {
+ auto ah = p.get_header<arp_hdr>();
+ if (!ah) {
+ return 0;
+ }
+ auto h = ah->ntoh();
+ if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) {
+ return 0;
+ }
+ switch (h.oper) {
+ case op_request:
+ return handle_request(&h);
+ case op_reply:
+ _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+template <typename L3>
+int arp_for<L3>::handle_request(arp_hdr* ah) {
+ if (ah->target_paddr == _l3self
+ && _l3self != L3::broadcast_address()) {
+ ah->oper = op_reply;
+ ah->target_hwaddr = ah->sender_hwaddr;
+ ah->target_paddr = ah->sender_paddr;
+ ah->sender_hwaddr = l2self();
+ ah->sender_paddr = _l3self;
+ *ah = ah->hton();
+ send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah)));
+ }
+ return 0;
+}
+
+#endif /* CEPH_MSG_ARP_H_ */
diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc
new file mode 100644
index 00000000..278efe9e
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.cc
@@ -0,0 +1,1267 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <atomic>
+#include <vector>
+#include <queue>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_pci.h>
+#include <rte_ethdev.h>
+#include <rte_cycles.h>
+#include <rte_memzone.h>
+
+#include "include/page.h"
+#include "align.h"
+#include "IP.h"
+#include "const.h"
+#include "dpdk_rte.h"
+#include "DPDK.h"
+#include "toeplitz.h"
+
+#include "common/Cycles.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+
+void* as_cookie(struct rte_pktmbuf_pool_private& p) {
+ return &p;
+};
+
+#ifndef MARKER
+typedef void *MARKER[0]; /**< generic marker for a point in a structure */
+#endif
+
+/******************* Net device related constatns *****************************/
+static constexpr uint16_t default_ring_size = 512;
+
+//
+// We need 2 times the ring size of buffers because of the way PMDs
+// refill the ring.
+//
+static constexpr uint16_t mbufs_per_queue_rx = 2 * default_ring_size;
+static constexpr uint16_t rx_gc_thresh = 64;
+
+//
+// No need to keep more descriptors in the air than can be sent in a single
+// rte_eth_tx_burst() call.
+//
+static constexpr uint16_t mbufs_per_queue_tx = 2 * default_ring_size;
+
+static constexpr uint16_t mbuf_cache_size = 512;
+//
+// Size of the data buffer in the non-inline case.
+//
+// We may want to change (increase) this value in future, while the
+// inline_mbuf_data_size value will unlikely change due to reasons described
+// above.
+//
+static constexpr size_t mbuf_data_size = 4096;
+
+static constexpr uint16_t mbuf_overhead =
+ sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+//
+// We'll allocate 2K data buffers for an inline case because this would require
+// a single page per mbuf. If we used 4K data buffers here it would require 2
+// pages for a single buffer (due to "mbuf_overhead") and this is a much more
+// demanding memory constraint.
+//
+static constexpr size_t inline_mbuf_data_size = 2048;
+
+
+// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers
+static constexpr uint8_t max_frags = 32 + 1;
+
+//
+// Intel's 40G NIC HW limit for a number of fragments in an xmit segment.
+//
+// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices
+// spec. for more details.
+//
+static constexpr uint8_t i40e_max_xmit_segment_frags = 8;
+
+//
+// VMWare's virtual NIC limit for a number of fragments in an xmit segment.
+//
+// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT
+//
+static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16;
+
+static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead;
+
+static size_t huge_page_size = 512 * CEPH_PAGE_SIZE;
+
+uint32_t qp_mempool_obj_size()
+{
+ uint32_t mp_size = 0;
+ struct rte_mempool_objsz mp_obj_sz = {};
+
+ //
+ // We will align each size to huge page size because DPDK allocates
+ // physically contiguous memory region for each pool object.
+ //
+
+ // Rx
+ mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+
+ sizeof(struct rte_pktmbuf_pool_private),
+ huge_page_size);
+
+ //Tx
+ std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz));
+ mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0,
+ &mp_obj_sz)+
+ sizeof(struct rte_pktmbuf_pool_private),
+ huge_page_size);
+ return mp_size;
+}
+
+static constexpr const char* pktmbuf_pool_name = "dpdk_net_pktmbuf_pool";
+
+/*
+ * When doing reads from the NIC queues, use this batch size
+ */
+static constexpr uint8_t packet_read_size = 32;
+/******************************************************************************/
+
+int DPDKDevice::init_port_start()
+{
+ ceph_assert(_port_idx < rte_eth_dev_count());
+
+ rte_eth_dev_info_get(_port_idx, &_dev_info);
+
+ //
+ // This is a workaround for a missing handling of a HW limitation in the
+ // DPDK i40e driver. This and all related to _is_i40e_device code should be
+ // removed once this handling is added.
+ //
+ if (std::string("rte_i40evf_pmd") == _dev_info.driver_name ||
+ std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+ ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl;
+ _is_i40e_device = true;
+ }
+
+ if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) {
+ ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl;
+ _is_vmxnet3_device = true;
+ }
+
+ //
+ // Another workaround: this time for a lack of number of RSS bits.
+ // ixgbe PF NICs support up to 16 RSS queues.
+ // ixgbe VF NICs support up to 4 RSS queues.
+ // i40e PF NICs support up to 64 RSS queues.
+ // i40e VF NICs support up to 16 RSS queues.
+ //
+ if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+ } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4);
+ } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64);
+ } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+ }
+
+ // Clear txq_flags - we want to support all available offload features
+ // except for multi-mempool and refcnt'ing which we don't need
+ _dev_info.default_txconf.txq_flags =
+ ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT;
+
+ //
+ // Disable features that are not supported by port's HW
+ //
+ if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) {
+ _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP;
+ }
+
+ if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) {
+ _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP;
+ }
+
+ if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) {
+ _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP;
+ }
+
+ if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
+ _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
+ }
+
+ if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) {
+ _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL;
+ }
+
+ if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) {
+ _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
+ }
+
+ /* for port configuration all features are off by default */
+ rte_eth_conf port_conf = { 0 };
+
+ ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues "
+ << _dev_info.max_rx_queues << " max_tx_queues "
+ << _dev_info.max_tx_queues << dendl;
+
+ _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues});
+
+ ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using "
+ << _num_queues << " queues" << dendl;;
+
+ // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
+ // Even if port has a single queue we still want the RSS feature to be
+ // available in order to make HW calculate RSS hash for us.
+ if (_num_queues > 1) {
+ if (_dev_info.hash_key_size == 40) {
+ _rss_key = default_rsskey_40bytes;
+ } else if (_dev_info.hash_key_size == 52) {
+ _rss_key = default_rsskey_52bytes;
+ } else if (_dev_info.hash_key_size != 0) {
+ // WTF?!!
+ rte_exit(EXIT_FAILURE,
+ "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested",
+ _port_idx, _dev_info.hash_key_size);
+ } else {
+ _rss_key = default_rsskey_40bytes;
+ _dev_info.hash_key_size = 40;
+ }
+
+ port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK;
+ if (_dev_info.hash_key_size) {
+ port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data());
+ port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size;
+ }
+ } else {
+ port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+ }
+
+ if (_num_queues > 1) {
+ if (_dev_info.reta_size) {
+ // RETA size should be a power of 2
+ ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0);
+
+ // Set the RSS table to the correct size
+ _redir_table.resize(_dev_info.reta_size);
+ _rss_table_bits = std::lround(std::log2(_dev_info.reta_size));
+ ldout(cct, 5) << __func__ << " Port " << int(_port_idx)
+ << ": RSS table size is " << _dev_info.reta_size << dendl;
+ } else {
+ // FIXME: same with sw_reta
+ _redir_table.resize(128);
+ _rss_table_bits = std::lround(std::log2(128));
+ }
+ } else {
+ _redir_table.push_back(0);
+ }
+
+ // Set Rx VLAN stripping
+ if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
+ port_conf.rxmode.hw_vlan_strip = 1;
+ }
+
+ // Enable HW CRC stripping
+ port_conf.rxmode.hw_strip_crc = 1;
+
+#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
+ // Enable LRO
+ if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
+ ldout(cct, 1) << __func__ << " LRO is on" << dendl;
+ port_conf.rxmode.enable_lro = 1;
+ _hw_features.rx_lro = true;
+ } else
+#endif
+ ldout(cct, 1) << __func__ << " LRO is off" << dendl;
+
+ // Check that all CSUM features are either all set all together or not set
+ // all together. If this assumption breaks we need to rework the below logic
+ // by splitting the csum offload feature bit into separate bits for IPv4,
+ // TCP.
+ ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+ (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) ||
+ (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+ !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)));
+
+ // Set Rx checksum checking
+ if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+ (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
+ ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl;
+ port_conf.rxmode.hw_ip_checksum = 1;
+ _hw_features.rx_csum_offload = 1;
+ }
+
+ if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
+ ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl;
+ _hw_features.tx_csum_ip_offload = 1;
+ }
+
+ // TSO is supported starting from DPDK v1.8
+ if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) {
+ ldout(cct, 1) << __func__ << " TSO is supported" << dendl;
+ _hw_features.tx_tso = 1;
+ }
+
+ // Check that Tx TCP CSUM features are either all set all together
+ // or not set all together. If this assumption breaks we need to rework the
+ // below logic by splitting the csum offload feature bit into separate bits
+ // for TCP.
+ ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) ||
+ !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM));
+
+ if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) {
+ ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl;
+ _hw_features.tx_csum_l4_offload = 1;
+ }
+
+ int retval;
+
+ ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl;
+
+ /*
+ * Standard DPDK port initialisation - config port, then set up
+ * rx and tx rings.
+ */
+ if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues,
+ &port_conf)) != 0) {
+ lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx
+ << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl;
+ return retval;
+ }
+
+ //rte_eth_promiscuous_enable(port_num);
+ ldout(cct, 1) << __func__ << " done." << dendl;
+
+ return 0;
+}
+
+void DPDKDevice::set_hw_flow_control()
+{
+ // Read the port's current/default flow control settings
+ struct rte_eth_fc_conf fc_conf;
+ auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf);
+
+ if (ret == -ENOTSUP) {
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+ << ": not support to get hardware flow control settings: " << ret << dendl;
+ goto not_supported;
+ }
+
+ if (ret < 0) {
+ lderr(cct) << __func__ << " port " << int(_port_idx)
+ << ": failed to get hardware flow control settings: " << ret << dendl;
+ ceph_abort();
+ }
+
+ if (_enable_fc) {
+ fc_conf.mode = RTE_FC_FULL;
+ } else {
+ fc_conf.mode = RTE_FC_NONE;
+ }
+
+ ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf);
+ if (ret == -ENOTSUP) {
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+ << ": not support to set hardware flow control settings: " << ret << dendl;
+ goto not_supported;
+ }
+
+ if (ret < 0) {
+ lderr(cct) << __func__ << " port " << int(_port_idx)
+ << ": failed to set hardware flow control settings: " << ret << dendl;
+ ceph_abort();
+ }
+
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": HW FC " << _enable_fc << dendl;
+ return;
+
+not_supported:
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl;
+}
+
+int DPDKDevice::init_port_fini()
+{
+ // Changing FC requires HW reset, so set it before the port is initialized.
+ set_hw_flow_control();
+
+ if (rte_eth_dev_start(_port_idx) != 0) {
+ lderr(cct) << __func__ << " can't start port " << _port_idx << dendl;
+ return -1;
+ }
+
+ if (_num_queues > 1) {
+ if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) {
+ ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl;
+
+ // Setup HW touse the TOEPLITZ hash function as an RSS hash function
+ struct rte_eth_hash_filter_info info = {};
+
+ info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG;
+ info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+
+ if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH,
+ RTE_ETH_FILTER_SET, &info) < 0) {
+ lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl;
+ return -1;
+ }
+ }
+
+ set_rss_table();
+ }
+
+ // Wait for a link
+ if (check_port_link_status() < 0) {
+ lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl;
+ return -1;
+ }
+
+ ldout(cct, 5) << __func__ << " created DPDK device" << dendl;
+ return 0;
+}
+
+void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) {
+ ceph_assert(!cpu_weights.empty());
+ if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) {
+ // special case queue sending to self only, to avoid requiring a hash value
+ return;
+ }
+ register_packet_provider([this] {
+ Tub<Packet> p;
+ if (!_proxy_packetq.empty()) {
+ p = std::move(_proxy_packetq.front());
+ _proxy_packetq.pop_front();
+ }
+ return p;
+ });
+ build_sw_reta(cpu_weights);
+}
+
+void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) {
+ float total_weight = 0;
+ for (auto&& x : cpu_weights) {
+ total_weight += x.second;
+ }
+ float accum = 0;
+ unsigned idx = 0;
+ std::array<uint8_t, 128> reta;
+ for (auto&& entry : cpu_weights) {
+ auto cpu = entry.first;
+ auto weight = entry.second;
+ accum += weight;
+ while (idx < (accum / total_weight * reta.size() - 0.5)) {
+ reta[idx++] = cpu;
+ }
+ }
+ _sw_reta = reta;
+}
+
+
+bool DPDKQueuePair::init_rx_mbuf_pool()
+{
+ std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx";
+
+ // reserve the memory for Rx buffers containers
+ _rx_free_pkts.reserve(mbufs_per_queue_rx);
+ _rx_free_bufs.reserve(mbufs_per_queue_rx);
+
+ _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str());
+ if (!_pktmbuf_pool_rx) {
+ ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str()
+ << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl;
+
+ //
+ // Don't pass single-producer/single-consumer flags to mbuf create as it
+ // seems faster to use a cache instead.
+ //
+ struct rte_pktmbuf_pool_private roomsz = {};
+ roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM;
+ _pktmbuf_pool_rx = rte_mempool_create(
+ name.c_str(),
+ mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size,
+ mbuf_cache_size,
+ sizeof(struct rte_pktmbuf_pool_private),
+ rte_pktmbuf_pool_init, as_cookie(roomsz),
+ rte_pktmbuf_init, nullptr,
+ rte_socket_id(), 0);
+ if (!_pktmbuf_pool_rx) {
+ lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl;
+ return false;
+ }
+
+ //
+ // allocate more data buffer
+ int bufs_count = cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx;
+ int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
+ std::string mz_name = "rx_buffer_data" + std::to_string(_qid);
+ const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(),
+ mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size);
+ ceph_assert(mz);
+ void* m = mz->addr;
+ for (int i = 0; i < bufs_count; i++) {
+ ceph_assert(m);
+ _alloc_bufs.push_back(m);
+ m += mbuf_data_size;
+ }
+
+ if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size,
+ rte_eth_dev_socket_id(_dev_port_idx),
+ _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) {
+ lderr(cct) << __func__ << " cannot initialize rx queue" << dendl;
+ return false;
+ }
+ }
+
+ return _pktmbuf_pool_rx != nullptr;
+}
+
+int DPDKDevice::check_port_link_status()
+{
+ int count = 0;
+
+ ldout(cct, 20) << __func__ << dendl;
+ const int sleep_time = 100 * 1000;
+ const int max_check_time = 90; /* 9s (90 * 100ms) in total */
+ while (true) {
+ struct rte_eth_link link;
+ memset(&link, 0, sizeof(link));
+ rte_eth_link_get_nowait(_port_idx, &link);
+
+ if (true) {
+ if (link.link_status) {
+ ldout(cct, 5) << __func__ << " done port "
+ << static_cast<unsigned>(_port_idx)
+ << " link Up - speed " << link.link_speed
+ << " Mbps - "
+ << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n"))
+ << dendl;
+ break;
+ } else if (count++ < max_check_time) {
+ ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl;
+ usleep(sleep_time);
+ } else {
+ lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl;
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+class C_handle_dev_stats : public EventCallback {
+ DPDKQueuePair *_qp;
+ public:
+ C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { }
+ void do_request(uint64_t id) {
+ _qp->handle_stats();
+ }
+};
+
+DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid)
+ : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid),
+ _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid),
+ _tx_gc_poller(this)
+{
+ if (!init_rx_mbuf_pool()) {
+ lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl;
+ ceph_abort();
+ }
+
+ static_assert(offsetof(tx_buf, private_end) -
+ offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM,
+ "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! "
+ "Increase the headroom size in the DPDK configuration");
+ static_assert(offsetof(tx_buf, _mbuf) == 0,
+ "There is a pad at the beginning of the tx_buf before _mbuf "
+ "field!");
+ static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0,
+ "inline_mbuf_data_size has to be a power of two!");
+
+ std::string name(std::string("queue") + std::to_string(qid));
+ PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last);
+
+ plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets");
+ plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets");
+ plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets");
+ plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets");
+ plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch");
+ plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch");
+ plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments");
+ plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments");
+ plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations");
+ plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations");
+ plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations");
+ plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations");
+ plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+
+ if (!_qid)
+ device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+void DPDKQueuePair::handle_stats()
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ rte_eth_stats rte_stats = {};
+ int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats);
+
+ if (rc) {
+ ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl;
+ return ;
+ }
+
+#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0)
+ _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts);
+ _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc);
+#endif
+ _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed);
+ _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf);
+
+ _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors);
+ _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors);
+ device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+bool DPDKQueuePair::poll_tx() {
+ bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback;
+#ifdef CEPH_PERF_DEV
+ uint64_t start = Cycles::rdtsc();
+#endif
+ uint32_t total_work = 0;
+ if (_tx_packetq.size() < 16) {
+ // refill send queue from upper layers
+ uint32_t work;
+ do {
+ work = 0;
+ for (auto&& pr : _pkt_providers) {
+ auto p = pr();
+ if (p) {
+ work++;
+ if (likely(nonloopback)) {
+ // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl;
+ _tx_packetq.push_back(std::move(*p));
+ } else {
+ auto th = p->get_header<eth_hdr>(0);
+ if (th->dst_mac == th->src_mac) {
+ _dev->l2receive(_qid, std::move(*p));
+ } else {
+ _tx_packetq.push_back(std::move(*p));
+ }
+ }
+ if (_tx_packetq.size() == 128) {
+ break;
+ }
+ }
+ }
+ total_work += work;
+ } while (work && total_work < 256 && _tx_packetq.size() < 128);
+ }
+ if (!_tx_packetq.empty()) {
+ uint64_t c = send(_tx_packetq);
+ perf_logger->inc(l_dpdk_qp_tx_packets, c);
+ perf_logger->set(l_dpdk_qp_tx_last_bunch, c);
+#ifdef CEPH_PERF_DEV
+ tx_count += total_work;
+ tx_cycles += Cycles::rdtsc() - start;
+#endif
+ return true;
+ }
+
+ return false;
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m)
+{
+ _frags.clear();
+ _bufs.clear();
+
+ for (; m != nullptr; m = m->next) {
+ char* data = rte_pktmbuf_mtod(m, char*);
+
+ _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)});
+ _bufs.push_back(data);
+ }
+
+ auto del = std::bind(
+ [this](std::vector<char*> &bufs) {
+ for (auto&& b : bufs) { _alloc_bufs.push_back(b); }
+ }, std::move(_bufs));
+ return Packet(
+ _frags.begin(), _frags.end(), make_deleter(std::move(del)));
+}
+
+inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m)
+{
+ _rx_free_pkts.push_back(m);
+ _num_rx_free_segs += m->nb_segs;
+
+ if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
+ char* data = rte_pktmbuf_mtod(m, char*);
+
+ return Packet(fragment{data, rte_pktmbuf_data_len(m)},
+ make_deleter([this, data] { _alloc_bufs.push_back(data); }));
+ } else {
+ return from_mbuf_lro(m);
+ }
+}
+
+inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head)
+{
+ for (; head != nullptr; head = head->next) {
+ if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) {
+ //
+ // If we failed to allocate a new buffer - push the rest of the
+ // cluster back to the free_packets list for a later retry.
+ //
+ _rx_free_pkts.push_back(head);
+ return false;
+ }
+ _rx_free_bufs.push_back(head);
+ }
+
+ return true;
+}
+
+bool DPDKQueuePair::rx_gc(bool force)
+{
+ if (_num_rx_free_segs >= rx_gc_thresh || force) {
+ ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs
+ << " thresh " << rx_gc_thresh
+ << " free pkts " << _rx_free_pkts.size()
+ << dendl;
+
+ while (!_rx_free_pkts.empty()) {
+ //
+ // Use back() + pop_back() semantics to avoid an extra
+ // _rx_free_pkts.clear() at the end of the function - clear() has a
+ // linear complexity.
+ //
+ auto m = _rx_free_pkts.back();
+ _rx_free_pkts.pop_back();
+
+ if (!refill_one_cluster(m)) {
+ ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl;
+ break;
+ }
+ }
+ for (auto&& m : _rx_free_bufs) {
+ rte_pktmbuf_prefree_seg(m);
+ }
+
+ if (_rx_free_bufs.size()) {
+ rte_mempool_put_bulk(_pktmbuf_pool_rx,
+ (void **)_rx_free_bufs.data(),
+ _rx_free_bufs.size());
+
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size());
+
+ _num_rx_free_segs -= _rx_free_bufs.size();
+ _rx_free_bufs.clear();
+
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) ||
+ (!_rx_free_pkts.empty() && _num_rx_free_segs));
+ }
+ }
+
+ return _num_rx_free_segs >= rx_gc_thresh;
+}
+
+
+void DPDKQueuePair::process_packets(
+ struct rte_mbuf **bufs, uint16_t count)
+{
+ uint64_t nr_frags = 0, bytes = 0;
+
+ for (uint16_t i = 0; i < count; i++) {
+ struct rte_mbuf *m = bufs[i];
+ offload_info oi;
+
+ Tub<Packet> p = from_mbuf(m);
+
+ // Drop the packet if translation above has failed
+ if (!p) {
+ perf_logger->inc(l_dpdk_qp_rx_no_memory_errors);
+ continue;
+ }
+ // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl;
+
+ nr_frags += m->nb_segs;
+ bytes += m->pkt_len;
+
+ // Set stipped VLAN value if available
+ if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) &&
+ (m->ol_flags & PKT_RX_VLAN_STRIPPED)) {
+ oi.vlan_tci = m->vlan_tci;
+ }
+
+ if (_dev->get_hw_features().rx_csum_offload) {
+ if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
+ // Packet with bad checksum, just drop it.
+ perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors);
+ continue;
+ }
+ // Note that when _hw_features.rx_csum_offload is on, the receive
+ // code for ip, tcp and udp will assume they don't need to check
+ // the checksum again, because we did this here.
+ }
+
+ p->set_offload_info(oi);
+ if (m->ol_flags & PKT_RX_RSS_HASH) {
+ p->set_rss_hash(m->hash.rss);
+ }
+
+ _dev->l2receive(_qid, std::move(*p));
+ }
+
+ perf_logger->inc(l_dpdk_qp_rx_packets, count);
+ perf_logger->set(l_dpdk_qp_rx_last_bunch, count);
+ perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags);
+ perf_logger->inc(l_dpdk_qp_rx_bytes, bytes);
+}
+
+bool DPDKQueuePair::poll_rx_once()
+{
+ struct rte_mbuf *buf[packet_read_size];
+
+ /* read a port */
+#ifdef CEPH_PERF_DEV
+ uint64_t start = Cycles::rdtsc();
+#endif
+ uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid,
+ buf, packet_read_size);
+
+ /* Now process the NIC packets read */
+ if (likely(count > 0)) {
+ process_packets(buf, count);
+#ifdef CEPH_PERF_DEV
+ rx_cycles = Cycles::rdtsc() - start;
+ rx_count += count;
+#endif
+ }
+#ifdef CEPH_PERF_DEV
+ else {
+ if (rx_count > 10000 && tx_count) {
+ ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns "
+ << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns"
+ << dendl;
+ rx_count = rx_cycles = tx_count = tx_cycles = 0;
+ }
+ }
+#endif
+
+ return count;
+}
+
+DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c,
+ DPDKDevice *dev, uint8_t qid): cct(c)
+{
+ std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx";
+
+ _pool = rte_mempool_lookup(name.c_str());
+ if (!_pool) {
+ ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str()
+ << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl;
+ //
+ // We are going to push the buffers from the mempool into
+ // the circular_buffer and then poll them from there anyway, so
+ // we prefer to make a mempool non-atomic in this case.
+ //
+ _pool = rte_mempool_create(name.c_str(),
+ mbufs_per_queue_tx, inline_mbuf_size,
+ mbuf_cache_size,
+ sizeof(struct rte_pktmbuf_pool_private),
+ rte_pktmbuf_pool_init, nullptr,
+ rte_pktmbuf_init, nullptr,
+ rte_socket_id(), 0);
+
+ if (!_pool) {
+ lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl;
+ ceph_abort();
+ }
+ if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size,
+ rte_eth_dev_socket_id(dev->port_idx()),
+ dev->def_tx_conf()) < 0) {
+ lderr(cct) << __func__ << " cannot initialize tx queue" << dendl;
+ ceph_abort();
+ }
+ }
+
+ //
+ // Fill the factory with the buffers from the mempool allocated
+ // above.
+ //
+ init_factory();
+}
+
+bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head)
+{
+ bool is_tso = head->ol_flags & PKT_TX_TCP_SEG;
+
+ // For a non-TSO case: number of fragments should not exceed 8
+ if (!is_tso){
+ return head->nb_segs > i40e_max_xmit_segment_frags;
+ }
+
+ //
+ // For a TSO case each MSS window should not include more than 8
+ // fragments including headers.
+ //
+
+ // Calculate the number of frags containing headers.
+ //
+ // Note: we support neither VLAN nor tunneling thus headers size
+ // accounting is super simple.
+ //
+ size_t headers_size = head->l2_len + head->l3_len + head->l4_len;
+ unsigned hdr_frags = 0;
+ size_t cur_payload_len = 0;
+ rte_mbuf *cur_seg = head;
+
+ while (cur_seg && cur_payload_len < headers_size) {
+ cur_payload_len += cur_seg->data_len;
+ cur_seg = cur_seg->next;
+ hdr_frags++;
+ }
+
+ //
+ // Header fragments will be used for each TSO segment, thus the
+ // maximum number of data segments will be 8 minus the number of
+ // header fragments.
+ //
+ // It's unclear from the spec how the first TSO segment is treated
+ // if the last fragment with headers contains some data bytes:
+ // whether this fragment will be accounted as a single fragment or
+ // as two separate fragments. We prefer to play it safe and assume
+ // that this fragment will be accounted as two separate fragments.
+ //
+ size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags;
+
+ if (head->nb_segs <= max_win_size) {
+ return false;
+ }
+
+ // Get the data (without headers) part of the first data fragment
+ size_t prev_frag_data = cur_payload_len - headers_size;
+ auto mss = head->tso_segsz;
+
+ while (cur_seg) {
+ unsigned frags_in_seg = 0;
+ size_t cur_seg_size = 0;
+
+ if (prev_frag_data) {
+ cur_seg_size = prev_frag_data;
+ frags_in_seg++;
+ prev_frag_data = 0;
+ }
+
+ while (cur_seg_size < mss && cur_seg) {
+ cur_seg_size += cur_seg->data_len;
+ cur_seg = cur_seg->next;
+ frags_in_seg++;
+
+ if (frags_in_seg > max_win_size) {
+ return true;
+ }
+ }
+
+ if (cur_seg_size > mss) {
+ prev_frag_data = cur_seg_size - mss;
+ }
+ }
+
+ return false;
+}
+
+void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head)
+{
+ // Handle TCP checksum offload
+ auto oi = p.offload_info();
+ if (oi.needs_ip_csum) {
+ head->ol_flags |= PKT_TX_IP_CKSUM;
+ // TODO: Take a VLAN header into an account here
+ head->l2_len = sizeof(struct ether_hdr);
+ head->l3_len = oi.ip_hdr_len;
+ }
+ if (qp.port().get_hw_features().tx_csum_l4_offload) {
+ if (oi.protocol == ip_protocol_num::tcp) {
+ head->ol_flags |= PKT_TX_TCP_CKSUM;
+ // TODO: Take a VLAN header into an account here
+ head->l2_len = sizeof(struct ether_hdr);
+ head->l3_len = oi.ip_hdr_len;
+
+ if (oi.tso_seg_size) {
+ ceph_assert(oi.needs_ip_csum);
+ head->ol_flags |= PKT_TX_TCP_SEG;
+ head->l4_len = oi.tcp_hdr_len;
+ head->tso_segsz = oi.tso_seg_size;
+ }
+ }
+ }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc(
+ CephContext *cct, Packet&& p, DPDKQueuePair& qp)
+{
+ // Too fragmented - linearize
+ if (p.nr_frags() > max_frags) {
+ p.linearize();
+ qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+ }
+
+ build_mbuf_cluster:
+ rte_mbuf *head = nullptr, *last_seg = nullptr;
+ unsigned nsegs = 0;
+
+ //
+ // Create a HEAD of the fragmented packet: check if frag0 has to be
+ // copied and if yes - send it in a copy way
+ //
+ if (!check_frag0(p)) {
+ if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+ ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+ return nullptr;
+ }
+ } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+ ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+ return nullptr;
+ }
+
+ unsigned total_nsegs = nsegs;
+
+ for (unsigned i = 1; i < p.nr_frags(); i++) {
+ rte_mbuf *h = nullptr, *new_last_seg = nullptr;
+ if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) {
+ ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl;
+ me(head)->recycle();
+ return nullptr;
+ }
+
+ total_nsegs += nsegs;
+
+ // Attach a new buffers' chain to the packet chain
+ last_seg->next = h;
+ last_seg = new_last_seg;
+ }
+
+ // Update the HEAD buffer with the packet info
+ head->pkt_len = p.len();
+ head->nb_segs = total_nsegs;
+
+ set_cluster_offload_info(p, qp, head);
+
+ //
+ // If a packet hasn't been linearized already and the resulting
+ // cluster requires the linearisation due to HW limitation:
+ //
+ // - Recycle the cluster.
+ // - Linearize the packet.
+ // - Build the cluster once again
+ //
+ if (head->nb_segs > max_frags ||
+ (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) ||
+ (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) {
+ me(head)->recycle();
+ p.linearize();
+ qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+
+ goto build_mbuf_cluster;
+ }
+
+ me(last_seg)->set_packet(std::move(p));
+
+ return me(head);
+}
+
+void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head)
+{
+ rte_mbuf* cur_seg = head;
+ size_t cur_seg_offset = 0;
+ unsigned cur_frag_idx = 0;
+ size_t cur_frag_offset = 0;
+
+ while (true) {
+ size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset,
+ inline_mbuf_data_size - cur_seg_offset);
+
+ memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset),
+ p.frag(cur_frag_idx).base + cur_frag_offset, to_copy);
+
+ cur_frag_offset += to_copy;
+ cur_seg_offset += to_copy;
+
+ if (cur_frag_offset >= p.frag(cur_frag_idx).size) {
+ ++cur_frag_idx;
+ if (cur_frag_idx >= p.nr_frags()) {
+ //
+ // We are done - set the data size of the last segment
+ // of the cluster.
+ //
+ cur_seg->data_len = cur_seg_offset;
+ break;
+ }
+
+ cur_frag_offset = 0;
+ }
+
+ if (cur_seg_offset >= inline_mbuf_data_size) {
+ cur_seg->data_len = inline_mbuf_data_size;
+ cur_seg = cur_seg->next;
+ cur_seg_offset = 0;
+
+ // FIXME: assert in a fast-path - remove!!!
+ ceph_assert(cur_seg);
+ }
+ }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp)
+{
+ // sanity
+ if (!p.len()) {
+ return nullptr;
+ }
+
+ /*
+ * Here we are going to use the fact that the inline data size is a
+ * power of two.
+ *
+ * We will first try to allocate the cluster and only if we are
+ * successful - we will go and copy the data.
+ */
+ auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size);
+ unsigned nsegs = aligned_len / inline_mbuf_data_size;
+ rte_mbuf *head = nullptr, *last_seg = nullptr;
+
+ tx_buf* buf = qp.get_tx_buf();
+ if (!buf) {
+ return nullptr;
+ }
+
+ head = buf->rte_mbuf_p();
+ last_seg = head;
+ for (unsigned i = 1; i < nsegs; i++) {
+ buf = qp.get_tx_buf();
+ if (!buf) {
+ me(head)->recycle();
+ return nullptr;
+ }
+
+ last_seg->next = buf->rte_mbuf_p();
+ last_seg = last_seg->next;
+ }
+
+ //
+ // If we've got here means that we have succeeded already!
+ // We only need to copy the data and set the head buffer with the
+ // relevant info.
+ //
+ head->pkt_len = p.len();
+ head->nb_segs = nsegs;
+
+ copy_packet_to_cluster(p, head);
+ set_cluster_offload_info(p, qp, head);
+
+ return me(head);
+}
+
+size_t DPDKQueuePair::tx_buf::copy_one_data_buf(
+ DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len)
+{
+ tx_buf* buf = qp.get_tx_buf();
+ if (!buf) {
+ return 0;
+ }
+
+ size_t len = std::min(buf_len, inline_mbuf_data_size);
+
+ m = buf->rte_mbuf_p();
+
+ // mbuf_put()
+ m->data_len = len;
+ m->pkt_len = len;
+
+ qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops);
+ qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len);
+
+ memcpy(rte_pktmbuf_mtod(m, void*), data, len);
+
+ return len;
+}
+
+void DPDKDevice::set_rss_table()
+{
+ // always fill our local indirection table.
+ unsigned i = 0;
+ for (auto& r : _redir_table) {
+ r = i++ % _num_queues;
+ }
+
+ if (_dev_info.reta_size == 0)
+ return;
+
+ int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE);
+ rte_eth_rss_reta_entry64 reta_conf[reta_conf_size];
+
+ // Configure the HW indirection table
+ i = 0;
+ for (auto& x : reta_conf) {
+ x.mask = ~0ULL;
+ for (auto& r: x.reta) {
+ r = i++ % _num_queues;
+ }
+ }
+
+ if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) {
+ rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx);
+ }
+}
+
+/******************************** Interface functions *************************/
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+ CephContext *cct,
+ unsigned cores,
+ uint8_t port_idx,
+ bool use_lro,
+ bool enable_fc)
+{
+ // Check that we have at least one DPDK-able port
+ if (rte_eth_dev_count() == 0) {
+ rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n");
+ } else {
+ ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl;
+ }
+
+ return std::unique_ptr<DPDKDevice>(
+ new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc));
+}
diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h
new file mode 100644
index 00000000..fa12af6b
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.h
@@ -0,0 +1,918 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_DEV_H
+#define CEPH_DPDK_DEV_H
+
+#include <memory>
+#include <functional>
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_version.h>
+
+#include "include/page.h"
+#include "common/Tub.h"
+#include "common/perf_counters.h"
+#include "msg/async/Event.h"
+#include "const.h"
+#include "circular_buffer.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "net.h"
+#include "toeplitz.h"
+
+
+struct free_deleter {
+ void operator()(void* p) { ::free(p); }
+};
+
+
+enum {
+ l_dpdk_dev_first = 58800,
+ l_dpdk_dev_rx_mcast,
+ l_dpdk_dev_rx_total_errors,
+ l_dpdk_dev_tx_total_errors,
+ l_dpdk_dev_rx_badcrc_errors,
+ l_dpdk_dev_rx_dropped_errors,
+ l_dpdk_dev_rx_nombuf_errors,
+ l_dpdk_dev_last
+};
+
+enum {
+ l_dpdk_qp_first = 58900,
+ l_dpdk_qp_rx_packets,
+ l_dpdk_qp_tx_packets,
+ l_dpdk_qp_rx_bad_checksum_errors,
+ l_dpdk_qp_rx_no_memory_errors,
+ l_dpdk_qp_rx_bytes,
+ l_dpdk_qp_tx_bytes,
+ l_dpdk_qp_rx_last_bunch,
+ l_dpdk_qp_tx_last_bunch,
+ l_dpdk_qp_rx_fragments,
+ l_dpdk_qp_tx_fragments,
+ l_dpdk_qp_rx_copy_ops,
+ l_dpdk_qp_tx_copy_ops,
+ l_dpdk_qp_rx_copy_bytes,
+ l_dpdk_qp_tx_copy_bytes,
+ l_dpdk_qp_rx_linearize_ops,
+ l_dpdk_qp_tx_linearize_ops,
+ l_dpdk_qp_tx_queue_length,
+ l_dpdk_qp_last
+};
+
+class DPDKDevice;
+class DPDKWorker;
+
+class DPDKQueuePair {
+ using packet_provider_type = std::function<Tub<Packet> ()>;
+ public:
+ void configure_proxies(const std::map<unsigned, float>& cpu_weights);
+ // build REdirection TAble for cpu_weights map: target cpu -> weight
+ void build_sw_reta(const std::map<unsigned, float>& cpu_weights);
+ void proxy_send(Packet p) {
+ _proxy_packetq.push_back(std::move(p));
+ }
+ void register_packet_provider(packet_provider_type func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ bool poll_tx();
+ friend class DPDKDevice;
+
+ class tx_buf_factory;
+
+ class tx_buf {
+ friend class DPDKQueuePair;
+ public:
+ static tx_buf* me(rte_mbuf* mbuf) {
+ return reinterpret_cast<tx_buf*>(mbuf);
+ }
+
+ private:
+ /**
+ * Checks if the original packet of a given cluster should be linearized
+ * due to HW limitations.
+ *
+ * @param head head of a cluster to check
+ *
+ * @return TRUE if a packet should be linearized.
+ */
+ static bool i40e_should_linearize(rte_mbuf *head);
+
+ /**
+ * Sets the offload info in the head buffer of an rte_mbufs cluster.
+ *
+ * @param p an original packet the cluster is built for
+ * @param qp QP handle
+ * @param head a head of an rte_mbufs cluster
+ */
+ static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head);
+
+ /**
+ * Creates a tx_buf cluster representing a given packet in a "zero-copy"
+ * way.
+ *
+ * @param p packet to translate
+ * @param qp DPDKQueuePair handle
+ *
+ * @return the HEAD tx_buf of the cluster or nullptr in case of a
+ * failure
+ */
+ static tx_buf* from_packet_zc(
+ CephContext *cct, Packet&& p, DPDKQueuePair& qp);
+
+ /**
+ * Copy the contents of the "packet" into the given cluster of
+ * rte_mbuf's.
+ *
+ * @note Size of the cluster has to be big enough to accommodate all the
+ * contents of the given packet.
+ *
+ * @param p packet to copy
+ * @param head head of the rte_mbuf's cluster
+ */
+ static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head);
+
+ /**
+ * Creates a tx_buf cluster representing a given packet in a "copy" way.
+ *
+ * @param p packet to translate
+ * @param qp DPDKQueuePair handle
+ *
+ * @return the HEAD tx_buf of the cluster or nullptr in case of a
+ * failure
+ */
+ static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp);
+
+ /**
+ * Zero-copy handling of a single fragment.
+ *
+ * @param do_one_buf Functor responsible for a single rte_mbuf
+ * handling
+ * @param qp DPDKQueuePair handle (in)
+ * @param frag Fragment to copy (in)
+ * @param head Head of the cluster (out)
+ * @param last_seg Last segment of the cluster (out)
+ * @param nsegs Number of segments in the cluster (out)
+ *
+ * @return TRUE in case of success
+ */
+ template <class DoOneBufFunc>
+ static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp,
+ fragment& frag, rte_mbuf*& head,
+ rte_mbuf*& last_seg, unsigned& nsegs) {
+ size_t len, left_to_set = frag.size;
+ char* base = frag.base;
+
+ rte_mbuf* m;
+
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert(frag.size);
+
+ // Create a HEAD of mbufs' cluster and set the first bytes into it
+ len = do_one_buf(qp, head, base, left_to_set);
+ if (!len) {
+ return false;
+ }
+
+ left_to_set -= len;
+ base += len;
+ nsegs = 1;
+
+ //
+ // Set the rest of the data into the new mbufs and chain them to
+ // the cluster.
+ //
+ rte_mbuf* prev_seg = head;
+ while (left_to_set) {
+ len = do_one_buf(qp, m, base, left_to_set);
+ if (!len) {
+ me(head)->recycle();
+ return false;
+ }
+
+ left_to_set -= len;
+ base += len;
+ nsegs++;
+
+ prev_seg->next = m;
+ prev_seg = m;
+ }
+
+ // Return the last mbuf in the cluster
+ last_seg = prev_seg;
+
+ return true;
+ }
+
+ /**
+ * Zero-copy handling of a single fragment.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param frag Fragment to copy (in)
+ * @param head Head of the cluster (out)
+ * @param last_seg Last segment of the cluster (out)
+ * @param nsegs Number of segments in the cluster (out)
+ *
+ * @return TRUE in case of success
+ */
+ static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag,
+ rte_mbuf*& head, rte_mbuf*& last_seg,
+ unsigned& nsegs) {
+ return do_one_frag(set_one_data_buf, qp, frag, head,
+ last_seg, nsegs);
+ }
+
+ /**
+ * Copies one fragment into the cluster of rte_mbuf's.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param frag Fragment to copy (in)
+ * @param head Head of the cluster (out)
+ * @param last_seg Last segment of the cluster (out)
+ * @param nsegs Number of segments in the cluster (out)
+ *
+ * We return the "last_seg" to avoid traversing the cluster in order to get
+ * it.
+ *
+ * @return TRUE in case of success
+ */
+ static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag,
+ rte_mbuf*& head, rte_mbuf*& last_seg,
+ unsigned& nsegs) {
+ return do_one_frag(copy_one_data_buf, qp, frag, head,
+ last_seg, nsegs);
+ }
+
+ /**
+ * Allocates a single rte_mbuf and sets it to point to a given data
+ * buffer.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param m New allocated rte_mbuf (out)
+ * @param va virtual address of a data buffer (in)
+ * @param buf_len length of the data to copy (in)
+ *
+ * @return The actual number of bytes that has been set in the mbuf
+ */
+ static size_t set_one_data_buf(
+ DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) {
+ static constexpr size_t max_frag_len = 15 * 1024; // 15K
+
+ // FIXME: current all tx buf is allocated without rte_malloc
+ return copy_one_data_buf(qp, m, va, buf_len);
+ //
+ // Currently we break a buffer on a 15K boundary because 82599
+ // devices have a 15.5K limitation on a maximum single fragment
+ // size.
+ //
+ rte_iova_t pa = rte_malloc_virt2iova(va);
+ if (!pa)
+ return copy_one_data_buf(qp, m, va, buf_len);
+
+ ceph_assert(buf_len);
+ tx_buf* buf = qp.get_tx_buf();
+ if (!buf) {
+ return 0;
+ }
+
+ size_t len = std::min(buf_len, max_frag_len);
+
+ buf->set_zc_info(va, pa, len);
+ m = buf->rte_mbuf_p();
+
+ return len;
+ }
+
+ /**
+ * Allocates a single rte_mbuf and copies a given data into it.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param m New allocated rte_mbuf (out)
+ * @param data Data to copy from (in)
+ * @param buf_len length of the data to copy (in)
+ *
+ * @return The actual number of bytes that has been copied
+ */
+ static size_t copy_one_data_buf(
+ DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len);
+
+ /**
+ * Checks if the first fragment of the given packet satisfies the
+ * zero-copy flow requirement: its first 128 bytes should not cross the
+ * 4K page boundary. This is required in order to avoid splitting packet
+ * headers.
+ *
+ * @param p packet to check
+ *
+ * @return TRUE if packet is ok and FALSE otherwise.
+ */
+ static bool check_frag0(Packet& p)
+ {
+ //
+ // First frag is special - it has headers that should not be split.
+ // If the addressing is such that the first fragment has to be
+ // split, then send this packet in a (non-zero) copy flow. We'll
+ // check if the first 128 bytes of the first fragment reside in the
+ // physically contiguous area. If that's the case - we are good to
+ // go.
+ //
+ if (p.frag(0).size < 128)
+ return false;
+
+ return true;
+ }
+
+ public:
+ tx_buf(tx_buf_factory& fc) : _fc(fc) {
+
+ _buf_physaddr = _mbuf.buf_physaddr;
+ _data_off = _mbuf.data_off;
+ }
+
+ rte_mbuf* rte_mbuf_p() { return &_mbuf; }
+
+ void set_zc_info(void* va, phys_addr_t pa, size_t len) {
+ // mbuf_put()
+ _mbuf.data_len = len;
+ _mbuf.pkt_len = len;
+
+ // Set the mbuf to point to our data
+ _mbuf.buf_addr = va;
+ _mbuf.buf_physaddr = pa;
+ _mbuf.data_off = 0;
+ _is_zc = true;
+ }
+
+ void reset_zc() {
+
+ //
+ // If this mbuf was the last in a cluster and contains an
+ // original packet object then call the destructor of the
+ // original packet object.
+ //
+ if (_p) {
+ //
+ // Reset the std::optional. This in particular is going
+ // to call the "packet"'s destructor and reset the
+ // "optional" state to "nonengaged".
+ //
+ _p.destroy();
+
+ } else if (!_is_zc) {
+ return;
+ }
+
+ // Restore the rte_mbuf fields we trashed in set_zc_info()
+ _mbuf.buf_physaddr = _buf_physaddr;
+ _mbuf.buf_addr = rte_mbuf_to_baddr(&_mbuf);
+ _mbuf.data_off = _data_off;
+
+ _is_zc = false;
+ }
+
+ void recycle() {
+ struct rte_mbuf *m = &_mbuf, *m_next;
+
+ while (m != nullptr) {
+ m_next = m->next;
+ rte_pktmbuf_reset(m);
+ _fc.put(me(m));
+ m = m_next;
+ }
+ }
+
+ void set_packet(Packet&& p) {
+ _p = std::move(p);
+ }
+
+ private:
+ struct rte_mbuf _mbuf;
+ MARKER private_start;
+ Tub<Packet> _p;
+ phys_addr_t _buf_physaddr;
+ uint16_t _data_off;
+ // TRUE if underlying mbuf has been used in the zero-copy flow
+ bool _is_zc = false;
+ // buffers' factory the buffer came from
+ tx_buf_factory& _fc;
+ MARKER private_end;
+ };
+
+ class tx_buf_factory {
+ //
+ // Number of buffers to free in each GC iteration:
+ // We want the buffers to be allocated from the mempool as many as
+ // possible.
+ //
+ // On the other hand if there is no Tx for some time we want the
+ // completions to be eventually handled. Thus we choose the smallest
+ // possible packets count number here.
+ //
+ static constexpr int gc_count = 1;
+ public:
+ tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid);
+ ~tx_buf_factory() {
+ // put all mbuf back into mempool in order to make the next factory work
+ while (gc());
+ rte_mempool_put_bulk(_pool, (void**)_ring.data(),
+ _ring.size());
+ }
+
+
+ /**
+ * @note Should not be called if there are no free tx_buf's
+ *
+ * @return a free tx_buf object
+ */
+ tx_buf* get() {
+ // Take completed from the HW first
+ tx_buf *pkt = get_one_completed();
+ if (pkt) {
+ pkt->reset_zc();
+ return pkt;
+ }
+
+ //
+ // If there are no completed at the moment - take from the
+ // factory's cache.
+ //
+ if (_ring.empty()) {
+ return nullptr;
+ }
+
+ pkt = _ring.back();
+ _ring.pop_back();
+
+ return pkt;
+ }
+
+ void put(tx_buf* buf) {
+ buf->reset_zc();
+ _ring.push_back(buf);
+ }
+
+ bool gc() {
+ for (int cnt = 0; cnt < gc_count; ++cnt) {
+ auto tx_buf_p = get_one_completed();
+ if (!tx_buf_p) {
+ return false;
+ }
+
+ put(tx_buf_p);
+ }
+
+ return true;
+ }
+ private:
+ /**
+ * Fill the mbufs circular buffer: after this the _pool will become
+ * empty. We will use it to catch the completed buffers:
+ *
+ * - Underlying PMD drivers will "free" the mbufs once they are
+ * completed.
+ * - We will poll the _pktmbuf_pool_tx till it's empty and release
+ * all the buffers from the freed mbufs.
+ */
+ void init_factory() {
+ while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) {
+ _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this});
+ }
+ }
+
+ /**
+ * PMD puts the completed buffers back into the mempool they have
+ * originally come from.
+ *
+ * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call
+ * rte_pktmbuf_reset() here again.
+ *
+ * @return a single tx_buf that has been completed by HW.
+ */
+ tx_buf* get_one_completed() {
+ return tx_buf::me(rte_pktmbuf_alloc(_pool));
+ }
+
+ private:
+ CephContext *cct;
+ std::vector<tx_buf*> _ring;
+ rte_mempool* _pool = nullptr;
+ };
+
+ public:
+ explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid);
+ ~DPDKQueuePair() {
+ if (device_stat_time_fd) {
+ center->delete_time_event(device_stat_time_fd);
+ }
+ rx_gc(true);
+ }
+
+ void rx_start() {
+ _rx_poller.construct(this);
+ }
+
+ uint32_t send(circular_buffer<Packet>& pb) {
+ // Zero-copy send
+ return _send(pb, [&] (Packet&& p) {
+ return tx_buf::from_packet_zc(cct, std::move(p), *this);
+ });
+ }
+
+ DPDKDevice& port() const { return *_dev; }
+ tx_buf* get_tx_buf() { return _tx_buf_factory.get(); }
+
+ void handle_stats();
+
+ private:
+ template <class Func>
+ uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) {
+ if (_tx_burst.size() == 0) {
+ for (auto&& p : pb) {
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert(p.len());
+
+ tx_buf* buf = packet_to_tx_buf_p(std::move(p));
+ if (!buf) {
+ break;
+ }
+
+ _tx_burst.push_back(buf->rte_mbuf_p());
+ }
+ }
+
+ uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid,
+ _tx_burst.data() + _tx_burst_idx,
+ _tx_burst.size() - _tx_burst_idx);
+
+ uint64_t nr_frags = 0, bytes = 0;
+
+ for (int i = 0; i < sent; i++) {
+ rte_mbuf* m = _tx_burst[_tx_burst_idx + i];
+ bytes += m->pkt_len;
+ nr_frags += m->nb_segs;
+ pb.pop_front();
+ }
+
+ perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags);
+ perf_logger->inc(l_dpdk_qp_tx_bytes, bytes);
+
+ _tx_burst_idx += sent;
+
+ if (_tx_burst_idx == _tx_burst.size()) {
+ _tx_burst_idx = 0;
+ _tx_burst.clear();
+ }
+
+ return sent;
+ }
+
+ /**
+ * Allocate a new data buffer and set the mbuf to point to it.
+ *
+ * Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+ * points to the private data of RTE_PKTMBUF_HEADROOM before the actual
+ * data buffer.
+ *
+ * @param m mbuf to update
+ */
+ static bool refill_rx_mbuf(rte_mbuf* m, size_t size,
+ std::vector<void*> &datas) {
+ if (datas.empty())
+ return false;
+ void *data = datas.back();
+ datas.pop_back();
+
+ //
+ // Set the mbuf to point to our data.
+ //
+ // Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+ // points to the private data of RTE_PKTMBUF_HEADROOM before the
+ // actual data buffer.
+ //
+ m->buf_addr = (char*)data - RTE_PKTMBUF_HEADROOM;
+ m->buf_physaddr = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM;
+ return true;
+ }
+
+ bool init_rx_mbuf_pool();
+ bool rx_gc(bool force=false);
+ bool refill_one_cluster(rte_mbuf* head);
+
+ /**
+ * Polls for a burst of incoming packets. This function will not block and
+ * will immediately return after processing all available packets.
+ *
+ */
+ bool poll_rx_once();
+
+ /**
+ * Translates an rte_mbuf's into packet and feeds them to _rx_stream.
+ *
+ * @param bufs An array of received rte_mbuf's
+ * @param count Number of buffers in the bufs[]
+ */
+ void process_packets(struct rte_mbuf **bufs, uint16_t count);
+
+ /**
+ * Translate rte_mbuf into the "packet".
+ * @param m mbuf to translate
+ *
+ * @return a "optional" object representing the newly received data if in an
+ * "engaged" state or an error if in a "disengaged" state.
+ */
+ Tub<Packet> from_mbuf(rte_mbuf* m);
+
+ /**
+ * Transform an LRO rte_mbuf cluster into the "packet" object.
+ * @param m HEAD of the mbufs' cluster to transform
+ *
+ * @return a "optional" object representing the newly received LRO packet if
+ * in an "engaged" state or an error if in a "disengaged" state.
+ */
+ Tub<Packet> from_mbuf_lro(rte_mbuf* m);
+
+ private:
+ CephContext *cct;
+ std::vector<packet_provider_type> _pkt_providers;
+ Tub<std::array<uint8_t, 128>> _sw_reta;
+ circular_buffer<Packet> _proxy_packetq;
+ stream<Packet> _rx_stream;
+ circular_buffer<Packet> _tx_packetq;
+ std::vector<void*> _alloc_bufs;
+
+ PerfCounters *perf_logger;
+ DPDKDevice* _dev;
+ uint8_t _dev_port_idx;
+ EventCenter *center;
+ uint8_t _qid;
+ rte_mempool *_pktmbuf_pool_rx;
+ std::vector<rte_mbuf*> _rx_free_pkts;
+ std::vector<rte_mbuf*> _rx_free_bufs;
+ std::vector<fragment> _frags;
+ std::vector<char*> _bufs;
+ size_t _num_rx_free_segs = 0;
+ uint64_t device_stat_time_fd = 0;
+
+#ifdef CEPH_PERF_DEV
+ uint64_t rx_cycles = 0;
+ uint64_t rx_count = 0;
+ uint64_t tx_cycles = 0;
+ uint64_t tx_count = 0;
+#endif
+
+ class DPDKTXPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKTXPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->poll_tx();
+ }
+ } _tx_poller;
+
+ class DPDKRXGCPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKRXGCPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->rx_gc();
+ }
+ } _rx_gc_poller;
+ tx_buf_factory _tx_buf_factory;
+ class DPDKRXPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKRXPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->poll_rx_once();
+ }
+ };
+ Tub<DPDKRXPoller> _rx_poller;
+ class DPDKTXGCPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKTXGCPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->_tx_buf_factory.gc();
+ }
+ } _tx_gc_poller;
+ std::vector<rte_mbuf*> _tx_burst;
+ uint16_t _tx_burst_idx = 0;
+};
+
+class DPDKDevice {
+ public:
+ CephContext *cct;
+ PerfCounters *perf_logger;
+ std::vector<std::unique_ptr<DPDKQueuePair>> _queues;
+ std::vector<DPDKWorker*> workers;
+ size_t _rss_table_bits = 0;
+ uint8_t _port_idx;
+ uint16_t _num_queues;
+ unsigned cores;
+ hw_features _hw_features;
+ uint8_t _queues_ready = 0;
+ unsigned _home_cpu;
+ bool _use_lro;
+ bool _enable_fc;
+ std::vector<uint8_t> _redir_table;
+ rss_key_type _rss_key;
+ bool _is_i40e_device = false;
+ bool _is_vmxnet3_device = false;
+
+ public:
+ rte_eth_dev_info _dev_info = {};
+
+ /**
+ * The final stage of a port initialization.
+ * @note Must be called *after* all queues from stage (2) have been
+ * initialized.
+ */
+ int init_port_fini();
+
+ private:
+ /**
+ * Port initialization consists of 3 main stages:
+ * 1) General port initialization which ends with a call to
+ * rte_eth_dev_configure() where we request the needed number of Rx and
+ * Tx queues.
+ * 2) Individual queues initialization. This is done in the constructor of
+ * DPDKQueuePair class. In particular the memory pools for queues are allocated
+ * in this stage.
+ * 3) The final stage of the initialization which starts with the call of
+ * rte_eth_dev_start() after which the port becomes fully functional. We
+ * will also wait for a link to get up in this stage.
+ */
+
+
+ /**
+ * First stage of the port initialization.
+ *
+ * @return 0 in case of success and an appropriate error code in case of an
+ * error.
+ */
+ int init_port_start();
+
+ /**
+ * Check the link status of out port in up to 9s, and print them finally.
+ */
+ int check_port_link_status();
+
+ /**
+ * Configures the HW Flow Control
+ */
+ void set_hw_flow_control();
+
+ public:
+ DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc):
+ cct(c), _port_idx(port_idx), _num_queues(num_queues),
+ _home_cpu(0), _use_lro(use_lro),
+ _enable_fc(enable_fc) {
+ _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues);
+ /* now initialise the port we will use */
+ int ret = init_port_start();
+ if (ret != 0) {
+ rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx);
+ }
+ string name(std::string("port") + std::to_string(port_idx));
+ PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last);
+
+ plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets");
+ plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors");
+
+ plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors");
+ plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors");
+ plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors");
+ plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ }
+
+ ~DPDKDevice() {
+ rte_eth_dev_stop(_port_idx);
+ }
+
+ DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; }
+ void l2receive(int qid, Packet p) {
+ _queues[qid]->_rx_stream.produce(std::move(p));
+ }
+ subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) {
+ auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet));
+ _queues[cpuid]->rx_start();
+ return std::move(sub);
+ }
+ ethernet_address hw_address() {
+ struct ether_addr mac;
+ rte_eth_macaddr_get(_port_idx, &mac);
+
+ return mac.addr_bytes;
+ }
+ hw_features get_hw_features() {
+ return _hw_features;
+ }
+ const rss_key_type& rss_key() const { return _rss_key; }
+ uint16_t hw_queues_count() { return _num_queues; }
+ std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) {
+ std::unique_ptr<DPDKQueuePair> qp;
+ qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid));
+ return std::move(qp);
+ }
+ unsigned hash2qid(uint32_t hash) {
+ // return hash % hw_queues_count();
+ return _redir_table[hash & (_redir_table.size() - 1)];
+ }
+ void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) {
+ ceph_assert(!_queues[i]);
+ _queues[i] = std::move(qp);
+ }
+ void unset_local_queue(unsigned i) {
+ ceph_assert(_queues[i]);
+ _queues[i].reset();
+ }
+ template <typename Func>
+ unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) {
+ auto& qp = queue_for_cpu(src_cpuid);
+ if (!qp._sw_reta)
+ return src_cpuid;
+
+ ceph_assert(!qp._sw_reta);
+ auto hash = hashfn() >> _rss_table_bits;
+ auto& reta = *qp._sw_reta;
+ return reta[hash % reta.size()];
+ }
+ unsigned hash2cpu(uint32_t hash) {
+ // there is an assumption here that qid == get_id() which will
+ // not necessary be true in the future
+ return forward_dst(hash2qid(hash), [hash] { return hash; });
+ }
+
+ hw_features& hw_features_ref() { return _hw_features; }
+
+ const rte_eth_rxconf* def_rx_conf() const {
+ return &_dev_info.default_rxconf;
+ }
+
+ const rte_eth_txconf* def_tx_conf() const {
+ return &_dev_info.default_txconf;
+ }
+
+ /**
+ * Set the RSS table in the device and store it in the internal vector.
+ */
+ void set_rss_table();
+
+ uint8_t port_idx() { return _port_idx; }
+ bool is_i40e_device() const {
+ return _is_i40e_device;
+ }
+ bool is_vmxnet3_device() const {
+ return _is_vmxnet3_device;
+ }
+};
+
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+ CephContext *c, unsigned cores, uint8_t port_idx = 0,
+ bool use_lro = true, bool enable_fc = true);
+
+
+/**
+ * @return Number of bytes needed for mempool objects of each QP.
+ */
+uint32_t qp_mempool_obj_size();
+
+#endif // CEPH_DPDK_DEV_H
diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc
new file mode 100644
index 00000000..3101ae57
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.cc
@@ -0,0 +1,281 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <memory>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <tuple>
+
+#include "common/ceph_argparse.h"
+#include "dpdk_rte.h"
+#include "DPDKStack.h"
+#include "DPDK.h"
+#include "IP.h"
+#include "TCP-Stack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdkstack "
+
+static int dpdk_thread_adaptor(void* f)
+{
+ (*static_cast<std::function<void ()>*>(f))();
+ return 0;
+}
+
+void DPDKWorker::initialize()
+{
+ static enum {
+ WAIT_DEVICE_STAGE,
+ WAIT_PORT_FIN_STAGE,
+ DONE
+ } create_stage = WAIT_DEVICE_STAGE;
+ static Mutex lock("DPDKStack::lock");
+ static Cond cond;
+ static unsigned queue_init_done = 0;
+ static unsigned cores = 0;
+ static std::shared_ptr<DPDKDevice> sdev;
+
+ unsigned i = center.get_id();
+ if (i == 0) {
+ // Hardcoded port index 0.
+ // TODO: Inherit it from the opts
+ cores = cct->_conf->ms_async_op_threads;
+ std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device(
+ cct, cores, cct->_conf->ms_dpdk_port_id,
+ cct->_conf->ms_dpdk_lro,
+ cct->_conf->ms_dpdk_hw_flow_control);
+ sdev = std::shared_ptr<DPDKDevice>(dev.release());
+ sdev->workers.resize(cores);
+ ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl;
+
+ Mutex::Locker l(lock);
+ create_stage = WAIT_PORT_FIN_STAGE;
+ cond.Signal();
+ } else {
+ Mutex::Locker l(lock);
+ while (create_stage <= WAIT_DEVICE_STAGE)
+ cond.Wait(lock);
+ }
+ ceph_assert(sdev);
+ if (i < sdev->hw_queues_count()) {
+ auto qp = sdev->init_local_queue(cct, &center, cct->_conf->ms_dpdk_hugepages, i);
+ std::map<unsigned, float> cpu_weights;
+ for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count();
+ j < cores; j+= sdev->hw_queues_count())
+ cpu_weights[i] = 1;
+ cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight;
+ qp->configure_proxies(cpu_weights);
+ sdev->set_local_queue(i, std::move(qp));
+ Mutex::Locker l(lock);
+ ++queue_init_done;
+ cond.Signal();
+ } else {
+ // auto master = qid % sdev->hw_queues_count();
+ // sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
+ ceph_abort();
+ }
+ if (i == 0) {
+ {
+ Mutex::Locker l(lock);
+ while (queue_init_done < cores)
+ cond.Wait(lock);
+ }
+
+ if (sdev->init_port_fini() < 0) {
+ lderr(cct) << __func__ << " init_port_fini failed " << dendl;
+ ceph_abort();
+ }
+ Mutex::Locker l(lock);
+ create_stage = DONE;
+ cond.Signal();
+ } else {
+ Mutex::Locker l(lock);
+ while (create_stage <= WAIT_PORT_FIN_STAGE)
+ cond.Wait(lock);
+ }
+
+ sdev->workers[i] = this;
+ _impl = std::unique_ptr<DPDKWorker::Impl>(
+ new DPDKWorker::Impl(cct, i, &center, sdev));
+ {
+ Mutex::Locker l(lock);
+ if (!--queue_init_done) {
+ create_stage = WAIT_DEVICE_STAGE;
+ sdev.reset();
+ }
+ }
+}
+
+using AvailableIPAddress = std::tuple<string, string, string>;
+static bool parse_available_address(
+ const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res)
+{
+ vector<string> ip_vec, gate_vec, mask_vec;
+ string_to_vec(ip_vec, ips);
+ string_to_vec(gate_vec, gates);
+ string_to_vec(mask_vec, masks);
+ if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size())
+ return false;
+
+ for (size_t i = 0; i < ip_vec.size(); ++i) {
+ res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]});
+ }
+ return true;
+}
+
+static bool match_available_address(const vector<AvailableIPAddress> &avails,
+ const entity_addr_t &ip, int &res)
+{
+ for (size_t i = 0; i < avails.size(); ++i) {
+ entity_addr_t addr;
+ auto a = std::get<0>(avails[i]).c_str();
+ if (!addr.parse(a))
+ continue;
+ if (addr.is_same_host(ip)) {
+ res = i;
+ return true;
+ }
+ }
+ return false;
+}
+
+DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev)
+ : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif)
+{
+ vector<AvailableIPAddress> tuples;
+ bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"),
+ cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"),
+ cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples);
+ if (!parsed) {
+ lderr(cct) << __func__ << " no available address "
+ << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", "
+ << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", "
+ << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", "
+ << dendl;
+ ceph_abort();
+ }
+ _inet.set_host_address(ipv4_address(std::get<0>(tuples[0])));
+ _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0])));
+ _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0])));
+}
+
+DPDKWorker::Impl::~Impl()
+{
+ _dev->unset_local_queue(id);
+}
+
+int DPDKWorker::listen(entity_addr_t &sa, const SocketOptions &opt,
+ ServerSocket *sock)
+{
+ ceph_assert(sa.get_family() == AF_INET);
+ ceph_assert(sock);
+
+ ldout(cct, 10) << __func__ << " addr " << sa << dendl;
+ // vector<AvailableIPAddress> tuples;
+ // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr,
+ // cct->_conf->ms_dpdk_gateway_ipv4_addr,
+ // cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples);
+ // if (!parsed) {
+ // lderr(cct) << __func__ << " no available address "
+ // << cct->_conf->ms_dpdk_host_ipv4_addr << ", "
+ // << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", "
+ // << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", "
+ // << dendl;
+ // return -EINVAL;
+ // }
+ // int idx;
+ // parsed = match_available_address(tuples, sa, idx);
+ // if (!parsed) {
+ // lderr(cct) << __func__ << " no matched address for " << sa << dendl;
+ // return -EINVAL;
+ // }
+ // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx])));
+ // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx])));
+ // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx])));
+ return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(),
+ sock);
+}
+
+int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+ // ceph_assert(addr.get_family() == AF_INET);
+ int r = tcpv4_connect(_impl->_inet.get_tcp(), addr, socket);
+ ldout(cct, 10) << __func__ << " addr " << addr << dendl;
+ return r;
+}
+
+void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+ // create a extra master thread
+ //
+ funcs[i] = std::move(func);
+ int r = 0;
+ r = dpdk::eal::init(cct);
+ if (r < 0) {
+ lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl;
+ ceph_abort();
+ }
+ // if dpdk::eal::init already called by NVMEDevice, we will select 1..n
+ // cores
+ ceph_assert(rte_lcore_count() >= i + 1);
+ unsigned core_id;
+ int j = i;
+ RTE_LCORE_FOREACH_SLAVE(core_id) {
+ if (i-- == 0) {
+ break;
+ }
+ }
+ dpdk::eal::execute_on_master([&]() {
+ r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id);
+ if (r < 0) {
+ lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl;
+ ceph_abort();
+ }
+ });
+}
+
+void DPDKStack::join_worker(unsigned i)
+{
+ dpdk::eal::execute_on_master([&]() {
+ rte_eal_wait_lcore(i+1);
+ });
+}
diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h
new file mode 100644
index 00000000..a44ae383
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.h
@@ -0,0 +1,257 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_MSG_DPDKSTACK_H
+#define CEPH_MSG_DPDKSTACK_H
+
+#include <functional>
+
+#include "common/ceph_context.h"
+#include "common/Tub.h"
+
+#include "msg/async/Stack.h"
+#include "net.h"
+#include "const.h"
+#include "IP.h"
+#include "Packet.h"
+
+class interface;
+
+template <typename Protocol>
+class NativeConnectedSocketImpl;
+
+// DPDKServerSocketImpl
+template <typename Protocol>
+class DPDKServerSocketImpl : public ServerSocketImpl {
+ typename Protocol::listener _listener;
+ public:
+ DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt,
+ int type);
+ int listen() {
+ return _listener.listen();
+ }
+ virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ virtual void abort_accept() override;
+ virtual int fd() const override {
+ return _listener.fd();
+ }
+};
+
+// NativeConnectedSocketImpl
+template <typename Protocol>
+class NativeConnectedSocketImpl : public ConnectedSocketImpl {
+ typename Protocol::connection _conn;
+ uint32_t _cur_frag = 0;
+ uint32_t _cur_off = 0;
+ Tub<Packet> _buf;
+ Tub<bufferptr> _cache_ptr;
+
+ public:
+ explicit NativeConnectedSocketImpl(typename Protocol::connection conn)
+ : _conn(std::move(conn)) {}
+ NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs)
+ : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf)) {}
+ virtual int is_connected() override {
+ return _conn.is_connected();
+ }
+
+ virtual ssize_t read(char *buf, size_t len) override {
+ size_t left = len;
+ ssize_t r = 0;
+ size_t off = 0;
+ while (left > 0) {
+ if (!_cache_ptr) {
+ _cache_ptr.construct();
+ r = zero_copy_read(*_cache_ptr);
+ if (r <= 0) {
+ _cache_ptr.destroy();
+ if (r == -EAGAIN)
+ break;
+ return r;
+ }
+ }
+ if (_cache_ptr->length() <= left) {
+ _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off);
+ left -= _cache_ptr->length();
+ off += _cache_ptr->length();
+ _cache_ptr.destroy();
+ } else {
+ _cache_ptr->copy_out(0, left, buf+off);
+ _cache_ptr->set_offset(_cache_ptr->offset() + left);
+ _cache_ptr->set_length(_cache_ptr->length() - left);
+ left = 0;
+ break;
+ }
+ }
+ return len - left ? len - left : -EAGAIN;
+ }
+
+ virtual ssize_t zero_copy_read(bufferptr &data) override {
+ auto err = _conn.get_errno();
+ if (err <= 0)
+ return err;
+
+ if (!_buf) {
+ _buf = std::move(_conn.read());
+ if (!_buf)
+ return -EAGAIN;
+ }
+
+ fragment &f = _buf->frag(_cur_frag);
+ Packet p = _buf->share(_cur_off, f.size);
+ auto del = std::bind(
+ [](Packet &p) {}, std::move(p));
+ data = buffer::claim_buffer(
+ f.size, f.base, make_deleter(std::move(del)));
+ if (++_cur_frag == _buf->nr_frags()) {
+ _cur_frag = 0;
+ _cur_off = 0;
+ _buf.destroy();
+ } else {
+ _cur_off += f.size;
+ }
+ ceph_assert(data.length());
+ return data.length();
+ }
+ virtual ssize_t send(bufferlist &bl, bool more) override {
+ auto err = _conn.get_errno();
+ if (err < 0)
+ return (ssize_t)err;
+
+ size_t available = _conn.peek_sent_available();
+ if (available == 0) {
+ return 0;
+ }
+
+ std::vector<fragment> frags;
+ std::list<bufferptr>::const_iterator pb = bl.buffers().begin();
+ uint64_t left_pbrs = bl.buffers().size();
+ uint64_t len = 0;
+ uint64_t seglen = 0;
+ while (len < available && left_pbrs--) {
+ seglen = pb->length();
+ if (len + seglen > available) {
+ // don't continue if we enough at least 1 fragment since no available
+ // space for next ptr.
+ if (len > 0)
+ break;
+ seglen = std::min(seglen, available);
+ }
+ len += seglen;
+ frags.push_back(fragment{(char*)pb->c_str(), seglen});
+ ++pb;
+ }
+
+ if (len != bl.length()) {
+ bufferlist swapped;
+ bl.splice(0, len, &swapped);
+ auto del = std::bind(
+ [](bufferlist &bl) {}, std::move(swapped));
+ return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+ } else {
+ auto del = std::bind(
+ [](bufferlist &bl) {}, std::move(bl));
+
+ return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+ }
+ }
+ virtual void shutdown() override {
+ _conn.close_write();
+ }
+ // FIXME need to impl close
+ virtual void close() override {
+ _conn.close_write();
+ }
+ virtual int fd() const override {
+ return _conn.fd();
+ }
+ virtual int socket_fd() const override {
+ return _conn.fd();
+ }
+
+};
+
+template <typename Protocol>
+DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl(
+ Protocol& proto, uint16_t port, const SocketOptions &opt, int type)
+ : ServerSocketImpl(type), _listener(proto.listen(port)) {}
+
+template <typename Protocol>
+int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) {
+ if (_listener.get_errno() < 0)
+ return _listener.get_errno();
+ auto c = _listener.accept();
+ if (!c)
+ return -EAGAIN;
+
+ if (out) {
+ *out = c->remote_addr();
+ out->set_type(addr_type);
+ }
+ std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi(
+ new NativeConnectedSocketImpl<Protocol>(std::move(*c)));
+ *s = ConnectedSocket(std::move(csi));
+ return 0;
+}
+
+template <typename Protocol>
+void DPDKServerSocketImpl<Protocol>::abort_accept() {
+ _listener.abort_accept();
+}
+
+class DPDKWorker : public Worker {
+ struct Impl {
+ unsigned id;
+ interface _netif;
+ std::shared_ptr<DPDKDevice> _dev;
+ ipv4 _inet;
+ Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev);
+ ~Impl();
+ };
+ std::unique_ptr<Impl> _impl;
+
+ virtual void initialize() override;
+ void set_ipv4_packet_filter(ip_packet_filter* filter) {
+ _impl->_inet.set_packet_filter(filter);
+ }
+ using tcp4 = tcp<ipv4_traits>;
+
+ public:
+ explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {}
+ virtual int listen(entity_addr_t &addr, const SocketOptions &opts, ServerSocket *) override;
+ virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+ void arp_learn(ethernet_address l2, ipv4_address l3) {
+ _impl->_inet.learn(l2, l3);
+ }
+ virtual void destroy() override {
+ _impl.reset();
+ }
+
+ friend class DPDKServerSocketImpl<tcp4>;
+};
+
+class DPDKStack : public NetworkStack {
+ vector<std::function<void()> > funcs;
+ public:
+ explicit DPDKStack(CephContext *cct, const string &t): NetworkStack(cct, t) {
+ funcs.resize(cct->_conf->ms_async_max_op_threads);
+ }
+ virtual bool support_zero_copy_read() const override { return true; }
+ virtual bool support_local_listen_table() const override { return true; }
+
+ virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+ virtual void join_worker(unsigned i) override;
+};
+
+#endif
diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc
new file mode 100644
index 00000000..5d291716
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "DPDKStack.h"
+#include "EventDPDK.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "DPDKDriver."
+
+int DPDKDriver::init(EventCenter *c, int nevent)
+{
+ return 0;
+}
+
+int DPDKDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+ ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+ << " add_mask=" << add_mask << dendl;
+
+ int r = manager.listen(fd, add_mask);
+ if (r < 0) {
+ lderr(cct) << __func__ << " add fd=" << fd << " failed. "
+ << cpp_strerror(-r) << dendl;
+ return -errno;
+ }
+
+ return 0;
+}
+
+int DPDKDriver::del_event(int fd, int cur_mask, int delmask)
+{
+ ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+ << " delmask=" << delmask << dendl;
+ int r = 0;
+
+ if (delmask != EVENT_NONE) {
+ if ((r = manager.unlisten(fd, delmask)) < 0) {
+ lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask
+ << " failed." << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int DPDKDriver::resize_events(int newsize)
+{
+ return 0;
+}
+
+int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+ int num_events = 512;
+ int events[num_events];
+ int masks[num_events];
+
+ int retval = manager.poll(events, masks, num_events, tvp);
+ if (retval > 0) {
+ fired_events.resize(retval);
+ for (int i = 0; i < retval; i++) {
+ fired_events[i].fd = events[i];
+ fired_events[i].mask = masks[i];
+ }
+ }
+ return retval;
+}
diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h
new file mode 100644
index 00000000..541c2210
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EVENTDPDK_H
+#define CEPH_EVENTDPDK_H
+
+#include "msg/async/Event.h"
+#include "msg/async/Stack.h"
+#include "UserspaceEvent.h"
+
+class DPDKDriver : public EventDriver {
+ CephContext *cct;
+
+ public:
+ UserspaceEventManager manager;
+
+ explicit DPDKDriver(CephContext *c): cct(c), manager(c) {}
+ virtual ~DPDKDriver() { }
+
+ int init(EventCenter *c, int nevent) override;
+ int add_event(int fd, int cur_mask, int add_mask) override;
+ int del_event(int fd, int cur_mask, int del_mask) override;
+ int resize_events(int newsize) override;
+ int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override;
+ bool need_wakeup() override { return false; }
+};
+
+#endif //CEPH_EVENTDPDK_H
diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc
new file mode 100644
index 00000000..f730cded
--- /dev/null
+++ b/src/msg/async/dpdk/IP.cc
@@ -0,0 +1,470 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+
+#include "capture.h"
+#include "IP.h"
+#include "toeplitz.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a) {
+ auto ip = a.ip;
+ return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff)
+ << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff);
+}
+
+utime_t ipv4::_frag_timeout = utime_t(30, 0);
+constexpr uint32_t ipv4::_frag_low_thresh;
+constexpr uint32_t ipv4::_frag_high_thresh;
+
+class C_handle_frag_timeout : public EventCallback {
+ ipv4 *_ipv4;
+
+ public:
+ C_handle_frag_timeout(ipv4 *i): _ipv4(i) {}
+ void do_request(uint64_t fd_or_id) {
+ _ipv4->frag_timeout();
+ }
+};
+
+enum {
+ l_dpdk_qp_first = 99000,
+ l_dpdk_total_linearize_operations,
+ l_dpdk_qp_last
+};
+
+ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif)
+ : cct(c), center(cen), _netif(netif), _global_arp(netif),
+ _arp(c, _global_arp, cen),
+ _host_address(0), _gw_address(0), _netmask(0),
+ _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }),
+ _rx_packets(
+ _l3.receive(
+ [this] (Packet p, ethernet_address ea) {
+ return handle_received_packet(std::move(p), ea);
+ },
+ [this] (forward_hash& out_hash_data, Packet& p, size_t off) {
+ return forward(out_hash_data, p, off);
+ }
+ )
+ ),
+ _tcp(*this, cen), _icmp(c, *this),
+ _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp },
+ { uint8_t(ip_protocol_num::icmp), &_icmp }}),
+ _packet_filter(nullptr)
+{
+ PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last);
+ plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations");
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ frag_handler = new C_handle_frag_timeout(this);
+}
+
+bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+ auto iph = p.get_header<ip_hdr>(off);
+
+ out_hash_data.push_back(iph->src_ip.ip);
+ out_hash_data.push_back(iph->dst_ip.ip);
+
+ auto h = iph->ntoh();
+ auto l4 = _l4[h.ip_proto];
+ if (l4) {
+ if (h.mf() == false && h.offset() == 0) {
+ // This IP datagram is atomic, forward according to tcp connection hash
+ l4->forward(out_hash_data, p, off + sizeof(ip_hdr));
+ }
+ // else forward according to ip fields only
+ }
+ return true;
+}
+
+int ipv4::handle_received_packet(Packet p, ethernet_address from)
+{
+ auto iph = p.get_header<ip_hdr>(0);
+ if (!iph) {
+ return 0;
+ }
+
+ // Skip checking csum of reassembled IP datagram
+ if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) {
+ checksummer csum;
+ csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+ if (csum.get() != 0) {
+ return 0;
+ }
+ }
+
+ auto h = iph->ntoh();
+ unsigned ip_len = h.len;
+ unsigned ip_hdr_len = h.ihl * 4;
+ unsigned pkt_len = p.len();
+ auto offset = h.offset();
+
+ ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto)
+ << std::dec << " packet from "
+ << h.src_ip << " -> " << h.dst_ip << " id=" << h.id
+ << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len
+ << " pkt_len=" << pkt_len << " offset=" << offset << dendl;
+
+ if (pkt_len > ip_len) {
+ // Trim extra data in the packet beyond IP total length
+ p.trim_back(pkt_len - ip_len);
+ } else if (pkt_len < ip_len) {
+ // Drop if it contains less than IP total length
+ return 0;
+ }
+ // Drop if the reassembled datagram will be larger than maximum IP size
+ if (offset + p.len() > ip_packet_len_max) {
+ return 0;
+ }
+
+ // FIXME: process options
+ if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) {
+ ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl;
+ _arp.learn(from, h.src_ip);
+ }
+
+ if (_packet_filter) {
+ bool handled = false;
+ _packet_filter->handle(p, &h, from, handled);
+ if (handled) {
+ return 0;
+ }
+ }
+
+ if (h.dst_ip != _host_address) {
+ // FIXME: forward
+ return 0;
+ }
+
+ // Does this IP datagram need reassembly
+ auto mf = h.mf();
+ if (mf == true || offset != 0) {
+ frag_limit_mem();
+ auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto};
+ auto& frag = _frags[frag_id];
+ if (mf == false) {
+ frag.last_frag_received = true;
+ }
+ // This is a newly created frag_id
+ if (frag.mem_size == 0) {
+ _frags_age.push_back(frag_id);
+ frag.rx_time = ceph_clock_now();
+ }
+ auto added_size = frag.merge(h, offset, std::move(p));
+ _frag_mem += added_size;
+ if (frag.is_complete()) {
+ // All the fragments are received
+ auto dropped_size = frag.mem_size;
+ auto& ip_data = frag.data.map.begin()->second;
+ // Choose a cpu to forward this packet
+ auto cpu_id = center->get_id();
+ auto l4 = _l4[h.ip_proto];
+ if (l4) {
+ size_t l4_offset = 0;
+ forward_hash hash_data;
+ hash_data.push_back(hton(h.src_ip.ip));
+ hash_data.push_back(hton(h.dst_ip.ip));
+ l4->forward(hash_data, ip_data, l4_offset);
+ cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data));
+ }
+
+ // No need to forward if the dst cpu is the current cpu
+ if (cpu_id == center->get_id()) {
+ l4->received(std::move(ip_data), h.src_ip, h.dst_ip);
+ } else {
+ auto to = _netif->hw_address();
+ auto pkt = frag.get_assembled_packet(from, to);
+ _netif->forward(center, cpu_id, std::move(pkt));
+ }
+
+ // Delete this frag from _frags and _frags_age
+ frag_drop(frag_id, dropped_size);
+ _frags_age.remove(frag_id);
+ perf_logger->set(l_dpdk_total_linearize_operations,
+ ipv4_packet_merger::linearizations());
+ } else {
+ // Some of the fragments are missing
+ if (frag_timefd) {
+ frag_arm();
+ }
+ }
+ return 0;
+ }
+
+ auto l4 = _l4[h.ip_proto];
+ if (l4) {
+ // Trim IP header and pass to upper layer
+ p.trim_front(ip_hdr_len);
+ l4->received(std::move(p), h.src_ip, h.dst_ip);
+ }
+ return 0;
+}
+
+void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+ // Figure out where to send the packet to. If it is a directly connected
+ // host, send to it directly, otherwise send to the default gateway.
+ ipv4_address dst;
+ if (in_my_netmask(to)) {
+ dst = to;
+ } else {
+ dst = _gw_address;
+ }
+
+ _arp.wait(std::move(dst), std::move(p), std::move(cb));
+}
+
+const hw_features& ipv4::get_hw_features() const
+{
+ return _netif->get_hw_features();
+}
+
+void ipv4::send(ipv4_address to, ip_protocol_num proto_num,
+ Packet p, ethernet_address e_dst) {
+ auto needs_frag = this->needs_frag(p, proto_num, get_hw_features());
+
+ auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable {
+ static uint16_t id = 0;
+ auto iph = pkt.prepend_header<ip_hdr>();
+ iph->ihl = sizeof(*iph) / 4;
+ iph->ver = 4;
+ iph->dscp = 0;
+ iph->ecn = 0;
+ iph->len = pkt.len();
+ // FIXME: a proper id
+ iph->id = id++;
+ if (needs_frag) {
+ uint16_t mf = remaining > 0;
+ // The fragment offset is measured in units of 8 octets (64 bits)
+ auto off = offset / 8;
+ iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off;
+ } else {
+ iph->frag = 0;
+ }
+ iph->ttl = 64;
+ iph->ip_proto = (uint8_t)proto_num;
+ iph->csum = 0;
+ iph->src_ip = _host_address;
+ iph->dst_ip = to;
+ ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to
+ << " len " << pkt.len() << dendl;
+ *iph = iph->hton();
+
+ if (get_hw_features().tx_csum_ip_offload) {
+ iph->csum = 0;
+ pkt.offload_info_ref().needs_ip_csum = true;
+ } else {
+ checksummer csum;
+ csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+ iph->csum = csum.get();
+ }
+
+ _packetq.push_back(
+ l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)});
+ };
+
+ if (needs_frag) {
+ uint16_t offset = 0;
+ uint16_t remaining = p.len();
+ auto mtu = get_hw_features().mtu;
+
+ while (remaining) {
+ auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining);
+ remaining -= can_send;
+ auto pkt = p.share(offset, can_send);
+ send_pkt(pkt, remaining, offset);
+ offset += can_send;
+ }
+ } else {
+ // The whole packet can be send in one shot
+ send_pkt(p, 0, 0);
+ }
+}
+
+Tub<l3_protocol::l3packet> ipv4::get_packet() {
+ // _packetq will be mostly empty here unless it hold remnants of previously
+ // fragmented packet
+ if (_packetq.empty()) {
+ for (size_t i = 0; i < _pkt_providers.size(); i++) {
+ auto l4p = _pkt_providers[_pkt_provider_idx++]();
+ if (_pkt_provider_idx == _pkt_providers.size()) {
+ _pkt_provider_idx = 0;
+ }
+ if (l4p) {
+ ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl;
+ send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst);
+ break;
+ }
+ }
+ }
+
+ Tub<l3_protocol::l3packet> p;
+ if (!_packetq.empty()) {
+ p = std::move(_packetq.front());
+ _packetq.pop_front();
+ }
+ return p;
+}
+
+void ipv4::frag_limit_mem() {
+ if (_frag_mem <= _frag_high_thresh) {
+ return;
+ }
+ auto drop = _frag_mem - _frag_low_thresh;
+ while (drop) {
+ if (_frags_age.empty()) {
+ return;
+ }
+ // Drop the oldest frag (first element) from _frags_age
+ auto frag_id = _frags_age.front();
+ _frags_age.pop_front();
+
+ // Drop from _frags as well
+ auto& frag = _frags[frag_id];
+ auto dropped_size = frag.mem_size;
+ frag_drop(frag_id, dropped_size);
+
+ drop -= std::min(drop, dropped_size);
+ }
+}
+
+void ipv4::frag_timeout() {
+ if (_frags.empty()) {
+ return;
+ }
+ auto now = ceph_clock_now();
+ for (auto it = _frags_age.begin(); it != _frags_age.end();) {
+ auto frag_id = *it;
+ auto& frag = _frags[frag_id];
+ if (now > frag.rx_time + _frag_timeout) {
+ auto dropped_size = frag.mem_size;
+ // Drop from _frags
+ frag_drop(frag_id, dropped_size);
+ // Drop from _frags_age
+ it = _frags_age.erase(it);
+ } else {
+ // The further items can only be younger
+ break;
+ }
+ }
+ if (_frags.size() != 0) {
+ frag_arm(now);
+ } else {
+ _frag_mem = 0;
+ }
+}
+
+int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) {
+ uint32_t old = mem_size;
+ unsigned ip_hdr_len = h.ihl * 4;
+ // Store IP header
+ if (offset == 0) {
+ header = p.share(0, ip_hdr_len);
+ }
+ // Sotre IP payload
+ p.trim_front(ip_hdr_len);
+ data.merge(offset, std::move(p));
+ // Update mem size
+ mem_size = header.memory();
+ for (const auto& x : data.map) {
+ mem_size += x.second.memory();
+ }
+ auto added_size = mem_size - old;
+ return added_size;
+}
+
+bool ipv4::frag::is_complete() {
+ // If all the fragments are received, ipv4::frag::merge() should merge all
+ // the fragments into a single packet
+ auto offset = data.map.begin()->first;
+ auto nr_packet = data.map.size();
+ return last_frag_received && nr_packet == 1 && offset == 0;
+}
+
+Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) {
+ auto& ip_header = header;
+ auto& ip_data = data.map.begin()->second;
+ // Append a ethernet header, needed for forwarding
+ auto eh = ip_header.prepend_header<eth_hdr>();
+ eh->src_mac = from;
+ eh->dst_mac = to;
+ eh->eth_proto = uint16_t(eth_protocol_num::ipv4);
+ *eh = eh->hton();
+ // Prepare a packet contains both ethernet header, ip header and ip data
+ ip_header.append(std::move(ip_data));
+ auto pkt = std::move(ip_header);
+ auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr));
+ // len is the sum of each fragment
+ iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr)));
+ // No fragmentation for the assembled datagram
+ iph->frag = 0;
+ // Since each fragment's csum is checked, no need to csum
+ // again for the assembled datagram
+ offload_info oi;
+ oi.reassembled = true;
+ pkt.set_offload_info(oi);
+ return pkt;
+}
+
+void icmp::received(Packet p, ipaddr from, ipaddr to) {
+ auto hdr = p.get_header<icmp_hdr>(0);
+ if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) {
+ return;
+ }
+ hdr->type = icmp_hdr::msg_type::echo_reply;
+ hdr->code = 0;
+ hdr->csum = 0;
+ checksummer csum;
+ csum.sum(reinterpret_cast<char*>(hdr), p.len());
+ hdr->csum = csum.get();
+
+ if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+ auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable {
+ if (r == 0) {
+ _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp});
+ }
+ };
+ _inet.wait_l2_dst_address(from, std::move(p), cb);
+ }
+}
diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h
new file mode 100644
index 00000000..480b4b95
--- /dev/null
+++ b/src/msg/async/dpdk/IP.h
@@ -0,0 +1,414 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_IP_H_
+#define CEPH_MSG_IP_H_
+
+#include <arpa/inet.h>
+#include <unordered_map>
+#include <cstdint>
+#include <array>
+#include <map>
+#include <list>
+#include <chrono>
+
+#include "msg/async/Event.h"
+#include "common/Throttle.h"
+
+#include "array_map.h"
+#include "ARP.h"
+#include "IPChecksum.h"
+#include "ip_types.h"
+#include "const.h"
+#include "net.h"
+#include "PacketUtil.h"
+#include "toeplitz.h"
+
+class ipv4;
+template <ip_protocol_num ProtoNum>
+class ipv4_l4;
+
+template <typename InetTraits>
+class tcp;
+
+struct ipv4_traits {
+ using address_type = ipv4_address;
+ using inet_type = ipv4_l4<ip_protocol_num::tcp>;
+ struct l4packet {
+ ipv4_address to;
+ Packet p;
+ ethernet_address e_dst;
+ ip_protocol_num proto_num;
+ };
+ using packet_provider_type = std::function<Tub<l4packet> ()>;
+ static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+ csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len);
+ }
+ static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min;
+};
+
+template <ip_protocol_num ProtoNum>
+class ipv4_l4 {
+ public:
+ ipv4& _inet;
+ public:
+ ipv4_l4(ipv4& inet) : _inet(inet) {}
+ void register_packet_provider(ipv4_traits::packet_provider_type func);
+ void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+class ip_protocol {
+ public:
+ virtual ~ip_protocol() {}
+ virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0;
+ virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; }
+};
+
+template <typename InetTraits>
+struct l4connid {
+ using ipaddr = typename InetTraits::address_type;
+ using inet_type = typename InetTraits::inet_type;
+ struct connid_hash;
+
+ ipaddr local_ip;
+ ipaddr foreign_ip;
+ uint16_t local_port;
+ uint16_t foreign_port;
+
+ bool operator==(const l4connid& x) const {
+ return local_ip == x.local_ip
+ && foreign_ip == x.foreign_ip
+ && local_port == x.local_port
+ && foreign_port == x.foreign_port;
+ }
+
+ uint32_t hash(const rss_key_type& rss_key) {
+ forward_hash hash_data;
+ hash_data.push_back(hton(foreign_ip.ip));
+ hash_data.push_back(hton(local_ip.ip));
+ hash_data.push_back(hton(foreign_port));
+ hash_data.push_back(hton(local_port));
+ return toeplitz_hash(rss_key, hash_data);
+ }
+};
+
+class ipv4_tcp final : public ip_protocol {
+ ipv4_l4<ip_protocol_num::tcp> _inet_l4;
+ std::unique_ptr<tcp<ipv4_traits>> _tcp;
+ public:
+ ipv4_tcp(ipv4& inet, EventCenter *c);
+ ~ipv4_tcp();
+ virtual void received(Packet p, ipv4_address from, ipv4_address to) override;
+ virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override;
+ friend class ipv4;
+};
+
+struct icmp_hdr {
+ enum class msg_type : uint8_t {
+ echo_reply = 0,
+ echo_request = 8,
+ };
+ msg_type type;
+ uint8_t code;
+ uint16_t csum;
+ uint32_t rest;
+} __attribute__((packed));
+
+
+class icmp {
+ public:
+ using ipaddr = ipv4_address;
+ using inet_type = ipv4_l4<ip_protocol_num::icmp>;
+ explicit icmp(CephContext *c, inet_type& inet)
+ : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) {
+ _inet.register_packet_provider([this] {
+ Tub<ipv4_traits::l4packet> l4p;
+ if (!_packetq.empty()) {
+ l4p = std::move(_packetq.front());
+ _packetq.pop_front();
+ _queue_space.put(l4p->p.len());
+ }
+ return l4p;
+ });
+ }
+ void received(Packet p, ipaddr from, ipaddr to);
+
+ private:
+ CephContext *cct;
+ // ipv4_l4<ip_protocol_num::icmp>
+ inet_type& _inet;
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+ Throttle _queue_space;
+};
+
+class ipv4_icmp final : public ip_protocol {
+ CephContext *cct;
+ ipv4_l4<ip_protocol_num::icmp> _inet_l4;
+ icmp _icmp;
+ public:
+ ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {}
+ virtual void received(Packet p, ipv4_address from, ipv4_address to) override {
+ _icmp.received(std::move(p), from, to);
+ }
+ friend class ipv4;
+};
+
+struct ip_hdr;
+
+struct ip_packet_filter {
+ virtual ~ip_packet_filter() {};
+ virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0;
+};
+
+struct ipv4_frag_id {
+ struct hash;
+ ipv4_address src_ip;
+ ipv4_address dst_ip;
+ uint16_t identification;
+ uint8_t protocol;
+ bool operator==(const ipv4_frag_id& x) const {
+ return src_ip == x.src_ip &&
+ dst_ip == x.dst_ip &&
+ identification == x.identification &&
+ protocol == x.protocol;
+ }
+};
+
+struct ipv4_frag_id::hash : private std::hash<ipv4_address>,
+ private std::hash<uint16_t>, private std::hash<uint8_t> {
+ size_t operator()(const ipv4_frag_id& id) const noexcept {
+ using h1 = std::hash<ipv4_address>;
+ using h2 = std::hash<uint16_t>;
+ using h3 = std::hash<uint8_t>;
+ return h1::operator()(id.src_ip) ^
+ h1::operator()(id.dst_ip) ^
+ h2::operator()(id.identification) ^
+ h3::operator()(id.protocol);
+ }
+};
+
+struct ipv4_tag {};
+using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>;
+
+class interface;
+
+class ipv4 {
+ public:
+ using address_type = ipv4_address;
+ using proto_type = uint16_t;
+ static address_type broadcast_address() { return ipv4_address(0xffffffff); }
+ static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); }
+ CephContext *cct;
+ EventCenter *center;
+
+ private:
+ interface* _netif;
+ std::vector<ipv4_traits::packet_provider_type> _pkt_providers;
+ Tub<uint64_t> frag_timefd;
+ EventCallbackRef frag_handler;
+ arp _global_arp;
+ arp_for<ipv4> _arp;
+ ipv4_address _host_address;
+ ipv4_address _gw_address;
+ ipv4_address _netmask;
+ l3_protocol _l3;
+ subscription<Packet, ethernet_address> _rx_packets;
+ ipv4_tcp _tcp;
+ ipv4_icmp _icmp;
+ array_map<ip_protocol*, 256> _l4;
+ ip_packet_filter *_packet_filter;
+ struct frag {
+ Packet header;
+ ipv4_packet_merger data;
+ utime_t rx_time;
+ uint32_t mem_size = 0;
+ // fragment with MF == 0 inidates it is the last fragment
+ bool last_frag_received = false;
+
+ Packet get_assembled_packet(ethernet_address from, ethernet_address to);
+ int32_t merge(ip_hdr &h, uint16_t offset, Packet p);
+ bool is_complete();
+ };
+ std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags;
+ std::list<ipv4_frag_id> _frags_age;
+ static utime_t _frag_timeout;
+ static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024};
+ static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024};
+ uint32_t _frag_mem = 0;
+ circular_buffer<l3_protocol::l3packet> _packetq;
+ unsigned _pkt_provider_idx = 0;
+ PerfCounters *perf_logger;
+
+ private:
+ int handle_received_packet(Packet p, ethernet_address from);
+ bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+ Tub<l3_protocol::l3packet> get_packet();
+ bool in_my_netmask(ipv4_address a) const {
+ return !((a.ip ^ _host_address.ip) & _netmask.ip);
+ }
+ void frag_limit_mem();
+ void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) {
+ _frags.erase(frag_id);
+ _frag_mem -= dropped_size;
+ }
+ void frag_arm(utime_t now) {
+ auto tp = now + _frag_timeout;
+ frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler));
+ }
+ void frag_arm() {
+ auto now = ceph_clock_now();
+ frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler));
+ }
+
+ public:
+ void frag_timeout();
+
+ public:
+ explicit ipv4(CephContext *c, EventCenter *cen, interface* netif);
+ ~ipv4() {
+ delete frag_handler;
+ }
+ void set_host_address(ipv4_address ip) {
+ _host_address = ip;
+ _arp.set_self_addr(ip);
+ }
+ ipv4_address host_address() {
+ return _host_address;
+ }
+ void set_gw_address(ipv4_address ip) {
+ _gw_address = ip;
+ }
+ ipv4_address gw_address() const {
+ return _gw_address;
+ }
+ void set_netmask_address(ipv4_address ip) {
+ _netmask = ip;
+ }
+ ipv4_address netmask_address() const {
+ return _netmask;
+ }
+ interface *netif() const {
+ return _netif;
+ }
+ // TODO or something. Should perhaps truly be a list
+ // of filters. With ordering. And blackjack. Etc.
+ // But for now, a simple single raw pointer suffices
+ void set_packet_filter(ip_packet_filter *f) {
+ _packet_filter = f;
+ }
+ ip_packet_filter * packet_filter() const {
+ return _packet_filter;
+ }
+ void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst);
+ tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; }
+ void register_l4(proto_type id, ip_protocol* handler);
+ const hw_features& get_hw_features() const;
+ static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) {
+ if (p.len() + ipv4_hdr_len_min <= hw_features.mtu)
+ return false;
+
+ if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso))
+ return false;
+
+ return true;
+ }
+ void learn(ethernet_address l2, ipv4_address l3) {
+ _arp.learn(l2, l3);
+ }
+ void register_packet_provider(ipv4_traits::packet_provider_type&& func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::register_packet_provider(
+ ipv4_traits::packet_provider_type func) {
+ _inet.register_packet_provider([func] {
+ auto l4p = func();
+ if (l4p) {
+ (*l4p).proto_num = ProtoNum;
+ }
+ return l4p;
+ });
+}
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+ _inet.wait_l2_dst_address(to, std::move(p), std::move(cb));
+}
+
+struct ip_hdr {
+ uint8_t ihl : 4;
+ uint8_t ver : 4;
+ uint8_t dscp : 6;
+ uint8_t ecn : 2;
+ uint16_t len;
+ uint16_t id;
+ uint16_t frag;
+ enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 };
+ uint8_t ttl;
+ uint8_t ip_proto;
+ uint16_t csum;
+ ipv4_address src_ip;
+ ipv4_address dst_ip;
+ uint8_t options[0];
+ ip_hdr hton() {
+ ip_hdr hdr = *this;
+ hdr.len = ::hton(len);
+ hdr.id = ::hton(id);
+ hdr.frag = ::hton(frag);
+ hdr.csum = ::hton(csum);
+ hdr.src_ip.ip = ::hton(src_ip.ip);
+ hdr.dst_ip.ip = ::hton(dst_ip.ip);
+ return hdr;
+ }
+ ip_hdr ntoh() {
+ ip_hdr hdr = *this;
+ hdr.len = ::ntoh(len);
+ hdr.id = ::ntoh(id);
+ hdr.frag = ::ntoh(frag);
+ hdr.csum = ::ntoh(csum);
+ hdr.src_ip = src_ip.ntoh();
+ hdr.dst_ip = dst_ip.ntoh();
+ return hdr;
+ }
+
+ bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); }
+ bool df() { return frag & (1 << uint8_t(frag_bits::df)); }
+ uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); }
+} __attribute__((packed));
+
+template <typename InetTraits>
+struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> {
+ size_t operator()(const l4connid<InetTraits>& id) const noexcept {
+ using h1 = std::hash<ipaddr>;
+ using h2 = std::hash<uint16_t>;
+ return h1::operator()(id.local_ip)
+ ^ h1::operator()(id.foreign_ip)
+ ^ h2::operator()(id.local_port)
+ ^ h2::operator()(id.foreign_port);
+ }
+};
+
+#endif /* CEPH_MSG_IP_H */
diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc
new file mode 100644
index 00000000..7a3253c1
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <arpa/inet.h>
+#include "net.h"
+#include "IPChecksum.h"
+
+void checksummer::sum(const char* data, size_t len) {
+ auto orig_len = len;
+ if (odd) {
+ csum += uint8_t(*data++);
+ --len;
+ }
+ auto p64 = reinterpret_cast<const uint64_t*>(data);
+ while (len >= 8) {
+ csum += ntohq(*p64++);
+ len -= 8;
+ }
+ auto p16 = reinterpret_cast<const uint16_t*>(p64);
+ while (len >= 2) {
+ csum += ntohs(*p16++);
+ len -= 2;
+ }
+ auto p8 = reinterpret_cast<const uint8_t*>(p16);
+ if (len) {
+ csum += *p8++ << 8;
+ len -= 1;
+ }
+ odd ^= orig_len & 1;
+}
+
+uint16_t checksummer::get() const {
+ __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64);
+ uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64);
+ csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48);
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ return htons(~csum);
+}
+
+void checksummer::sum(const Packet& p) {
+ for (auto&& f : p.fragments()) {
+ sum(f.base, f.size);
+ }
+}
+
+uint16_t ip_checksum(const void* data, size_t len) {
+ checksummer cksum;
+ cksum.sum(reinterpret_cast<const char*>(data), len);
+ return cksum.get();
+}
diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h
new file mode 100644
index 00000000..9af4a86b
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CHECKSUM_H_
+#define CEPH_MSG_CHECKSUM_H_
+
+#include <cstdint>
+#include <cstddef>
+#include <arpa/inet.h>
+
+#include "Packet.h"
+
+uint16_t ip_checksum(const void* data, size_t len);
+
+struct checksummer {
+ __int128 csum = 0;
+ bool odd = false;
+ void sum(const char* data, size_t len);
+ void sum(const Packet& p);
+ void sum(uint8_t data) {
+ if (!odd) {
+ csum += data << 8;
+ } else {
+ csum += data;
+ }
+ odd = !odd;
+ }
+ void sum(uint16_t data) {
+ if (odd) {
+ sum(uint8_t(data >> 8));
+ sum(uint8_t(data));
+ } else {
+ csum += data;
+ }
+ }
+ void sum(uint32_t data) {
+ if (odd) {
+ sum(uint16_t(data));
+ sum(uint16_t(data >> 16));
+ } else {
+ csum += data;
+ }
+ }
+ void sum_many() {}
+ template <typename T0, typename... T>
+ void sum_many(T0 data, T... rest) {
+ sum(data);
+ sum_many(rest...);
+ }
+ uint16_t get() const;
+};
+
+#endif /* CEPH_MSG_CHECKSUM_H_ */
diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc
new file mode 100644
index 00000000..6c2320a0
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+
+#include "capture.h"
+#include "Packet.h"
+
+constexpr size_t Packet::internal_data_size;
+constexpr size_t Packet::default_nr_frags;
+
+void Packet::linearize(size_t at_frag, size_t desired_size) {
+ _impl->unuse_internal_data();
+ size_t nr_frags = 0;
+ size_t accum_size = 0;
+ while (accum_size < desired_size) {
+ accum_size += _impl->frags[at_frag + nr_frags].size;
+ ++nr_frags;
+ }
+ char *new_frag = new char[accum_size];
+ auto p = new_frag;
+ for (size_t i = 0; i < nr_frags; ++i) {
+ auto& f = _impl->frags[at_frag + i];
+ p = std::copy(f.base, f.base + f.size, p);
+ }
+ // collapse nr_frags into one fragment
+ std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + at_frag + 1);
+ _impl->_nr_frags -= nr_frags - 1;
+ _impl->frags[at_frag] = fragment{new_frag, accum_size};
+ if (at_frag == 0 && desired_size == len()) {
+ // We can drop the old buffer safely
+ auto x = std::move(_impl->_deleter);
+ _impl->_deleter = make_deleter([new_frag] { delete []new_frag; });
+ } else {
+ auto del = std::bind(
+ [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter));
+ _impl->_deleter = make_deleter(std::move(del));
+ }
+}
+
+class C_free_on_cpu : public EventCallback {
+ deleter del;
+ std::function<void()> cb;
+ public:
+ C_free_on_cpu(deleter &&d, std::function<void()> &&c):
+ del(std::move(d)), cb(std::move(c)) {}
+ void do_request(uint64_t fd) {
+ // deleter needs to be moved from lambda capture to be destroyed here
+ // otherwise deleter destructor will be called on a cpu that called
+ // create_external_event when work_item is destroyed.
+ deleter xxx(std::move(del));
+ cb();
+ delete this;
+ }
+};
+
+Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb)
+{
+ auto del = std::bind(
+ [center, cb] (deleter &del) mutable {
+ center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb)));
+ }, std::move(_impl->_deleter));
+ // make new deleter that runs old deleter on an origin cpu
+ _impl->_deleter = make_deleter(deleter(), std::move(del));
+
+ return Packet(impl::copy(_impl.get()));
+}
+
+std::ostream& operator<<(std::ostream& os, const Packet& p) {
+ os << "Packet{";
+ bool first = true;
+ for (auto&& frag : p.fragments()) {
+ if (!first) {
+ os << ", ";
+ }
+ first = false;
+ if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) {
+ os << '"';
+ for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+ auto c = *p;
+ if (isprint(c)) {
+ os << c;
+ } else if (c == '\r') {
+ os << "\\r";
+ } else if (c == '\n') {
+ os << "\\n";
+ } else if (c == '\t') {
+ os << "\\t";
+ } else {
+ uint8_t b = c;
+ os << "\\x" << (b / 16) << (b % 16);
+ }
+ }
+ os << '"';
+ } else {
+ os << "{";
+ bool nfirst = true;
+ for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+ if (!nfirst) {
+ os << " ";
+ }
+ nfirst = false;
+ uint8_t b = *p;
+ os << b;
+ }
+ os << "}";
+ }
+ }
+ os << "}";
+ return os;
+}
diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h
new file mode 100644
index 00000000..db9cd2a7
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.h
@@ -0,0 +1,550 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_H_
+#define CEPH_MSG_PACKET_H_
+
+#include <vector>
+#include <algorithm>
+#include <iosfwd>
+
+#include "include/types.h"
+#include "common/Tub.h"
+#include "common/deleter.h"
+#include "msg/async/Event.h"
+
+#include "const.h"
+
+struct fragment {
+ char* base;
+ size_t size;
+};
+
+struct offload_info {
+ ip_protocol_num protocol = ip_protocol_num::unused;
+ bool needs_csum = false;
+ uint8_t ip_hdr_len = 20;
+ uint8_t tcp_hdr_len = 20;
+ uint8_t udp_hdr_len = 8;
+ bool needs_ip_csum = false;
+ bool reassembled = false;
+ uint16_t tso_seg_size = 0;
+ // HW stripped VLAN header (CPU order)
+ Tub<uint16_t> vlan_tci;
+};
+
+// Zero-copy friendly packet class
+//
+// For implementing zero-copy, we need a flexible destructor that can
+// destroy packet data in different ways: decrementing a reference count,
+// or calling a free()-like function.
+//
+// Moreover, we need different destructors for each set of fragments within
+// a single fragment. For example, a header and trailer might need delete[]
+// to be called, while the internal data needs a reference count to be
+// released. Matters are complicated in that fragments can be split
+// (due to virtual/physical translation).
+//
+// To implement this, we associate each packet with a single destructor,
+// but allow composing a packet from another packet plus a fragment to
+// be added, with its own destructor, causing the destructors to be chained.
+//
+// The downside is that the data needed for the destructor is duplicated,
+// if it is already available in the fragment itself.
+//
+// As an optimization, when we allocate small fragments, we allocate some
+// extra space, so prepending to the packet does not require extra
+// allocations. This is useful when adding headers.
+//
+class Packet {
+ // enough for lots of headers, not quite two cache lines:
+ static constexpr size_t internal_data_size = 128 - 16;
+ static constexpr size_t default_nr_frags = 4;
+
+ struct pseudo_vector {
+ fragment* _start;
+ fragment* _finish;
+ pseudo_vector(fragment* start, size_t nr)
+ : _start(start), _finish(_start + nr) {}
+ fragment* begin() { return _start; }
+ fragment* end() { return _finish; }
+ fragment& operator[](size_t idx) { return _start[idx]; }
+ };
+
+ struct impl {
+ // when destroyed, virtual destructor will reclaim resources
+ deleter _deleter;
+ unsigned _len = 0;
+ uint16_t _nr_frags = 0;
+ uint16_t _allocated_frags;
+ offload_info _offload_info;
+ Tub<uint32_t> rss_hash;
+ char data[internal_data_size]; // only frags[0] may use
+ unsigned headroom = internal_data_size; // in data
+ // FIXME: share data/frags space
+
+ fragment frags[];
+
+ explicit impl(size_t nr_frags = default_nr_frags);
+ impl(const impl&) = delete;
+ impl(fragment frag, size_t nr_frags = default_nr_frags);
+
+ pseudo_vector fragments() { return { frags, _nr_frags }; }
+
+ static std::unique_ptr<impl> allocate(size_t nr_frags) {
+ nr_frags = std::max(nr_frags, default_nr_frags);
+ return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags));
+ }
+
+ static std::unique_ptr<impl> copy(impl* old, size_t nr) {
+ auto n = allocate(nr);
+ n->_deleter = std::move(old->_deleter);
+ n->_len = old->_len;
+ n->_nr_frags = old->_nr_frags;
+ n->headroom = old->headroom;
+ n->_offload_info = old->_offload_info;
+ n->rss_hash.construct(old->rss_hash);
+ std::copy(old->frags, old->frags + old->_nr_frags, n->frags);
+ old->copy_internal_fragment_to(n.get());
+ return std::move(n);
+ }
+
+ static std::unique_ptr<impl> copy(impl* old) {
+ return copy(old, old->_nr_frags);
+ }
+
+ static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) {
+ if (old->_allocated_frags >= old->_nr_frags + extra_frags) {
+ return std::move(old);
+ }
+ return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags));
+ }
+ void* operator new(size_t size, size_t nr_frags = default_nr_frags) {
+ ceph_assert(nr_frags == uint16_t(nr_frags));
+ return ::operator new(size + nr_frags * sizeof(fragment));
+ }
+ // Matching the operator new above
+ void operator delete(void* ptr, size_t nr_frags) {
+ return ::operator delete(ptr);
+ }
+ // Since the above "placement delete" hides the global one, expose it
+ void operator delete(void* ptr) {
+ return ::operator delete(ptr);
+ }
+
+ bool using_internal_data() const {
+ return _nr_frags
+ && frags[0].base >= data
+ && frags[0].base < data + internal_data_size;
+ }
+
+ void unuse_internal_data() {
+ if (!using_internal_data()) {
+ return;
+ }
+ auto buf = static_cast<char*>(::malloc(frags[0].size));
+ if (!buf) {
+ throw std::bad_alloc();
+ }
+ deleter d = make_free_deleter(buf);
+ std::copy(frags[0].base, frags[0].base + frags[0].size, buf);
+ frags[0].base = buf;
+ _deleter.append(std::move(d));
+ headroom = internal_data_size;
+ }
+ void copy_internal_fragment_to(impl* to) {
+ if (!using_internal_data()) {
+ return;
+ }
+ to->frags[0].base = to->data + headroom;
+ std::copy(frags[0].base, frags[0].base + frags[0].size,
+ to->frags[0].base);
+ }
+ };
+ explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {}
+ std::unique_ptr<impl> _impl;
+public:
+ static Packet from_static_data(const char* data, size_t len) {
+ return {fragment{const_cast<char*>(data), len}, deleter()};
+ }
+
+ // build empty Packet
+ Packet();
+ // build empty Packet with nr_frags allocated
+ explicit Packet(size_t nr_frags);
+ // move existing Packet
+ Packet(Packet&& x) noexcept;
+ // copy data into Packet
+ Packet(const char* data, size_t len);
+ // copy data into Packet
+ explicit Packet(fragment frag);
+ // zero-copy single fragment
+ Packet(fragment frag, deleter del);
+ // zero-copy multiple fragments
+ Packet(std::vector<fragment> frag, deleter del);
+ // build Packet with iterator
+ template <typename Iterator>
+ Packet(Iterator begin, Iterator end, deleter del);
+ // append fragment (copying new fragment)
+ Packet(Packet&& x, fragment frag);
+ // prepend fragment (copying new fragment, with header optimization)
+ Packet(fragment frag, Packet&& x);
+ // prepend fragment (zero-copy)
+ Packet(fragment frag, deleter del, Packet&& x);
+ // append fragment (zero-copy)
+ Packet(Packet&& x, fragment frag, deleter d);
+ // append deleter
+ Packet(Packet&& x, deleter d);
+
+ Packet& operator=(Packet&& x) {
+ if (this != &x) {
+ this->~Packet();
+ new (this) Packet(std::move(x));
+ }
+ return *this;
+ }
+
+ unsigned len() const { return _impl->_len; }
+ unsigned memory() const { return len() + sizeof(Packet::impl); }
+
+ fragment frag(unsigned idx) const { return _impl->frags[idx]; }
+ fragment& frag(unsigned idx) { return _impl->frags[idx]; }
+
+ unsigned nr_frags() const { return _impl->_nr_frags; }
+ pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; }
+ fragment* fragment_array() const { return _impl->frags; }
+
+ // share Packet data (reference counted, non COW)
+ Packet share();
+ Packet share(size_t offset, size_t len);
+
+ void append(Packet&& p);
+
+ void trim_front(size_t how_much);
+ void trim_back(size_t how_much);
+
+ // get a header pointer, linearizing if necessary
+ template <typename Header>
+ Header* get_header(size_t offset = 0);
+
+ // get a header pointer, linearizing if necessary
+ char* get_header(size_t offset, size_t size);
+
+ // prepend a header (default-initializing it)
+ template <typename Header>
+ Header* prepend_header(size_t extra_size = 0);
+
+ // prepend a header (uninitialized!)
+ char* prepend_uninitialized_header(size_t size);
+
+ Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{});
+
+ void linearize() { return linearize(0, len()); }
+
+ void reset() { _impl.reset(); }
+
+ void reserve(int n_frags) {
+ if (n_frags > _impl->_nr_frags) {
+ auto extra = n_frags - _impl->_nr_frags;
+ _impl = impl::allocate_if_needed(std::move(_impl), extra);
+ }
+ }
+ Tub<uint32_t> rss_hash() {
+ return _impl->rss_hash;
+ }
+ void set_rss_hash(uint32_t hash) {
+ _impl->rss_hash.construct(hash);
+ }
+private:
+ void linearize(size_t at_frag, size_t desired_size);
+ bool allocate_headroom(size_t size);
+public:
+ class offload_info offload_info() const { return _impl->_offload_info; }
+ class offload_info& offload_info_ref() { return _impl->_offload_info; }
+ void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; }
+};
+
+std::ostream& operator<<(std::ostream& os, const Packet& p);
+
+inline Packet::Packet(Packet&& x) noexcept
+ : _impl(std::move(x._impl)) {
+}
+
+inline Packet::impl::impl(size_t nr_frags)
+ : _len(0), _allocated_frags(nr_frags) {
+}
+
+inline Packet::impl::impl(fragment frag, size_t nr_frags)
+ : _len(frag.size), _allocated_frags(nr_frags) {
+ ceph_assert(_allocated_frags > _nr_frags);
+ if (frag.size <= internal_data_size) {
+ headroom -= frag.size;
+ frags[0] = { data + headroom, frag.size };
+ } else {
+ auto buf = static_cast<char*>(::malloc(frag.size));
+ if (!buf) {
+ throw std::bad_alloc();
+ }
+ deleter d = make_free_deleter(buf);
+ frags[0] = { buf, frag.size };
+ _deleter.append(std::move(d));
+ }
+ std::copy(frag.base, frag.base + frag.size, frags[0].base);
+ ++_nr_frags;
+}
+
+inline Packet::Packet(): _impl(impl::allocate(1)) {
+}
+
+inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) {
+}
+
+inline Packet::Packet(fragment frag): _impl(new impl(frag)) {
+}
+
+inline Packet::Packet(const char* data, size_t size):
+ Packet(fragment{const_cast<char*>(data), size}) {
+}
+
+inline Packet::Packet(fragment frag, deleter d)
+ : _impl(impl::allocate(1)) {
+ _impl->_deleter = std::move(d);
+ _impl->frags[_impl->_nr_frags++] = frag;
+ _impl->_len = frag.size;
+}
+
+inline Packet::Packet(std::vector<fragment> frag, deleter d)
+ : _impl(impl::allocate(frag.size())) {
+ _impl->_deleter = std::move(d);
+ std::copy(frag.begin(), frag.end(), _impl->frags);
+ _impl->_nr_frags = frag.size();
+ _impl->_len = 0;
+ for (auto&& f : _impl->fragments()) {
+ _impl->_len += f.size;
+ }
+}
+
+template <typename Iterator>
+inline Packet::Packet(Iterator begin, Iterator end, deleter del) {
+ unsigned nr_frags = 0, len = 0;
+ nr_frags = std::distance(begin, end);
+ std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; });
+ _impl = impl::allocate(nr_frags);
+ _impl->_deleter = std::move(del);
+ _impl->_len = len;
+ _impl->_nr_frags = nr_frags;
+ std::copy(begin, end, _impl->frags);
+}
+
+inline Packet::Packet(Packet&& x, fragment frag)
+ : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+ _impl->_len += frag.size;
+ char* buf = new char[frag.size];
+ std::copy(frag.base, frag.base + frag.size, buf);
+ _impl->frags[_impl->_nr_frags++] = {buf, frag.size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] {
+ delete[] buf;
+ });
+}
+
+inline bool Packet::allocate_headroom(size_t size) {
+ if (_impl->headroom >= size) {
+ _impl->_len += size;
+ if (!_impl->using_internal_data()) {
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags + 1);
+ _impl->frags[0] = { _impl->data + internal_data_size, 0 };
+ ++_impl->_nr_frags;
+ }
+ _impl->headroom -= size;
+ _impl->frags[0].base -= size;
+ _impl->frags[0].size += size;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+inline Packet::Packet(fragment frag, Packet&& x)
+ : _impl(std::move(x._impl)) {
+ // try to prepend into existing internal fragment
+ if (allocate_headroom(frag.size)) {
+ std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base);
+ return;
+ } else {
+ // didn't work out, allocate and copy
+ _impl->unuse_internal_data();
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ _impl->_len += frag.size;
+ char *buf = new char[frag.size];
+ std::copy(frag.base, frag.base + frag.size, buf);
+ std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags + 1);
+ ++_impl->_nr_frags;
+ _impl->frags[0] = {buf, frag.size};
+ _impl->_deleter = make_deleter(
+ std::move(_impl->_deleter), [buf] { delete []buf; });
+ }
+}
+
+inline Packet::Packet(Packet&& x, fragment frag, deleter d)
+ : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+ _impl->_len += frag.size;
+ _impl->frags[_impl->_nr_frags++] = frag;
+ d.append(std::move(_impl->_deleter));
+ _impl->_deleter = std::move(d);
+}
+
+inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) {
+ _impl->_deleter.append(std::move(d));
+}
+
+inline void Packet::append(Packet&& p) {
+ if (!_impl->_len) {
+ *this = std::move(p);
+ return;
+ }
+ _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags);
+ _impl->_len += p._impl->_len;
+ p._impl->unuse_internal_data();
+ std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags);
+ _impl->_nr_frags += p._impl->_nr_frags;
+ p._impl->_deleter.append(std::move(_impl->_deleter));
+ _impl->_deleter = std::move(p._impl->_deleter);
+}
+
+inline char* Packet::get_header(size_t offset, size_t size) {
+ if (offset + size > _impl->_len) {
+ return nullptr;
+ }
+ size_t i = 0;
+ while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) {
+ offset -= _impl->frags[i++].size;
+ }
+ if (i == _impl->_nr_frags) {
+ return nullptr;
+ }
+ if (offset + size > _impl->frags[i].size) {
+ linearize(i, offset + size);
+ }
+ return _impl->frags[i].base + offset;
+}
+
+template <typename Header>
+inline Header* Packet::get_header(size_t offset) {
+ return reinterpret_cast<Header*>(get_header(offset, sizeof(Header)));
+}
+
+inline void Packet::trim_front(size_t how_much) {
+ ceph_assert(how_much <= _impl->_len);
+ _impl->_len -= how_much;
+ size_t i = 0;
+ while (how_much && how_much >= _impl->frags[i].size) {
+ how_much -= _impl->frags[i++].size;
+ }
+ std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags);
+ _impl->_nr_frags -= i;
+ if (!_impl->using_internal_data()) {
+ _impl->headroom = internal_data_size;
+ }
+ if (how_much) {
+ if (_impl->using_internal_data()) {
+ _impl->headroom += how_much;
+ }
+ _impl->frags[0].base += how_much;
+ _impl->frags[0].size -= how_much;
+ }
+}
+
+inline void Packet::trim_back(size_t how_much) {
+ ceph_assert(how_much <= _impl->_len);
+ _impl->_len -= how_much;
+ size_t i = _impl->_nr_frags - 1;
+ while (how_much && how_much >= _impl->frags[i].size) {
+ how_much -= _impl->frags[i--].size;
+ }
+ _impl->_nr_frags = i + 1;
+ if (how_much) {
+ _impl->frags[i].size -= how_much;
+ if (i == 0 && _impl->using_internal_data()) {
+ _impl->headroom += how_much;
+ }
+ }
+}
+
+template <typename Header>
+Header* Packet::prepend_header(size_t extra_size) {
+ auto h = prepend_uninitialized_header(sizeof(Header) + extra_size);
+ return new (h) Header{};
+}
+
+// prepend a header (uninitialized!)
+inline char* Packet::prepend_uninitialized_header(size_t size) {
+ if (!allocate_headroom(size)) {
+ // didn't work out, allocate and copy
+ _impl->unuse_internal_data();
+ // try again, after unuse_internal_data we may have space after all
+ if (!allocate_headroom(size)) {
+ // failed
+ _impl->_len += size;
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ char *buf = new char[size];
+ std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags + 1);
+ ++_impl->_nr_frags;
+ _impl->frags[0] = {buf, size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+ [buf] { delete []buf; });
+ }
+ }
+ return _impl->frags[0].base;
+}
+
+inline Packet Packet::share() {
+ return share(0, _impl->_len);
+}
+
+inline Packet Packet::share(size_t offset, size_t len) {
+ _impl->unuse_internal_data(); // FIXME: eliminate?
+ Packet n;
+ n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags);
+ size_t idx = 0;
+ while (offset > 0 && offset >= _impl->frags[idx].size) {
+ offset -= _impl->frags[idx++].size;
+ }
+ while (n._impl->_len < len) {
+ auto& f = _impl->frags[idx++];
+ auto fsize = std::min(len - n._impl->_len, f.size - offset);
+ n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize };
+ n._impl->_len += fsize;
+ offset = 0;
+ }
+ n._impl->_offload_info = _impl->_offload_info;
+ ceph_assert(!n._impl->_deleter);
+ n._impl->_deleter = _impl->_deleter.share();
+ return n;
+}
+
+#endif /* CEPH_MSG_PACKET_H_ */
diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h
new file mode 100644
index 00000000..118218e6
--- /dev/null
+++ b/src/msg/async/dpdk/PacketUtil.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_UTIL_H_
+#define CEPH_MSG_PACKET_UTIL_H_
+
+#include <map>
+#include <iostream>
+
+#include "Packet.h"
+
+template <typename Offset, typename Tag>
+class packet_merger {
+ private:
+ static uint64_t& linearizations_ref() {
+ static thread_local uint64_t linearization_count;
+ return linearization_count;
+ }
+ public:
+ std::map<Offset, Packet> map;
+
+ static uint64_t linearizations() {
+ return linearizations_ref();
+ }
+
+ void merge(Offset offset, Packet p) {
+ bool insert = true;
+ auto beg = offset;
+ auto end = beg + p.len();
+ // First, try to merge the packet with existing segment
+ for (auto it = map.begin(); it != map.end();) {
+ auto& seg_pkt = it->second;
+ auto seg_beg = it->first;
+ auto seg_end = seg_beg + seg_pkt.len();
+ // There are 6 cases:
+ if (seg_beg <= beg && end <= seg_end) {
+ // 1) seg_beg beg end seg_end
+ // We already have data in this packet
+ return;
+ } else if (beg <= seg_beg && seg_end <= end) {
+ // 2) beg seg_beg seg_end end
+ // The new segment contains more data than this old segment
+ // Delete the old one, insert the new one
+ it = map.erase(it);
+ insert = true;
+ break;
+ } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) {
+ // 3) beg seg_beg end seg_end
+ // Merge two segments, trim front of old segment
+ auto trim = end - seg_beg;
+ seg_pkt.trim_front(trim);
+ p.append(std::move(seg_pkt));
+ // Delete the old one, insert the new one
+ it = map.erase(it);
+ insert = true;
+ break;
+ } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+ // 4) seg_beg beg seg_end end
+ // Merge two segments, trim front of new segment
+ auto trim = seg_end - beg;
+ p.trim_front(trim);
+ // Append new data to the old segment, keep the old segment
+ seg_pkt.append(std::move(p));
+ seg_pkt.linearize();
+ ++linearizations_ref();
+ insert = false;
+ break;
+ } else {
+ // 5) beg end < seg_beg seg_end
+ // or
+ // 6) seg_beg seg_end < beg end
+ // Can not merge with this segment, keep looking
+ it++;
+ insert = true;
+ }
+ }
+
+ if (insert) {
+ p.linearize();
+ ++linearizations_ref();
+ map.emplace(beg, std::move(p));
+ }
+
+ // Second, merge adjacent segments after this packet has been merged,
+ // because this packet might fill a "whole" and make two adjacent
+ // segments mergable
+ for (auto it = map.begin(); it != map.end();) {
+ // The first segment
+ auto& seg_pkt = it->second;
+ auto seg_beg = it->first;
+ auto seg_end = seg_beg + seg_pkt.len();
+
+ // The second segment
+ auto it_next = it;
+ it_next++;
+ if (it_next == map.end()) {
+ break;
+ }
+ auto& p = it_next->second;
+ auto beg = it_next->first;
+ auto end = beg + p.len();
+
+ // Merge the the second segment into first segment if possible
+ if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+ // Merge two segments, trim front of second segment
+ auto trim = seg_end - beg;
+ p.trim_front(trim);
+ // Append new data to the first segment, keep the first segment
+ seg_pkt.append(std::move(p));
+
+ // Delete the second segment
+ map.erase(it_next);
+
+ // Keep merging this first segment with its new next packet
+ // So we do not update the iterator: it
+ continue;
+ } else if (end <= seg_end) {
+ // The first segment has all the data in the second segment
+ // Delete the second segment
+ map.erase(it_next);
+ continue;
+ } else if (seg_end < beg) {
+ // Can not merge first segment with second segment
+ it = it_next;
+ continue;
+ } else {
+ // If we reach here, we have a bug with merge.
+ std::cout << "packet_merger: merge error\n";
+ abort();
+ }
+ }
+ }
+};
+
+#endif
diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h
new file mode 100644
index 00000000..996ae93c
--- /dev/null
+++ b/src/msg/async/dpdk/TCP-Stack.h
@@ -0,0 +1,40 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+// tcp/network-stack integration
+
+#ifndef CEPH_MSG_DPDK_TCP_STACK_H
+#define CEPH_MSG_DPDK_TCP_STACK_H
+
+class ServerSocket;
+class ConnectedSocket;
+
+class ipv4_traits;
+template <typename InetTraits>
+class tcp;
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+ int type, ServerSocket *sa);
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+ ConnectedSocket *sa);
+
+#endif
diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc
new file mode 100644
index 00000000..c6397709
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.cc
@@ -0,0 +1,840 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "align.h"
+#include "TCP.h"
+#include "IP.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "tcp "
+
+void tcp_option::parse(uint8_t* beg, uint8_t* end)
+{
+ while (beg < end) {
+ auto kind = option_kind(*beg);
+ if (kind != option_kind::nop && kind != option_kind::eol) {
+ // Make sure there is enough room for this option
+ auto len = *(beg + 1);
+ if (beg + len > end) {
+ return;
+ }
+ }
+ switch (kind) {
+ case option_kind::mss:
+ _mss_received = true;
+ _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss);
+ beg += option_len::mss;
+ break;
+ case option_kind::win_scale:
+ _win_scale_received = true;
+ _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift;
+ // We can turn on win_scale option, 7 is Linux's default win scale size
+ _local_win_scale = 7;
+ beg += option_len::win_scale;
+ break;
+ case option_kind::sack:
+ _sack_received = true;
+ beg += option_len::sack;
+ break;
+ case option_kind::nop:
+ beg += option_len::nop;
+ break;
+ case option_kind::eol:
+ return;
+ default:
+ // Ignore options we do not understand
+ auto len = *(beg + 1);
+ beg += len;
+ // Prevent infinite loop
+ if (len == 0) {
+ return;
+ }
+ break;
+ }
+ }
+}
+
+uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size)
+{
+ auto hdr = reinterpret_cast<uint8_t*>(th);
+ auto off = hdr + sizeof(tcp_hdr);
+ uint8_t size = 0;
+ bool syn_on = th->f_syn;
+ bool ack_on = th->f_ack;
+
+ if (syn_on) {
+ if (_mss_received || !ack_on) {
+ auto mss = new (off) tcp_option::mss;
+ mss->mss = _local_mss;
+ off += mss->len;
+ size += mss->len;
+ *mss = mss->hton();
+ }
+ if (_win_scale_received || !ack_on) {
+ auto win_scale = new (off) tcp_option::win_scale;
+ win_scale->shift = _local_win_scale;
+ off += win_scale->len;
+ size += win_scale->len;
+ }
+ }
+ if (size > 0) {
+ // Insert NOP option
+ auto size_max = align_up(uint8_t(size + 1), tcp_option::align);
+ while (size < size_max - uint8_t(option_len::eol)) {
+ new (off) tcp_option::nop;
+ off += option_len::nop;
+ size += option_len::nop;
+ }
+ new (off) tcp_option::eol;
+ size += option_len::eol;
+ }
+ ceph_assert(size == options_size);
+
+ return size;
+}
+
+uint8_t tcp_option::get_size(bool syn_on, bool ack_on)
+{
+ uint8_t size = 0;
+ if (syn_on) {
+ if (_mss_received || !ack_on) {
+ size += option_len::mss;
+ }
+ if (_win_scale_received || !ack_on) {
+ size += option_len::win_scale;
+ }
+ }
+ if (size > 0) {
+ size += option_len::eol;
+ // Insert NOP option to align on 32-bit
+ size = align_up(size, tcp_option::align);
+ }
+ return size;
+}
+
+ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c)
+ : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c)))
+{ }
+
+ipv4_tcp::~ipv4_tcp() { }
+
+void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to)
+{
+ _tcp->received(std::move(p), from, to);
+}
+
+bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+ return _tcp->forward(out_hash_data, p, off);
+}
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+ int type, ServerSocket *sock)
+{
+ auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type);
+ int r = p->listen();
+ if (r < 0) {
+ delete p;
+ return r;
+ }
+ *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+ return 0;
+}
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+ ConnectedSocket *sock)
+{
+ auto conn = tcpv4.connect(addr);
+ *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>(
+ new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn))));
+ return 0;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip)
+{
+ ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin)
+ << " syn=" << bool(rth->f_syn) << dendl;
+ if (rth->f_rst) {
+ return;
+ }
+ Packet p;
+ auto th = p.prepend_header<tcp_hdr>();
+ th->src_port = rth->dst_port;
+ th->dst_port = rth->src_port;
+ if (rth->f_ack) {
+ th->seq = rth->ack;
+ }
+ // If this RST packet is in response to a SYN packet. We ACK the ISN.
+ if (rth->f_syn) {
+ th->ack = rth->seq + 1;
+ th->f_ack = true;
+ }
+ th->f_rst = true;
+ th->data_offset = sizeof(*th) / 4;
+ th->checksum = 0;
+ *th = th->hton();
+
+ checksummer csum;
+ offload_info oi;
+ InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
+ if (get_hw_features().tx_csum_l4_offload) {
+ th->checksum = ~csum.get();
+ oi.needs_csum = true;
+ } else {
+ csum.sum(p);
+ th->checksum = csum.get();
+ oi.needs_csum = false;
+ }
+
+ oi.protocol = ip_protocol_num::tcp;
+ oi.tcp_hdr_len = sizeof(tcp_hdr);
+ p.set_offload_info(oi);
+
+ send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
+}
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+template<typename InetTraits>
+ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) {
+ return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port
+ << " tcb(" << this << " fd=" << fd << " s=" << _state << ").";
+}
+
+template<typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p)
+{
+ auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+ auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+ auto opt_end = opt_start + opt_len;
+ p.trim_front(th->data_offset * 4);
+ tcp_sequence seg_seq = th->seq;
+
+ // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
+ _rcv.next = seg_seq + 1;
+ _rcv.initial = seg_seq;
+
+ // ISS should be selected and a SYN segment sent of the form:
+ // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+ // SND.NXT is set to ISS+1 and SND.UNA to ISS
+ // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
+ // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
+ // have
+ // th->seq = syn_on ? _snd.initial : _snd.next
+ // to make sure retransmitted SYN has correct SEQ number.
+ do_setup_isn();
+
+ _rcv.urgent = _rcv.next;
+
+ ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl;
+ init_from_options(th, opt_start, opt_end);
+ do_syn_received();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p)
+{
+ auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+ auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+ auto opt_end = opt_start + opt_len;
+ p.trim_front(th->data_offset * 4);
+ tcp_sequence seg_seq = th->seq;
+ auto seg_ack = th->ack;
+
+ ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+ << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+ bool acceptable = false;
+ // 3.1 first check the ACK bit
+ if (th->f_ack) {
+ // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
+ // RST bit is set, if so drop the segment and return)
+ if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
+ return respond_with_reset(th);
+ }
+
+ // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+ acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
+ }
+
+ // 3.2 second check the RST bit
+ if (th->f_rst) {
+ // If the ACK was acceptable then signal the user "error: connection
+ // reset", drop the segment, enter CLOSED state, delete TCB, and
+ // return. Otherwise (no ACK) drop the segment and return.
+ if (acceptable) {
+ return do_reset();
+ } else {
+ return;
+ }
+ }
+
+ // 3.3 third check the security and precedence
+ // NOTE: Ignored for now
+
+ // 3.4 fourth check the SYN bit
+ if (th->f_syn) {
+ // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should
+ // be advanced to equal SEG.ACK (if there is an ACK), and any segments
+ // on the retransmission queue which are thereby acknowledged should be
+ // removed.
+ _rcv.next = seg_seq + 1;
+ _rcv.initial = seg_seq;
+ if (th->f_ack) {
+ // TODO: clean retransmission queue
+ _snd.unacknowledged = seg_ack;
+ }
+ if (_snd.unacknowledged > _snd.initial) {
+ // If SND.UNA > ISS (our SYN has been ACKed), change the connection
+ // state to ESTABLISHED, form an ACK segment
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl;
+ init_from_options(th, opt_start, opt_end);
+ do_established();
+ output();
+ } else {
+ // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
+ // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+ ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl;
+ do_syn_received();
+ }
+ }
+
+ // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
+ // segment and return.
+ return;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p)
+{
+ p.trim_front(th->data_offset * 4);
+ bool do_output = false;
+ bool do_output_data = false;
+ tcp_sequence seg_seq = th->seq;
+ auto seg_ack = th->ack;
+ auto seg_len = p.len();
+ ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+ << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw
+ << " rcv next " << _rcv.next.raw << " len " << seg_len
+ << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+ // 4.1 first check sequence number
+ if (!segment_acceptable(seg_seq, seg_len)) {
+ //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ return output();
+ }
+
+ // In the following it is assumed that the segment is the idealized
+ // segment that begins at RCV.NXT and does not exceed the window.
+ if (seg_seq < _rcv.next) {
+ // ignore already acknowledged data
+ auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
+ ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl;
+ p.trim_front(dup);
+ seg_len -= dup;
+ seg_seq += dup;
+ }
+ // FIXME: We should trim data outside the right edge of the receive window as well
+
+ if (seg_seq != _rcv.next) {
+ ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw
+ << " actual " << seg_seq.raw
+ << " out of order size " << _rcv.out_of_order.map.size()
+ << dendl;
+ insert_out_of_order(seg_seq, std::move(p));
+ // A TCP receiver SHOULD send an immediate duplicate ACK
+ // when an out-of-order segment arrives.
+ return output();
+ }
+
+ // 4.2 second check the RST bit
+ if (th->f_rst) {
+ if (in_state(SYN_RECEIVED)) {
+ // If this connection was initiated with a passive OPEN (i.e.,
+ // came from the LISTEN state), then return this connection to
+ // LISTEN state and return. The user need not be informed. If
+ // this connection was initiated with an active OPEN (i.e., came
+ // from SYN_SENT state) then the connection was refused, signal
+ // the user "connection refused". In either case, all segments
+ // on the retransmission queue should be removed. And in the
+ // active OPEN case, enter the CLOSED state and delete the TCB,
+ // and return.
+ errno = -ECONNREFUSED;
+ return do_reset();
+ }
+ if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
+ // If the RST bit is set then, any outstanding RECEIVEs and SEND
+ // should receive "reset" responses. All segment queues should be
+ // flushed. Users should also receive an unsolicited general
+ // "connection reset" signal. Enter the CLOSED state, delete the
+ // TCB, and return.
+ return do_reset();
+ }
+ if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
+ // If the RST bit is set then, enter the CLOSED state, delete the
+ // TCB, and return.
+ return do_closed();
+ }
+ }
+
+ // 4.3 third check security and precedence
+ // NOTE: Ignored for now
+
+ // 4.4 fourth, check the SYN bit
+ if (th->f_syn) {
+ // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+ // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+
+ // If the SYN is in the window it is an error, send a reset, any
+ // outstanding RECEIVEs and SEND should receive "reset" responses,
+ // all segment queues should be flushed, the user should also
+ // receive an unsolicited general "connection reset" signal, enter
+ // the CLOSED state, delete the TCB, and return.
+ respond_with_reset(th);
+ return do_reset();
+
+ // If the SYN is not in the window this step would not be reached
+ // and an ack would have been sent in the first step (sequence
+ // number check).
+ }
+
+ // 4.5 fifth check the ACK field
+ if (!th->f_ack) {
+ // if the ACK bit is off drop the segment and return
+ return;
+ } else {
+ // SYN_RECEIVED STATE
+ if (in_state(SYN_RECEIVED)) {
+ // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
+ // and continue processing.
+ if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
+ ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl;
+ do_established();
+ if (_tcp.push_listen_queue(_local_port, this)) {
+ ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl;
+ } else {
+ ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl;
+ return respond_with_reset(th);
+ }
+ } else {
+ // <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(th);
+ }
+ }
+ auto update_window = [this, th, seg_seq, seg_ack] {
+ ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq
+ << " seg_ack=" << seg_ack << " old window=" << th->window
+ << " new window=" << int(_snd.window_scale) << dendl;
+ _snd.window = th->window << _snd.window_scale;
+ _snd.wl1 = seg_seq;
+ _snd.wl2 = seg_ack;
+ if (_snd.window == 0) {
+ _persist_time_out = _rto;
+ start_persist_timer();
+ } else {
+ stop_persist_timer();
+ }
+ };
+ // ESTABLISHED STATE or
+ // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
+ if (in_state(ESTABLISHED | CLOSE_WAIT)) {
+ // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
+ if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
+ // Remote ACKed data we sent
+ auto acked_bytes = data_segment_acked(seg_ack);
+
+ // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
+ if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
+ update_window();
+ }
+
+ // some data is acked, try send more data
+ do_output_data = true;
+
+ auto set_retransmit_timer = [this] {
+ if (_snd.data.empty()) {
+ // All outstanding segments are acked, turn off the timer.
+ stop_retransmit_timer();
+ // Signal the waiter of this event
+ signal_all_data_acked();
+ } else {
+ // Restart the timer becasue new data is acked.
+ start_retransmit_timer();
+ }
+ };
+
+ if (_snd.dupacks >= 3) {
+ // We are in fast retransmit / fast recovery phase
+ uint32_t smss = _snd.mss;
+ if (seg_ack > _snd.recover) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl;
+ // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
+ _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
+ // Exit the fast recovery procedure
+ exit_fast_recovery();
+ set_retransmit_timer();
+ } else {
+ ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl;
+ // Retransmit the first unacknowledged segment
+ fast_retransmit();
+ // Deflate the congestion window by the amount of new data
+ // acknowledged by the Cumulative Acknowledgment field
+ _snd.cwnd -= acked_bytes;
+ // If the partial ACK acknowledges at least one SMSS of new
+ // data, then add back SMSS bytes to the congestion window
+ if (acked_bytes >= smss) {
+ _snd.cwnd += smss;
+ }
+ // Send a new segment if permitted by the new value of
+ // cwnd. Do not exit the fast recovery procedure For
+ // the first partial ACK that arrives during fast
+ // recovery, also reset the retransmit timer.
+ if (++_snd.partial_ack == 1) {
+ start_retransmit_timer();
+ }
+ }
+ } else {
+ // RFC5681: The fast retransmit algorithm uses the arrival
+ // of 3 duplicate ACKs (as defined in section 2, without
+ // any intervening ACKs which move SND.UNA) as an
+ // indication that a segment has been lost.
+ //
+ // So, here we reset dupacks to zero becasue this ACK moves
+ // SND.UNA.
+ exit_fast_recovery();
+ set_retransmit_timer();
+ }
+ } else if (!_snd.data.empty() && seg_len == 0 &&
+ th->f_fin == 0 && th->f_syn == 0 &&
+ th->ack == _snd.unacknowledged &&
+ uint32_t(th->window << _snd.window_scale) == _snd.window) {
+ // Note:
+ // RFC793 states:
+ // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
+ // RFC5681 states:
+ // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
+ // and repair loss, based on incoming duplicate ACKs.
+ // Here, We follow RFC5681.
+ _snd.dupacks++;
+ uint32_t smss = _snd.mss;
+ // 3 duplicated ACKs trigger a fast retransmit
+ if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+ // RFC5681 Step 3.1
+ // Send cwnd + 2 * smss per RFC3042
+ do_output_data = true;
+ } else if (_snd.dupacks == 3) {
+ // RFC6582 Step 3.2
+ if (seg_ack - 1 > _snd.recover) {
+ _snd.recover = _snd.next - 1;
+ // RFC5681 Step 3.2
+ _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
+ fast_retransmit();
+ } else {
+ // Do not enter fast retransmit and do not reset ssthresh
+ }
+ // RFC5681 Step 3.3
+ _snd.cwnd = _snd.ssthresh + 3 * smss;
+ } else if (_snd.dupacks > 3) {
+ // RFC5681 Step 3.4
+ _snd.cwnd += smss;
+ // RFC5681 Step 3.5
+ do_output_data = true;
+ }
+ } else if (seg_ack > _snd.next) {
+ // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
+ // then send an ACK, drop the segment, and return
+ return output();
+ } else if (_snd.window == 0 && th->window > 0) {
+ update_window();
+ do_output_data = true;
+ }
+ }
+ // FIN_WAIT_1 STATE
+ if (in_state(FIN_WAIT_1)) {
+ // In addition to the processing for the ESTABLISHED state, if
+ // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
+ // processing in that state.
+ if (seg_ack == _snd.next + 1) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl;
+ _state = FIN_WAIT_2;
+ do_local_fin_acked();
+ }
+ }
+ // FIN_WAIT_2 STATE
+ if (in_state(FIN_WAIT_2)) {
+ // In addition to the processing for the ESTABLISHED state, if
+ // the retransmission queue is empty, the user’s CLOSE can be
+ // acknowledged ("ok") but do not delete the TCB.
+ // TODO
+ }
+ // CLOSING STATE
+ if (in_state(CLOSING)) {
+ if (seg_ack == _snd.next + 1) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl;
+ do_local_fin_acked();
+ return do_time_wait();
+ } else {
+ return;
+ }
+ }
+ // LAST_ACK STATE
+ if (in_state(LAST_ACK)) {
+ if (seg_ack == _snd.next + 1) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl;
+ do_local_fin_acked();
+ return do_closed();
+ }
+ }
+ // TIME_WAIT STATE
+ if (in_state(TIME_WAIT)) {
+ // The only thing that can arrive in this state is a
+ // retransmission of the remote FIN. Acknowledge it, and restart
+ // the 2 MSL timeout.
+ // TODO
+ }
+ }
+
+ // 4.6 sixth, check the URG bit
+ if (th->f_urg) {
+ // TODO
+ }
+
+ // 4.7 seventh, process the segment text
+ if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
+ if (p.len()) {
+ // Once the TCP takes responsibility for the data it advances
+ // RCV.NXT over the data accepted, and adjusts RCV.WND as
+ // apporopriate to the current buffer availability. The total of
+ // RCV.NXT and RCV.WND should not be reduced.
+ _rcv.data.push_back(std::move(p));
+ _rcv.next += seg_len;
+ auto merged = merge_out_of_order();
+ signal_data_received();
+ // Send an acknowledgment of the form:
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ // This acknowledgment should be piggybacked on a segment being
+ // transmitted if possible without incurring undue delay.
+ if (merged) {
+ // TCP receiver SHOULD send an immediate ACK when the
+ // incoming segment fills in all or part of a gap in the
+ // sequence space.
+ do_output = true;
+ } else {
+ do_output = should_send_ack(seg_len);
+ }
+ ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl;
+ }
+ } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
+ // This should not occur, since a FIN has been received from the
+ // remote side. Ignore the segment text.
+ return;
+ }
+
+ // 4.8 eighth, check the FIN bit
+ if (th->f_fin) {
+ if (in_state(CLOSED | LISTEN | SYN_SENT)) {
+ // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
+ // since the SEG.SEQ cannot be validated; drop the segment and return.
+ return;
+ }
+ auto fin_seq = seg_seq + seg_len;
+ if (fin_seq == _rcv.next) {
+ _rcv.next = fin_seq + 1;
+
+ // If this <FIN> packet contains data as well, we can ACK both data
+ // and <FIN> in a single packet, so canncel the previous ACK.
+ clear_delayed_ack();
+ do_output = false;
+ // Send ACK for the FIN!
+ output();
+ signal_data_received();
+ _errno = 0;
+
+ if (in_state(SYN_RECEIVED | ESTABLISHED)) {
+ ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl;
+ _state = CLOSE_WAIT;
+ // EOF
+ }
+ if (in_state(FIN_WAIT_1)) {
+ // If our FIN has been ACKed (perhaps in this segment), then
+ // enter TIME-WAIT, start the time-wait timer, turn off the other
+ // timers; otherwise enter the CLOSING state.
+ // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
+ // not FIN_WAIT_1 if we reach here.
+ ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl;
+ _state = CLOSING;
+ }
+ if (in_state(FIN_WAIT_2)) {
+ ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl;
+ return do_time_wait();
+ }
+ }
+ }
+ if (do_output || (do_output_data && can_send())) {
+ // Since we will do output, we can canncel scheduled delayed ACK.
+ clear_delayed_ack();
+ output();
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::connect()
+{
+ ldout(_tcp.cct, 20) << __func__ << dendl;
+ // An initial send sequence number (ISS) is selected. A SYN segment of the
+ // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1,
+ // enter SYN-SENT state, and return.
+ do_setup_isn();
+
+ // Local receive window scale factor
+ _rcv.window_scale = _option._local_win_scale = 7;
+ // Maximum segment size local can receive
+ _rcv.mss = _option._local_mss = local_mss();
+ // Linux's default window size
+ _rcv.window = 29200 << _rcv.window_scale;
+
+ do_syn_sent();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close_final_cleanup()
+{
+ if (_snd._all_data_acked_fd >= 0) {
+ center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE);
+ _tcp.manager.close(_snd._all_data_acked_fd);
+ _snd._all_data_acked_fd = -1;
+ }
+
+ _snd.closed = true;
+ signal_data_received();
+ ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl;
+ if (in_state(CLOSE_WAIT)) {
+ ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl;
+ _state = LAST_ACK;
+ } else if (in_state(ESTABLISHED)) {
+ ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl;
+ _state = FIN_WAIT_1;
+ }
+ // Send <FIN> to remote
+ // Note: we call output_one to make sure a packet with FIN actually
+ // sent out. If we only call output() and _packetq is not empty,
+ // tcp::tcb::get_packet(), packet with FIN will not be generated.
+ output_one();
+ output();
+ center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::retransmit()
+{
+ auto output_update_rto = [this] {
+ output();
+ // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
+ this->_rto = std::min(this->_rto * 2, this->_rto_max);
+ start_retransmit_timer();
+ };
+
+ // Retransmit SYN
+ if (syn_needs_on()) {
+ if (_snd.syn_retransmit++ < _max_nr_retransmit) {
+ output_update_rto();
+ } else {
+ _errno = -ECONNABORTED;
+ ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max "
+ << _max_nr_retransmit << dendl;
+ _errno = -ETIMEDOUT;
+ cleanup();
+ return;
+ }
+ }
+
+ // Retransmit FIN
+ if (fin_needs_on()) {
+ if (_snd.fin_retransmit++ < _max_nr_retransmit) {
+ output_update_rto();
+ } else {
+ ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max "
+ << _max_nr_retransmit << dendl;
+ _errno = -ETIMEDOUT;
+ cleanup();
+ return;
+ }
+ }
+
+ // Retransmit Data
+ if (_snd.data.empty()) {
+ return;
+ }
+
+ // If there are unacked data, retransmit the earliest segment
+ auto& unacked_seg = _snd.data.front();
+
+ // According to RFC5681
+ // Update ssthresh only for the first retransmit
+ uint32_t smss = _snd.mss;
+ if (unacked_seg.nr_transmits == 0) {
+ _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
+ }
+ // RFC6582 Step 4
+ _snd.recover = _snd.next - 1;
+ // Start the slow start process
+ _snd.cwnd = smss;
+ // End fast recovery
+ exit_fast_recovery();
+
+ ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size()
+ << " nr=" << unacked_seg.nr_transmits << dendl;
+ if (unacked_seg.nr_transmits < _max_nr_retransmit) {
+ unacked_seg.nr_transmits++;
+ } else {
+ // Delete connection when max num of retransmission is reached
+ ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max "
+ << _max_nr_retransmit << dendl;
+ _errno = -ETIMEDOUT;
+ cleanup();
+ return;
+ }
+ retransmit_one();
+
+ output_update_rto();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::persist() {
+ ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl;
+ // Send 1 byte packet to probe peer's window size
+ _snd.window_probe = true;
+ output_one();
+ _snd.window_probe = false;
+
+ output();
+ // Perform binary exponential back-off per RFC1122
+ _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
+ start_persist_timer();
+}
diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h
new file mode 100644
index 00000000..b7bd7132
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.h
@@ -0,0 +1,1503 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_TCP_H_
+#define CEPH_DPDK_TCP_H_
+
+#include <unordered_map>
+#include <map>
+#include <queue>
+#include <functional>
+#include <deque>
+#include <chrono>
+#include <stdexcept>
+#include <system_error>
+
+#include "msg/async/dpdk/EventDPDK.h"
+
+#include "include/utime.h"
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/ceph_crypto.h"
+#include "msg/async/Event.h"
+#include "IPChecksum.h"
+#include "IP.h"
+#include "const.h"
+#include "byteorder.h"
+#include "shared_ptr.h"
+#include "PacketUtil.h"
+
+#include "include/random.h"
+
+struct tcp_hdr;
+
+enum class tcp_state : uint16_t {
+ CLOSED = (1 << 0),
+ LISTEN = (1 << 1),
+ SYN_SENT = (1 << 2),
+ SYN_RECEIVED = (1 << 3),
+ ESTABLISHED = (1 << 4),
+ FIN_WAIT_1 = (1 << 5),
+ FIN_WAIT_2 = (1 << 6),
+ CLOSE_WAIT = (1 << 7),
+ CLOSING = (1 << 8),
+ LAST_ACK = (1 << 9),
+ TIME_WAIT = (1 << 10)
+};
+
+inline tcp_state operator|(tcp_state s1, tcp_state s2) {
+ return tcp_state(uint16_t(s1) | uint16_t(s2));
+}
+
+inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) {
+ switch (s) {
+ case tcp_state::CLOSED: return str << "CLOSED";
+ case tcp_state::LISTEN: return str << "LISTEN";
+ case tcp_state::SYN_SENT: return str << "SYN_SENT";
+ case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED";
+ case tcp_state::ESTABLISHED: return str << "ESTABLISHED";
+ case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1";
+ case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2";
+ case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT";
+ case tcp_state::CLOSING: return str << "CLOSING";
+ case tcp_state::LAST_ACK: return str << "LAST_ACK";
+ case tcp_state::TIME_WAIT: return str << "TIME_WAIT";
+ default: return str << "UNKNOWN";
+ }
+}
+
+struct tcp_option {
+ // The kind and len field are fixed and defined in TCP protocol
+ enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 };
+ enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
+ struct mss {
+ option_kind kind = option_kind::mss;
+ option_len len = option_len::mss;
+ uint16_t mss;
+ struct mss hton() {
+ struct mss m = *this;
+ m.mss = ::hton(m.mss);
+ return m;
+ }
+ } __attribute__((packed));
+ struct win_scale {
+ option_kind kind = option_kind::win_scale;
+ option_len len = option_len::win_scale;
+ uint8_t shift;
+ } __attribute__((packed));
+ struct sack {
+ option_kind kind = option_kind::sack;
+ option_len len = option_len::sack;
+ } __attribute__((packed));
+ struct timestamps {
+ option_kind kind = option_kind::timestamps;
+ option_len len = option_len::timestamps;
+ uint32_t t1;
+ uint32_t t2;
+ } __attribute__((packed));
+ struct nop {
+ option_kind kind = option_kind::nop;
+ } __attribute__((packed));
+ struct eol {
+ option_kind kind = option_kind::eol;
+ } __attribute__((packed));
+ static const uint8_t align = 4;
+
+ void parse(uint8_t* beg, uint8_t* end);
+ uint8_t fill(tcp_hdr* th, uint8_t option_size);
+ uint8_t get_size(bool syn_on, bool ack_on);
+
+ // For option negotiattion
+ bool _mss_received = false;
+ bool _win_scale_received = false;
+ bool _timestamps_received = false;
+ bool _sack_received = false;
+
+ // Option data
+ uint16_t _remote_mss = 536;
+ uint16_t _local_mss;
+ uint8_t _remote_win_scale = 0;
+ uint8_t _local_win_scale = 0;
+};
+inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+
+struct tcp_sequence {
+ uint32_t raw;
+};
+
+tcp_sequence ntoh(tcp_sequence ts) {
+ return tcp_sequence { ::ntoh(ts.raw) };
+}
+
+tcp_sequence hton(tcp_sequence ts) {
+ return tcp_sequence { ::hton(ts.raw) };
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) {
+ return os << s.raw;
+}
+
+inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; }
+inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; }
+inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; }
+inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; }
+inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; }
+inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; }
+inline bool operator==(tcp_sequence s, tcp_sequence q) { return s.raw == q.raw; }
+inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); }
+inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; }
+inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; }
+inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); }
+inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); }
+
+struct tcp_hdr {
+ uint16_t src_port;
+ uint16_t dst_port;
+ tcp_sequence seq;
+ tcp_sequence ack;
+ uint8_t rsvd1 : 4;
+ uint8_t data_offset : 4;
+ uint8_t f_fin : 1;
+ uint8_t f_syn : 1;
+ uint8_t f_rst : 1;
+ uint8_t f_psh : 1;
+ uint8_t f_ack : 1;
+ uint8_t f_urg : 1;
+ uint8_t rsvd2 : 2;
+ uint16_t window;
+ uint16_t checksum;
+ uint16_t urgent;
+
+ tcp_hdr hton() {
+ tcp_hdr hdr = *this;
+ hdr.src_port = ::hton(src_port);
+ hdr.dst_port = ::hton(dst_port);
+ hdr.seq = ::hton(seq);
+ hdr.ack = ::hton(ack);
+ hdr.window = ::hton(window);
+ hdr.checksum = ::hton(checksum);
+ hdr.urgent = ::hton(urgent);
+ return hdr;
+ }
+
+ tcp_hdr ntoh() {
+ tcp_hdr hdr = *this;
+ hdr.src_port = ::ntoh(src_port);
+ hdr.dst_port = ::ntoh(dst_port);
+ hdr.seq = ::ntoh(seq);
+ hdr.ack = ::ntoh(ack);
+ hdr.window = ::ntoh(window);
+ hdr.checksum = ::ntoh(checksum);
+ hdr.urgent = ::ntoh(urgent);
+ return hdr;
+ }
+} __attribute__((packed));
+
+struct tcp_tag {};
+using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>;
+
+template <typename InetTraits>
+class tcp {
+ public:
+ using ipaddr = typename InetTraits::address_type;
+ using inet_type = typename InetTraits::inet_type;
+ using connid = l4connid<InetTraits>;
+ using connid_hash = typename connid::connid_hash;
+ class connection;
+ class listener;
+ private:
+ class tcb;
+
+ class C_handle_delayed_ack : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_handle_delayed_ack(tcb *t): tc(t) { }
+ void do_request(uint64_t r) {
+ tc->_nr_full_seg_received = 0;
+ tc->output();
+ }
+ };
+
+ class C_handle_retransmit : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_handle_retransmit(tcb *t): tc(t) { }
+ void do_request(uint64_t r) {
+ tc->retransmit();
+ }
+ };
+
+ class C_handle_persist : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_handle_persist(tcb *t): tc(t) { }
+ void do_request(uint64_t r) {
+ tc->persist();
+ }
+ };
+
+ class C_all_data_acked : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_all_data_acked(tcb *t): tc(t) {}
+ void do_request(uint64_t fd_or_id) {
+ tc->close_final_cleanup();
+ }
+ };
+
+ class C_actual_remove_tcb : public EventCallback {
+ lw_shared_ptr<tcb> tc;
+ public:
+ C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {}
+ void do_request(uint64_t r) {
+ delete this;
+ }
+ };
+
+ class tcb : public enable_lw_shared_from_this<tcb> {
+ using clock_type = ceph::coarse_real_clock;
+ static constexpr tcp_state CLOSED = tcp_state::CLOSED;
+ static constexpr tcp_state LISTEN = tcp_state::LISTEN;
+ static constexpr tcp_state SYN_SENT = tcp_state::SYN_SENT;
+ static constexpr tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED;
+ static constexpr tcp_state ESTABLISHED = tcp_state::ESTABLISHED;
+ static constexpr tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1;
+ static constexpr tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2;
+ static constexpr tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT;
+ static constexpr tcp_state CLOSING = tcp_state::CLOSING;
+ static constexpr tcp_state LAST_ACK = tcp_state::LAST_ACK;
+ static constexpr tcp_state TIME_WAIT = tcp_state::TIME_WAIT;
+ tcp_state _state = CLOSED;
+ tcp& _tcp;
+ UserspaceEventManager &manager;
+ connection* _conn = nullptr;
+ bool _connect_done = false;
+ ipaddr _local_ip;
+ ipaddr _foreign_ip;
+ uint16_t _local_port;
+ uint16_t _foreign_port;
+ struct unacked_segment {
+ Packet p;
+ uint16_t data_len;
+ unsigned nr_transmits;
+ clock_type::time_point tx_time;
+ };
+ struct send {
+ tcp_sequence unacknowledged;
+ tcp_sequence next;
+ uint32_t window;
+ uint8_t window_scale;
+ uint16_t mss;
+ tcp_sequence urgent;
+ tcp_sequence wl1;
+ tcp_sequence wl2;
+ tcp_sequence initial;
+ std::deque<unacked_segment> data;
+ std::deque<Packet> unsent;
+ uint32_t unsent_len = 0;
+ uint32_t queued_len = 0;
+ bool closed = false;
+ // Wait for all data are acked
+ int _all_data_acked_fd = -1;
+ // Limit number of data queued into send queue
+ Throttle user_queue_space;
+ // Round-trip time variation
+ std::chrono::microseconds rttvar;
+ // Smoothed round-trip time
+ std::chrono::microseconds srtt;
+ bool first_rto_sample = true;
+ clock_type::time_point syn_tx_time;
+ // Congestion window
+ uint32_t cwnd;
+ // Slow start threshold
+ uint32_t ssthresh;
+ // Duplicated ACKs
+ uint16_t dupacks = 0;
+ unsigned syn_retransmit = 0;
+ unsigned fin_retransmit = 0;
+ uint32_t limited_transfer = 0;
+ uint32_t partial_ack = 0;
+ tcp_sequence recover;
+ bool window_probe = false;
+ send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {}
+ } _snd;
+ struct receive {
+ tcp_sequence next;
+ uint32_t window;
+ uint8_t window_scale;
+ uint16_t mss;
+ tcp_sequence urgent;
+ tcp_sequence initial;
+ std::deque<Packet> data;
+ tcp_packet_merger out_of_order;
+ } _rcv;
+ EventCenter *center;
+ int fd;
+ // positive means no errno, 0 means eof, nagetive means error
+ int16_t _errno = 1;
+ tcp_option _option;
+ EventCallbackRef delayed_ack_event;
+ Tub<uint64_t> _delayed_ack_fd;
+ // Retransmission timeout
+ std::chrono::microseconds _rto{1000*1000};
+ std::chrono::microseconds _persist_time_out{1000*1000};
+ static constexpr std::chrono::microseconds _rto_min{1000*1000};
+ static constexpr std::chrono::microseconds _rto_max{60000*1000};
+ // Clock granularity
+ static constexpr std::chrono::microseconds _rto_clk_granularity{1000};
+ static constexpr uint16_t _max_nr_retransmit{5};
+ EventCallbackRef retransmit_event;
+ Tub<uint64_t> retransmit_fd;
+ EventCallbackRef persist_event;
+ EventCallbackRef all_data_ack_event;
+ Tub<uint64_t> persist_fd;
+ uint16_t _nr_full_seg_received = 0;
+ struct isn_secret {
+ // 512 bits secretkey for ISN generating
+ uint32_t key[16];
+ isn_secret () {
+ for (auto& k : key) {
+ k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max());
+ }
+ }
+ };
+ static isn_secret _isn_secret;
+ tcp_sequence get_isn();
+ circular_buffer<typename InetTraits::l4packet> _packetq;
+ bool _poll_active = false;
+ public:
+ // callback
+ void close_final_cleanup();
+ ostream& _prefix(std::ostream *_dout);
+
+ public:
+ tcb(tcp& t, connid id);
+ ~tcb();
+ void input_handle_listen_state(tcp_hdr* th, Packet p);
+ void input_handle_syn_sent_state(tcp_hdr* th, Packet p);
+ void input_handle_other_state(tcp_hdr* th, Packet p);
+ void output_one(bool data_retransmit = false);
+ bool is_all_data_acked();
+ int send(Packet p);
+ void connect();
+ Tub<Packet> read();
+ void close();
+ void remove_from_tcbs() {
+ auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
+ _tcp._tcbs.erase(id);
+ }
+ Tub<typename InetTraits::l4packet> get_packet();
+ void output() {
+ if (!_poll_active) {
+ _poll_active = true;
+
+ auto tcb = this->shared_from_this();
+ _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) {
+ if (r == 0) {
+ tcb->_tcp.poll_tcb(dst, std::move(tcb));
+ } else if (r == -ETIMEDOUT) {
+ // in other states connection should time out
+ if (tcb->in_state(SYN_SENT)) {
+ tcb->_errno = -ETIMEDOUT;
+ tcb->cleanup();
+ }
+ } else if (r == -EBUSY) {
+ // retry later
+ tcb->_poll_active = false;
+ tcb->start_retransmit_timer();
+ }
+ });
+ }
+ }
+
+ int16_t get_errno() const {
+ return _errno;
+ }
+
+ tcp_state& state() {
+ return _state;
+ }
+
+ uint64_t peek_sent_available() {
+ if (!in_state(ESTABLISHED))
+ return 0;
+ uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current();
+ return left;
+ }
+
+ int is_connected() const {
+ if (_errno <= 0)
+ return _errno;
+ return _connect_done;
+ }
+
+ private:
+ void respond_with_reset(tcp_hdr* th);
+ bool merge_out_of_order();
+ void insert_out_of_order(tcp_sequence seq, Packet p);
+ void trim_receive_data_after_window();
+ bool should_send_ack(uint16_t seg_len);
+ void clear_delayed_ack();
+ Packet get_transmit_packet();
+ void retransmit_one() {
+ bool data_retransmit = true;
+ output_one(data_retransmit);
+ }
+ void start_retransmit_timer() {
+ if (retransmit_fd)
+ center->delete_time_event(*retransmit_fd);
+ retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event));
+ };
+ void stop_retransmit_timer() {
+ if (retransmit_fd) {
+ center->delete_time_event(*retransmit_fd);
+ retransmit_fd.destroy();
+ }
+ };
+ void start_persist_timer() {
+ if (persist_fd)
+ center->delete_time_event(*persist_fd);
+ persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event));
+ };
+ void stop_persist_timer() {
+ if (persist_fd) {
+ center->delete_time_event(*persist_fd);
+ persist_fd.destroy();
+ }
+ };
+ void persist();
+ void retransmit();
+ void fast_retransmit();
+ void update_rto(clock_type::time_point tx_time);
+ void update_cwnd(uint32_t acked_bytes);
+ void cleanup();
+ uint32_t can_send() {
+ if (_snd.window_probe) {
+ return 1;
+ }
+ // Can not send more than advertised window allows
+ auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len);
+ // Can not send more than congestion window allows
+ x = std::min(_snd.cwnd, x);
+ if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+ // RFC5681 Step 3.1
+ // Send cwnd + 2 * smss per RFC3042
+ auto flight = flight_size();
+ auto max = _snd.cwnd + 2 * _snd.mss;
+ x = flight <= max ? std::min(x, max - flight) : 0;
+ _snd.limited_transfer += x;
+ } else if (_snd.dupacks >= 3) {
+ // RFC5681 Step 3.5
+ // Sent 1 full-sized segment at most
+ x = std::min(uint32_t(_snd.mss), x);
+ }
+ return x;
+ }
+ uint32_t flight_size() {
+ uint32_t size = 0;
+ std::for_each(_snd.data.begin(), _snd.data.end(),
+ [&] (unacked_segment& seg) { size += seg.p.len(); });
+ return size;
+ }
+ uint16_t local_mss() {
+ return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+ }
+ void queue_packet(Packet p) {
+ _packetq.emplace_back(
+ typename InetTraits::l4packet{_foreign_ip, std::move(p)});
+ }
+ void signal_data_received() {
+ manager.notify(fd, EVENT_READABLE);
+ }
+ void signal_all_data_acked() {
+ if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0)
+ manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+ }
+ void do_syn_sent() {
+ _state = SYN_SENT;
+ _snd.syn_tx_time = clock_type::now();
+ // Send <SYN> to remote
+ output();
+ }
+ void do_syn_received() {
+ _state = SYN_RECEIVED;
+ _snd.syn_tx_time = clock_type::now();
+ // Send <SYN,ACK> to remote
+ output();
+ }
+ void do_established() {
+ _state = ESTABLISHED;
+ update_rto(_snd.syn_tx_time);
+ _connect_done = true;
+ manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE);
+ }
+ void do_reset() {
+ _state = CLOSED;
+ // Free packets to be sent which are waiting for user_queue_space
+ _snd.user_queue_space.reset();
+ cleanup();
+ _errno = -ECONNRESET;
+ manager.notify(fd, EVENT_READABLE);
+
+ if (_snd._all_data_acked_fd >= 0)
+ manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+ }
+ void do_time_wait() {
+ // FIXME: Implement TIME_WAIT state timer
+ _state = TIME_WAIT;
+ cleanup();
+ }
+ void do_closed() {
+ _state = CLOSED;
+ cleanup();
+ }
+ void do_setup_isn() {
+ _snd.initial = get_isn();
+ _snd.unacknowledged = _snd.initial;
+ _snd.next = _snd.initial + 1;
+ _snd.recover = _snd.initial;
+ }
+ void do_local_fin_acked() {
+ _snd.unacknowledged += 1;
+ _snd.next += 1;
+ }
+ bool syn_needs_on() {
+ return in_state(SYN_SENT | SYN_RECEIVED);
+ }
+ bool fin_needs_on() {
+ return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
+ _snd.unsent_len == 0 && _snd.queued_len == 0;
+ }
+ bool ack_needs_on() {
+ return !in_state(CLOSED | LISTEN | SYN_SENT);
+ }
+ bool foreign_will_not_send() {
+ return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
+ }
+ bool in_state(tcp_state state) {
+ return uint16_t(_state) & uint16_t(state);
+ }
+ void exit_fast_recovery() {
+ _snd.dupacks = 0;
+ _snd.limited_transfer = 0;
+ _snd.partial_ack = 0;
+ }
+ uint32_t data_segment_acked(tcp_sequence seg_ack);
+ bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len);
+ void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
+ friend class connection;
+
+ friend class C_handle_delayed_ack;
+ friend class C_handle_retransmit;
+ friend class C_handle_persist;
+ friend class C_all_data_acked;
+ };
+
+ CephContext *cct;
+ // ipv4_l4<ip_protocol_num::tcp>
+ inet_type& _inet;
+ EventCenter *center;
+ UserspaceEventManager &manager;
+ std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
+ std::unordered_map<uint16_t, listener*> _listening;
+ std::random_device _rd;
+ std::default_random_engine _e;
+ std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
+ circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
+ // queue for packets that do not belong to any tcb
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+ Throttle _queue_space;
+ // Limit number of data queued into send queue
+ public:
+ class connection {
+ lw_shared_ptr<tcb> _tcb;
+ public:
+ explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
+ connection(const connection&) = delete;
+ connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
+ _tcb->_conn = this;
+ }
+ ~connection();
+ void operator=(const connection&) = delete;
+ connection& operator=(connection&& x) {
+ if (this != &x) {
+ this->~connection();
+ new (this) connection(std::move(x));
+ }
+ return *this;
+ }
+ int fd() const {
+ return _tcb->fd;
+ }
+ int send(Packet p) {
+ return _tcb->send(std::move(p));
+ }
+ Tub<Packet> read() {
+ return _tcb->read();
+ }
+ int16_t get_errno() const {
+ return _tcb->get_errno();
+ }
+ void close_read();
+ void close_write();
+ entity_addr_t remote_addr() const {
+ entity_addr_t addr;
+ auto net_ip = _tcb->_foreign_ip.hton();
+ memcpy((void*)&addr.in4_addr().sin_addr.s_addr,
+ &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr));
+ addr.set_family(AF_INET);
+ return addr;
+ }
+ uint64_t peek_sent_available() {
+ return _tcb->peek_sent_available();
+ }
+ int is_connected() const { return _tcb->is_connected(); }
+ };
+ class listener {
+ tcp& _tcp;
+ uint16_t _port;
+ int _fd = -1;
+ int16_t _errno;
+ queue<connection> _q;
+ size_t _q_max_length;
+
+ private:
+ listener(tcp& t, uint16_t port, size_t queue_length)
+ : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) {
+ }
+ public:
+ listener(const listener&) = delete;
+ void operator=(const listener&) = delete;
+ listener(listener&& x)
+ : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno),
+ _q(std::move(x._q)) {
+ if (_fd >= 0)
+ _tcp._listening[_port] = this;
+ }
+ ~listener() {
+ abort_accept();
+ }
+ int listen() {
+ if (_tcp._listening.find(_port) != _tcp._listening.end())
+ return -EADDRINUSE;
+ _tcp._listening.emplace(_port, this);
+ _fd = _tcp.manager.get_eventfd();
+ return 0;
+ }
+ Tub<connection> accept() {
+ Tub<connection> c;
+ if (!_q.empty()) {
+ c = std::move(_q.front());
+ _q.pop();
+ }
+ return c;
+ }
+ void abort_accept() {
+ while (!_q.empty())
+ _q.pop();
+ if (_fd >= 0) {
+ _tcp._listening.erase(_port);
+ _tcp.manager.close(_fd);
+ _fd = -1;
+ }
+ }
+ int16_t get_errno() const {
+ return _errno;
+ }
+ bool full() const {
+ return _q.size() == _q_max_length;
+ }
+ int fd() const {
+ return _fd;
+ }
+ friend class tcp;
+ };
+ public:
+ explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen);
+ void received(Packet p, ipaddr from, ipaddr to);
+ bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+ listener listen(uint16_t port, size_t queue_length = 100);
+ connection connect(const entity_addr_t &addr);
+ const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); }
+ void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) {
+ _poll_tcbs.emplace_back(std::move(tcb), dst);
+ }
+ bool push_listen_queue(uint16_t port, tcb *t) {
+ auto listener = _listening.find(port);
+ if (listener == _listening.end() || listener->second->full()) {
+ return false;
+ }
+ listener->second->_q.push(connection(t->shared_from_this()));
+ manager.notify(listener->second->_fd, EVENT_READABLE);
+ return true;
+ }
+
+ private:
+ void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p);
+ void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
+ friend class listener;
+};
+
+template <typename InetTraits>
+tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen)
+ : cct(c), _inet(inet), center(cen),
+ manager(static_cast<DPDKDriver*>(cen->get_driver())->manager),
+ _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) {
+ int tcb_polled = 0u;
+ _inet.register_packet_provider([this, tcb_polled] () mutable {
+ Tub<typename InetTraits::l4packet> l4p;
+ auto c = _poll_tcbs.size();
+ if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
+ l4p = std::move(_packetq.front());
+ _packetq.pop_front();
+ _queue_space.put(l4p->p.len());
+ } else {
+ while (c--) {
+ tcb_polled++;
+ lw_shared_ptr<tcb> tcb;
+ ethernet_address dst;
+ std::tie(tcb, dst) = std::move(_poll_tcbs.front());
+ _poll_tcbs.pop_front();
+ l4p = std::move(tcb->get_packet());
+ if (l4p) {
+ l4p->e_dst = dst;
+ break;
+ }
+ }
+ }
+ return l4p;
+ });
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
+ return listener(*this, port, queue_length);
+}
+
+template <typename InetTraits>
+typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) {
+ uint16_t src_port;
+ connid id;
+ auto src_ip = _inet._inet.host_address();
+ auto dst_ip = ipv4_address(addr);
+ auto dst_port = addr.get_port();
+
+ do {
+ src_port = _port_dist(_e);
+ id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port};
+ if (_tcbs.find(id) == _tcbs.end()) {
+ if (_inet._inet.netif()->hw_queues_count() == 1 ||
+ _inet._inet.netif()->hash2cpu(
+ id.hash(_inet._inet.netif()->rss_key())) == center->get_id())
+ break;
+ }
+ } while (true);
+
+ auto tcbp = make_lw_shared<tcb>(*this, id);
+ _tcbs.insert({id, tcbp});
+ tcbp->connect();
+ return connection(tcbp);
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) {
+ auto th = p.get_header<tcp_hdr>(off);
+ if (th) {
+ out_hash_data.push_back(th->src_port);
+ out_hash_data.push_back(th->dst_port);
+ }
+ return true;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) {
+ auto th = p.get_header<tcp_hdr>(0);
+ if (!th) {
+ return;
+ }
+ // th->data_offset is correct even before ntoh()
+ if (unsigned(th->data_offset * 4) < sizeof(*th)) {
+ return;
+ }
+
+ if (!get_hw_features().rx_csum_offload) {
+ checksummer csum;
+ InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
+ csum.sum(p);
+ if (csum.get() != 0) {
+ return;
+ }
+ }
+ auto h = th->ntoh();
+ auto id = connid{to, from, h.dst_port, h.src_port};
+ auto tcbi = _tcbs.find(id);
+ lw_shared_ptr<tcb> tcbp;
+ if (tcbi == _tcbs.end()) {
+ auto listener = _listening.find(id.local_port);
+ if (listener == _listening.end() || listener->second->full()) {
+ // 1) In CLOSE state
+ // 1.1 all data in the incoming segment is discarded. An incoming
+ // segment containing a RST is discarded. An incoming segment not
+ // containing a RST causes a RST to be sent in response.
+ // FIXME:
+ // if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
+ // if ACK on: <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+ } else {
+ // 2) In LISTEN state
+ // 2.1 first check for an RST
+ if (h.f_rst) {
+ // An incoming RST should be ignored
+ return;
+ }
+ // 2.2 second check for an ACK
+ if (h.f_ack) {
+ // Any acknowledgment is bad if it arrives on a connection
+ // still in the LISTEN state.
+ // <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+ }
+ // 2.3 third check for a SYN
+ if (h.f_syn) {
+ // check the security
+ // NOTE: Ignored for now
+ tcbp = make_lw_shared<tcb>(*this, id);
+ _tcbs.insert({id, tcbp});
+ return tcbp->input_handle_listen_state(&h, std::move(p));
+ }
+ // 2.4 fourth other text or control
+ // So you are unlikely to get here, but if you do, drop the
+ // segment, and return.
+ return;
+ }
+ } else {
+ tcbp = tcbi->second;
+ if (tcbp->state() == tcp_state::SYN_SENT) {
+ // 3) In SYN_SENT State
+ return tcbp->input_handle_syn_sent_state(&h, std::move(p));
+ } else {
+ // 4) In other state, can be one of the following:
+ // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+ // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+ return tcbp->input_handle_other_state(&h, std::move(p));
+ }
+ }
+}
+
+// Send packet does not belong to any tcb
+template <typename InetTraits>
+void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) {
+ if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+ _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable {
+ if (r == 0)
+ _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
+ });
+ }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::connection::~connection() {
+ if (_tcb) {
+ _tcb->_conn = nullptr;
+ close_read();
+ close_write();
+ }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
+ : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip),
+ _local_port(id.local_port), _foreign_port(id.foreign_port),
+ _snd(_tcp.cct),
+ center(t.center),
+ fd(t.manager.get_eventfd()),
+ delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)),
+ retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)),
+ persist_event(new tcp<InetTraits>::C_handle_persist(this)),
+ all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::~tcb()
+{
+ if (_delayed_ack_fd)
+ center->delete_time_event(*_delayed_ack_fd);
+ if (retransmit_fd)
+ center->delete_time_event(*retransmit_fd);
+ if (persist_fd)
+ center->delete_time_event(*persist_fd);
+ delete delayed_ack_event;
+ delete retransmit_event;
+ delete persist_event;
+ delete all_data_ack_event;
+ manager.close(fd);
+ fd = -1;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth)
+{
+ _tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
+}
+
+template <typename InetTraits>
+uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) {
+ uint32_t total_acked_bytes = 0;
+ // Full ACK of segment
+ while (!_snd.data.empty()
+ && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
+ auto acked_bytes = _snd.data.front().p.len();
+ _snd.unacknowledged += acked_bytes;
+ // Ignore retransmitted segments when setting the RTO
+ if (_snd.data.front().nr_transmits == 0) {
+ update_rto(_snd.data.front().tx_time);
+ }
+ update_cwnd(acked_bytes);
+ total_acked_bytes += acked_bytes;
+ _snd.user_queue_space.put(_snd.data.front().data_len);
+ manager.notify(fd, EVENT_WRITABLE);
+ _snd.data.pop_front();
+ }
+ // Partial ACK of segment
+ if (_snd.unacknowledged < seg_ack) {
+ auto acked_bytes = seg_ack - _snd.unacknowledged;
+ if (!_snd.data.empty()) {
+ auto& unacked_seg = _snd.data.front();
+ unacked_seg.p.trim_front(acked_bytes);
+ }
+ _snd.unacknowledged = seg_ack;
+ update_cwnd(acked_bytes);
+ total_acked_bytes += acked_bytes;
+ }
+ return total_acked_bytes;
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) {
+ if (seg_len == 0 && _rcv.window == 0) {
+ // SEG.SEQ = RCV.NXT
+ return seg_seq == _rcv.next;
+ } else if (seg_len == 0 && _rcv.window > 0) {
+ // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
+ } else if (seg_len > 0 && _rcv.window > 0) {
+ // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ // or
+ // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+ bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
+ bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
+ return x || y;
+ } else {
+ // SEG.LEN > 0 RCV.WND = 0, not acceptable
+ return false;
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
+ // Handle tcp options
+ _option.parse(opt_start, opt_end);
+
+ // Remote receive window scale factor
+ _snd.window_scale = _option._remote_win_scale;
+ // Local receive window scale factor
+ _rcv.window_scale = _option._local_win_scale;
+
+ // Maximum segment size remote can receive
+ _snd.mss = _option._remote_mss;
+ // Maximum segment size local can receive
+ _rcv.mss = _option._local_mss = local_mss();
+
+ // Linux's default window size
+ _rcv.window = 29200 << _rcv.window_scale;
+ _snd.window = th->window << _snd.window_scale;
+
+ // Segment sequence number used for last window update
+ _snd.wl1 = th->seq;
+ // Segment acknowledgment number used for last window update
+ _snd.wl2 = th->ack;
+
+ // Setup initial congestion window
+ if (2190 < _snd.mss) {
+ _snd.cwnd = 2 * _snd.mss;
+ } else if (1095 < _snd.mss && _snd.mss <= 2190) {
+ _snd.cwnd = 3 * _snd.mss;
+ } else {
+ _snd.cwnd = 4 * _snd.mss;
+ }
+
+ // Setup initial slow start threshold
+ _snd.ssthresh = th->window << _snd.window_scale;
+}
+
+template <typename InetTraits>
+Packet tcp<InetTraits>::tcb::get_transmit_packet() {
+ // easy case: empty queue
+ if (_snd.unsent.empty()) {
+ return Packet();
+ }
+ auto can_send = this->can_send();
+ // Max number of TCP payloads we can pass to NIC
+ uint32_t len;
+ if (_tcp.get_hw_features().tx_tso) {
+ // FIXME: Info tap device the size of the split packet
+ len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+ } else {
+ len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
+ }
+ can_send = std::min(can_send, len);
+ // easy case: one small packet
+ if (_snd.unsent.front().len() <= can_send) {
+ auto p = std::move(_snd.unsent.front());
+ _snd.unsent.pop_front();
+ _snd.unsent_len -= p.len();
+ return p;
+ }
+ // moderate case: need to split one packet
+ if (_snd.unsent.front().len() > can_send) {
+ auto p = _snd.unsent.front().share(0, can_send);
+ _snd.unsent.front().trim_front(can_send);
+ _snd.unsent_len -= p.len();
+ return p;
+ }
+ // hard case: merge some packets, possibly split last
+ auto p = std::move(_snd.unsent.front());
+ _snd.unsent.pop_front();
+ can_send -= p.len();
+ while (!_snd.unsent.empty()
+ && _snd.unsent.front().len() <= can_send) {
+ can_send -= _snd.unsent.front().len();
+ p.append(std::move(_snd.unsent.front()));
+ _snd.unsent.pop_front();
+ }
+ // FIXME: this will result in calling "deleter" of packet which free managed objects
+ // will used later
+ // if (!_snd.unsent.empty() && can_send) {
+ // auto& q = _snd.unsent.front();
+ // p.append(q.share(0, can_send));
+ // q.trim_front(can_send);
+ // }
+ _snd.unsent_len -= p.len();
+ return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
+ if (in_state(CLOSED)) {
+ return;
+ }
+
+ Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
+ Packet clone = p.share(); // early clone to prevent share() from calling packet::unuse_internal_data() on header.
+ uint16_t len = p.len();
+ bool syn_on = syn_needs_on();
+ bool ack_on = ack_needs_on();
+
+ auto options_size = _option.get_size(syn_on, ack_on);
+ auto th = p.prepend_header<tcp_hdr>(options_size);
+
+ th->src_port = _local_port;
+ th->dst_port = _foreign_port;
+
+ th->f_syn = syn_on;
+ th->f_ack = ack_on;
+ if (ack_on) {
+ clear_delayed_ack();
+ }
+ th->f_urg = false;
+ th->f_psh = false;
+
+ tcp_sequence seq;
+ if (data_retransmit) {
+ seq = _snd.unacknowledged;
+ } else {
+ seq = syn_on ? _snd.initial : _snd.next;
+ _snd.next += len;
+ }
+ th->seq = seq;
+ th->ack = _rcv.next;
+ th->data_offset = (sizeof(*th) + options_size) / 4;
+ th->window = _rcv.window >> _rcv.window_scale;
+ th->checksum = 0;
+
+ // FIXME: does the FIN have to fit in the window?
+ bool fin_on = fin_needs_on();
+ th->f_fin = fin_on;
+
+ // Add tcp options
+ _option.fill(th, options_size);
+ *th = th->hton();
+
+ offload_info oi;
+ checksummer csum;
+ uint16_t pseudo_hdr_seg_len = 0;
+
+ oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size;
+
+ if (_tcp.get_hw_features().tx_csum_l4_offload) {
+ oi.needs_csum = true;
+
+ //
+ // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
+ // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
+ // complement sum of the pseudo header.
+ //
+ // For TSO the csum should be calculated for a pseudo header with
+ // segment length set to 0. All the rest is the same as for a TCP Tx
+ // CSUM offload case.
+ //
+ if (_tcp.get_hw_features().tx_tso && len > _snd.mss) {
+ oi.tso_seg_size = _snd.mss;
+ } else {
+ pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+ }
+ } else {
+ pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+ oi.needs_csum = false;
+ }
+
+ InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
+ pseudo_hdr_seg_len);
+
+ if (_tcp.get_hw_features().tx_csum_l4_offload) {
+ th->checksum = ~csum.get();
+ } else {
+ csum.sum(p);
+ th->checksum = csum.get();
+ }
+
+ oi.protocol = ip_protocol_num::tcp;
+
+ p.set_offload_info(oi);
+
+ if (!data_retransmit && (len || syn_on || fin_on)) {
+ auto now = clock_type::now();
+ if (len) {
+ unsigned nr_transmits = 0;
+ _snd.data.emplace_back(unacked_segment{std::move(clone),
+ len, nr_transmits, now});
+ }
+ if (!retransmit_fd) {
+ start_retransmit_timer();
+ }
+ }
+
+ queue_packet(std::move(p));
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::is_all_data_acked() {
+ if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) {
+ return true;
+ }
+ return false;
+}
+
+template <typename InetTraits>
+Tub<Packet> tcp<InetTraits>::tcb::read() {
+ Tub<Packet> p;
+ if (_rcv.data.empty())
+ return p;
+
+ p.construct();
+ for (auto&& q : _rcv.data) {
+ p->append(std::move(q));
+ }
+ _rcv.data.clear();
+ return p;
+}
+
+template <typename InetTraits>
+int tcp<InetTraits>::tcb::send(Packet p) {
+ // We can not send after the connection is closed
+ ceph_assert(!_snd.closed);
+
+ if (in_state(CLOSED))
+ return -ECONNRESET;
+
+ auto len = p.len();
+ if (!_snd.user_queue_space.get_or_fail(len)) {
+ // note: caller must ensure enough queue space to send
+ ceph_abort();
+ }
+ // TODO: Handle p.len() > max user_queue_space case
+ _snd.queued_len += len;
+ _snd.unsent_len += len;
+ _snd.queued_len -= len;
+ _snd.unsent.push_back(std::move(p));
+ if (can_send() > 0) {
+ output();
+ }
+ return len;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close() {
+ if (in_state(CLOSED) || _snd.closed) {
+ return ;
+ }
+ // TODO: We should make this asynchronous
+
+ _errno = -EPIPE;
+ center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+ bool acked = is_all_data_acked();
+ if (!acked) {
+ _snd._all_data_acked_fd = manager.get_eventfd();
+ center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event);
+ } else {
+ close_final_cleanup();
+ }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
+ // We've received a TSO packet, do ack immediately
+ if (seg_len > _rcv.mss) {
+ _nr_full_seg_received = 0;
+ if (_delayed_ack_fd) {
+ center->delete_time_event(*_delayed_ack_fd);
+ _delayed_ack_fd.destroy();
+ }
+ return true;
+ }
+
+ // We've received a full sized segment, ack for every second full sized segment
+ if (seg_len == _rcv.mss) {
+ if (_nr_full_seg_received++ >= 1) {
+ _nr_full_seg_received = 0;
+ if (_delayed_ack_fd) {
+ center->delete_time_event(*_delayed_ack_fd);
+ _delayed_ack_fd.destroy();
+ }
+ return true;
+ }
+ }
+
+ // If the timer is armed and its callback hasn't been run.
+ if (_delayed_ack_fd) {
+ return false;
+ }
+
+ // If the timer is not armed, schedule a delayed ACK.
+ // The maximum delayed ack timer allowed by RFC1122 is 500ms, most
+ // implementations use 200ms.
+ _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event));
+ return false;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::clear_delayed_ack() {
+ if (_delayed_ack_fd) {
+ center->delete_time_event(*_delayed_ack_fd);
+ _delayed_ack_fd.destroy();
+ }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::merge_out_of_order() {
+ bool merged = false;
+ if (_rcv.out_of_order.map.empty()) {
+ return merged;
+ }
+ for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
+ auto& p = it->second;
+ auto seg_beg = it->first;
+ auto seg_len = p.len();
+ auto seg_end = seg_beg + seg_len;
+ if (seg_beg <= _rcv.next && seg_end > _rcv.next) {
+ // This segment has been received out of order and its previous
+ // segment has been received now
+ auto trim = _rcv.next - seg_beg;
+ if (trim) {
+ p.trim_front(trim);
+ seg_len -= trim;
+ }
+ _rcv.next += seg_len;
+ _rcv.data.push_back(std::move(p));
+ // Since c++11, erase() always returns the value of the following element
+ it = _rcv.out_of_order.map.erase(it);
+ merged = true;
+ } else if (_rcv.next >= seg_end) {
+ // This segment has been receive already, drop it
+ it = _rcv.out_of_order.map.erase(it);
+ } else {
+ // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
+ // so we can stop looking here.
+ it++;
+ break;
+ }
+ }
+ return merged;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) {
+ _rcv.out_of_order.merge(seg, std::move(p));
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
+ abort();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::fast_retransmit() {
+ if (!_snd.data.empty()) {
+ auto& unacked_seg = _snd.data.front();
+ unacked_seg.nr_transmits++;
+ retransmit_one();
+ output();
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
+ // Update RTO according to RFC6298
+ auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time);
+ if (_snd.first_rto_sample) {
+ _snd.first_rto_sample = false;
+ // RTTVAR <- R/2
+ // SRTT <- R
+ _snd.rttvar = R / 2;
+ _snd.srtt = R;
+ } else {
+ // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
+ // SRTT <- (1 - alpha) * SRTT + alpha * R'
+ // where alpha = 1/8 and beta = 1/4
+ auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
+ _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
+ _snd.srtt = _snd.srtt * 7 / 8 + R / 8;
+ }
+ // RTO <- SRTT + max(G, K * RTTVAR)
+ _rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
+
+ // Make sure 1 sec << _rto << 60 sec
+ _rto = std::max(_rto, _rto_min);
+ _rto = std::min(_rto, _rto_max);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
+ uint32_t smss = _snd.mss;
+ if (_snd.cwnd < _snd.ssthresh) {
+ // In slow start phase
+ _snd.cwnd += std::min(acked_bytes, smss);
+ } else {
+ // In congestion avoidance phase
+ uint32_t round_up = 1;
+ _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
+ }
+}
+
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::cleanup() {
+ manager.notify(fd, EVENT_READABLE);
+ _snd.closed = true;
+ _snd.unsent.clear();
+ _snd.data.clear();
+ _rcv.out_of_order.map.clear();
+ _rcv.data.clear();
+ stop_retransmit_timer();
+ clear_delayed_ack();
+ center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this));
+ remove_from_tcbs();
+}
+
+template <typename InetTraits>
+tcp_sequence tcp<InetTraits>::tcb::get_isn() {
+ // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
+ // with the expression:
+ // ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
+ // M is the 4 microsecond timer
+ using namespace std::chrono;
+ uint32_t hash[4];
+ hash[0] = _local_ip.ip;
+ hash[1] = _foreign_ip.ip;
+ hash[2] = (_local_port << 16) + _foreign_port;
+ hash[3] = _isn_secret.key[15];
+ ceph::crypto::MD5 md5;
+ md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key));
+ md5.Final((unsigned char*)hash);
+ auto seq = hash[0];
+ auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
+ seq += m.count() / 4;
+ return make_seq(seq);
+}
+
+template <typename InetTraits>
+Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
+ _poll_active = false;
+ if (_packetq.empty()) {
+ output_one();
+ }
+
+ Tub<typename InetTraits::l4packet> p;
+ if (in_state(CLOSED)) {
+ return p;
+ }
+
+ ceph_assert(!_packetq.empty());
+
+ p = std::move(_packetq.front());
+ _packetq.pop_front();
+ if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) {
+ // If there are packets to send in the queue or tcb is allowed to send
+ // more add tcp back to polling set to keep sending. In addition, dupacks >= 3
+ // is an indication that an segment is lost, stop sending more in this case.
+ output();
+ }
+ return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_read() {
+ // do nothing
+ // _tcb->manager.notify(_tcb->fd, EVENT_READABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_write() {
+ _tcb->close();
+}
+
+template <typename InetTraits>
+constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
+
+template <typename InetTraits>
+typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
+
+
+#endif /* TCP_HH_ */
diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc
new file mode 100644
index 00000000..282dcef1
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "UserspaceEvent.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+int UserspaceEventManager::get_eventfd()
+{
+ int fd;
+ if (!unused_fds.empty()) {
+ fd = unused_fds.front();
+ unused_fds.pop_front();
+ } else {
+ fd = ++max_fd;
+ fds.resize(fd + 1);
+ }
+
+ Tub<UserspaceFDImpl> &impl = fds[fd];
+ ceph_assert(!impl);
+ impl.construct();
+ ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+ return fd;
+}
+
+int UserspaceEventManager::notify(int fd, int mask)
+{
+ ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl;
+ if ((size_t)fd >= fds.size())
+ return -ENOENT;
+
+ Tub<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return -ENOENT;
+
+ ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+ << " listening=" << int(impl->listening_mask)
+ << " waiting_idx=" << int(impl->waiting_idx) << dendl;
+
+ impl->activating_mask |= mask;
+ if (impl->waiting_idx)
+ return 0;
+
+ if (impl->listening_mask & mask) {
+ if (waiting_fds.size() <= max_wait_idx)
+ waiting_fds.resize(waiting_fds.size()*2);
+ impl->waiting_idx = ++max_wait_idx;
+ waiting_fds[max_wait_idx] = fd;
+ }
+
+ ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+ << " listening=" << int(impl->listening_mask)
+ << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl;
+ return 0;
+}
+
+void UserspaceEventManager::close(int fd)
+{
+ ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+ if ((size_t)fd >= fds.size())
+ return ;
+
+ Tub<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return ;
+
+ if (fd == max_fd)
+ --max_fd;
+ else
+ unused_fds.push_back(fd);
+
+ if (impl->activating_mask) {
+ if (waiting_fds[max_wait_idx] == fd) {
+ ceph_assert(impl->waiting_idx == max_wait_idx);
+ --max_wait_idx;
+ }
+ waiting_fds[impl->waiting_idx] = -1;
+ }
+ impl.destroy();
+}
+
+int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp)
+{
+ int fd;
+ uint32_t i = 0;
+ int count = 0;
+ ceph_assert(num_events);
+ // leave zero slot for waiting_fds
+ while (i < max_wait_idx) {
+ fd = waiting_fds[++i];
+ if (fd == -1)
+ continue;
+
+ events[count] = fd;
+ Tub<UserspaceFDImpl> &impl = fds[fd];
+ ceph_assert(impl);
+ masks[count] = impl->listening_mask & impl->activating_mask;
+ ceph_assert(masks[count]);
+ ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl;
+ impl->activating_mask &= (~masks[count]);
+ impl->waiting_idx = 0;
+ if (++count >= num_events)
+ break;
+ }
+ if (i < max_wait_idx) {
+ memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i));
+ }
+ max_wait_idx -= i;
+ return count;
+}
diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h
new file mode 100644
index 00000000..7e89517d
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_USERSPACEEVENT_H
+#define CEPH_USERSPACEEVENT_H
+
+#include <cstddef>
+#include <errno.h>
+#include <string.h>
+
+#include <vector>
+#include <list>
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+#include "common/Tub.h"
+
+class CephContext;
+
+class UserspaceEventManager {
+ struct UserspaceFDImpl {
+ uint32_t waiting_idx = 0;
+ int16_t read_errno = 0;
+ int16_t write_errno = 0;
+ int8_t listening_mask = 0;
+ int8_t activating_mask = 0;
+ uint32_t magic = 4921;
+ };
+ CephContext *cct;
+ int max_fd = 0;
+ uint32_t max_wait_idx = 0;
+ std::vector<Tub<UserspaceFDImpl> > fds;
+ std::vector<int> waiting_fds;
+ std::list<uint32_t> unused_fds;
+
+ public:
+ explicit UserspaceEventManager(CephContext *c): cct(c) {
+ waiting_fds.resize(1024);
+ }
+
+ int get_eventfd();
+
+ int listen(int fd, int mask) {
+ if ((size_t)fd >= fds.size())
+ return -ENOENT;
+
+ Tub<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return -ENOENT;
+
+ impl->listening_mask |= mask;
+ if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) {
+ if (waiting_fds.size() <= max_wait_idx)
+ waiting_fds.resize(waiting_fds.size()*2);
+ impl->waiting_idx = ++max_wait_idx;
+ waiting_fds[max_wait_idx] = fd;
+ }
+ return 0;
+ }
+
+ int unlisten(int fd, int mask) {
+ if ((size_t)fd >= fds.size())
+ return -ENOENT;
+
+ Tub<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return -ENOENT;
+
+ impl->listening_mask &= (~mask);
+ if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) {
+ if (waiting_fds[max_wait_idx] == fd) {
+ ceph_assert(impl->waiting_idx == max_wait_idx);
+ --max_wait_idx;
+ }
+ waiting_fds[impl->waiting_idx] = -1;
+ impl->waiting_idx = 0;
+ }
+ return 0;
+ }
+
+ int notify(int fd, int mask);
+ void close(int fd);
+ int poll(int *events, int *masks, int num_events, struct timeval *tp);
+
+ bool check() {
+ for (auto &&m : fds) {
+ if (m && m->magic != 4921)
+ return false;
+ }
+ return true;
+ }
+};
+
+#endif //CEPH_USERSPACEEVENT_H
diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h
new file mode 100644
index 00000000..3b48f789
--- /dev/null
+++ b/src/msg/async/dpdk/align.h
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_ALIGN_HH_
+#define CEPH_MSG_DPDK_ALIGN_HH_
+
+#include <cstdint>
+#include <cstdlib>
+
+template <typename T>
+inline constexpr T align_up(T v, T align) {
+ return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_up(T* v, size_t align) {
+ static_assert(sizeof(T) == 1, "align byte pointers only");
+ return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
+}
+
+template <typename T>
+inline constexpr T align_down(T v, T align) {
+ return v & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_down(T* v, size_t align) {
+ static_assert(sizeof(T) == 1, "align byte pointers only");
+ return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align));
+}
+
+#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */
diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h
new file mode 100644
index 00000000..40f7728d
--- /dev/null
+++ b/src/msg/async/dpdk/array_map.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_ARRAY_MAP_HH_
+#define CEPH_ARRAY_MAP_HH_
+
+#include <array>
+
+// unordered_map implemented as a simple array
+
+template <typename Value, size_t Max>
+class array_map {
+ std::array<Value, Max> _a {};
+ public:
+ array_map(std::initializer_list<std::pair<size_t, Value>> i) {
+ for (auto kv : i) {
+ _a[kv.first] = kv.second;
+ }
+ }
+ Value& operator[](size_t key) { return _a[key]; }
+ const Value& operator[](size_t key) const { return _a[key]; }
+
+ Value& at(size_t key) {
+ if (key >= Max) {
+ throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max));
+ }
+ return _a[key];
+ }
+};
+
+#endif /* ARRAY_MAP_HH_ */
diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h
new file mode 100644
index 00000000..a996ec07
--- /dev/null
+++ b/src/msg/async/dpdk/byteorder.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_BYTEORDER_H_
+#define CEPH_MSG_BYTEORDER_H_
+
+#include <arpa/inet.h> // for ntohs() and friends
+#include <iosfwd>
+#include <utility>
+
+inline uint64_t ntohq(uint64_t v) {
+ return __builtin_bswap64(v);
+}
+inline uint64_t htonq(uint64_t v) {
+ return __builtin_bswap64(v);
+}
+
+inline void ntoh() {}
+inline void hton() {}
+
+inline uint8_t ntoh(uint8_t x) { return x; }
+inline uint8_t hton(uint8_t x) { return x; }
+inline uint16_t ntoh(uint16_t x) { return ntohs(x); }
+inline uint16_t hton(uint16_t x) { return htons(x); }
+inline uint32_t ntoh(uint32_t x) { return ntohl(x); }
+inline uint32_t hton(uint32_t x) { return htonl(x); }
+inline uint64_t ntoh(uint64_t x) { return ntohq(x); }
+inline uint64_t hton(uint64_t x) { return htonq(x); }
+
+inline int8_t ntoh(int8_t x) { return x; }
+inline int8_t hton(int8_t x) { return x; }
+inline int16_t ntoh(int16_t x) { return ntohs(x); }
+inline int16_t hton(int16_t x) { return htons(x); }
+inline int32_t ntoh(int32_t x) { return ntohl(x); }
+inline int32_t hton(int32_t x) { return htonl(x); }
+inline int64_t ntoh(int64_t x) { return ntohq(x); }
+inline int64_t hton(int64_t x) { return htonq(x); }
+
+#endif /* CEPH_MSG_BYTEORDER_H_ */
diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h
new file mode 100644
index 00000000..1ace8eeb
--- /dev/null
+++ b/src/msg/async/dpdk/capture.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_DPDK_CAPTURE_H
+#define CEPH_MSG_DPDK_CAPTURE_H
+
+#include <utility>
+
+template <typename T, typename F>
+class capture_impl {
+ T x;
+ F f;
+ public:
+ capture_impl(capture_impl &) = delete;
+ capture_impl( T && x, F && f )
+ : x{std::forward<T>(x)}, f{std::forward<F>(f)}
+ {}
+
+ template <typename ...Ts> auto operator()( Ts&&...args )
+ -> decltype(f( x, std::forward<Ts>(args)... ))
+ {
+ return f( x, std::forward<Ts>(args)... );
+ }
+
+ template <typename ...Ts> auto operator()( Ts&&...args ) const
+ -> decltype(f( x, std::forward<Ts>(args)... ))
+ {
+ return f( x, std::forward<Ts>(args)... );
+ }
+};
+
+template <typename T, typename F>
+capture_impl<T,F> capture( T && x, F && f ) {
+ return capture_impl<T,F>(
+ std::forward<T>(x), std::forward<F>(f) );
+}
+
+#endif //CEPH_MSG_DPDK_CAPTURE_H
diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h
new file mode 100644
index 00000000..2c92c120
--- /dev/null
+++ b/src/msg/async/dpdk/circular_buffer.h
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_CIRCULAR_BUFFER_HH_
+#define CEPH_CIRCULAR_BUFFER_HH_
+
+// A growable double-ended queue container that can be efficiently
+// extended (and shrunk) from both ends. Implementation is a single
+// storage vector.
+//
+// Similar to libstdc++'s std::deque, except that it uses a single level
+// store, and so is more efficient for simple stored items.
+// Similar to boost::circular_buffer_space_optimized, except it uses
+// uninitialized storage for unoccupied elements (and thus move/copy
+// constructors instead of move/copy assignments, which are less efficient).
+
+#include <memory>
+#include <algorithm>
+
+#include "transfer.h"
+
+template <typename T, typename Alloc = std::allocator<T>>
+class circular_buffer {
+ struct impl : Alloc {
+ T* storage = nullptr;
+ // begin, end interpreted (mod capacity)
+ size_t begin = 0;
+ size_t end = 0;
+ size_t capacity = 0;
+ };
+ impl _impl;
+ public:
+ using value_type = T;
+ using size_type = size_t;
+ using reference = T&;
+ using pointer = T*;
+ using const_reference = const T&;
+ using const_pointer = const T*;
+ public:
+ circular_buffer() = default;
+ circular_buffer(circular_buffer&& X);
+ circular_buffer(const circular_buffer& X) = delete;
+ ~circular_buffer();
+ circular_buffer& operator=(const circular_buffer&) = delete;
+ circular_buffer& operator=(circular_buffer&&) = delete;
+ void push_front(const T& data);
+ void push_front(T&& data);
+ template <typename... A>
+ void emplace_front(A&&... args);
+ void push_back(const T& data);
+ void push_back(T&& data);
+ template <typename... A>
+ void emplace_back(A&&... args);
+ T& front();
+ T& back();
+ void pop_front();
+ void pop_back();
+ bool empty() const;
+ size_t size() const;
+ size_t capacity() const;
+ T& operator[](size_t idx);
+ template <typename Func>
+ void for_each(Func func);
+ // access an element, may return wrong or destroyed element
+ // only useful if you do not rely on data accuracy (e.g. prefetch)
+ T& access_element_unsafe(size_t idx);
+ private:
+ void expand();
+ void maybe_expand(size_t nr = 1);
+ size_t mask(size_t idx) const;
+
+ template<typename CB, typename ValueType>
+ struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> {
+ typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t;
+
+ ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; }
+ ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; }
+ // prefix
+ cbiterator<CB, ValueType>& operator++() {
+ idx++;
+ return *this;
+ }
+ // postfix
+ cbiterator<CB, ValueType> operator++(int unused) {
+ auto v = *this;
+ idx++;
+ return v;
+ }
+ // prefix
+ cbiterator<CB, ValueType>& operator--() {
+ idx--;
+ return *this;
+ }
+ // postfix
+ cbiterator<CB, ValueType> operator--(int unused) {
+ auto v = *this;
+ idx--;
+ return v;
+ }
+ cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const {
+ return cbiterator<CB, ValueType>(cb, idx + n);
+ }
+ cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const {
+ return cbiterator<CB, ValueType>(cb, idx - n);
+ }
+ cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) {
+ idx += n;
+ return *this;
+ }
+ cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) {
+ idx -= n;
+ return *this;
+ }
+ bool operator==(const cbiterator<CB, ValueType>& rhs) const {
+ return idx == rhs.idx;
+ }
+ bool operator!=(const cbiterator<CB, ValueType>& rhs) const {
+ return idx != rhs.idx;
+ }
+ bool operator<(const cbiterator<CB, ValueType>& rhs) const {
+ return idx < rhs.idx;
+ }
+ bool operator>(const cbiterator<CB, ValueType>& rhs) const {
+ return idx > rhs.idx;
+ }
+ bool operator>=(const cbiterator<CB, ValueType>& rhs) const {
+ return idx >= rhs.idx;
+ }
+ bool operator<=(const cbiterator<CB, ValueType>& rhs) const {
+ return idx <= rhs.idx;
+ }
+ typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const {
+ return idx - rhs.idx;
+ }
+ private:
+ CB* cb;
+ size_t idx;
+ cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {}
+ friend class circular_buffer;
+ };
+ friend class iterator;
+
+ public:
+ typedef cbiterator<circular_buffer, T> iterator;
+ typedef cbiterator<const circular_buffer, const T> const_iterator;
+
+ iterator begin() {
+ return iterator(this, _impl.begin);
+ }
+ const_iterator begin() const {
+ return const_iterator(this, _impl.begin);
+ }
+ iterator end() {
+ return iterator(this, _impl.end);
+ }
+ const_iterator end() const {
+ return const_iterator(this, _impl.end);
+ }
+ const_iterator cbegin() const {
+ return const_iterator(this, _impl.begin);
+ }
+ const_iterator cend() const {
+ return const_iterator(this, _impl.end);
+ }
+};
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const {
+ return idx & (_impl.capacity - 1);
+}
+
+template <typename T, typename Alloc>
+inline bool circular_buffer<T, Alloc>::empty() const {
+ return _impl.begin == _impl.end;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::size() const {
+ return _impl.end - _impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::capacity() const {
+ return _impl.capacity;
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x)
+ : _impl(std::move(x._impl)) {
+ x._impl = {};
+}
+
+template <typename T, typename Alloc>
+template <typename Func>
+inline void circular_buffer<T, Alloc>::for_each(Func func) {
+ auto s = _impl.storage;
+ auto m = _impl.capacity - 1;
+ for (auto i = _impl.begin; i != _impl.end; ++i) {
+ func(s[i & m]);
+ }
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::~circular_buffer() {
+ for_each([this] (T& obj) {
+ _impl.destroy(&obj);
+ });
+ _impl.deallocate(_impl.storage, _impl.capacity);
+}
+
+template <typename T, typename Alloc>
+void circular_buffer<T, Alloc>::expand() {
+ auto new_cap = std::max<size_t>(_impl.capacity * 2, 1);
+ auto new_storage = _impl.allocate(new_cap);
+ auto p = new_storage;
+ try {
+ for_each([this, &p] (T& obj) {
+ transfer_pass1(_impl, &obj, p);
+ p++;
+ });
+ } catch (...) {
+ while (p != new_storage) {
+ _impl.destroy(--p);
+ }
+ _impl.deallocate(new_storage, new_cap);
+ throw;
+ }
+ p = new_storage;
+ for_each([this, &p] (T& obj) {
+ transfer_pass2(_impl, &obj, p++);
+ });
+ std::swap(_impl.storage, new_storage);
+ std::swap(_impl.capacity, new_cap);
+ _impl.begin = 0;
+ _impl.end = p - _impl.storage;
+ _impl.deallocate(new_storage, new_cap);
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) {
+ if (_impl.end - _impl.begin + nr > _impl.capacity) {
+ expand();
+ }
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(const T& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ _impl.construct(p, data);
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(T&& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ _impl.construct(p, std::move(data));
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ _impl.construct(p, std::forward<Args>(args)...);
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(const T& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ _impl.construct(p, data);
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(T&& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ _impl.construct(p, std::move(data));
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ _impl.construct(p, std::forward<Args>(args)...);
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::front() {
+ return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::back() {
+ return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_front() {
+ _impl.destroy(&front());
+ ++_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_back() {
+ _impl.destroy(&back());
+ --_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::operator[](size_t idx) {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+#endif /* CEPH_CIRCULAR_BUFFER_HH_ */
diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h
new file mode 100644
index 00000000..ea5dc49e
--- /dev/null
+++ b/src/msg/async/dpdk/const.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CONST_H_
+#define CEPH_MSG_CONST_H_
+
+#include <stdint.h>
+
+enum class ip_protocol_num : uint8_t {
+ icmp = 1, tcp = 6, unused = 255
+};
+
+enum class eth_protocol_num : uint16_t {
+ ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd
+};
+
+const uint8_t eth_hdr_len = 14;
+const uint8_t tcp_hdr_len_min = 20;
+const uint8_t ipv4_hdr_len_min = 20;
+const uint8_t ipv6_hdr_len_min = 40;
+const uint16_t ip_packet_len_max = 65535;
+
+#endif
diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc
new file mode 100644
index 00000000..9f9d343b
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.cc
@@ -0,0 +1,154 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <bitset>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_version.h>
+
+#include "DPDK.h"
+#include "dpdk_rte.h"
+
+namespace dpdk {
+
+ static inline std::vector<char> string2vector(std::string str) {
+ auto v = std::vector<char>(str.begin(), str.end());
+ v.push_back('\0');
+ return v;
+ }
+
+ bool eal::initialized = false;
+ std::thread eal::t;
+ std::mutex eal::lock;
+ std::condition_variable eal::cond;
+ std::list<std::function<void()>> eal::funcs;
+
+ static int bitcount(unsigned long long n)
+ {
+ return std::bitset<CHAR_BIT * sizeof(n)>{n}.count();
+ }
+
+ int eal::init(CephContext *c)
+ {
+ if (initialized) {
+ return 1;
+ }
+
+ bool done = false;
+ auto num = std::stoull(c->_conf.get_val<std::string>("ms_dpdk_coremask"),
+ nullptr, 16);
+ unsigned int coremaskbit = bitcount(num);
+
+ ceph_assert(coremaskbit > c->_conf->ms_async_op_threads);
+
+ t = std::thread([&]() {
+ // TODO: Inherit these from the app parameters - "opts"
+ std::vector<std::vector<char>> args {
+ string2vector(string("ceph")),
+ string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")),
+ string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel),
+ };
+
+ Tub<std::string> hugepages_path;
+ if (!c->_conf->ms_dpdk_hugepages.empty()) {
+ hugepages_path.construct(c->_conf->ms_dpdk_hugepages);
+ }
+
+ // If "hugepages" is not provided and DPDK PMD drivers mode is requested -
+ // use the default DPDK huge tables configuration.
+ if (hugepages_path) {
+ args.push_back(string2vector("--huge-dir"));
+ args.push_back(string2vector(*hugepages_path));
+
+ //
+ // We don't know what is going to be our networking configuration so we
+ // assume there is going to be a queue per-CPU. Plus we'll give a DPDK
+ // 64MB for "other stuff".
+ //
+ unsigned int x;
+ std::stringstream ss;
+ ss << std::hex << "fffefffe";
+ ss >> x;
+ size_t size_MB = mem_size(bitcount(x)) >> 20;
+ std::stringstream size_MB_str;
+ size_MB_str << size_MB;
+
+ args.push_back(string2vector("-m"));
+ args.push_back(string2vector(size_MB_str.str()));
+ } else if (!c->_conf->ms_dpdk_pmd.empty()) {
+ args.push_back(string2vector("--no-huge"));
+ }
+
+ std::string rte_file_prefix;
+ rte_file_prefix = "rte_";
+ rte_file_prefix += c->_conf->name.to_str();
+ args.push_back(string2vector("--file-prefix"));
+ args.push_back(string2vector(rte_file_prefix));
+
+ std::vector<char*> cargs;
+
+ for (auto&& a: args) {
+ cargs.push_back(a.data());
+ }
+ /* initialise the EAL for all */
+ int ret = rte_eal_init(cargs.size(), cargs.data());
+ if (ret < 0)
+ return ret;
+
+ std::unique_lock<std::mutex> l(lock);
+ initialized = true;
+ done = true;
+ cond.notify_all();
+ while (true) {
+ if (!funcs.empty()) {
+ auto f = std::move(funcs.front());
+ funcs.pop_front();
+ f();
+ cond.notify_all();
+ } else {
+ cond.wait(l);
+ }
+ }
+ });
+ t.detach();
+ std::unique_lock<std::mutex> l(lock);
+ while (!done)
+ cond.wait(l);
+ return 0;
+ }
+
+ size_t eal::mem_size(int num_cpus)
+ {
+ size_t memsize = 0;
+ //
+ // PMD mempool memory:
+ //
+ // We don't know what is going to be our networking configuration so we
+ // assume there is going to be a queue per-CPU.
+ //
+ memsize += num_cpus * qp_mempool_obj_size();
+
+ // Plus we'll give a DPDK 64MB for "other stuff".
+ memsize += (64UL << 20);
+
+ return memsize;
+ }
+
+} // namespace dpdk
diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h
new file mode 100644
index 00000000..4aa83899
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.h
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef CEPH_DPDK_RTE_H_
+#define CEPH_DPDK_RTE_H_
+
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include <bitset>
+#include <rte_config.h>
+#include <rte_version.h>
+#include <boost/program_options.hpp>
+
+/*********************** Compat section ***************************************/
+// We currently support only versions 2.0 and above.
+#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0))
+#error "DPDK version above 2.0.0 is required"
+#endif
+
+#if defined(RTE_MBUF_REFCNT_ATOMIC)
+#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \
+ "config/common_linuxapp"
+#endif
+/******************************************************************************/
+
+namespace dpdk {
+
+// DPDK Environment Abstraction Layer
+class eal {
+ public:
+ using cpuset = std::bitset<RTE_MAX_LCORE>;
+
+ static std::mutex lock;
+ static std::condition_variable cond;
+ static std::list<std::function<void()>> funcs;
+ static int init(CephContext *c);
+ static void execute_on_master(std::function<void()> &&f) {
+ bool done = false;
+ std::unique_lock<std::mutex> l(lock);
+ funcs.emplace_back([&]() { f(); done = true; });
+ cond.notify_all();
+ while (!done)
+ cond.wait(l);
+ }
+ /**
+ * Returns the amount of memory needed for DPDK
+ * @param num_cpus Number of CPUs the application is going to use
+ *
+ * @return
+ */
+ static size_t mem_size(int num_cpus);
+ static bool initialized;
+ static std::thread t;
+};
+
+} // namespace dpdk
+#endif // CEPH_DPDK_RTE_H_
diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc
new file mode 100644
index 00000000..9aca5078
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.cc
@@ -0,0 +1,16 @@
+#include <iomanip>
+
+#include "ethernet.h"
+
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) {
+ auto& m = ea.mac;
+ using u = uint32_t;
+ os << std::hex << std::setw(2)
+ << u(m[0]) << ":"
+ << u(m[1]) << ":"
+ << u(m[2]) << ":"
+ << u(m[3]) << ":"
+ << u(m[4]) << ":"
+ << u(m[5]);
+ return os;
+}
diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h
new file mode 100644
index 00000000..b007425f
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_ETHERNET_H_
+#define CEPH_MSG_ETHERNET_H_
+
+#include <array>
+#include <sstream>
+
+#include "include/ceph_assert.h"
+#include "byteorder.h"
+
+struct ethernet_address {
+ ethernet_address() {}
+
+ ethernet_address(const uint8_t *eaddr) {
+ std::copy(eaddr, eaddr + 6, mac.begin());
+ }
+
+ ethernet_address(std::initializer_list<uint8_t> eaddr) {
+ ceph_assert(eaddr.size() == mac.size());
+ std::copy(eaddr.begin(), eaddr.end(), mac.begin());
+ }
+
+ ethernet_address ntoh() {
+ return *this;
+ }
+ ethernet_address hton() {
+ return *this;
+ }
+ std::array<uint8_t, 6> mac;
+} __attribute__((packed));
+
+inline bool operator==(const ethernet_address& a, const ethernet_address& b) {
+ return a.mac == b.mac;
+}
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea);
+
+struct ethernet {
+ using address = ethernet_address;
+ static address broadcast_address() {
+ return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ }
+ static constexpr uint16_t arp_hardware_type() { return 1; }
+};
+
+struct eth_hdr {
+ ethernet_address dst_mac;
+ ethernet_address src_mac;
+ uint16_t eth_proto;
+ eth_hdr hton() {
+ eth_hdr hdr = *this;
+ hdr.eth_proto = ::hton(eth_proto);
+ return hdr;
+ }
+ eth_hdr ntoh() {
+ eth_hdr hdr = *this;
+ hdr.eth_proto = ::ntoh(eth_proto);
+ return hdr;
+ }
+} __attribute__((packed));
+
+ethernet_address parse_ethernet_address(std::string addr);
+
+#endif /* CEPH_MSG_ETHERNET_H_ */
diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h
new file mode 100644
index 00000000..356d8fd6
--- /dev/null
+++ b/src/msg/async/dpdk/ip_types.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_IP_TYPES_H_H
+#define CEPH_IP_TYPES_H_H
+
+#include <boost/asio/ip/address_v4.hpp>
+#include <string>
+
+class Packet;
+class ethernet_address;
+using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>;
+
+struct ipv4_addr {
+ uint32_t ip;
+ uint16_t port;
+
+ ipv4_addr() : ip(0), port(0) {}
+ ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {}
+ ipv4_addr(uint16_t port) : ip(0), port(port) {}
+ ipv4_addr(const std::string &addr);
+ ipv4_addr(const std::string &addr, uint16_t port);
+
+ ipv4_addr(const entity_addr_t &ad) {
+ ip = ntoh(ad.in4_addr().sin_addr.s_addr);
+ port = ad.get_port();
+ }
+
+ ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {}
+};
+
+struct ipv4_address {
+ ipv4_address() : ip(0) {}
+ explicit ipv4_address(uint32_t ip) : ip(ip) {}
+ explicit ipv4_address(const std::string& addr) {
+ ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong());
+ }
+ ipv4_address(ipv4_addr addr) {
+ ip = addr.ip;
+ }
+
+ uint32_t ip;
+
+ ipv4_address hton() {
+ ipv4_address addr;
+ addr.ip = ::hton(ip);
+ return addr;
+ }
+ ipv4_address ntoh() {
+ ipv4_address addr;
+ addr.ip = ::ntoh(ip);
+ return addr;
+ }
+
+ friend bool operator==(ipv4_address x, ipv4_address y) {
+ return x.ip == y.ip;
+ }
+ friend bool operator!=(ipv4_address x, ipv4_address y) {
+ return x.ip != y.ip;
+ }
+} __attribute__((packed));
+
+static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; }
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a);
+
+namespace std {
+
+ template <>
+ struct hash<ipv4_address> {
+ size_t operator()(ipv4_address a) const { return a.ip; }
+ };
+
+}
+
+#endif //CEPH_IP_TYPES_H_H
diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc
new file mode 100644
index 00000000..6e361f18
--- /dev/null
+++ b/src/msg/async/dpdk/net.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ */
+
+#include "net.h"
+#include "DPDK.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "net "
+
+interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center)
+ : cct(cct), _dev(dev),
+ _rx(_dev->receive(
+ center->get_id(),
+ [center, this] (Packet p) {
+ return dispatch_packet(center, std::move(p));
+ }
+ )),
+ _hw_address(_dev->hw_address()),
+ _hw_features(_dev->get_hw_features()) {
+ auto idx = 0u;
+ unsigned qid = center->get_id();
+ dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable {
+ Tub<Packet> p;
+ for (size_t i = 0; i < _pkt_providers.size(); i++) {
+ auto l3p = _pkt_providers[idx++]();
+ if (idx == _pkt_providers.size())
+ idx = 0;
+ if (l3p) {
+ auto l3pv = std::move(*l3p);
+ auto eh = l3pv.p.prepend_header<eth_hdr>();
+ eh->dst_mac = l3pv.to;
+ eh->src_mac = _hw_address;
+ eh->eth_proto = uint16_t(l3pv.proto_num);
+ *eh = eh->hton();
+ ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num)
+ << " " << _hw_address << " -> " << l3pv.to
+ << " length " << std::dec << l3pv.p.len() << dendl;
+ p = std::move(l3pv.p);
+ return p;
+ }
+ }
+ return p;
+ });
+}
+
+subscription<Packet, ethernet_address> interface::register_l3(
+ eth_protocol_num proto_num,
+ std::function<int (Packet p, ethernet_address from)> next,
+ std::function<bool (forward_hash&, Packet& p, size_t)> forward)
+{
+ auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward)));
+ ceph_assert(i.second);
+ l3_rx_stream& l3_rx = i.first->second;
+ return l3_rx.packet_stream.listen(std::move(next));
+}
+
+unsigned interface::hash2cpu(uint32_t hash) {
+ return _dev->hash2cpu(hash);
+}
+
+const rss_key_type& interface::rss_key() const {
+ return _dev->rss_key();
+}
+
+uint16_t interface::hw_queues_count() const {
+ return _dev->hw_queues_count();
+}
+
+class C_handle_l2forward : public EventCallback {
+ std::shared_ptr<DPDKDevice> sdev;
+ unsigned &queue_depth;
+ Packet p;
+ unsigned dst;
+
+ public:
+ C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target)
+ : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {}
+ void do_request(uint64_t fd) {
+ sdev->l2receive(dst, std::move(p));
+ queue_depth--;
+ delete this;
+ }
+};
+
+void interface::forward(EventCenter *source, unsigned target, Packet p) {
+ static __thread unsigned queue_depth;
+
+ if (queue_depth < 1000) {
+ queue_depth++;
+ // FIXME: need ensure this event not be called after EventCenter destruct
+ _dev->workers[target]->center.dispatch_event_external(
+ new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target));
+ }
+}
+
+int interface::dispatch_packet(EventCenter *center, Packet p) {
+ auto eh = p.get_header<eth_hdr>();
+ if (eh) {
+ auto i = _proto_map.find(ntoh(eh->eth_proto));
+ auto hwrss = p.rss_hash();
+ if (hwrss) {
+ ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+ << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+ << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl;
+ } else {
+ ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+ << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+ << " length " << std::dec << p.len() << dendl;
+ }
+ if (i != _proto_map.end()) {
+ l3_rx_stream& l3 = i->second;
+ auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () {
+ auto hwrss = p.rss_hash();
+ if (hwrss) {
+ return *hwrss;
+ } else {
+ forward_hash data;
+ if (l3.forward(data, p, sizeof(eth_hdr))) {
+ return toeplitz_hash(rss_key(), data);
+ }
+ return 0u;
+ }
+ });
+ if (fw != center->get_id()) {
+ ldout(cct, 1) << __func__ << " forward to " << fw << dendl;
+ forward(center, fw, std::move(p));
+ } else {
+ auto h = eh->ntoh();
+ auto from = h.src_mac;
+ p.trim_front(sizeof(*eh));
+ // avoid chaining, since queue length is unlimited
+ // drop instead.
+ if (l3.ready()) {
+ return l3.packet_stream.produce(std::move(p), from);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+class C_arp_learn : public EventCallback {
+ DPDKWorker *worker;
+ ethernet_address l2_addr;
+ ipv4_address l3_addr;
+
+ public:
+ C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3)
+ : worker(w), l2_addr(l2), l3_addr(l3) {}
+ void do_request(uint64_t id) {
+ worker->arp_learn(l2_addr, l3_addr);
+ delete this;
+ }
+};
+
+void interface::arp_learn(ethernet_address l2, ipv4_address l3)
+{
+ for (auto &&w : _dev->workers) {
+ w->center.dispatch_event_external(
+ new C_arp_learn(w, l2, l3));
+ }
+}
+
+l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func)
+ : _netif(netif), _proto_num(proto_num) {
+ _netif->register_packet_provider(std::move(func));
+}
+
+subscription<Packet, ethernet_address> l3_protocol::receive(
+ std::function<int (Packet, ethernet_address)> rx_fn,
+ std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) {
+ return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward));
+};
diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h
new file mode 100644
index 00000000..63f0422b
--- /dev/null
+++ b/src/msg/async/dpdk/net.h
@@ -0,0 +1,138 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_NET_H
+#define CEPH_MSG_DPDK_NET_H
+
+#include "const.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "toeplitz.h"
+
+struct hw_features {
+ // Enable tx ip header checksum offload
+ bool tx_csum_ip_offload = false;
+ // Enable tx l4 (TCP or UDP) checksum offload
+ bool tx_csum_l4_offload = false;
+ // Enable rx checksum offload
+ bool rx_csum_offload = false;
+ // LRO is enabled
+ bool rx_lro = false;
+ // Enable tx TCP segment offload
+ bool tx_tso = false;
+ // Enable tx UDP fragmentation offload
+ bool tx_ufo = false;
+ // Maximum Transmission Unit
+ uint16_t mtu = 1500;
+ // Maximun packet len when TCP/UDP offload is enabled
+ uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len;
+};
+
+class forward_hash {
+ uint8_t data[64];
+ size_t end_idx = 0;
+ public:
+ size_t size() const {
+ return end_idx;
+ }
+ void push_back(uint8_t b) {
+ ceph_assert(end_idx < sizeof(data));
+ data[end_idx++] = b;
+ }
+ void push_back(uint16_t b) {
+ push_back(uint8_t(b));
+ push_back(uint8_t(b >> 8));
+ }
+ void push_back(uint32_t b) {
+ push_back(uint16_t(b));
+ push_back(uint16_t(b >> 16));
+ }
+ const uint8_t& operator[](size_t idx) const {
+ return data[idx];
+ }
+};
+
+class interface;
+
+class l3_protocol {
+ public:
+ struct l3packet {
+ eth_protocol_num proto_num;
+ ethernet_address to;
+ Packet p;
+ };
+ using packet_provider_type = std::function<Tub<l3packet> ()>;
+
+ private:
+ interface* _netif;
+ eth_protocol_num _proto_num;
+
+ public:
+ explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func);
+ subscription<Packet, ethernet_address> receive(
+ std::function<int (Packet, ethernet_address)> rx_fn,
+ std::function<bool (forward_hash &h, Packet &p, size_t s)> forward);
+
+ private:
+ friend class interface;
+};
+
+class DPDKDevice;
+struct ipv4_address;
+
+class interface {
+ CephContext *cct;
+ struct l3_rx_stream {
+ stream<Packet, ethernet_address> packet_stream;
+ std::function<bool (forward_hash&, Packet&, size_t)> forward;
+ bool ready() { return packet_stream.started(); }
+ explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {}
+ };
+ std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
+ std::shared_ptr<DPDKDevice> _dev;
+ subscription<Packet> _rx;
+ ethernet_address _hw_address;
+ struct hw_features _hw_features;
+ std::vector<l3_protocol::packet_provider_type> _pkt_providers;
+
+ private:
+ int dispatch_packet(EventCenter *c, Packet p);
+ public:
+ explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center);
+ ethernet_address hw_address() { return _hw_address; }
+ const struct hw_features& get_hw_features() const { return _hw_features; }
+ subscription<Packet, ethernet_address> register_l3(
+ eth_protocol_num proto_num,
+ std::function<int (Packet, ethernet_address)> next,
+ std::function<bool (forward_hash&, Packet&, size_t)> forward);
+ void forward(EventCenter *source, unsigned target, Packet p);
+ unsigned hash2cpu(uint32_t hash);
+ void register_packet_provider(l3_protocol::packet_provider_type func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ const rss_key_type& rss_key() const;
+ uint16_t hw_queues_count() const;
+ void arp_learn(ethernet_address l2, ipv4_address l3);
+ friend class l3_protocol;
+};
+
+#endif //CEPH_MSG_DPDK_NET_H
diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h
new file mode 100644
index 00000000..984ddca1
--- /dev/null
+++ b/src/msg/async/dpdk/queue.h
@@ -0,0 +1,96 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_QUEUE_H_
+#define CEPH_MSG_DPDK_QUEUE_H_
+
+#include <queue>
+
+#include "circular_buffer.h"
+
+template <typename T>
+class queue {
+ std::queue<T, circular_buffer<T>> _q;
+ size_t _max;
+
+ public:
+ explicit queue(size_t size): _max(size) {}
+
+ // Push an item.
+ //
+ // Returns false if the queue was full and the item was not pushed.
+ bool push(T&& a);
+
+ // pops an item.
+ T pop();
+
+ // Consumes items from the queue, passing them to @func, until @func
+ // returns false or the queue it empty
+ //
+ // Returns false if func returned false.
+ template <typename Func>
+ bool consume(Func&& func);
+
+ // Returns true when the queue is empty.
+ bool empty() const;
+
+ // Returns true when the queue is full.
+ bool full() const;
+
+ size_t size() const { return _q.size(); }
+
+ // Destroy any items in the queue
+ void clear() {
+ while (!_q.empty()) {
+ _q.pop();
+ }
+ }
+};
+
+template <typename T>
+inline bool queue<T>::push(T&& data) {
+ if (_q.size() < _max) {
+ _q.push(std::move(data));
+ notify_not_empty();
+ return true;
+ } else {
+ return false;
+ }
+}
+
+template <typename T>
+inline T queue<T>::pop() {
+ T data = std::move(_q.front());
+ _q.pop();
+ return data;
+}
+
+template <typename T>
+inline bool queue<T>::empty() const {
+ return _q.empty();
+}
+
+template <typename T>
+inline bool queue<T>::full() const {
+ return _q.size() == _max;
+}
+
+#endif /* CEPH_MSG_DPDK_QUEUE_H_ */
diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h
new file mode 100644
index 00000000..d078063b
--- /dev/null
+++ b/src/msg/async/dpdk/shared_ptr.h
@@ -0,0 +1,391 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_LW_SHARED_PTR_H_
+#define CEPH_LW_SHARED_PTR_H_
+
+#include <utility>
+#include <type_traits>
+#include <functional>
+#include <iostream>
+
+// This header defines a shared pointer facility, lw_shared_ptr<>,
+// modeled after std::shared_ptr<>.
+//
+// Unlike std::shared_ptr<>, this implementation is thread
+// safe, and two pointers sharing the same object must not be used in
+// different threads.
+//
+// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<>
+// occupying just one machine word, and adding just one word to the shared
+// object. However, it does not support polymorphism.
+//
+// It supports shared_from_this() via enable_shared_from_this<>
+// and lw_enable_shared_from_this<>().
+//
+
+template <typename T>
+class lw_shared_ptr;
+
+template <typename T>
+class enable_lw_shared_from_this;
+
+template <typename T>
+class enable_shared_from_this;
+
+template <typename T, typename... A>
+lw_shared_ptr<T> make_lw_shared(A&&... a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T&& a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T& a);
+
+struct lw_shared_ptr_counter_base {
+ long _count = 0;
+};
+
+
+namespace internal {
+
+template <class T, class U>
+struct lw_shared_ptr_accessors;
+
+template <class T>
+struct lw_shared_ptr_accessors_esft;
+
+template <class T>
+struct lw_shared_ptr_accessors_no_esft;
+
+}
+
+
+// We want to support two use cases for shared_ptr<T>:
+//
+// 1. T is any type (primitive or class type)
+//
+// 2. T is a class type that inherits from enable_shared_from_this<T>.
+//
+// In the first case, we must wrap T in an object containing the counter,
+// since T may be a primitive type and cannot be a base class.
+//
+// In the second case, we want T to reach the counter through its
+// enable_shared_from_this<> base class, so that we can implement
+// shared_from_this().
+//
+// To implement those two conflicting requirements (T alongside its counter;
+// T inherits from an object containing the counter) we use std::conditional<>
+// and some accessor functions to select between two implementations.
+
+
+// CRTP from this to enable shared_from_this:
+template <typename T>
+class enable_lw_shared_from_this : private lw_shared_ptr_counter_base {
+ using ctor = T;
+protected:
+ enable_lw_shared_from_this() noexcept {}
+ enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {}
+ enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {}
+ enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; }
+ enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; }
+public:
+ lw_shared_ptr<T> shared_from_this();
+ lw_shared_ptr<const T> shared_from_this() const;
+
+ template <typename X>
+ friend class lw_shared_ptr;
+ template <typename X>
+ friend class ::internal::lw_shared_ptr_accessors_esft;
+ template <typename X, class Y>
+ friend class ::internal::lw_shared_ptr_accessors;
+};
+
+template <typename T>
+struct shared_ptr_no_esft : private lw_shared_ptr_counter_base {
+ T _value;
+
+ shared_ptr_no_esft() = default;
+ shared_ptr_no_esft(const T& x) : _value(x) {}
+ shared_ptr_no_esft(T&& x) : _value(std::move(x)) {}
+ template <typename... A>
+ shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {}
+
+ template <typename X>
+ friend class lw_shared_ptr;
+ template <typename X>
+ friend class ::internal::lw_shared_ptr_accessors_no_esft;
+ template <typename X, class Y>
+ friend class ::internal::lw_shared_ptr_accessors;
+};
+
+
+/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed,
+/// primarily so that incomplete classes can be used.
+///
+/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>.
+/// The specialization must be visible for all uses of \c lw_shared_ptr<T>.
+///
+/// To customize, the template must have a `static void dispose(T*)` operator that disposes of
+/// the object.
+template <typename T>
+struct lw_shared_ptr_deleter; // No generic implementation
+
+namespace internal {
+
+template <typename T>
+struct lw_shared_ptr_accessors_esft {
+ using concrete_type = std::remove_const_t<T>;
+ static T* to_value(lw_shared_ptr_counter_base* counter) {
+ return static_cast<T*>(counter);
+ }
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ delete static_cast<T*>(counter);
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // since to_value() is defined above, we don't need to do anything special
+ // to force-instantiate it
+ }
+};
+
+template <typename T>
+struct lw_shared_ptr_accessors_no_esft {
+ using concrete_type = shared_ptr_no_esft<T>;
+ static T* to_value(lw_shared_ptr_counter_base* counter) {
+ return &static_cast<concrete_type*>(counter)->_value;
+ }
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ delete static_cast<concrete_type*>(counter);
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // since to_value() is defined above, we don't need to do anything special
+ // to force-instantiate it
+ }
+};
+
+// Generic case: lw_shared_ptr_deleter<T> is not specialized, select
+// implementation based on whether T inherits from enable_lw_shared_from_this<T>.
+template <typename T, typename U = void>
+struct lw_shared_ptr_accessors : std::conditional_t<
+ std::is_base_of<enable_lw_shared_from_this<T>, T>::value,
+ lw_shared_ptr_accessors_esft<T>,
+ lw_shared_ptr_accessors_no_esft<T>> {
+};
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> {
+ using concrete_type = T;
+ static T* to_value(lw_shared_ptr_counter_base* counter);
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ lw_shared_ptr_deleter<T>::dispose(to_value(counter));
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // instantiate to_value(); must be defined by shared_ptr_incomplete.hh
+ to_value(p);
+ }
+};
+
+}
+
+template <typename T>
+class lw_shared_ptr {
+ using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>;
+ using concrete_type = typename accessors::concrete_type;
+ mutable lw_shared_ptr_counter_base* _p = nullptr;
+private:
+ lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) {
+ if (_p) {
+ ++_p->_count;
+ }
+ }
+ template <typename... A>
+ static lw_shared_ptr make(A&&... a) {
+ auto p = new concrete_type(std::forward<A>(a)...);
+ accessors::instantiate_to_value(p);
+ return lw_shared_ptr(p);
+ }
+public:
+ using element_type = T;
+
+ lw_shared_ptr() noexcept = default;
+ lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {}
+ lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) {
+ if (_p) {
+ ++_p->_count;
+ }
+ }
+ lw_shared_ptr(lw_shared_ptr&& x) noexcept : _p(x._p) {
+ x._p = nullptr;
+ }
+ [[gnu::always_inline]]
+ ~lw_shared_ptr() {
+ if (_p && !--_p->_count) {
+ accessors::dispose(_p);
+ }
+ }
+ lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept {
+ if (_p != x._p) {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(x);
+ }
+ return *this;
+ }
+ lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept {
+ if (_p != x._p) {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(std::move(x));
+ }
+ return *this;
+ }
+ lw_shared_ptr& operator=(std::nullptr_t) noexcept {
+ return *this = lw_shared_ptr();
+ }
+ lw_shared_ptr& operator=(T&& x) noexcept {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x)));
+ return *this;
+ }
+
+ T& operator*() const noexcept { return *accessors::to_value(_p); }
+ T* operator->() const noexcept { return accessors::to_value(_p); }
+ T* get() const noexcept {
+ if (_p) {
+ return accessors::to_value(_p);
+ } else {
+ return nullptr;
+ }
+ }
+
+ long int use_count() const noexcept {
+ if (_p) {
+ return _p->_count;
+ } else {
+ return 0;
+ }
+ }
+
+ operator lw_shared_ptr<const T>() const noexcept {
+ return lw_shared_ptr<const T>(_p);
+ }
+
+ explicit operator bool() const noexcept {
+ return _p;
+ }
+
+ bool owned() const noexcept {
+ return _p->_count == 1;
+ }
+
+ bool operator==(const lw_shared_ptr<const T>& x) const {
+ return _p == x._p;
+ }
+
+ bool operator!=(const lw_shared_ptr<const T>& x) const {
+ return !operator==(x);
+ }
+
+ bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return _p == x._p;
+ }
+
+ bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return !operator==(x);
+ }
+
+ bool operator<(const lw_shared_ptr<const T>& x) const {
+ return _p < x._p;
+ }
+
+ bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return _p < x._p;
+ }
+
+ template <typename U>
+ friend class lw_shared_ptr;
+
+ template <typename X, typename... A>
+ friend lw_shared_ptr<X> make_lw_shared(A&&...);
+
+ template <typename U>
+ friend lw_shared_ptr<U> make_lw_shared(U&&);
+
+ template <typename U>
+ friend lw_shared_ptr<U> make_lw_shared(U&);
+
+ template <typename U>
+ friend class enable_lw_shared_from_this;
+};
+
+template <typename T, typename... A>
+inline
+lw_shared_ptr<T> make_lw_shared(A&&... a) {
+ return lw_shared_ptr<T>::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T&& a) {
+ return lw_shared_ptr<T>::make(std::move(a));
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T& a) {
+ return lw_shared_ptr<T>::make(a);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T>
+enable_lw_shared_from_this<T>::shared_from_this() {
+ return lw_shared_ptr<T>(this);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<const T>
+enable_lw_shared_from_this<T>::shared_from_this() const {
+ return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this));
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) {
+ if (!p) {
+ return out << "null";
+ }
+ return out << *p;
+}
+
+namespace std {
+
+ template <typename T>
+ struct hash<lw_shared_ptr<T>> : private hash<T*> {
+ size_t operator()(const lw_shared_ptr<T>& p) const {
+ return hash<T*>::operator()(p.get());
+ }
+ };
+
+}
+
+#endif /* CEPH_LW_SHARED_PTR_H_ */
diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h
new file mode 100644
index 00000000..1898e8f8
--- /dev/null
+++ b/src/msg/async/dpdk/stream.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_STREAM_H_
+#define CEPH_MSG_STREAM_H_
+
+#include <exception>
+#include <cassert>
+
+// A stream<> is the producer side. It may call produce() as long
+// as the returned from the previous invocation is ready.
+// To signify no more data is available, call close().
+//
+// A subscription<> is the consumer side. It is created by a call
+// to stream::listen(). Calling subscription::start(),
+// which registers the data processing callback, starts processing
+// events. It may register for end-of-stream notifications by
+// return the when_done() future, which also delivers error
+// events (as exceptions).
+//
+// The consumer can pause generation of new data by returning
+// positive integer; when it becomes ready, the producer
+// will resume processing.
+
+template <typename... T>
+class subscription;
+
+template <typename... T>
+class stream {
+ subscription<T...>* _sub = nullptr;
+ int done;
+ bool ready;
+ public:
+ using next_fn = std::function<int (T...)>;
+ stream() = default;
+ stream(const stream&) = delete;
+ stream(stream&&) = delete;
+ ~stream() {
+ if (_sub) {
+ _sub->_stream = nullptr;
+ }
+ }
+
+ void operator=(const stream&) = delete;
+ void operator=(stream&&) = delete;
+
+ // Returns a subscription that reads value from this
+ // stream.
+ subscription<T...> listen() {
+ return subscription<T...>(this);
+ }
+
+ // Returns a subscription that reads value from this
+ // stream, and also sets up the listen function.
+ subscription<T...> listen(next_fn next) {
+ auto sub = subscription<T...>(this);
+ sub.start(std::move(next));
+ return sub;
+ }
+
+ // Becomes ready when the listener is ready to accept
+ // values. Call only once, when beginning to produce
+ // values.
+ bool started() {
+ return ready;
+ }
+
+ // Produce a value. Call only after started(), and after
+ // a previous produce() is ready.
+ int produce(T... data) {
+ return _sub->_next(std::move(data)...);
+ }
+
+ // End the stream. Call only after started(), and after
+ // a previous produce() is ready. No functions may be called
+ // after this.
+ void close() {
+ done = 1;
+ }
+
+ // Signal an error. Call only after started(), and after
+ // a previous produce() is ready. No functions may be called
+ // after this.
+ void set_exception(int error) {
+ done = error;
+ }
+ private:
+ void start();
+ friend class subscription<T...>;
+};
+
+template <typename... T>
+class subscription {
+ public:
+ using next_fn = typename stream<T...>::next_fn;
+ private:
+ stream<T...>* _stream;
+ next_fn _next;
+ private:
+ explicit subscription(stream<T...>* s): _stream(s) {
+ ceph_assert(!_stream->_sub);
+ _stream->_sub = this;
+ }
+
+ public:
+ subscription(subscription&& x)
+ : _stream(x._stream), _next(std::move(x._next)) {
+ x._stream = nullptr;
+ if (_stream) {
+ _stream->_sub = this;
+ }
+ }
+ ~subscription() {
+ if (_stream) {
+ _stream->_sub = nullptr;
+ }
+ }
+
+ /// \brief Start receiving events from the stream.
+ ///
+ /// \param next Callback to call for each event
+ void start(std::function<int (T...)> next) {
+ _next = std::move(next);
+ _stream->ready = true;
+ }
+
+ // Becomes ready when the stream is empty, or when an error
+ // happens (in that case, an exception is held).
+ int done() {
+ return _stream->done;
+ }
+
+ friend class stream<T...>;
+};
+
+#endif /* CEPH_MSG_STREAM_H_ */
diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h
new file mode 100644
index 00000000..3ca38808
--- /dev/null
+++ b/src/msg/async/dpdk/toeplitz.h
@@ -0,0 +1,92 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*-
+ * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CEPH_MSG_TOEPLITZ_H_
+#define CEPH_MSG_TOEPLITZ_H_
+
+#include <vector>
+
+using rss_key_type = std::vector<uint8_t>;
+
+// Mellanox Linux's driver key
+static const rss_key_type default_rsskey_40bytes = {
+ 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
+ 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
+ 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
+ 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
+ 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
+};
+
+// Intel's i40e PMD default RSS key
+static const rss_key_type default_rsskey_52bytes = {
+ 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
+ 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
+ 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
+ 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
+ 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
+ 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
+ 0x81, 0x15, 0x03, 0x66
+};
+
+template<typename T>
+static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data)
+{
+ uint32_t hash = 0, v;
+ u_int i, b;
+
+ /* XXXRW: Perhaps an assertion about key length vs. data length? */
+
+ v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
+ for (i = 0; i < data.size(); i++) {
+ for (b = 0; b < 8; b++) {
+ if (data[i] & (1<<(7-b)))
+ hash ^= v;
+ v <<= 1;
+ if ((i + 4) < key.size() &&
+ (key[i+4] & (1<<(7-b))))
+ v |= 1;
+ }
+ }
+ return (hash);
+}
+#endif
diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h
new file mode 100644
index 00000000..599db5bd
--- /dev/null
+++ b/src/msg/async/dpdk/transfer.h
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_TRANSFER_H_
+#define CEPH_TRANSFER_H_
+
+// Helper functions for copying or moving multiple objects in an exception
+// safe manner, then destroying the sources.
+//
+// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs,
+// (this copies the object from @from to @to). If no exceptions are encountered,
+// call transfer_pass2(allocator, &from, &to). This destroys the object at the
+// origin. If exceptions were encountered, simply destroy all copied objects.
+//
+// As an optimization, if the objects are moveable without throwing (noexcept)
+// transfer_pass1() simply moves the objects and destroys the source, and
+// transfer_pass2() does nothing.
+
+#include <type_traits>
+#include <utility>
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+ typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ a.construct(to, std::move(*from));
+ a.destroy(from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+ typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+ typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ a.construct(to, *from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+ typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ a.destroy(from);
+}
+
+#endif /* CEPH_TRANSFER_H_ */
diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc
new file mode 100644
index 00000000..f047eb18
--- /dev/null
+++ b/src/msg/async/frames_v2.cc
@@ -0,0 +1,480 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2020 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "frames_v2.h"
+
+#include <ostream>
+
+#undef FMT_HEADER_ONLY
+#define FMT_HEADER_ONLY 1
+#include "seastar/fmt/include/fmt/format.h"
+
+namespace ceph::msgr::v2 {
+
+// Unpads bufferlist to unpadded_len.
+static void unpad_zero(bufferlist& bl, uint32_t unpadded_len) {
+ ceph_assert(bl.length() >= unpadded_len);
+ if (bl.length() > unpadded_len) {
+ bl.splice(unpadded_len, bl.length() - unpadded_len);
+ }
+}
+
+// Discards trailing empty segments, unless there is just one segment.
+// A frame always has at least one (possibly empty) segment.
+static size_t calc_num_segments(const bufferlist segment_bls[],
+ size_t segment_count) {
+ ceph_assert(segment_count > 0 && segment_count <= MAX_NUM_SEGMENTS);
+ for (size_t i = segment_count; i-- > 0; ) {
+ if (segment_bls[i].length() > 0) {
+ return i + 1;
+ }
+ }
+ return 1;
+}
+
+static void check_segment_crc(const bufferlist& segment_bl,
+ uint32_t expected_crc) {
+ uint32_t crc = segment_bl.crc32c(-1);
+ if (crc != expected_crc) {
+ throw FrameError(fmt::format(
+ "bad segment crc calculated={} expected={}", crc, expected_crc));
+ }
+}
+
+// Returns true if the frame is ready for dispatching, or false if
+// it was aborted by the sender and must be dropped.
+static bool check_epilogue_late_status(__u8 late_status) {
+ __u8 aborted = late_status & FRAME_LATE_STATUS_ABORTED_MASK;
+ if (aborted != FRAME_LATE_STATUS_ABORTED &&
+ aborted != FRAME_LATE_STATUS_COMPLETE) {
+ throw FrameError(fmt::format("bad late_status"));
+ }
+ return aborted == FRAME_LATE_STATUS_COMPLETE;
+}
+
+void FrameAssembler::fill_preamble(Tag tag,
+ preamble_block_t& preamble) const {
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ ::memset(&preamble, 0, sizeof(preamble));
+
+ preamble.tag = static_cast<__u8>(tag);
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ preamble.segments[i].length = m_descs[i].logical_len;
+ preamble.segments[i].alignment = m_descs[i].align;
+ }
+ preamble.num_segments = m_descs.size();
+ preamble.crc = ceph_crc32c(
+ 0, reinterpret_cast<const unsigned char*>(&preamble),
+ sizeof(preamble) - sizeof(preamble.crc));
+}
+
+uint64_t FrameAssembler::get_frame_logical_len() const {
+ ceph_assert(!m_descs.empty());
+ uint64_t logical_len = 0;
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ logical_len += m_descs[i].logical_len;
+ }
+ return logical_len;
+}
+
+uint64_t FrameAssembler::get_frame_onwire_len() const {
+ ceph_assert(!m_descs.empty());
+ uint64_t onwire_len = get_preamble_onwire_len();
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ onwire_len += get_segment_onwire_len(i);
+ }
+ onwire_len += get_epilogue_onwire_len();
+ return onwire_len;
+}
+
+bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const {
+ epilogue_crc_rev0_block_t epilogue;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ ::memset(&epilogue, 0, sizeof(epilogue));
+
+ bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue));
+ frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+ epilogue.crc_values[i] = segment_bls[i].crc32c(-1);
+ if (segment_bls[i].length() > 0) {
+ frame_bl.claim_append(segment_bls[i]);
+ }
+ }
+ frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+ return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const {
+ bufferlist preamble_bl(sizeof(preamble));
+ preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+ sizeof(preamble));
+
+ epilogue_secure_rev0_block_t epilogue;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ ::memset(&epilogue, 0, sizeof(epilogue));
+ bufferlist epilogue_bl(sizeof(epilogue));
+ epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+ sizeof(epilogue));
+
+ // preamble + MAX_NUM_SEGMENTS + epilogue
+ uint32_t onwire_lens[MAX_NUM_SEGMENTS + 2];
+ onwire_lens[0] = preamble_bl.length();
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ onwire_lens[i + 1] = segment_bls[i].length(); // already padded
+ }
+ onwire_lens[m_descs.size() + 1] = epilogue_bl.length();
+ m_crypto->tx->reset_tx_handler(onwire_lens,
+ onwire_lens + m_descs.size() + 2);
+ m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ if (segment_bls[i].length() > 0) {
+ m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+ }
+ }
+ m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+ return m_crypto->tx->authenticated_encrypt_final();
+}
+
+bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const {
+ epilogue_crc_rev1_block_t epilogue;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ ::memset(&epilogue, 0, sizeof(epilogue));
+ epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+
+ bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue));
+ frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble));
+
+ ceph_assert(segment_bls[0].length() == m_descs[0].logical_len);
+ if (segment_bls[0].length() > 0) {
+ uint32_t crc = segment_bls[0].crc32c(-1);
+ frame_bl.claim_append(segment_bls[0]);
+ encode(crc, frame_bl);
+ }
+ if (m_descs.size() == 1) {
+ return frame_bl; // no epilogue if only one segment
+ }
+
+ for (size_t i = 1; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+ epilogue.crc_values[i - 1] = segment_bls[i].crc32c(-1);
+ if (segment_bls[i].length() > 0) {
+ frame_bl.claim_append(segment_bls[i]);
+ }
+ }
+ frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue));
+ return frame_bl;
+}
+
+bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const {
+ bufferlist preamble_bl;
+ if (segment_bls[0].length() > FRAME_PREAMBLE_INLINE_SIZE) {
+ // first segment is partially inlined, inline buffer is full
+ preamble_bl.reserve(sizeof(preamble));
+ preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+ sizeof(preamble));
+ segment_bls[0].splice(0, FRAME_PREAMBLE_INLINE_SIZE, &preamble_bl);
+ } else {
+ // first segment is fully inlined, inline buffer may need padding
+ uint32_t pad_len = FRAME_PREAMBLE_INLINE_SIZE - segment_bls[0].length();
+ preamble_bl.reserve(sizeof(preamble) + pad_len);
+ preamble_bl.append(reinterpret_cast<const char*>(&preamble),
+ sizeof(preamble));
+ preamble_bl.claim_append(segment_bls[0]);
+ if (pad_len > 0) {
+ preamble_bl.append_zero(pad_len);
+ }
+ }
+
+ m_crypto->tx->reset_tx_handler({preamble_bl.length()});
+ m_crypto->tx->authenticated_encrypt_update(preamble_bl);
+ auto frame_bl = m_crypto->tx->authenticated_encrypt_final();
+
+ if (segment_bls[0].length() > 0) {
+ m_crypto->tx->reset_tx_handler({segment_bls[0].length()});
+ m_crypto->tx->authenticated_encrypt_update(segment_bls[0]);
+ auto tmp = m_crypto->tx->authenticated_encrypt_final();
+ frame_bl.claim_append(tmp);
+ }
+ if (m_descs.size() == 1) {
+ return frame_bl; // no epilogue if only one segment
+ }
+
+ epilogue_secure_rev1_block_t epilogue;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ ::memset(&epilogue, 0, sizeof(epilogue));
+ epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE;
+ bufferlist epilogue_bl(sizeof(epilogue));
+ epilogue_bl.append(reinterpret_cast<const char*>(&epilogue),
+ sizeof(epilogue));
+
+ // MAX_NUM_SEGMENTS - 1 + epilogue
+ uint32_t onwire_lens[MAX_NUM_SEGMENTS];
+ for (size_t i = 1; i < m_descs.size(); i++) {
+ onwire_lens[i - 1] = segment_bls[i].length(); // already padded
+ }
+ onwire_lens[m_descs.size() - 1] = epilogue_bl.length();
+ m_crypto->tx->reset_tx_handler(onwire_lens, onwire_lens + m_descs.size());
+ for (size_t i = 1; i < m_descs.size(); i++) {
+ if (segment_bls[i].length() > 0) {
+ m_crypto->tx->authenticated_encrypt_update(segment_bls[i]);
+ }
+ }
+ m_crypto->tx->authenticated_encrypt_update(epilogue_bl);
+ auto tmp = m_crypto->tx->authenticated_encrypt_final();
+ frame_bl.claim_append(tmp);
+ return frame_bl;
+}
+
+bufferlist FrameAssembler::assemble_frame(Tag tag, bufferlist segment_bls[],
+ const uint16_t segment_aligns[],
+ size_t segment_count) {
+ m_descs.resize(calc_num_segments(segment_bls, segment_count));
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ m_descs[i].logical_len = segment_bls[i].length();
+ m_descs[i].align = segment_aligns[i];
+ }
+
+ preamble_block_t preamble;
+ fill_preamble(tag, preamble);
+
+ if (m_crypto->rx) {
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+ // We're padding segments to biggest cipher's block size. Although
+ // AES-GCM can live without that as it's a stream cipher, we don't
+ // want to be fixed to stream ciphers only.
+ uint32_t padded_len = get_segment_padded_len(i);
+ if (padded_len > segment_bls[i].length()) {
+ uint32_t pad_len = padded_len - segment_bls[i].length();
+ segment_bls[i].reserve(pad_len);
+ segment_bls[i].append_zero(pad_len);
+ }
+ }
+ if (m_is_rev1) {
+ return asm_secure_rev1(preamble, segment_bls);
+ }
+ return asm_secure_rev0(preamble, segment_bls);
+ }
+ if (m_is_rev1) {
+ return asm_crc_rev1(preamble, segment_bls);
+ }
+ return asm_crc_rev0(preamble, segment_bls);
+}
+
+Tag FrameAssembler::disassemble_preamble(bufferlist& preamble_bl) {
+ if (m_crypto->rx) {
+ m_crypto->rx->reset_rx_handler();
+ if (m_is_rev1) {
+ ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE +
+ get_auth_tag_len());
+ m_crypto->rx->authenticated_decrypt_update_final(preamble_bl);
+ } else {
+ ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+ m_crypto->rx->authenticated_decrypt_update(preamble_bl);
+ }
+ } else {
+ ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+ }
+
+ // I expect ceph_le32 will make the endian conversion for me. Passing
+ // everything through ::Decode is unnecessary.
+ auto preamble = reinterpret_cast<const preamble_block_t*>(
+ preamble_bl.c_str());
+ // check preamble crc before any further processing
+ uint32_t crc = ceph_crc32c(
+ 0, reinterpret_cast<const unsigned char*>(preamble),
+ sizeof(*preamble) - sizeof(preamble->crc));
+ if (crc != preamble->crc) {
+ throw FrameError(fmt::format(
+ "bad preamble crc calculated={} expected={}", crc, preamble->crc));
+ }
+
+ // see calc_num_segments()
+ if (preamble->num_segments < 1 ||
+ preamble->num_segments > MAX_NUM_SEGMENTS) {
+ throw FrameError(fmt::format(
+ "bad number of segments num_segments={}", preamble->num_segments));
+ }
+ if (preamble->num_segments > 1 &&
+ preamble->segments[preamble->num_segments - 1].length == 0) {
+ throw FrameError("last segment empty");
+ }
+
+ m_descs.resize(preamble->num_segments);
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ m_descs[i].logical_len = preamble->segments[i].length;
+ m_descs[i].align = preamble->segments[i].alignment;
+ }
+ return static_cast<Tag>(preamble->tag);
+}
+
+bool FrameAssembler::disasm_all_crc_rev0(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const {
+ ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev0_block_t));
+ auto epilogue = reinterpret_cast<const epilogue_crc_rev0_block_t*>(
+ epilogue_bl.c_str());
+
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+ check_segment_crc(segment_bls[i], epilogue->crc_values[i]);
+ }
+ return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+bool FrameAssembler::disasm_all_secure_rev0(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const {
+ for (size_t i = 0; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+ if (segment_bls[i].length() > 0) {
+ m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+ unpad_zero(segment_bls[i], m_descs[i].logical_len);
+ }
+ }
+
+ ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev0_block_t) +
+ get_auth_tag_len());
+ m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+ auto epilogue = reinterpret_cast<const epilogue_secure_rev0_block_t*>(
+ epilogue_bl.c_str());
+ return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED);
+}
+
+void FrameAssembler::disasm_first_crc_rev1(bufferlist& preamble_bl,
+ bufferlist& segment_bl) const {
+ ceph_assert(preamble_bl.length() == sizeof(preamble_block_t));
+ if (m_descs[0].logical_len > 0) {
+ ceph_assert(segment_bl.length() == m_descs[0].logical_len +
+ FRAME_CRC_SIZE);
+ bufferlist::const_iterator it(&segment_bl, m_descs[0].logical_len);
+ uint32_t expected_crc;
+ decode(expected_crc, it);
+ segment_bl.splice(m_descs[0].logical_len, FRAME_CRC_SIZE);
+ check_segment_crc(segment_bl, expected_crc);
+ } else {
+ ceph_assert(segment_bl.length() == 0);
+ }
+}
+
+bool FrameAssembler::disasm_remaining_crc_rev1(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const {
+ ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev1_block_t));
+ auto epilogue = reinterpret_cast<const epilogue_crc_rev1_block_t*>(
+ epilogue_bl.c_str());
+
+ for (size_t i = 1; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == m_descs[i].logical_len);
+ check_segment_crc(segment_bls[i], epilogue->crc_values[i - 1]);
+ }
+ return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disasm_first_secure_rev1(bufferlist& preamble_bl,
+ bufferlist& segment_bl) const {
+ ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE);
+ uint32_t padded_len = get_segment_padded_len(0);
+ if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+ ceph_assert(segment_bl.length() == padded_len + get_auth_tag_len() -
+ FRAME_PREAMBLE_INLINE_SIZE);
+ m_crypto->rx->reset_rx_handler();
+ m_crypto->rx->authenticated_decrypt_update_final(segment_bl);
+ // prepend the inline buffer (already decrypted) to segment_bl
+ bufferlist tmp;
+ segment_bl.swap(tmp);
+ preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+ &segment_bl);
+ segment_bl.claim_append(tmp);
+ } else {
+ ceph_assert(segment_bl.length() == 0);
+ preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE,
+ &segment_bl);
+ }
+ unpad_zero(segment_bl, m_descs[0].logical_len);
+ ceph_assert(segment_bl.length() == m_descs[0].logical_len);
+}
+
+bool FrameAssembler::disasm_remaining_secure_rev1(
+ bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+ m_crypto->rx->reset_rx_handler();
+ for (size_t i = 1; i < m_descs.size(); i++) {
+ ceph_assert(segment_bls[i].length() == get_segment_padded_len(i));
+ if (segment_bls[i].length() > 0) {
+ m_crypto->rx->authenticated_decrypt_update(segment_bls[i]);
+ unpad_zero(segment_bls[i], m_descs[i].logical_len);
+ }
+ }
+
+ ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev1_block_t) +
+ get_auth_tag_len());
+ m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl);
+ auto epilogue = reinterpret_cast<const epilogue_secure_rev1_block_t*>(
+ epilogue_bl.c_str());
+ return check_epilogue_late_status(epilogue->late_status);
+}
+
+void FrameAssembler::disassemble_first_segment(bufferlist& preamble_bl,
+ bufferlist& segment_bl) const {
+ ceph_assert(!m_descs.empty());
+ if (m_is_rev1) {
+ if (m_crypto->rx) {
+ disasm_first_secure_rev1(preamble_bl, segment_bl);
+ } else {
+ disasm_first_crc_rev1(preamble_bl, segment_bl);
+ }
+ } else {
+ // noop, everything is handled in disassemble_remaining_segments()
+ }
+}
+
+bool FrameAssembler::disassemble_remaining_segments(
+ bufferlist segment_bls[], bufferlist& epilogue_bl) const {
+ ceph_assert(!m_descs.empty());
+ if (m_is_rev1) {
+ if (m_descs.size() == 1) {
+ // no epilogue if only one segment
+ ceph_assert(epilogue_bl.length() == 0);
+ return true;
+ }
+ if (m_crypto->rx) {
+ return disasm_remaining_secure_rev1(segment_bls, epilogue_bl);
+ }
+ return disasm_remaining_crc_rev1(segment_bls, epilogue_bl);
+ }
+ if (m_crypto->rx) {
+ return disasm_all_secure_rev0(segment_bls, epilogue_bl);
+ }
+ return disasm_all_crc_rev0(segment_bls, epilogue_bl);
+}
+
+std::ostream& operator<<(std::ostream& os, const FrameAssembler& frame_asm) {
+ if (!frame_asm.m_descs.empty()) {
+ os << frame_asm.get_preamble_onwire_len();
+ for (size_t i = 0; i < frame_asm.m_descs.size(); i++) {
+ os << " + " << frame_asm.get_segment_onwire_len(i)
+ << " (logical " << frame_asm.m_descs[i].logical_len
+ << "/" << frame_asm.m_descs[i].align << ")";
+ }
+ os << " + " << frame_asm.get_epilogue_onwire_len() << " ";
+ }
+ os << "rev1=" << frame_asm.m_is_rev1
+ << " rx=" << frame_asm.m_crypto->rx.get()
+ << " tx=" << frame_asm.m_crypto->tx.get();
+ return os;
+}
+
+} // namespace ceph::msgr::v2
diff --git a/src/msg/async/frames_v2.h b/src/msg/async/frames_v2.h
new file mode 100644
index 00000000..88fa4e1b
--- /dev/null
+++ b/src/msg/async/frames_v2.h
@@ -0,0 +1,842 @@
+#ifndef _MSG_ASYNC_FRAMES_V2_
+#define _MSG_ASYNC_FRAMES_V2_
+
+#include "include/types.h"
+#include "common/Clock.h"
+#include "crypto_onwire.h"
+#include <array>
+#include <iosfwd>
+#include <utility>
+
+#include <boost/container/static_vector.hpp>
+
+/**
+ * Protocol V2 Frame Structures
+ *
+ * Documentation in: doc/dev/msgr2.rst
+ **/
+
+namespace ceph::msgr::v2 {
+
+// We require these features from any peer, period, in order to encode
+// a entity_addrvec_t.
+const uint64_t msgr2_required = CEPH_FEATUREMASK_MSG_ADDR2;
+
+// We additionally assume the peer has the below features *purely for
+// the purpose of encoding the frames themselves*. The only complex
+// types in the frames are entity_addr_t and entity_addrvec_t, and we
+// specifically want the peer to understand the (new in nautilus)
+// TYPE_ANY. We treat narrow this assumption to frames because we
+// expect there may be future clients (the kernel) that understand
+// msgr v2 and understand this encoding but don't necessarily have
+// everything else that SERVER_NAUTILUS implies. Yes, a fresh feature
+// bit would be a cleaner approach, but those are scarce these days.
+const uint64_t msgr2_frame_assumed =
+ msgr2_required |
+ CEPH_FEATUREMASK_SERVER_NAUTILUS;
+
+enum class Tag : __u8 {
+ HELLO = 1,
+ AUTH_REQUEST,
+ AUTH_BAD_METHOD,
+ AUTH_REPLY_MORE,
+ AUTH_REQUEST_MORE,
+ AUTH_DONE,
+ AUTH_SIGNATURE,
+ CLIENT_IDENT,
+ SERVER_IDENT,
+ IDENT_MISSING_FEATURES,
+ SESSION_RECONNECT,
+ SESSION_RESET,
+ SESSION_RETRY,
+ SESSION_RETRY_GLOBAL,
+ SESSION_RECONNECT_OK,
+ WAIT,
+ MESSAGE,
+ KEEPALIVE2,
+ KEEPALIVE2_ACK,
+ ACK
+};
+
+struct segment_t {
+ // TODO: this will be dropped with support for `allocation policies`.
+ // We need them because of the rx_buffers zero-copy optimization.
+ static constexpr __le16 PAGE_SIZE_ALIGNMENT{4096};
+
+ static constexpr __le16 DEFAULT_ALIGNMENT = sizeof(void *);
+
+ ceph_le32 length;
+ ceph_le16 alignment;
+} __attribute__((packed));
+
+struct SegmentIndex {
+ struct Msg {
+ static constexpr std::size_t HEADER = 0;
+ static constexpr std::size_t FRONT = 1;
+ static constexpr std::size_t MIDDLE = 2;
+ static constexpr std::size_t DATA = 3;
+ };
+
+ struct Control {
+ static constexpr std::size_t PAYLOAD = 0;
+ };
+};
+
+static constexpr uint8_t CRYPTO_BLOCK_SIZE { 16 };
+
+static constexpr std::size_t MAX_NUM_SEGMENTS = 4;
+
+// V2 preamble consists of one or more preamble blocks depending on
+// the number of segments a particular frame needs. Each block holds
+// up to MAX_NUM_SEGMENTS segments and has its own CRC.
+//
+// XXX: currently the multi-segment facility is NOT implemented.
+struct preamble_block_t {
+ // Tag. For multi-segmented frames the value is the same
+ // between subsequent preamble blocks.
+ __u8 tag;
+
+ // Number of segments to go in entire frame. First preable block has
+ // set this to just #segments, second #segments - MAX_NUM_SEGMENTS,
+ // third to #segments - MAX_NUM_SEGMENTS and so on.
+ __u8 num_segments;
+
+ std::array<segment_t, MAX_NUM_SEGMENTS> segments;
+ __u8 _reserved[2];
+
+ // CRC32 for this single preamble block.
+ ceph_le32 crc;
+} __attribute__((packed));
+static_assert(sizeof(preamble_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout<preamble_block_t>::value);
+
+struct epilogue_crc_rev0_block_t {
+ __u8 late_flags; // FRAME_LATE_FLAG_ABORTED
+ std::array<ceph_le32, MAX_NUM_SEGMENTS> crc_values;
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev0_block_t>);
+
+struct epilogue_crc_rev1_block_t {
+ __u8 late_status; // FRAME_LATE_STATUS_*
+ ceph_le32 crc_values[MAX_NUM_SEGMENTS - 1];
+} __attribute__((packed));
+static_assert(std::is_standard_layout_v<epilogue_crc_rev1_block_t>);
+
+struct epilogue_secure_rev0_block_t {
+ __u8 late_flags; // FRAME_LATE_FLAG_ABORTED
+ __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_flags)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev0_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev0_block_t>);
+
+// epilogue_secure_rev0_block_t with late_flags changed to late_status
+struct epilogue_secure_rev1_block_t {
+ __u8 late_status; // FRAME_LATE_STATUS_*
+ __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_status)];
+} __attribute__((packed));
+static_assert(sizeof(epilogue_secure_rev1_block_t) % CRYPTO_BLOCK_SIZE == 0);
+static_assert(std::is_standard_layout_v<epilogue_secure_rev1_block_t>);
+
+static constexpr uint32_t FRAME_CRC_SIZE = 4;
+static constexpr uint32_t FRAME_PREAMBLE_INLINE_SIZE = 48;
+static_assert(FRAME_PREAMBLE_INLINE_SIZE % CRYPTO_BLOCK_SIZE == 0);
+// just for performance, nothing should break otherwise
+static_assert(sizeof(ceph_msg_header2) <= FRAME_PREAMBLE_INLINE_SIZE);
+static constexpr uint32_t FRAME_PREAMBLE_WITH_INLINE_SIZE =
+ sizeof(preamble_block_t) + FRAME_PREAMBLE_INLINE_SIZE;
+
+// A frame can be aborted by the sender after transmitting the
+// preamble and the first segment. The remainder of the frame
+// is filled with zeros, up until the epilogue.
+//
+// This flag is for msgr2.0. Note that in crc mode, late_flags
+// is not covered by any crc -- a single bit flip can result in
+// a completed frame being dropped or in an aborted frame with
+// garbage segment payloads being dispatched.
+#define FRAME_LATE_FLAG_ABORTED (1<<0)
+
+// For msgr2.1, FRAME_LATE_STATUS_ABORTED has the same meaning
+// as FRAME_LATE_FLAG_ABORTED and late_status replaces late_flags.
+// Bit error detection in crc mode is achieved by using a 4-bit
+// nibble per flag with two code words that are far apart in terms
+// of Hamming Distance (HD=4, same as provided by CRC32-C for
+// input lengths over ~5K).
+#define FRAME_LATE_STATUS_ABORTED 0x1
+#define FRAME_LATE_STATUS_COMPLETE 0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
+
+#define FRAME_LATE_STATUS_RESERVED_TRUE 0x10
+#define FRAME_LATE_STATUS_RESERVED_FALSE 0xe0
+#define FRAME_LATE_STATUS_RESERVED_MASK 0xf0
+
+struct FrameError : std::runtime_error {
+ using runtime_error::runtime_error;
+};
+
+class FrameAssembler {
+public:
+ // crypto must be non-null
+ FrameAssembler(const ceph::crypto::onwire::rxtx_t* crypto, bool is_rev1)
+ : m_crypto(crypto), m_is_rev1(is_rev1) {}
+
+ void set_is_rev1(bool is_rev1) {
+ m_descs.clear();
+ m_is_rev1 = is_rev1;
+ }
+
+ bool get_is_rev1() {
+ return m_is_rev1;
+ }
+
+ size_t get_num_segments() const {
+ ceph_assert(!m_descs.empty());
+ return m_descs.size();
+ }
+
+ uint32_t get_segment_logical_len(size_t seg_idx) const {
+ ceph_assert(seg_idx < m_descs.size());
+ return m_descs[seg_idx].logical_len;
+ }
+
+ uint16_t get_segment_align(size_t seg_idx) const {
+ ceph_assert(seg_idx < m_descs.size());
+ return m_descs[seg_idx].align;
+ }
+
+ // Preamble:
+ //
+ // preamble_block_t
+ // [preamble inline buffer + auth tag -- only in msgr2.1 secure mode]
+ //
+ // The preamble is generated unconditionally.
+ //
+ // In msgr2.1 secure mode, the first segment is inlined into the
+ // preamble inline buffer, either fully or partially.
+ uint32_t get_preamble_onwire_len() const {
+ if (m_is_rev1 && m_crypto->rx) {
+ return FRAME_PREAMBLE_WITH_INLINE_SIZE + get_auth_tag_len();
+ }
+ return sizeof(preamble_block_t);
+ }
+
+ // Segment:
+ //
+ // segment payload
+ // [zero padding -- only in secure mode]
+ // [crc or auth tag -- only in msgr2.1, only for the first segment]
+ //
+ // For an empty segment, nothing is generated. In msgr2.1 secure
+ // mode, if the first segment gets fully inlined into the preamble
+ // inline buffer, it is considered empty.
+ uint32_t get_segment_onwire_len(size_t seg_idx) const {
+ ceph_assert(seg_idx < m_descs.size());
+ if (m_crypto->rx) {
+ uint32_t padded_len = get_segment_padded_len(seg_idx);
+ if (m_is_rev1 && seg_idx == 0) {
+ if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) {
+ return padded_len + get_auth_tag_len() - FRAME_PREAMBLE_INLINE_SIZE;
+ }
+ return 0;
+ }
+ return padded_len;
+ }
+ if (m_is_rev1 && seg_idx == 0 && m_descs[0].logical_len > 0) {
+ return m_descs[0].logical_len + FRAME_CRC_SIZE;
+ }
+ return m_descs[seg_idx].logical_len;
+ }
+
+ // Epilogue:
+ //
+ // epilogue_*_block_t
+ // [auth tag -- only in secure mode]
+ //
+ // For msgr2.0, the epilogue is generated unconditionally. In
+ // crc mode, it stores crcs for all segments; the preamble is
+ // covered by its own crc. In secure mode, the epilogue auth tag
+ // covers the whole frame.
+ //
+ // For msgr2.1, the epilogue is generated only if the frame has
+ // more than one segment (i.e. at least one of second to fourth
+ // segments is not empty). In crc mode, it stores crcs for
+ // second to fourh segments; the preamble and the first segment
+ // are covered by their own crcs. In secure mode, the epilogue
+ // auth tag covers second to fourth segments; the preamble and the
+ // first segment (if not fully inlined into the preamble inline
+ // buffer) are covered by their own auth tags.
+ //
+ // Note that the auth tag format is an implementation detail of a
+ // particular cipher. FrameAssembler is concerned only with where
+ // the auth tag is placed (at the end of the ciphertext) and how
+ // long it is (RxHandler::get_extra_size_at_final()). This is to
+ // provide room for other encryption algorithms: currently we use
+ // AES-128-GCM with 16-byte tags, but it is possible to switch to
+ // e.g. AES-128-CBC + HMAC-SHA512 without affecting the protocol
+ // (except for the cipher negotiation, of course).
+ //
+ // Additionally, each variant of the epilogue contains either
+ // late_flags or late_status field that directs handling of frames
+ // with more than one segment.
+ uint32_t get_epilogue_onwire_len() const {
+ ceph_assert(!m_descs.empty());
+ if (m_is_rev1 && m_descs.size() == 1) {
+ return 0;
+ }
+ if (m_crypto->rx) {
+ return (m_is_rev1 ? sizeof(epilogue_secure_rev1_block_t) :
+ sizeof(epilogue_secure_rev0_block_t)) + get_auth_tag_len();
+ }
+ return m_is_rev1 ? sizeof(epilogue_crc_rev1_block_t) :
+ sizeof(epilogue_crc_rev0_block_t);
+ }
+
+ uint64_t get_frame_logical_len() const;
+ uint64_t get_frame_onwire_len() const;
+
+ bufferlist assemble_frame(Tag tag, bufferlist segment_bls[],
+ const uint16_t segment_aligns[],
+ size_t segment_count);
+
+ Tag disassemble_preamble(bufferlist& preamble_bl);
+
+ // Like msgr1, and unlike msgr2.0, msgr2.1 allows interpreting the
+ // first segment before reading in the rest of the frame.
+ //
+ // For msgr2.1 (set_is_rev1(true)), you may:
+ //
+ // - read in the first segment
+ // - call disassemble_first_segment()
+ // - use the contents of the first segment, for example to
+ // look up user-provided buffers based on ceph_msg_header2::tid
+ // - read in the remaining segments, possibly directly into
+ // user-provided buffers
+ // - read in epilogue
+ // - call disassemble_remaining_segments()
+ //
+ // For msgr2.0 (set_is_rev1(false)), disassemble_first_segment() is
+ // a noop. To accomodate, disassemble_remaining_segments() always
+ // takes all segments and skips over the first segment in msgr2.1
+ // case. You must:
+ //
+ // - read in all segments
+ // - read in epilogue
+ // - call disassemble_remaining_segments()
+ //
+ // disassemble_remaining_segments() returns true if the frame is
+ // ready for dispatching, or false if it was aborted by the sender
+ // and must be dropped.
+ void disassemble_first_segment(bufferlist& preamble_bl,
+ bufferlist& segment_bl) const;
+ bool disassemble_remaining_segments(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const;
+
+private:
+ struct segment_desc_t {
+ uint32_t logical_len;
+ uint16_t align;
+ };
+
+ uint32_t get_segment_padded_len(size_t seg_idx) const {
+ return p2roundup<uint32_t>(m_descs[seg_idx].logical_len,
+ CRYPTO_BLOCK_SIZE);
+ }
+
+ uint32_t get_auth_tag_len() const {
+ return m_crypto->rx->get_extra_size_at_final();
+ }
+
+ bufferlist asm_crc_rev0(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const;
+ bufferlist asm_secure_rev0(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const;
+ bufferlist asm_crc_rev1(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const;
+ bufferlist asm_secure_rev1(const preamble_block_t& preamble,
+ bufferlist segment_bls[]) const;
+
+ bool disasm_all_crc_rev0(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const;
+ bool disasm_all_secure_rev0(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const;
+ void disasm_first_crc_rev1(bufferlist& preamble_bl,
+ bufferlist& segment_bl) const;
+ bool disasm_remaining_crc_rev1(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const;
+ void disasm_first_secure_rev1(bufferlist& preamble_bl,
+ bufferlist& segment_bl) const;
+ bool disasm_remaining_secure_rev1(bufferlist segment_bls[],
+ bufferlist& epilogue_bl) const;
+
+ void fill_preamble(Tag tag, preamble_block_t& preamble) const;
+ friend std::ostream& operator<<(std::ostream& os,
+ const FrameAssembler& frame_asm);
+
+ boost::container::static_vector<segment_desc_t, MAX_NUM_SEGMENTS> m_descs;
+ const ceph::crypto::onwire::rxtx_t* m_crypto;
+ bool m_is_rev1; // msgr2.1?
+};
+
+template <class T, uint16_t... SegmentAlignmentVs>
+struct Frame {
+ static constexpr size_t SegmentsNumV = sizeof...(SegmentAlignmentVs);
+ static_assert(SegmentsNumV > 0 && SegmentsNumV <= MAX_NUM_SEGMENTS);
+protected:
+ std::array<ceph::bufferlist, SegmentsNumV> segments;
+
+private:
+ static constexpr std::array<uint16_t, SegmentsNumV> alignments {
+ SegmentAlignmentVs...
+ };
+
+public:
+ ceph::bufferlist get_buffer(FrameAssembler& tx_frame_asm) {
+ auto bl = tx_frame_asm.assemble_frame(T::tag, segments.data(),
+ alignments.data(), SegmentsNumV);
+ ceph_assert(bl.length() == tx_frame_asm.get_frame_onwire_len());
+ return bl;
+ }
+};
+
+// ControlFrames are used to manage transceiver state (like connections) and
+// orchestrate transfers of MessageFrames. They use only single segment with
+// marshalling facilities -- derived classes specify frame structure through
+// Args pack while ControlFrame provides common encode/decode machinery.
+template <class C, typename... Args>
+class ControlFrame : public Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */> {
+protected:
+ ceph::bufferlist &get_payload_segment() {
+ return this->segments[SegmentIndex::Control::PAYLOAD];
+ }
+
+ // this tuple is only used when decoding values from a payload segment
+ std::tuple<Args...> _values;
+
+ // FIXME: for now, we assume specific features for the purpoess of encoding
+ // the frames themselves (*not* messages in message frames!).
+ uint64_t features = msgr2_frame_assumed;
+
+ template <typename T>
+ inline void _encode_payload_each(T &t) {
+ if constexpr (std::is_same<T, std::vector<uint32_t> const>()) {
+ encode((uint32_t)t.size(), this->get_payload_segment(), features);
+ for (const auto &elem : t) {
+ encode(elem, this->get_payload_segment(), features);
+ }
+ } else {
+ encode(t, this->get_payload_segment(), features);
+ }
+ }
+
+ template <typename T>
+ inline void _decode_payload_each(T &t, bufferlist::const_iterator &ti) const {
+ if constexpr (std::is_same<T, std::vector<uint32_t>>()) {
+ uint32_t size;
+ decode(size, ti);
+ t.resize(size);
+ for (uint32_t i = 0; i < size; ++i) {
+ decode(t[i], ti);
+ }
+ } else {
+ decode(t, ti);
+ }
+ }
+
+ template <std::size_t... Is>
+ inline void _decode_payload(bufferlist::const_iterator &ti,
+ std::index_sequence<Is...>) const {
+ (_decode_payload_each((Args &)std::get<Is>(_values), ti), ...);
+ }
+
+ template <std::size_t N>
+ inline decltype(auto) get_val() {
+ return std::get<N>(_values);
+ }
+
+ ControlFrame()
+ : Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */>() {
+ }
+
+ void _encode(const Args &... args) {
+ (_encode_payload_each(args), ...);
+ }
+
+ void _decode(const ceph::bufferlist &bl) {
+ auto ti = bl.cbegin();
+ _decode_payload(ti, std::index_sequence_for<Args...>());
+ }
+
+public:
+ static C Encode(const Args &... args) {
+ C c;
+ c._encode(args...);
+ return c;
+ }
+
+ static C Decode(const ceph::bufferlist &payload) {
+ C c;
+ c._decode(payload);
+ return c;
+ }
+};
+
+struct HelloFrame : public ControlFrame<HelloFrame,
+ uint8_t, // entity type
+ entity_addr_t> { // peer address
+ static const Tag tag = Tag::HELLO;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint8_t &entity_type() { return get_val<0>(); }
+ inline entity_addr_t &peer_addr() { return get_val<1>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestFrame : public ControlFrame<AuthRequestFrame,
+ uint32_t, // auth method
+ vector<uint32_t>, // preferred modes
+ bufferlist> { // auth payload
+ static const Tag tag = Tag::AUTH_REQUEST;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint32_t &method() { return get_val<0>(); }
+ inline vector<uint32_t> &preferred_modes() { return get_val<1>(); }
+ inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AuthBadMethodFrame : public ControlFrame<AuthBadMethodFrame,
+ uint32_t, // method
+ int32_t, // result
+ std::vector<uint32_t>, // allowed methods
+ std::vector<uint32_t>> { // allowed modes
+ static const Tag tag = Tag::AUTH_BAD_METHOD;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint32_t &method() { return get_val<0>(); }
+ inline int32_t &result() { return get_val<1>(); }
+ inline std::vector<uint32_t> &allowed_methods() { return get_val<2>(); }
+ inline std::vector<uint32_t> &allowed_modes() { return get_val<3>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AuthReplyMoreFrame : public ControlFrame<AuthReplyMoreFrame,
+ bufferlist> { // auth payload
+ static const Tag tag = Tag::AUTH_REPLY_MORE;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AuthRequestMoreFrame : public ControlFrame<AuthRequestMoreFrame,
+ bufferlist> { // auth payload
+ static const Tag tag = Tag::AUTH_REQUEST_MORE;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline bufferlist &auth_payload() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AuthDoneFrame : public ControlFrame<AuthDoneFrame,
+ uint64_t, // global id
+ uint32_t, // connection mode
+ bufferlist> { // auth method payload
+ static const Tag tag = Tag::AUTH_DONE;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint64_t &global_id() { return get_val<0>(); }
+ inline uint32_t &con_mode() { return get_val<1>(); }
+ inline bufferlist &auth_payload() { return get_val<2>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AuthSignatureFrame
+ : public ControlFrame<AuthSignatureFrame,
+ sha256_digest_t> {
+ static const Tag tag = Tag::AUTH_SIGNATURE;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline sha256_digest_t &signature() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct ClientIdentFrame
+ : public ControlFrame<ClientIdentFrame,
+ entity_addrvec_t, // my addresses
+ entity_addr_t, // target address
+ int64_t, // global_id
+ uint64_t, // global seq
+ uint64_t, // supported features
+ uint64_t, // required features
+ uint64_t, // flags
+ uint64_t> { // client cookie
+ static const Tag tag = Tag::CLIENT_IDENT;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline entity_addrvec_t &addrs() { return get_val<0>(); }
+ inline entity_addr_t &target_addr() { return get_val<1>(); }
+ inline int64_t &gid() { return get_val<2>(); }
+ inline uint64_t &global_seq() { return get_val<3>(); }
+ inline uint64_t &supported_features() { return get_val<4>(); }
+ inline uint64_t &required_features() { return get_val<5>(); }
+ inline uint64_t &flags() { return get_val<6>(); }
+ inline uint64_t &cookie() { return get_val<7>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct ServerIdentFrame
+ : public ControlFrame<ServerIdentFrame,
+ entity_addrvec_t, // my addresses
+ int64_t, // global_id
+ uint64_t, // global seq
+ uint64_t, // supported features
+ uint64_t, // required features
+ uint64_t, // flags
+ uint64_t> { // server cookie
+ static const Tag tag = Tag::SERVER_IDENT;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline entity_addrvec_t &addrs() { return get_val<0>(); }
+ inline int64_t &gid() { return get_val<1>(); }
+ inline uint64_t &global_seq() { return get_val<2>(); }
+ inline uint64_t &supported_features() { return get_val<3>(); }
+ inline uint64_t &required_features() { return get_val<4>(); }
+ inline uint64_t &flags() { return get_val<5>(); }
+ inline uint64_t &cookie() { return get_val<6>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct ReconnectFrame
+ : public ControlFrame<ReconnectFrame,
+ entity_addrvec_t, // my addresses
+ uint64_t, // client cookie
+ uint64_t, // server cookie
+ uint64_t, // global sequence
+ uint64_t, // connect sequence
+ uint64_t> { // message sequence
+ static const Tag tag = Tag::SESSION_RECONNECT;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline entity_addrvec_t &addrs() { return get_val<0>(); }
+ inline uint64_t &client_cookie() { return get_val<1>(); }
+ inline uint64_t &server_cookie() { return get_val<2>(); }
+ inline uint64_t &global_seq() { return get_val<3>(); }
+ inline uint64_t &connect_seq() { return get_val<4>(); }
+ inline uint64_t &msg_seq() { return get_val<5>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct ResetFrame : public ControlFrame<ResetFrame,
+ bool> { // full reset
+ static const Tag tag = Tag::SESSION_RESET;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline bool &full() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct RetryFrame : public ControlFrame<RetryFrame,
+ uint64_t> { // connection seq
+ static const Tag tag = Tag::SESSION_RETRY;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint64_t &connect_seq() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct RetryGlobalFrame : public ControlFrame<RetryGlobalFrame,
+ uint64_t> { // global seq
+ static const Tag tag = Tag::SESSION_RETRY_GLOBAL;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint64_t &global_seq() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct WaitFrame : public ControlFrame<WaitFrame> {
+ static const Tag tag = Tag::WAIT;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct ReconnectOkFrame : public ControlFrame<ReconnectOkFrame,
+ uint64_t> { // message seq
+ static const Tag tag = Tag::SESSION_RECONNECT_OK;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint64_t &msg_seq() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct IdentMissingFeaturesFrame
+ : public ControlFrame<IdentMissingFeaturesFrame,
+ uint64_t> { // missing features mask
+ static const Tag tag = Tag::IDENT_MISSING_FEATURES;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint64_t &features() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrame : public ControlFrame<KeepAliveFrame,
+ utime_t> { // timestamp
+ static const Tag tag = Tag::KEEPALIVE2;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ static KeepAliveFrame Encode() {
+ return KeepAliveFrame::Encode(ceph_clock_now());
+ }
+
+ inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct KeepAliveFrameAck : public ControlFrame<KeepAliveFrameAck,
+ utime_t> { // ack timestamp
+ static const Tag tag = Tag::KEEPALIVE2_ACK;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline utime_t &timestamp() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+struct AckFrame : public ControlFrame<AckFrame,
+ uint64_t> { // message sequence
+ static const Tag tag = Tag::ACK;
+ using ControlFrame::Encode;
+ using ControlFrame::Decode;
+
+ inline uint64_t &seq() { return get_val<0>(); }
+
+protected:
+ using ControlFrame::ControlFrame;
+};
+
+using segment_bls_t =
+ boost::container::static_vector<bufferlist, MAX_NUM_SEGMENTS>;
+
+// This class is used for encoding/decoding header of the message frame.
+// Body is processed almost independently with the sole junction point
+// being the `extra_payload_len` passed to get_buffer().
+struct MessageFrame : public Frame<MessageFrame,
+ /* four segments */
+ segment_t::DEFAULT_ALIGNMENT,
+ segment_t::DEFAULT_ALIGNMENT,
+ segment_t::DEFAULT_ALIGNMENT,
+ segment_t::PAGE_SIZE_ALIGNMENT> {
+ static const Tag tag = Tag::MESSAGE;
+
+ static MessageFrame Encode(const ceph_msg_header2 &msg_header,
+ const ceph::bufferlist &front,
+ const ceph::bufferlist &middle,
+ const ceph::bufferlist &data) {
+ MessageFrame f;
+ f.segments[SegmentIndex::Msg::HEADER].append(
+ reinterpret_cast<const char*>(&msg_header), sizeof(msg_header));
+
+ f.segments[SegmentIndex::Msg::FRONT] = front;
+ f.segments[SegmentIndex::Msg::MIDDLE] = middle;
+ f.segments[SegmentIndex::Msg::DATA] = data;
+
+ return f;
+ }
+
+ static MessageFrame Decode(segment_bls_t& recv_segments) {
+ MessageFrame f;
+ // transfer segments' bufferlists. If a MessageFrame contains less
+ // SegmentsNumV segments, the missing ones will be seen as zeroed.
+ for (__u8 idx = 0; idx < std::size(recv_segments); idx++) {
+ f.segments[idx] = std::move(recv_segments[idx]);
+ }
+ return f;
+ }
+
+ inline const ceph_msg_header2 &header() {
+ auto& hdrbl = segments[SegmentIndex::Msg::HEADER];
+ return reinterpret_cast<const ceph_msg_header2&>(*hdrbl.c_str());
+ }
+
+ ceph::bufferlist &front() {
+ return segments[SegmentIndex::Msg::FRONT];
+ }
+
+ ceph::bufferlist &middle() {
+ return segments[SegmentIndex::Msg::MIDDLE];
+ }
+
+ ceph::bufferlist &data() {
+ return segments[SegmentIndex::Msg::DATA];
+ }
+
+ uint32_t front_len() const {
+ return segments[SegmentIndex::Msg::FRONT].length();
+ }
+
+ uint32_t middle_len() const {
+ return segments[SegmentIndex::Msg::MIDDLE].length();
+ }
+
+ uint32_t data_len() const {
+ return segments[SegmentIndex::Msg::DATA].length();
+ }
+
+protected:
+ using Frame::Frame;
+};
+
+} // namespace ceph::msgr::v2
+
+#endif // _MSG_ASYNC_FRAMES_V2_
diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc
new file mode 100644
index 00000000..2b4e646d
--- /dev/null
+++ b/src/msg/async/net_handler.cc
@@ -0,0 +1,233 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+
+#include "net_handler.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "NetHandler "
+
+namespace ceph{
+
+int NetHandler::create_socket(int domain, bool reuse_addr)
+{
+ int s;
+ int r = 0;
+
+ if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) {
+ r = errno;
+ lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl;
+ return -r;
+ }
+
+#if !defined(__FreeBSD__)
+ /* Make sure connection-intensive things like the benchmark
+ * will be able to close/open sockets a zillion of times */
+ if (reuse_addr) {
+ int on = 1;
+ if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
+ r = errno;
+ lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: "
+ << strerror(r) << dendl;
+ close(s);
+ return -r;
+ }
+ }
+#endif
+
+ return s;
+}
+
+int NetHandler::set_nonblock(int sd)
+{
+ int flags;
+ int r = 0;
+
+ /* Set the socket nonblocking.
+ * Note that fcntl(2) for F_GETFL and F_SETFL can't be
+ * interrupted by a signal. */
+ if ((flags = fcntl(sd, F_GETFL)) < 0 ) {
+ r = errno;
+ lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl;
+ return -r;
+ }
+ if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) {
+ r = errno;
+ lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl;
+ return -r;
+ }
+
+ return 0;
+}
+
+int NetHandler::set_socket_options(int sd, bool nodelay, int size)
+{
+ int r = 0;
+ // disable Nagle algorithm?
+ if (nodelay) {
+ int flag = 1;
+ r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag));
+ if (r < 0) {
+ r = errno;
+ ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl;
+ }
+ }
+ if (size) {
+ r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size));
+ if (r < 0) {
+ r = errno;
+ ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl;
+ }
+ }
+
+ // block ESIGPIPE
+#ifdef CEPH_USE_SO_NOSIGPIPE
+ int val = 1;
+ r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val));
+ if (r) {
+ r = errno;
+ ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl;
+ }
+#endif
+ return -r;
+}
+
+void NetHandler::set_priority(int sd, int prio, int domain)
+{
+#ifdef SO_PRIORITY
+ if (prio < 0) {
+ return;
+ }
+ int r = -1;
+#ifdef IPTOS_CLASS_CS6
+ int iptos = IPTOS_CLASS_CS6;
+ switch (domain) {
+ case AF_INET:
+ r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos));
+ break;
+ case AF_INET6:
+ r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos));
+ break;
+ default:
+ lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")"
+ << " to " << iptos << dendl;
+ return;
+ }
+ if (r < 0) {
+ r = errno;
+ ldout(cct,0) << "couldn't set TOS to " << iptos
+ << ": " << cpp_strerror(r) << dendl;
+ }
+
+#endif // IPTOS_CLASS_CS6
+ // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0.
+ // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT
+ // We need to call setsockopt(SO_PRIORITY) after it.
+ r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio));
+ if (r < 0) {
+ r = errno;
+ ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio
+ << ": " << cpp_strerror(r) << dendl;
+ }
+#else
+ return;
+#endif // SO_PRIORITY
+}
+
+int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock)
+{
+ int ret;
+ int s = create_socket(addr.get_family());
+ if (s < 0)
+ return s;
+
+ if (nonblock) {
+ ret = set_nonblock(s);
+ if (ret < 0) {
+ close(s);
+ return ret;
+ }
+ }
+
+ set_socket_options(s, cct->_conf->ms_tcp_nodelay, cct->_conf->ms_tcp_rcvbuf);
+
+ {
+ entity_addr_t addr = bind_addr;
+ if (cct->_conf->ms_bind_before_connect && (!addr.is_blank_ip())) {
+ addr.set_port(0);
+ ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+ if (ret < 0) {
+ ret = errno;
+ ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl;
+ close(s);
+ return -ret;
+ }
+ }
+ }
+
+ ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len());
+ if (ret < 0) {
+ ret = errno;
+ if (errno == EINPROGRESS && nonblock)
+ return s;
+
+ ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl;
+ close(s);
+ return -ret;
+ }
+
+ return s;
+}
+
+int NetHandler::reconnect(const entity_addr_t &addr, int sd)
+{
+ int r = 0;
+ int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len());
+
+ if (ret < 0 && errno != EISCONN) {
+ r = errno;
+ ldout(cct, 10) << __func__ << " reconnect: " << strerror(r) << dendl;
+ if (r == EINPROGRESS || r == EALREADY)
+ return 1;
+ return -r;
+ }
+
+ return 0;
+}
+
+int NetHandler::connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+ return generic_connect(addr, bind_addr, false);
+}
+
+int NetHandler::nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr)
+{
+ return generic_connect(addr, bind_addr, true);
+}
+
+
+}
diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h
new file mode 100644
index 00000000..19042377
--- /dev/null
+++ b/src/msg/async/net_handler.h
@@ -0,0 +1,46 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_NET_UTILS_H
+#define CEPH_COMMON_NET_UTILS_H
+#include "common/config.h"
+
+namespace ceph {
+ class NetHandler {
+ int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock);
+
+ CephContext *cct;
+ public:
+ int create_socket(int domain, bool reuse_addr=false);
+ explicit NetHandler(CephContext *c): cct(c) {}
+ int set_nonblock(int sd);
+ int set_socket_options(int sd, bool nodelay, int size);
+ int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+
+ /**
+ * Try to reconnect the socket.
+ *
+ * @return 0 success
+ * > 0 just break, and wait for event
+ * < 0 need to goto fail
+ */
+ int reconnect(const entity_addr_t &addr, int sd);
+ int nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr);
+ void set_priority(int sd, int priority, int domain);
+ };
+}
+
+#endif
diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc
new file mode 100644
index 00000000..34299975
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.cc
@@ -0,0 +1,1234 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "Infiniband.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "RDMAStack.h"
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "Infiniband "
+
+static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1;
+static const uint32_t MAX_INLINE_DATA = 0;
+static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000");
+static const uint32_t CQ_DEPTH = 30000;
+
+Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr), gid_idx(0)
+{
+#ifdef HAVE_IBV_EXP
+ union ibv_gid cgid;
+ struct ibv_exp_gid_attr gid_attr;
+ bool malformed = false;
+
+ ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl;
+ int r = ibv_query_port(ctxt, port_num, port_attr);
+ if (r == -1) {
+ lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ lid = port_attr->lid;
+
+ // search for requested GID in GIDs table
+ ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid)
+ << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl;
+ r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(),
+ "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx"
+ ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx",
+ &cgid.raw[ 0], &cgid.raw[ 1],
+ &cgid.raw[ 2], &cgid.raw[ 3],
+ &cgid.raw[ 4], &cgid.raw[ 5],
+ &cgid.raw[ 6], &cgid.raw[ 7],
+ &cgid.raw[ 8], &cgid.raw[ 9],
+ &cgid.raw[10], &cgid.raw[11],
+ &cgid.raw[12], &cgid.raw[13],
+ &cgid.raw[14], &cgid.raw[15]);
+
+ if (r != 16) {
+ ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl;
+ malformed = true;
+ }
+
+ gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE;
+
+ for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) {
+ r = ibv_query_gid(ctxt, port_num, gid_idx, &gid);
+ if (r) {
+ lderr(cct) << __func__ << " query gid of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr);
+ if (r) {
+ lderr(cct) << __func__ << " query gid attributes of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ if (malformed) break; // stay with gid_idx=0
+ if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) &&
+ (memcmp(&gid, &cgid, 16) == 0) ) {
+ ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl;
+ break;
+ }
+ }
+
+ if (gid_idx == port_attr->gid_tbl_len) {
+ lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl;
+ ceph_abort();
+ }
+#else
+ int r = ibv_query_port(ctxt, port_num, port_attr);
+ if (r == -1) {
+ lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+
+ lid = port_attr->lid;
+ r = ibv_query_gid(ctxt, port_num, 0, &gid);
+ if (r) {
+ lderr(cct) << __func__ << " query gid failed " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+#endif
+}
+
+
+Device::Device(CephContext *cct, ibv_device* d, struct ibv_context *dc)
+ : device(d), device_attr(new ibv_device_attr), active_port(nullptr)
+{
+ if (device == NULL) {
+ lderr(cct) << __func__ << " device == NULL" << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ name = ibv_get_device_name(device);
+ if (cct->_conf->ms_async_rdma_cm) {
+ ctxt = dc;
+ } else {
+ ctxt = ibv_open_device(device);
+ }
+ if (ctxt == NULL) {
+ lderr(cct) << __func__ << " open rdma device failed. " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ int r = ibv_query_device(ctxt, device_attr);
+ if (r == -1) {
+ lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+}
+
+void Device::binding_port(CephContext *cct, int port_num) {
+ port_cnt = device_attr->phys_port_cnt;
+ for (uint8_t i = 0; i < port_cnt; ++i) {
+ Port *port = new Port(cct, ctxt, i+1);
+ if (i + 1 == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) {
+ active_port = port;
+ ldout(cct, 1) << __func__ << " found active port " << i+1 << dendl;
+ break;
+ } else {
+ ldout(cct, 10) << __func__ << " port " << i+1 << " is not what we want. state: " << port->get_port_attr()->state << ")"<< dendl;
+ }
+ delete port;
+ }
+ if (nullptr == active_port) {
+ lderr(cct) << __func__ << " port not found" << dendl;
+ ceph_assert(active_port);
+ }
+}
+
+
+Infiniband::QueuePair::QueuePair(
+ CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+ int port, ibv_srq *srq,
+ Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq,
+ uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key)
+: cct(c), infiniband(infiniband),
+ type(type),
+ ctxt(infiniband.device->ctxt),
+ ib_physical_port(port),
+ pd(infiniband.pd->pd),
+ srq(srq),
+ qp(NULL),
+ cm_id(cid),
+ txcq(txcq),
+ rxcq(rxcq),
+ initial_psn(0),
+ max_send_wr(tx_queue_len),
+ max_recv_wr(rx_queue_len),
+ q_key(q_key),
+ dead(false)
+{
+ initial_psn = lrand48() & 0xffffff;
+ if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) {
+ lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ pd = infiniband.pd->pd;
+}
+
+int Infiniband::QueuePair::init()
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ ibv_qp_init_attr qpia;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&qpia, 0, sizeof(qpia));
+ qpia.send_cq = txcq->get_cq();
+ qpia.recv_cq = rxcq->get_cq();
+ if (srq) {
+ qpia.srq = srq; // use the same shared receive queue
+ } else {
+ qpia.cap.max_recv_wr = max_recv_wr;
+ qpia.cap.max_recv_sge = 1;
+ }
+ qpia.cap.max_send_wr = max_send_wr; // max outstanding send requests
+ qpia.cap.max_send_sge = 1; // max send scatter-gather elements
+ qpia.cap.max_inline_data = MAX_INLINE_DATA; // max bytes of immediate data on send q
+ qpia.qp_type = type; // RC, UC, UD, or XRC
+ qpia.sq_sig_all = 0; // only generate CQEs on requested WQEs
+
+ if (!cct->_conf->ms_async_rdma_cm) {
+ qp = ibv_create_qp(pd, &qpia);
+ if (qp == NULL) {
+ lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl;
+ if (errno == ENOMEM) {
+ lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, "
+ " ms_async_rdma_send_buffers or"
+ " ms_async_rdma_buffer_size" << dendl;
+ }
+ return -1;
+ }
+ } else {
+ ceph_assert(cm_id->verbs == pd->context);
+ if (rdma_create_qp(cm_id, pd, &qpia)) {
+ lderr(cct) << __func__ << " failed to create queue pair with rdmacm library"
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ qp = cm_id->qp;
+ }
+ ldout(cct, 20) << __func__ << " successfully create queue pair: "
+ << "qp=" << qp << dendl;
+
+ if (cct->_conf->ms_async_rdma_cm)
+ return 0;
+
+ // move from RESET to INIT state
+ ibv_qp_attr qpa;
+ memset(&qpa, 0, sizeof(qpa));
+ qpa.qp_state = IBV_QPS_INIT;
+ qpa.pkey_index = 0;
+ qpa.port_num = (uint8_t)(ib_physical_port);
+ qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
+ qpa.qkey = q_key;
+
+ int mask = IBV_QP_STATE | IBV_QP_PORT;
+ switch (type) {
+ case IBV_QPT_RC:
+ mask |= IBV_QP_ACCESS_FLAGS;
+ mask |= IBV_QP_PKEY_INDEX;
+ break;
+ case IBV_QPT_UD:
+ mask |= IBV_QP_QKEY;
+ mask |= IBV_QP_PKEY_INDEX;
+ break;
+ case IBV_QPT_RAW_PACKET:
+ break;
+ default:
+ ceph_abort();
+ }
+
+ int ret = ibv_modify_qp(qp, &qpa, mask);
+ if (ret) {
+ ibv_destroy_qp(qp);
+ lderr(cct) << __func__ << " failed to transition to INIT state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ ldout(cct, 20) << __func__ << " successfully change queue pair to INIT:"
+ << " qp=" << qp << dendl;
+ return 0;
+}
+
+/**
+ * Change RC QueuePair into the ERROR state. This is necessary modify
+ * the Queue Pair into the Error state and poll all of the relevant
+ * Work Completions prior to destroying a Queue Pair.
+ * Since destroying a Queue Pair does not guarantee that its Work
+ * Completions are removed from the CQ upon destruction. Even if the
+ * Work Completions are already in the CQ, it might not be possible to
+ * retrieve them. If the Queue Pair is associated with an SRQ, it is
+ * recommended wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED
+ *
+ * \return
+ * -errno if the QueuePair can't switch to ERROR
+ * 0 for success.
+ */
+int Infiniband::QueuePair::to_dead()
+{
+ if (dead)
+ return 0;
+ ibv_qp_attr qpa;
+ memset(&qpa, 0, sizeof(qpa));
+ qpa.qp_state = IBV_QPS_ERR;
+
+ int mask = IBV_QP_STATE;
+ int ret = ibv_modify_qp(qp, &qpa, mask);
+ if (ret) {
+ lderr(cct) << __func__ << " failed to transition to ERROR state: "
+ << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ dead = true;
+ return ret;
+}
+
+int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to query qp: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ if (rqp)
+ *rqp = qpa.dest_qp_num;
+ return 0;
+}
+
+/**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to query qp: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ if (lid)
+ *lid = qpa.ah_attr.dlid;
+ return 0;
+}
+
+/**
+ * Get the state of a QueuePair.
+ */
+int Infiniband::QueuePair::get_state() const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to get state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ return qpa.qp_state;
+}
+
+/**
+ * Return true if the queue pair is in an error state, false otherwise.
+ */
+bool Infiniband::QueuePair::is_error() const
+{
+ ibv_qp_attr qpa;
+ ibv_qp_init_attr qpia;
+
+ int r = ibv_query_qp(qp, &qpa, -1, &qpia);
+ if (r) {
+ lderr(cct) << __func__ << " failed to get state: "
+ << cpp_strerror(errno) << dendl;
+ return true;
+ }
+ return qpa.cur_qp_state == IBV_QPS_ERR;
+}
+
+
+Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib)
+ : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0)
+{
+}
+
+Infiniband::CompletionChannel::~CompletionChannel()
+{
+ if (channel) {
+ int r = ibv_destroy_comp_channel(channel);
+ if (r < 0)
+ lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl;
+ ceph_assert(r == 0);
+ }
+}
+
+int Infiniband::CompletionChannel::init()
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ channel = ibv_create_comp_channel(infiniband.device->ctxt);
+ if (!channel) {
+ lderr(cct) << __func__ << " failed to create receive completion channel: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ int rc = NetHandler(cct).set_nonblock(channel->fd);
+ if (rc < 0) {
+ ibv_destroy_comp_channel(channel);
+ return -1;
+ }
+ return 0;
+}
+
+void Infiniband::CompletionChannel::ack_events()
+{
+ ibv_ack_cq_events(cq, cq_events_that_need_ack);
+ cq_events_that_need_ack = 0;
+}
+
+bool Infiniband::CompletionChannel::get_cq_event()
+{
+ ibv_cq *cq = NULL;
+ void *ev_ctx;
+ if (ibv_get_cq_event(channel, &cq, &ev_ctx)) {
+ if (errno != EAGAIN && errno != EINTR)
+ lderr(cct) << __func__ << " failed to retrieve CQ event: "
+ << cpp_strerror(errno) << dendl;
+ return false;
+ }
+
+ /* accumulate number of cq events that need to
+ * * be acked, and periodically ack them
+ * */
+ if (++cq_events_that_need_ack == MAX_ACK_EVENT) {
+ ldout(cct, 20) << __func__ << " ack aq events." << dendl;
+ ibv_ack_cq_events(cq, MAX_ACK_EVENT);
+ cq_events_that_need_ack = 0;
+ }
+
+ return true;
+}
+
+
+Infiniband::CompletionQueue::~CompletionQueue()
+{
+ if (cq) {
+ int r = ibv_destroy_cq(cq);
+ if (r < 0)
+ lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl;
+ ceph_assert(r == 0);
+ }
+}
+
+int Infiniband::CompletionQueue::init()
+{
+ cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0);
+ if (!cq) {
+ lderr(cct) << __func__ << " failed to create receive completion queue: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ if (ibv_req_notify_cq(cq, 0)) {
+ lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl;
+ ibv_destroy_cq(cq);
+ cq = nullptr;
+ return -1;
+ }
+
+ channel->bind_cq(cq);
+ ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl;
+ return 0;
+}
+
+int Infiniband::CompletionQueue::rearm_notify(bool solicite_only)
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ int r = ibv_req_notify_cq(cq, 0);
+ if (r < 0)
+ lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl;
+ return r;
+}
+
+int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) {
+ int r = ibv_poll_cq(cq, num_entries, ret_wc_array);
+ if (r < 0) {
+ lderr(cct) << __func__ << " poll_completion_queue occur met error: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+ return r;
+}
+
+
+Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device)
+ : pd(ibv_alloc_pd(device->ctxt))
+{
+ if (pd == NULL) {
+ lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+}
+
+Infiniband::ProtectionDomain::~ProtectionDomain()
+{
+ ibv_dealloc_pd(pd);
+}
+
+
+Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t len, char* b)
+ : mr(m), bytes(len), offset(0), buffer(b)
+{
+}
+
+Infiniband::MemoryManager::Chunk::~Chunk()
+{
+}
+
+void Infiniband::MemoryManager::Chunk::set_offset(uint32_t o)
+{
+ offset = o;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_offset()
+{
+ return offset;
+}
+
+void Infiniband::MemoryManager::Chunk::set_bound(uint32_t b)
+{
+ bound = b;
+}
+
+void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b)
+{
+ offset = 0;
+ bound = b;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::get_bound()
+{
+ return bound;
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len)
+{
+ uint32_t left = bound - offset;
+ if (left >= len) {
+ memcpy(buf, buffer+offset, len);
+ offset += len;
+ return len;
+ } else {
+ memcpy(buf, buffer+offset, left);
+ offset = 0;
+ bound = 0;
+ return left;
+ }
+}
+
+uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len)
+{
+ uint32_t left = bytes - offset;
+ if (left >= len) {
+ memcpy(buffer+offset, buf, len);
+ offset += len;
+ return len;
+ } else {
+ memcpy(buffer+offset, buf, left);
+ offset = bytes;
+ return left;
+ }
+}
+
+bool Infiniband::MemoryManager::Chunk::full()
+{
+ return offset == bytes;
+}
+
+bool Infiniband::MemoryManager::Chunk::over()
+{
+ return Infiniband::MemoryManager::Chunk::offset == bound;
+}
+
+void Infiniband::MemoryManager::Chunk::clear()
+{
+ offset = 0;
+ bound = 0;
+}
+
+Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s)
+ : manager(m), buffer_size(s), lock("cluster_lock")
+{
+}
+
+Infiniband::MemoryManager::Cluster::~Cluster()
+{
+ int r = ibv_dereg_mr(chunk_base->mr);
+ ceph_assert(r == 0);
+ const auto chunk_end = chunk_base + num_chunk;
+ for (auto chunk = chunk_base; chunk != chunk_end; chunk++) {
+ chunk->~Chunk();
+ }
+
+ ::free(chunk_base);
+ manager.free(base);
+}
+
+int Infiniband::MemoryManager::Cluster::fill(uint32_t num)
+{
+ ceph_assert(!base);
+ num_chunk = num;
+ uint32_t bytes = buffer_size * num;
+
+ base = (char*)manager.malloc(bytes);
+ end = base + bytes;
+ ceph_assert(base);
+ chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num));
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num);
+ free_chunks.reserve(num);
+ ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ ceph_assert(m);
+ Chunk* chunk = chunk_base;
+ for (uint32_t offset = 0; offset < bytes; offset += buffer_size){
+ new(chunk) Chunk(m, buffer_size, base+offset);
+ free_chunks.push_back(chunk);
+ chunk++;
+ }
+ return 0;
+}
+
+void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck)
+{
+ Mutex::Locker l(lock);
+ for (auto c : ck) {
+ c->clear();
+ free_chunks.push_back(c);
+ }
+}
+
+int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t bytes)
+{
+ uint32_t num = bytes / buffer_size + 1;
+ if (bytes % buffer_size == 0)
+ --num;
+ int r = num;
+ Mutex::Locker l(lock);
+ if (free_chunks.empty())
+ return 0;
+ if (!bytes) {
+ r = free_chunks.size();
+ for (auto c : free_chunks)
+ chunks.push_back(c);
+ free_chunks.clear();
+ return r;
+ }
+ if (free_chunks.size() < num) {
+ num = free_chunks.size();
+ r = num;
+ }
+ for (uint32_t i = 0; i < num; ++i) {
+ chunks.push_back(free_chunks.back());
+ free_chunks.pop_back();
+ }
+ return r;
+}
+
+bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs)
+{
+ /* unlimited */
+ if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0)
+ return true;
+
+ if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) {
+ lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " <<
+ n_bufs_allocated << " requested: " << nbufs <<
+ " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl;
+ return false;
+ }
+
+ return true;
+}
+
+void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) {
+ perf_logger = logger;
+ if (perf_logger != nullptr)
+ perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated);
+}
+
+void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs)
+{
+ n_bufs_allocated += nbufs;
+
+ if (!perf_logger)
+ return;
+
+ if (nbufs > 0) {
+ perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs);
+ } else {
+ perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs);
+ }
+}
+
+void *Infiniband::MemoryManager::mem_pool::slow_malloc()
+{
+ void *p;
+
+ Mutex::Locker l(PoolAllocator::lock);
+ PoolAllocator::g_ctx = ctx;
+ // this will trigger pool expansion via PoolAllocator::malloc()
+ p = boost::pool<PoolAllocator>::malloc();
+ PoolAllocator::g_ctx = nullptr;
+ return p;
+}
+
+Infiniband::MemoryManager::MemPoolContext *Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr;
+Mutex Infiniband::MemoryManager::PoolAllocator::lock("pool-alloc-lock");
+
+// lock is taken by mem_pool::slow_malloc()
+char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type bytes)
+{
+ mem_info *m;
+ Chunk *ch;
+ size_t rx_buf_size;
+ unsigned nbufs;
+ MemoryManager *manager;
+ CephContext *cct;
+
+ ceph_assert(g_ctx);
+ manager = g_ctx->manager;
+ cct = manager->cct;
+ rx_buf_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size;
+ nbufs = bytes/rx_buf_size;
+
+ if (!g_ctx->can_alloc(nbufs))
+ return NULL;
+
+ m = static_cast<mem_info *>(manager->malloc(bytes + sizeof(*m)));
+ if (!m) {
+ lderr(cct) << __func__ << " failed to allocate " <<
+ bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+ return NULL;
+ }
+
+ m->mr = ibv_reg_mr(manager->pd->pd, m->chunks, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
+ if (m->mr == NULL) {
+ lderr(cct) << __func__ << " failed to register " <<
+ bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl;
+ manager->free(m);
+ return NULL;
+ }
+
+ m->nbufs = nbufs;
+ // save this chunk context
+ m->ctx = g_ctx;
+
+ // note that the memory can be allocated before perf logger is set
+ g_ctx->update_stats(nbufs);
+
+ /* initialize chunks */
+ ch = m->chunks;
+ for (unsigned i = 0; i < nbufs; i++) {
+ ch->lkey = m->mr->lkey;
+ ch->bytes = cct->_conf->ms_async_rdma_buffer_size;
+ ch->offset = 0;
+ ch->buffer = ch->data; // TODO: refactor tx and remove buffer
+ ch = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(ch) + rx_buf_size);
+ }
+
+ return reinterpret_cast<char *>(m->chunks);
+}
+
+
+void Infiniband::MemoryManager::PoolAllocator::free(char * const block)
+{
+ mem_info *m;
+ Mutex::Locker l(lock);
+
+ m = reinterpret_cast<mem_info *>(block) - 1;
+ m->ctx->update_stats(-m->nbufs);
+ ibv_dereg_mr(m->mr);
+ m->ctx->manager->free(m);
+}
+
+Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p)
+ : cct(c), device(d), pd(p),
+ rxbuf_pool_ctx(this),
+ rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size,
+ c->_conf->ms_async_rdma_receive_buffers > 0 ?
+ // if possible make initial pool size 2 * receive_queue_len
+ // that way there will be no pool expansion upon receive of the
+ // first packet.
+ (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ?
+ c->_conf->ms_async_rdma_receive_buffers : 2 * c->_conf->ms_async_rdma_receive_queue_len) :
+ // rx pool is infinite, we can set any initial size that we want
+ 2 * c->_conf->ms_async_rdma_receive_queue_len)
+{
+}
+
+Infiniband::MemoryManager::~MemoryManager()
+{
+ if (send)
+ delete send;
+}
+
+void* Infiniband::MemoryManager::huge_pages_malloc(size_t size)
+{
+ size_t real_size = ALIGN_TO_PAGE_SIZE(size + HUGE_PAGE_SIZE);
+ char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS |MAP_POPULATE | MAP_HUGETLB,-1, 0);
+ if (ptr == MAP_FAILED) {
+ ptr = (char *)std::malloc(real_size);
+ if (ptr == NULL) return NULL;
+ real_size = 0;
+ }
+ *((size_t *)ptr) = real_size;
+ return ptr + HUGE_PAGE_SIZE;
+}
+
+void Infiniband::MemoryManager::huge_pages_free(void *ptr)
+{
+ if (ptr == NULL) return;
+ void *real_ptr = (char *)ptr -HUGE_PAGE_SIZE;
+ size_t real_size = *((size_t *)real_ptr);
+ ceph_assert(real_size % HUGE_PAGE_SIZE == 0);
+ if (real_size != 0)
+ munmap(real_ptr, real_size);
+ else
+ std::free(real_ptr);
+}
+
+
+void* Infiniband::MemoryManager::malloc(size_t size)
+{
+ if (cct->_conf->ms_async_rdma_enable_hugepage)
+ return huge_pages_malloc(size);
+ else
+ return std::malloc(size);
+}
+
+void Infiniband::MemoryManager::free(void *ptr)
+{
+ if (cct->_conf->ms_async_rdma_enable_hugepage)
+ huge_pages_free(ptr);
+ else
+ std::free(ptr);
+}
+
+void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num)
+{
+ ceph_assert(device);
+ ceph_assert(pd);
+
+ send = new Cluster(*this, size);
+ send->fill(tx_num);
+}
+
+void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks)
+{
+ send->take_back(chunks);
+}
+
+int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+ return send->get_buffers(c, bytes);
+}
+
+static std::atomic<bool> init_prereq = {false};
+
+void Infiniband::verify_prereq(CephContext *cct) {
+
+ //On RDMA MUST be called before fork
+ int rc = ibv_fork_init();
+ if (rc) {
+ lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl;
+ ceph_abort();
+ }
+
+ ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage << dendl;
+ if (cct->_conf->ms_async_rdma_enable_hugepage){
+ rc = setenv("RDMAV_HUGEPAGES_SAFE","1",1);
+ ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") << dendl;
+ if (rc) {
+ lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl;
+ ceph_abort();
+ }
+ }
+
+ //Check ulimit
+ struct rlimit limit;
+ getrlimit(RLIMIT_MEMLOCK, &limit);
+ if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) {
+ lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory."
+ " We recommend setting this parameter to infinity" << dendl;
+ }
+ init_prereq = true;
+}
+
+Infiniband::Infiniband(CephContext *cct)
+ : cct(cct), lock("IB lock"),
+ device_name(cct->_conf->ms_async_rdma_device_name),
+ port_num( cct->_conf->ms_async_rdma_port_num)
+{
+ if (!init_prereq)
+ verify_prereq(cct);
+ ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl;
+}
+
+void Infiniband::init()
+{
+ Mutex::Locker l(lock);
+
+ if (initialized)
+ return;
+
+ device_list = new DeviceList(cct);
+ initialized = true;
+
+ device = device_list->get_device(device_name.c_str());
+ ceph_assert(device);
+ device->binding_port(cct, port_num);
+ ib_physical_port = device->active_port->get_port_num();
+ pd = new ProtectionDomain(cct, device);
+ ceph_assert(NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0);
+
+ support_srq = cct->_conf->ms_async_rdma_support_srq;
+ if (support_srq)
+ rx_queue_len = device->device_attr->max_srq_wr;
+ else
+ rx_queue_len = device->device_attr->max_qp_wr;
+ if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) {
+ rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len;
+ ldout(cct, 1) << __func__ << " receive queue length is " << rx_queue_len << " receive buffers" << dendl;
+ } else {
+ ldout(cct, 0) << __func__ << " requested receive queue length " <<
+ cct->_conf->ms_async_rdma_receive_queue_len <<
+ " is too big. Setting " << rx_queue_len << dendl;
+ }
+
+ // check for the misconfiguration
+ if (cct->_conf->ms_async_rdma_receive_buffers > 0 &&
+ rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) {
+ lderr(cct) << __func__ << " rdma_receive_queue_len (" <<
+ rx_queue_len << ") > ms_async_rdma_receive_buffers(" <<
+ cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl;
+ ceph_abort();
+ }
+
+ tx_queue_len = device->device_attr->max_qp_wr;
+ if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) {
+ tx_queue_len = cct->_conf->ms_async_rdma_send_buffers;
+ ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers" << dendl;
+ } else {
+ ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl;
+ }
+
+ ldout(cct, 1) << __func__ << " device allow " << device->device_attr->max_cqe
+ << " completion entries" << dendl;
+
+ memory_manager = new MemoryManager(cct, device, pd);
+ memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len);
+
+ if (support_srq) {
+ srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT);
+ post_chunks_to_rq(rx_queue_len, NULL); //add to srq
+ }
+}
+
+Infiniband::~Infiniband()
+{
+ if (!initialized)
+ return;
+ if (support_srq)
+ ibv_destroy_srq(srq);
+ delete memory_manager;
+ delete pd;
+}
+
+/**
+ * Create a shared receive queue. This basically wraps the verbs call.
+ *
+ * \param[in] max_wr
+ * The max number of outstanding work requests in the SRQ.
+ * \param[in] max_sge
+ * The max number of scatter elements per WR.
+ * \return
+ * A valid ibv_srq pointer, or NULL on error.
+ */
+ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge)
+{
+ ibv_srq_init_attr sia;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&sia, 0, sizeof(sia));
+ sia.srq_context = device->ctxt;
+ sia.attr.max_wr = max_wr;
+ sia.attr.max_sge = max_sge;
+ return ibv_create_srq(pd->pd, &sia);
+}
+
+int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes)
+{
+ return memory_manager->get_send_buffers(c, bytes);
+}
+
+/**
+ * Create a new QueuePair. This factory should be used in preference to
+ * the QueuePair constructor directly, since this lets derivatives of
+ * Infiniband, e.g. MockInfiniband (if it existed),
+ * return mocked out QueuePair derivatives.
+ *
+ * \return
+ * QueuePair on success or NULL if init fails
+ * See QueuePair::QueuePair for parameter documentation.
+ */
+Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx,
+ CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id)
+{
+ Infiniband::QueuePair *qp = new QueuePair(
+ cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id);
+ if (qp->init()) {
+ delete qp;
+ return NULL;
+ }
+ return qp;
+}
+
+int Infiniband::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+ int ret, i = 0;
+ ibv_sge isge[num];
+ Chunk *chunk;
+ ibv_recv_wr rx_work_request[num];
+
+ while (i < num) {
+ chunk = get_memory_manager()->get_rx_buffer();
+ if (chunk == NULL) {
+ lderr(cct) << __func__ << " WARNING: out of memory. Requested " << num <<
+ " rx buffers. Got " << i << dendl;
+ if (i == 0)
+ return 0;
+ // if we got some buffers post them and hope for the best
+ rx_work_request[i-1].next = 0;
+ break;
+ }
+
+ isge[i].addr = reinterpret_cast<uint64_t>(chunk->data);
+ isge[i].length = chunk->bytes;
+ isge[i].lkey = chunk->lkey;
+
+ memset(&rx_work_request[i], 0, sizeof(rx_work_request[i]));
+ rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// stash descriptor ptr
+ if (i == num - 1) {
+ rx_work_request[i].next = 0;
+ } else {
+ rx_work_request[i].next = &rx_work_request[i+1];
+ }
+ rx_work_request[i].sg_list = &isge[i];
+ rx_work_request[i].num_sge = 1;
+ i++;
+ }
+ ibv_recv_wr *badworkrequest;
+ if (support_srq) {
+ ret = ibv_post_srq_recv(srq, &rx_work_request[0], &badworkrequest);
+ ceph_assert(ret == 0);
+ } else {
+ ceph_assert(qp);
+ ret = ibv_post_recv(qp, &rx_work_request[0], &badworkrequest);
+ ceph_assert(ret == 0);
+ }
+ return i;
+}
+
+Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c)
+{
+ Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this);
+ if (cc->init()) {
+ delete cc;
+ return NULL;
+ }
+ return cc;
+}
+
+Infiniband::CompletionQueue* Infiniband::create_comp_queue(
+ CephContext *cct, CompletionChannel *cc)
+{
+ Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue(
+ cct, *this, CQ_DEPTH, cc);
+ if (cq->init()) {
+ delete cq;
+ return NULL;
+ }
+ return cq;
+}
+
+// 1 means no valid buffer read, 0 means got enough buffer
+// else return < 0 means error
+int Infiniband::recv_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+ char msg[TCP_MSG_LEN];
+ char gid[33];
+ ssize_t r = ::read(sd, &msg, sizeof(msg));
+ // Drop incoming qpt
+ if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+ if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+ return -EINVAL;
+ }
+ }
+ if (r < 0) {
+ r = -errno;
+ lderr(cct) << __func__ << " got error " << r << ": "
+ << cpp_strerror(r) << dendl;
+ } else if (r == 0) { // valid disconnect message of length 0
+ ldout(cct, 10) << __func__ << " got disconnect message " << dendl;
+ } else if ((size_t)r != sizeof(msg)) { // invalid message
+ ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl;
+ r = -EINVAL;
+ } else { // valid message
+ sscanf(msg, "%hx:%x:%x:%x:%s", &(im.lid), &(im.qpn), &(im.psn), &(im.peer_qpn),gid);
+ wire_gid_to_gid(gid, &(im.gid));
+ ldout(cct, 5) << __func__ << " recevd: " << im.lid << ", " << im.qpn << ", " << im.psn << ", " << im.peer_qpn << ", " << gid << dendl;
+ }
+ return r;
+}
+
+int Infiniband::send_msg(CephContext *cct, int sd, IBSYNMsg& im)
+{
+ int retry = 0;
+ ssize_t r;
+
+ char msg[TCP_MSG_LEN];
+ char gid[33];
+retry:
+ gid_to_wire_gid(&(im.gid), gid);
+ sprintf(msg, "%04x:%08x:%08x:%08x:%s", im.lid, im.qpn, im.psn, im.peer_qpn, gid);
+ ldout(cct, 10) << __func__ << " sending: " << im.lid << ", " << im.qpn << ", " << im.psn
+ << ", " << im.peer_qpn << ", " << gid << dendl;
+ r = ::write(sd, msg, sizeof(msg));
+ // Drop incoming qpt
+ if (cct->_conf->ms_inject_socket_failures && sd >= 0) {
+ if (rand() % cct->_conf->ms_inject_socket_failures == 0) {
+ ldout(cct, 0) << __func__ << " injecting socket failure" << dendl;
+ return -EINVAL;
+ }
+ }
+
+ if ((size_t)r != sizeof(msg)) {
+ // FIXME need to handle EAGAIN instead of retry
+ if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) {
+ retry++;
+ goto retry;
+ }
+ if (r < 0)
+ lderr(cct) << __func__ << " send returned error " << errno << ": "
+ << cpp_strerror(errno) << dendl;
+ else
+ lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl;
+ return -errno;
+ }
+ return 0;
+}
+
+void Infiniband::wire_gid_to_gid(const char *wgid, union ibv_gid *gid)
+{
+ char tmp[9];
+ uint32_t v32;
+ int i;
+
+ for (tmp[8] = 0, i = 0; i < 4; ++i) {
+ memcpy(tmp, wgid + i * 8, 8);
+ sscanf(tmp, "%x", &v32);
+ *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32);
+ }
+}
+
+void Infiniband::gid_to_wire_gid(const union ibv_gid *gid, char wgid[])
+{
+ for (int i = 0; i < 4; ++i)
+ sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4)));
+}
+
+Infiniband::QueuePair::~QueuePair()
+{
+ if (qp) {
+ ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl;
+ ceph_assert(!ibv_destroy_qp(qp));
+ }
+}
+
+/**
+ * Given a string representation of the `status' field from Verbs
+ * struct `ibv_wc'.
+ *
+ * \param[in] status
+ * The integer status obtained in ibv_wc.status.
+ * \return
+ * A string corresponding to the given status.
+ */
+const char* Infiniband::wc_status_to_string(int status)
+{
+ static const char *lookup[] = {
+ "SUCCESS",
+ "LOC_LEN_ERR",
+ "LOC_QP_OP_ERR",
+ "LOC_EEC_OP_ERR",
+ "LOC_PROT_ERR",
+ "WR_FLUSH_ERR",
+ "MW_BIND_ERR",
+ "BAD_RESP_ERR",
+ "LOC_ACCESS_ERR",
+ "REM_INV_REQ_ERR",
+ "REM_ACCESS_ERR",
+ "REM_OP_ERR",
+ "RETRY_EXC_ERR",
+ "RNR_RETRY_EXC_ERR",
+ "LOC_RDD_VIOL_ERR",
+ "REM_INV_RD_REQ_ERR",
+ "REM_ABORT_ERR",
+ "INV_EECN_ERR",
+ "INV_EEC_STATE_ERR",
+ "FATAL_ERR",
+ "RESP_TIMEOUT_ERR",
+ "GENERAL_ERR"
+ };
+
+ if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR)
+ return "<status out of range!>";
+ return lookup[status];
+}
+
+const char* Infiniband::qp_state_string(int status) {
+ switch(status) {
+ case IBV_QPS_RESET : return "IBV_QPS_RESET";
+ case IBV_QPS_INIT : return "IBV_QPS_INIT";
+ case IBV_QPS_RTR : return "IBV_QPS_RTR";
+ case IBV_QPS_RTS : return "IBV_QPS_RTS";
+ case IBV_QPS_SQD : return "IBV_QPS_SQD";
+ case IBV_QPS_SQE : return "IBV_QPS_SQE";
+ case IBV_QPS_ERR : return "IBV_QPS_ERR";
+ default: return " out of range.";
+ }
+}
diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h
new file mode 100644
index 00000000..2889cdfc
--- /dev/null
+++ b/src/msg/async/rdma/Infiniband.h
@@ -0,0 +1,529 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_INFINIBAND_H
+#define CEPH_INFINIBAND_H
+
+#include <boost/pool/pool.hpp>
+// need this because boost messes with ceph log/assert definitions
+#include "include/ceph_assert.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+
+#include <atomic>
+#include <string>
+#include <vector>
+
+#include "include/int_types.h"
+#include "include/page.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "common/Mutex.h"
+#include "common/perf_counters.h"
+#include "msg/msg_types.h"
+#include "msg/async/net_handler.h"
+
+#define HUGE_PAGE_SIZE (2 * 1024 * 1024)
+#define ALIGN_TO_PAGE_SIZE(x) \
+ (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE)
+
+struct IBSYNMsg {
+ uint16_t lid;
+ uint32_t qpn;
+ uint32_t psn;
+ uint32_t peer_qpn;
+ union ibv_gid gid;
+} __attribute__((packed));
+
+class RDMAStack;
+class CephContext;
+
+class Port {
+ struct ibv_context* ctxt;
+ int port_num;
+ struct ibv_port_attr* port_attr;
+ uint16_t lid;
+ int gid_idx = 0;
+ union ibv_gid gid;
+
+ public:
+ explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn);
+ uint16_t get_lid() { return lid; }
+ ibv_gid get_gid() { return gid; }
+ int get_port_num() { return port_num; }
+ ibv_port_attr* get_port_attr() { return port_attr; }
+ int get_gid_idx() { return gid_idx; }
+};
+
+
+class Device {
+ ibv_device *device;
+ const char* name;
+ uint8_t port_cnt = 0;
+ public:
+ explicit Device(CephContext *c, ibv_device* d, struct ibv_context *dc);
+ ~Device() {
+ if (active_port) {
+ delete active_port;
+ ceph_assert(ibv_close_device(ctxt) == 0);
+ }
+ }
+ const char* get_name() { return name;}
+ uint16_t get_lid() { return active_port->get_lid(); }
+ ibv_gid get_gid() { return active_port->get_gid(); }
+ int get_gid_idx() { return active_port->get_gid_idx(); }
+ void binding_port(CephContext *c, int port_num);
+ struct ibv_context *ctxt;
+ ibv_device_attr *device_attr;
+ Port* active_port;
+};
+
+
+class DeviceList {
+ struct ibv_device ** device_list;
+ struct ibv_context ** device_context_list;
+ int num;
+ Device** devices;
+ public:
+ explicit DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)),
+ device_context_list(rdma_get_devices(&num)) {
+ if (device_list == NULL || num == 0) {
+ lderr(cct) << __func__ << " failed to get rdma device list. " << cpp_strerror(errno) << dendl;
+ ceph_abort();
+ }
+ devices = new Device*[num];
+
+ for (int i = 0;i < num; ++i) {
+ devices[i] = new Device(cct, device_list[i], device_context_list[i]);
+ }
+ }
+ ~DeviceList() {
+ for (int i=0; i < num; ++i) {
+ delete devices[i];
+ }
+ delete []devices;
+ ibv_free_device_list(device_list);
+ }
+
+ Device* get_device(const char* device_name) {
+ ceph_assert(devices);
+ for (int i = 0; i < num; ++i) {
+ if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) {
+ return devices[i];
+ }
+ }
+ return NULL;
+ }
+};
+
+// stat counters
+enum {
+ l_msgr_rdma_dispatcher_first = 94000,
+
+ l_msgr_rdma_polling,
+ l_msgr_rdma_inflight_tx_chunks,
+ l_msgr_rdma_rx_bufs_in_use,
+ l_msgr_rdma_rx_bufs_total,
+
+ l_msgr_rdma_tx_total_wc,
+ l_msgr_rdma_tx_total_wc_errors,
+ l_msgr_rdma_tx_wc_retry_errors,
+ l_msgr_rdma_tx_wc_wr_flush_errors,
+
+ l_msgr_rdma_rx_total_wc,
+ l_msgr_rdma_rx_total_wc_errors,
+ l_msgr_rdma_rx_fin,
+
+ l_msgr_rdma_handshake_errors,
+
+ l_msgr_rdma_total_async_events,
+ l_msgr_rdma_async_last_wqe_events,
+
+ l_msgr_rdma_created_queue_pair,
+ l_msgr_rdma_active_queue_pair,
+
+ l_msgr_rdma_dispatcher_last,
+};
+
+enum {
+ l_msgr_rdma_first = 95000,
+
+ l_msgr_rdma_tx_no_mem,
+ l_msgr_rdma_tx_parital_mem,
+ l_msgr_rdma_tx_failed,
+
+ l_msgr_rdma_tx_chunks,
+ l_msgr_rdma_tx_bytes,
+ l_msgr_rdma_rx_chunks,
+ l_msgr_rdma_rx_bytes,
+ l_msgr_rdma_pending_sent_conns,
+
+ l_msgr_rdma_last,
+};
+
+class RDMADispatcher;
+
+class Infiniband {
+ public:
+ class ProtectionDomain {
+ public:
+ explicit ProtectionDomain(CephContext *cct, Device *device);
+ ~ProtectionDomain();
+
+ ibv_pd* const pd;
+ };
+
+
+ class MemoryManager {
+ public:
+ class Chunk {
+ public:
+ Chunk(ibv_mr* m, uint32_t len, char* b);
+ ~Chunk();
+
+ void set_offset(uint32_t o);
+ uint32_t get_offset();
+ void set_bound(uint32_t b);
+ void prepare_read(uint32_t b);
+ uint32_t get_bound();
+ uint32_t read(char* buf, uint32_t len);
+ uint32_t write(char* buf, uint32_t len);
+ bool full();
+ bool over();
+ void clear();
+
+ public:
+ ibv_mr* mr;
+ uint32_t lkey = 0;
+ uint32_t bytes;
+ uint32_t bound = 0;
+ uint32_t offset;
+ char* buffer; // TODO: remove buffer/refactor TX
+ char data[0];
+ };
+
+ class Cluster {
+ public:
+ Cluster(MemoryManager& m, uint32_t s);
+ ~Cluster();
+
+ int fill(uint32_t num);
+ void take_back(std::vector<Chunk*> &ck);
+ int get_buffers(std::vector<Chunk*> &chunks, size_t bytes);
+ Chunk *get_chunk_by_buffer(const char *c) {
+ uint32_t idx = (c - base) / buffer_size;
+ Chunk *chunk = chunk_base + idx;
+ return chunk;
+ }
+ bool is_my_buffer(const char *c) const {
+ return c >= base && c < end;
+ }
+
+ MemoryManager& manager;
+ uint32_t buffer_size;
+ uint32_t num_chunk = 0;
+ Mutex lock;
+ std::vector<Chunk*> free_chunks;
+ char *base = nullptr;
+ char *end = nullptr;
+ Chunk* chunk_base = nullptr;
+ };
+
+ class MemPoolContext {
+ PerfCounters *perf_logger;
+
+ public:
+ MemoryManager *manager;
+ unsigned n_bufs_allocated;
+ // true if it is possible to alloc
+ // more memory for the pool
+ explicit MemPoolContext(MemoryManager *m) :
+ perf_logger(nullptr),
+ manager(m),
+ n_bufs_allocated(0) {}
+ bool can_alloc(unsigned nbufs);
+ void update_stats(int val);
+ void set_stat_logger(PerfCounters *logger);
+ };
+
+ class PoolAllocator {
+ struct mem_info {
+ ibv_mr *mr;
+ MemPoolContext *ctx;
+ unsigned nbufs;
+ Chunk chunks[0];
+ };
+ public:
+ typedef std::size_t size_type;
+ typedef std::ptrdiff_t difference_type;
+
+ static char * malloc(const size_type bytes);
+ static void free(char * const block);
+
+ static MemPoolContext *g_ctx;
+ static Mutex lock;
+ };
+
+ /**
+ * modify boost pool so that it is possible to
+ * have a thread safe 'context' when allocating/freeing
+ * the memory. It is needed to allow a different pool
+ * configurations and bookkeeping per CephContext and
+ * also to be able to use same allocator to deal with
+ * RX and TX pool.
+ * TODO: use boost pool to allocate TX chunks too
+ */
+ class mem_pool : public boost::pool<PoolAllocator> {
+ private:
+ MemPoolContext *ctx;
+ void *slow_malloc();
+
+ public:
+ explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size,
+ const size_type nnext_size = 32,
+ const size_type nmax_size = 0) :
+ pool(nrequested_size, nnext_size, nmax_size),
+ ctx(ctx) { }
+
+ void *malloc() {
+ if (!store().empty())
+ return (store().malloc)();
+ // need to alloc more memory...
+ // slow path code
+ return slow_malloc();
+ }
+ };
+
+ MemoryManager(CephContext *c, Device *d, ProtectionDomain *p);
+ ~MemoryManager();
+
+ void* malloc(size_t size);
+ void free(void *ptr);
+
+ void create_tx_pool(uint32_t size, uint32_t tx_num);
+ void return_tx(std::vector<Chunk*> &chunks);
+ int get_send_buffers(std::vector<Chunk*> &c, size_t bytes);
+ bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); }
+ Chunk *get_tx_chunk_by_buffer(const char *c) {
+ return send->get_chunk_by_buffer(c);
+ }
+ uint32_t get_tx_buffer_size() const {
+ return send->buffer_size;
+ }
+
+ Chunk *get_rx_buffer() {
+ return reinterpret_cast<Chunk *>(rxbuf_pool.malloc());
+ }
+
+ void release_rx_buffer(Chunk *chunk) {
+ rxbuf_pool.free(chunk);
+ }
+
+ void set_rx_stat_logger(PerfCounters *logger) {
+ rxbuf_pool_ctx.set_stat_logger(logger);
+ }
+
+ CephContext *cct;
+ private:
+ // TODO: Cluster -> TxPool txbuf_pool
+ // chunk layout fix
+ //
+ Cluster* send = nullptr;// SEND
+ Device *device;
+ ProtectionDomain *pd;
+ MemPoolContext rxbuf_pool_ctx;
+ mem_pool rxbuf_pool;
+
+
+ void* huge_pages_malloc(size_t size);
+ void huge_pages_free(void *ptr);
+ };
+
+ private:
+ uint32_t tx_queue_len = 0;
+ uint32_t rx_queue_len = 0;
+ uint32_t max_sge = 0;
+ uint8_t ib_physical_port = 0;
+ MemoryManager* memory_manager = nullptr;
+ ibv_srq* srq = nullptr; // shared receive work queue
+ Device *device = NULL;
+ ProtectionDomain *pd = NULL;
+ DeviceList *device_list = nullptr;
+ void wire_gid_to_gid(const char *wgid, union ibv_gid *gid);
+ void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]);
+ CephContext *cct;
+ Mutex lock;
+ bool initialized = false;
+ const std::string &device_name;
+ uint8_t port_num;
+ bool support_srq = false;
+
+ public:
+ explicit Infiniband(CephContext *c);
+ ~Infiniband();
+ void init();
+ static void verify_prereq(CephContext *cct);
+
+ class CompletionChannel {
+ static const uint32_t MAX_ACK_EVENT = 5000;
+ CephContext *cct;
+ Infiniband& infiniband;
+ ibv_comp_channel *channel;
+ ibv_cq *cq;
+ uint32_t cq_events_that_need_ack;
+
+ public:
+ CompletionChannel(CephContext *c, Infiniband &ib);
+ ~CompletionChannel();
+ int init();
+ bool get_cq_event();
+ int get_fd() { return channel->fd; }
+ ibv_comp_channel* get_channel() { return channel; }
+ void bind_cq(ibv_cq *c) { cq = c; }
+ void ack_events();
+ };
+
+ // this class encapsulates the creation, use, and destruction of an RC
+ // completion queue.
+ //
+ // You need to call init and it will create a cq and associate to comp channel
+ class CompletionQueue {
+ public:
+ CompletionQueue(CephContext *c, Infiniband &ib,
+ const uint32_t qd, CompletionChannel *cc)
+ : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {}
+ ~CompletionQueue();
+ int init();
+ int poll_cq(int num_entries, ibv_wc *ret_wc_array);
+
+ ibv_cq* get_cq() const { return cq; }
+ int rearm_notify(bool solicited_only=true);
+ CompletionChannel* get_cc() const { return channel; }
+ private:
+ CephContext *cct;
+ Infiniband& infiniband; // Infiniband to which this QP belongs
+ CompletionChannel *channel;
+ ibv_cq *cq;
+ uint32_t queue_depth;
+ };
+
+ // this class encapsulates the creation, use, and destruction of an RC
+ // queue pair.
+ //
+ // you need call init and it will create a qp and bring it to the INIT state.
+ // after obtaining the lid, qpn, and psn of a remote queue pair, one
+ // must call plumb() to bring the queue pair to the RTS state.
+ class QueuePair {
+ public:
+ QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type,
+ int ib_physical_port, ibv_srq *srq,
+ Infiniband::CompletionQueue* txcq,
+ Infiniband::CompletionQueue* rxcq,
+ uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0);
+ ~QueuePair();
+
+ int init();
+
+ /**
+ * Get the initial packet sequence number for this QueuePair.
+ * This is randomly generated on creation. It should not be confused
+ * with the remote side's PSN, which is set in #plumb().
+ */
+ uint32_t get_initial_psn() const { return initial_psn; };
+ /**
+ * Get the local queue pair number for this QueuePair.
+ * QPNs are analogous to UDP/TCP port numbers.
+ */
+ uint32_t get_local_qp_number() const { return qp->qp_num; };
+ /**
+ * Get the remote queue pair number for this QueuePair, as set in #plumb().
+ * QPNs are analogous to UDP/TCP port numbers.
+ */
+ int get_remote_qp_number(uint32_t *rqp) const;
+ /**
+ * Get the remote infiniband address for this QueuePair, as set in #plumb().
+ * LIDs are "local IDs" in infiniband terminology. They are short, locally
+ * routable addresses.
+ */
+ int get_remote_lid(uint16_t *lid) const;
+ /**
+ * Get the state of a QueuePair.
+ */
+ int get_state() const;
+ /**
+ * Return true if the queue pair is in an error state, false otherwise.
+ */
+ bool is_error() const;
+ void add_tx_wr(uint32_t amt) { tx_wr_inflight += amt; }
+ void dec_tx_wr(uint32_t amt) { tx_wr_inflight -= amt; }
+ uint32_t get_tx_wr() const { return tx_wr_inflight; }
+ ibv_qp* get_qp() const { return qp; }
+ Infiniband::CompletionQueue* get_tx_cq() const { return txcq; }
+ Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; }
+ int to_dead();
+ bool is_dead() const { return dead; }
+
+ private:
+ CephContext *cct;
+ Infiniband& infiniband; // Infiniband to which this QP belongs
+ ibv_qp_type type; // QP type (IBV_QPT_RC, etc.)
+ ibv_context* ctxt; // device context of the HCA to use
+ int ib_physical_port;
+ ibv_pd* pd; // protection domain
+ ibv_srq* srq; // shared receive queue
+ ibv_qp* qp; // infiniband verbs QP handle
+ struct rdma_cm_id *cm_id;
+ Infiniband::CompletionQueue* txcq;
+ Infiniband::CompletionQueue* rxcq;
+ uint32_t initial_psn; // initial packet sequence number
+ uint32_t max_send_wr;
+ uint32_t max_recv_wr;
+ uint32_t q_key;
+ bool dead;
+ std::atomic<uint32_t> tx_wr_inflight = {0}; // counter for inflight Tx WQEs
+ };
+
+ public:
+ typedef MemoryManager::Cluster Cluster;
+ typedef MemoryManager::Chunk Chunk;
+ QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*,
+ ibv_qp_type type, struct rdma_cm_id *cm_id);
+ ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge);
+ // post rx buffers to srq, return number of buffers actually posted
+ int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+ void post_chunk_to_pool(Chunk* chunk) {
+ get_memory_manager()->release_rx_buffer(chunk);
+ }
+ int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes);
+ CompletionChannel *create_comp_channel(CephContext *c);
+ CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL);
+ uint8_t get_ib_physical_port() { return ib_physical_port; }
+ int send_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+ int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg);
+ uint16_t get_lid() { return device->get_lid(); }
+ ibv_gid get_gid() { return device->get_gid(); }
+ MemoryManager* get_memory_manager() { return memory_manager; }
+ Device* get_device() { return device; }
+ int get_async_fd() { return device->ctxt->async_fd; }
+ bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);}
+ Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); }
+ static const char* wc_status_to_string(int status);
+ static const char* qp_state_string(int status);
+ uint32_t get_rx_queue_len() const { return rx_queue_len; }
+};
+
+#endif
diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
new file mode 100644
index 00000000..89be7428
--- /dev/null
+++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc
@@ -0,0 +1,743 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#include "RDMAStack.h"
+
+class C_handle_connection_established : public EventCallback {
+ RDMAConnectedSocketImpl *csi;
+ bool active = true;
+ public:
+ C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {}
+ void do_request(uint64_t fd) final {
+ if (active)
+ csi->handle_connection_established();
+ }
+ void close() {
+ active = false;
+ }
+};
+
+class C_handle_connection_read : public EventCallback {
+ RDMAConnectedSocketImpl *csi;
+ bool active = true;
+ public:
+ explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {}
+ void do_request(uint64_t fd) final {
+ if (active)
+ csi->handle_connection();
+ }
+ void close() {
+ active = false;
+ }
+};
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAConnectedSocketImpl "
+
+RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w)
+ : cct(cct), connected(0), error(0), infiniband(ib),
+ dispatcher(s), worker(w), lock("RDMAConnectedSocketImpl::lock"),
+ is_server(false), read_handler(new C_handle_connection_read(this)),
+ established_handler(new C_handle_connection_established(this)),
+ active(false), pending(false)
+{
+ if (!cct->_conf->ms_async_rdma_cm) {
+ qp = infiniband->create_queue_pair(cct, s->get_tx_cq(), s->get_rx_cq(), IBV_QPT_RC, NULL);
+ my_msg.qpn = qp->get_local_qp_number();
+ my_msg.psn = qp->get_initial_psn();
+ my_msg.lid = infiniband->get_lid();
+ my_msg.peer_qpn = 0;
+ my_msg.gid = infiniband->get_gid();
+ notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+ dispatcher->register_qp(qp, this);
+ dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+ dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+ }
+}
+
+RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl()
+{
+ ldout(cct, 20) << __func__ << " destruct." << dendl;
+ cleanup();
+ worker->remove_pending_conn(this);
+ dispatcher->erase_qpn(my_msg.qpn);
+
+ for (unsigned i=0; i < wc.size(); ++i) {
+ dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id));
+ }
+ for (unsigned i=0; i < buffers.size(); ++i) {
+ dispatcher->post_chunk_to_pool(buffers[i]);
+ }
+
+ Mutex::Locker l(lock);
+ if (notify_fd >= 0)
+ ::close(notify_fd);
+ if (tcp_fd >= 0)
+ ::close(tcp_fd);
+ error = ECONNRESET;
+}
+
+void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v)
+{
+ Mutex::Locker l(lock);
+ if (wc.empty())
+ wc = std::move(v);
+ else
+ wc.insert(wc.end(), v.begin(), v.end());
+ notify();
+}
+
+void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w)
+{
+ Mutex::Locker l(lock);
+ if (wc.empty())
+ return ;
+ w.swap(wc);
+}
+
+int RDMAConnectedSocketImpl::activate()
+{
+ ibv_qp_attr qpa;
+ int r;
+
+ // now connect up the qps and switch to RTR
+ memset(&qpa, 0, sizeof(qpa));
+ qpa.qp_state = IBV_QPS_RTR;
+ qpa.path_mtu = IBV_MTU_1024;
+ qpa.dest_qp_num = peer_msg.qpn;
+ qpa.rq_psn = peer_msg.psn;
+ qpa.max_dest_rd_atomic = 1;
+ qpa.min_rnr_timer = 12;
+ //qpa.ah_attr.is_global = 0;
+ qpa.ah_attr.is_global = 1;
+ qpa.ah_attr.grh.hop_limit = 6;
+ qpa.ah_attr.grh.dgid = peer_msg.gid;
+
+ qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx();
+
+ qpa.ah_attr.dlid = peer_msg.lid;
+ qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl;
+ qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp;
+ qpa.ah_attr.src_path_bits = 0;
+ qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port());
+
+ ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl;
+
+ r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+ IBV_QP_AV |
+ IBV_QP_PATH_MTU |
+ IBV_QP_DEST_QPN |
+ IBV_QP_RQ_PSN |
+ IBV_QP_MIN_RNR_TIMER |
+ IBV_QP_MAX_DEST_RD_ATOMIC);
+ if (r) {
+ lderr(cct) << __func__ << " failed to transition to RTR state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl;
+
+ // now move to RTS
+ qpa.qp_state = IBV_QPS_RTS;
+
+ // How long to wait before retrying if packet lost or server dead.
+ // Supposedly the timeout is 4.096us*2^timeout. However, the actual
+ // timeout appears to be 4.096us*2^(timeout+1), so the setting
+ // below creates a 135ms timeout.
+ qpa.timeout = 14;
+
+ // How many times to retry after timeouts before giving up.
+ qpa.retry_cnt = 7;
+
+ // How many times to retry after RNR (receiver not ready) condition
+ // before giving up. Occurs when the remote side has not yet posted
+ // a receive request.
+ qpa.rnr_retry = 7; // 7 is infinite retry.
+ qpa.sq_psn = my_msg.psn;
+ qpa.max_rd_atomic = 1;
+
+ r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE |
+ IBV_QP_TIMEOUT |
+ IBV_QP_RETRY_CNT |
+ IBV_QP_RNR_RETRY |
+ IBV_QP_SQ_PSN |
+ IBV_QP_MAX_QP_RD_ATOMIC);
+ if (r) {
+ lderr(cct) << __func__ << " failed to transition to RTS state: "
+ << cpp_strerror(errno) << dendl;
+ return -1;
+ }
+
+ // the queue pair should be ready to use once the client has finished
+ // setting up their end.
+ ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl;
+ ldout(cct, 20) << __func__ << " QueuePair: " << qp << " with qp:" << qp->get_qp() << dendl;
+
+ if (!is_server) {
+ connected = 1; //indicate successfully
+ ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << my_msg.qpn << dendl;
+ submit(false);
+ }
+ active = true;
+
+ return 0;
+}
+
+int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+ ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:"
+ << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl;
+ NetHandler net(cct);
+
+ // we construct a socket to transport ib sync message
+ // but we shouldn't block in tcp connecting
+ if (opts.nonblock) {
+ tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr);
+ } else {
+ tcp_fd = net.connect(peer_addr, opts.connect_bind_addr);
+ }
+
+ if (tcp_fd < 0) {
+ return -errno;
+ }
+
+ int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size);
+ if (r < 0) {
+ ::close(tcp_fd);
+ tcp_fd = -1;
+ return -errno;
+ }
+
+ ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl;
+ net.set_priority(tcp_fd, opts.priority, peer_addr.get_family());
+ r = 0;
+ if (opts.nonblock) {
+ worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler);
+ } else {
+ r = handle_connection_established(false);
+ }
+ return r;
+}
+
+int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) {
+ ldout(cct, 20) << __func__ << " start " << dendl;
+ // delete read event
+ worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+ if (1 == connected) {
+ ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl;
+ if (need_set_fault) {
+ fault();
+ }
+ return -1;
+ }
+ // send handshake msg to server
+ my_msg.peer_qpn = 0;
+ int r = infiniband->send_msg(cct, tcp_fd, my_msg);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl;
+ if (need_set_fault) {
+ fault();
+ }
+ return r;
+ }
+ worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+ ldout(cct, 20) << __func__ << " finish " << dendl;
+ return 0;
+}
+
+void RDMAConnectedSocketImpl::handle_connection() {
+ ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl;
+ int r = infiniband->recv_msg(cct, tcp_fd, peer_msg);
+ if (r <= 0) {
+ if (r != -EAGAIN) {
+ dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+ ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl;
+ fault();
+ }
+ return;
+ }
+
+ if (1 == connected) {
+ ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl;
+ fault();
+ return;
+ }
+
+ if (!is_server) {// syn + ack from server
+ my_msg.peer_qpn = peer_msg.qpn;
+ ldout(cct, 20) << __func__ << " peer msg : < " << peer_msg.qpn << ", " << peer_msg.psn
+ << ", " << peer_msg.lid << ", " << peer_msg.peer_qpn << "> " << dendl;
+ if (!connected) {
+ r = activate();
+ ceph_assert(!r);
+ }
+ notify();
+ r = infiniband->send_msg(cct, tcp_fd, my_msg);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " send client ack failed." << dendl;
+ dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+ fault();
+ }
+ } else {
+ if (peer_msg.peer_qpn == 0) {// syn from client
+ if (active) {
+ ldout(cct, 10) << __func__ << " server is already active." << dendl;
+ return ;
+ }
+ r = activate();
+ ceph_assert(!r);
+ r = infiniband->send_msg(cct, tcp_fd, my_msg);
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " server ack failed." << dendl;
+ dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors);
+ fault();
+ return ;
+ }
+ } else { // ack from client
+ connected = 1;
+ ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl;
+ //cleanup();
+ submit(false);
+ notify();
+ }
+ }
+}
+
+ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len)
+{
+ uint64_t i = 0;
+ int r = ::read(notify_fd, &i, sizeof(i));
+ ldout(cct, 20) << __func__ << " notify_fd : " << i << " in " << my_msg.qpn << " r = " << r << dendl;
+
+ if (!active) {
+ ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl;
+ return -EAGAIN;
+ }
+
+ if (0 == connected) {
+ ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl;
+ return -EAGAIN;
+ }
+ ssize_t read = 0;
+ if (!buffers.empty())
+ read = read_buffers(buf,len);
+
+ std::vector<ibv_wc> cqe;
+ get_wc(cqe);
+ if (cqe.empty()) {
+ if (!buffers.empty()) {
+ notify();
+ }
+ if (read > 0) {
+ return read;
+ }
+ if (error) {
+ return -error;
+ } else {
+ return -EAGAIN;
+ }
+ }
+
+ ldout(cct, 20) << __func__ << " poll queue got " << cqe.size() << " responses. QP: " << my_msg.qpn << dendl;
+ for (size_t i = 0; i < cqe.size(); ++i) {
+ ibv_wc* response = &cqe[i];
+ ceph_assert(response->status == IBV_WC_SUCCESS);
+ Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+ ldout(cct, 25) << __func__ << " chunk length: " << response->byte_len << " bytes." << chunk << dendl;
+ chunk->prepare_read(response->byte_len);
+ worker->perf_logger->inc(l_msgr_rdma_rx_bytes, response->byte_len);
+ if (response->byte_len == 0) {
+ dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin);
+ if (connected) {
+ error = ECONNRESET;
+ ldout(cct, 20) << __func__ << " got remote close msg..." << dendl;
+ }
+ dispatcher->post_chunk_to_pool(chunk);
+ } else {
+ if (read == (ssize_t)len) {
+ buffers.push_back(chunk);
+ ldout(cct, 25) << __func__ << " buffers add a chunk: " << response->byte_len << dendl;
+ } else if (read + response->byte_len > (ssize_t)len) {
+ read += chunk->read(buf+read, (ssize_t)len-read);
+ buffers.push_back(chunk);
+ ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl;
+ } else {
+ read += chunk->read(buf+read, response->byte_len);
+ dispatcher->post_chunk_to_pool(chunk);
+ update_post_backlog();
+ }
+ }
+ }
+
+ worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size());
+ if (is_server && connected == 0) {
+ ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << my_msg.qpn << " peer QP: " << peer_msg.qpn << dendl;
+ connected = 1; //if so, we don't need the last handshake
+ cleanup();
+ submit(false);
+ }
+
+ if (!buffers.empty()) {
+ notify();
+ }
+
+ if (read == 0 && error)
+ return -error;
+ return read == 0 ? -EAGAIN : read;
+}
+
+ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len)
+{
+ size_t read = 0, tmp = 0;
+ auto c = buffers.begin();
+ for (; c != buffers.end() ; ++c) {
+ tmp = (*c)->read(buf+read, len-read);
+ read += tmp;
+ ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound() << ". Chunk:" << *c << dendl;
+ if ((*c)->over()) {
+ dispatcher->post_chunk_to_pool(*c);
+ update_post_backlog();
+ ldout(cct, 25) << __func__ << " one chunk over." << dendl;
+ }
+ if (read == len) {
+ break;
+ }
+ }
+
+ if (c != buffers.end() && (*c)->over())
+ ++c;
+ buffers.erase(buffers.begin(), c);
+ ldout(cct, 25) << __func__ << " got " << read << " bytes, buffers size: " << buffers.size() << dendl;
+ return read;
+}
+
+ssize_t RDMAConnectedSocketImpl::zero_copy_read(bufferptr &data)
+{
+ if (error)
+ return -error;
+ static const int MAX_COMPLETIONS = 16;
+ ibv_wc wc[MAX_COMPLETIONS];
+ ssize_t size = 0;
+
+ ibv_wc* response;
+ Chunk* chunk;
+ bool loaded = false;
+ auto iter = buffers.begin();
+ if (iter != buffers.end()) {
+ chunk = *iter;
+ // FIXME need to handle release
+ // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+ buffers.erase(iter);
+ loaded = true;
+ size = chunk->bound;
+ }
+
+ std::vector<ibv_wc> cqe;
+ get_wc(cqe);
+ if (cqe.empty())
+ return size == 0 ? -EAGAIN : size;
+
+ ldout(cct, 20) << __func__ << " pool completion queue got " << cqe.size() << " responses."<< dendl;
+
+ for (size_t i = 0; i < cqe.size(); ++i) {
+ response = &wc[i];
+ chunk = reinterpret_cast<Chunk*>(response->wr_id);
+ chunk->prepare_read(response->byte_len);
+ if (!loaded && i == 0) {
+ // FIXME need to handle release
+ // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband);
+ size = chunk->bound;
+ continue;
+ }
+ buffers.push_back(chunk);
+ iter++;
+ }
+
+ if (size == 0)
+ return -EAGAIN;
+ return size;
+}
+
+ssize_t RDMAConnectedSocketImpl::send(bufferlist &bl, bool more)
+{
+ if (error) {
+ if (!active)
+ return -EPIPE;
+ return -error;
+ }
+ size_t bytes = bl.length();
+ if (!bytes)
+ return 0;
+ {
+ Mutex::Locker l(lock);
+ pending_bl.claim_append(bl);
+ if (!connected) {
+ ldout(cct, 20) << __func__ << " fake send to upper, QP: " << my_msg.qpn << dendl;
+ return bytes;
+ }
+ }
+ ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << dendl;
+ ssize_t r = submit(more);
+ if (r < 0 && r != -EAGAIN)
+ return r;
+ return bytes;
+}
+
+ssize_t RDMAConnectedSocketImpl::submit(bool more)
+{
+ if (error)
+ return -error;
+ Mutex::Locker l(lock);
+ size_t bytes = pending_bl.length();
+ ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: "
+ << pending_bl.buffers().size() << dendl;
+ if (!bytes)
+ return 0;
+
+ auto fill_tx_via_copy = [this](std::vector<Chunk*> &tx_buffers,
+ unsigned bytes,
+ auto& start,
+ const auto& end) -> unsigned {
+ ceph_assert(start != end);
+ auto chunk_idx = tx_buffers.size();
+ int ret = worker->get_reged_mem(this, tx_buffers, bytes);
+ if (ret == 0) {
+ ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl;
+ worker->perf_logger->inc(l_msgr_rdma_tx_no_mem);
+ return 0;
+ }
+
+ unsigned total_copied = 0;
+ Chunk *current_chunk = tx_buffers[chunk_idx];
+ while (start != end) {
+ const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str());
+ unsigned copied = 0;
+ while (copied < start->length()) {
+ uint32_t r = current_chunk->write((char*)addr+copied, start->length() - copied);
+ copied += r;
+ total_copied += r;
+ bytes -= r;
+ if (current_chunk->full()){
+ if (++chunk_idx == tx_buffers.size())
+ return total_copied;
+ current_chunk = tx_buffers[chunk_idx];
+ }
+ }
+ ++start;
+ }
+ ceph_assert(bytes == 0);
+ return total_copied;
+ };
+
+ std::vector<Chunk*> tx_buffers;
+ auto it = std::cbegin(pending_bl.buffers());
+ auto copy_it = it;
+ unsigned total = 0;
+ unsigned need_reserve_bytes = 0;
+ while (it != pending_bl.buffers().end()) {
+ if (infiniband->is_tx_buffer(it->raw_c_str())) {
+ if (need_reserve_bytes) {
+ unsigned copied = fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+ total += copied;
+ if (copied < need_reserve_bytes)
+ goto sending;
+ need_reserve_bytes = 0;
+ }
+ ceph_assert(copy_it == it);
+ tx_buffers.push_back(infiniband->get_tx_chunk_by_buffer(it->raw_c_str()));
+ total += it->length();
+ ++copy_it;
+ } else {
+ need_reserve_bytes += it->length();
+ }
+ ++it;
+ }
+ if (need_reserve_bytes)
+ total += fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it);
+
+ sending:
+ if (total == 0)
+ return -EAGAIN;
+ ceph_assert(total <= pending_bl.length());
+ bufferlist swapped;
+ if (total < pending_bl.length()) {
+ worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem);
+ pending_bl.splice(total, pending_bl.length()-total, &swapped);
+ pending_bl.swap(swapped);
+ } else {
+ pending_bl.clear();
+ }
+
+ ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers "
+ << pending_bl.buffers().size() << " tx chunks " << tx_buffers.size() << dendl;
+
+ int r = post_work_request(tx_buffers);
+ if (r < 0)
+ return r;
+
+ ldout(cct, 20) << __func__ << " finished sending " << bytes << " bytes." << dendl;
+ return pending_bl.length() ? -EAGAIN : 0;
+}
+
+int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers)
+{
+ ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " " << tx_buffers[0] << dendl;
+ vector<Chunk*>::iterator current_buffer = tx_buffers.begin();
+ ibv_sge isge[tx_buffers.size()];
+ uint32_t current_sge = 0;
+ ibv_send_wr iswr[tx_buffers.size()];
+ uint32_t current_swr = 0;
+ ibv_send_wr* pre_wr = NULL;
+ uint32_t num = 0;
+
+ // FIPS zeroization audit 20191115: these memsets are not security related.
+ memset(iswr, 0, sizeof(iswr));
+ memset(isge, 0, sizeof(isge));
+
+ while (current_buffer != tx_buffers.end()) {
+ isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer);
+ isge[current_sge].length = (*current_buffer)->get_offset();
+ isge[current_sge].lkey = (*current_buffer)->mr->lkey;
+ ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length << dendl;
+
+ iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer);
+ iswr[current_swr].next = NULL;
+ iswr[current_swr].sg_list = &isge[current_sge];
+ iswr[current_swr].num_sge = 1;
+ iswr[current_swr].opcode = IBV_WR_SEND;
+ iswr[current_swr].send_flags = IBV_SEND_SIGNALED;
+ /*if (isge[current_sge].length < infiniband->max_inline_data) {
+ iswr[current_swr].send_flags = IBV_SEND_INLINE;
+ ldout(cct, 20) << __func__ << " send_inline." << dendl;
+ }*/
+
+ num++;
+ worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length);
+ if (pre_wr)
+ pre_wr->next = &iswr[current_swr];
+ pre_wr = &iswr[current_swr];
+ ++current_sge;
+ ++current_swr;
+ ++current_buffer;
+ }
+
+ ibv_send_wr *bad_tx_work_request;
+ if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) {
+ ldout(cct, 1) << __func__ << " failed to send data"
+ << " (most probably should be peer not ready): "
+ << cpp_strerror(errno) << dendl;
+ worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+ return -errno;
+ }
+ qp->add_tx_wr(num);
+ worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size());
+ ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl;
+ return 0;
+}
+
+void RDMAConnectedSocketImpl::fin() {
+ ibv_send_wr wr;
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&wr, 0, sizeof(wr));
+
+ wr.wr_id = reinterpret_cast<uint64_t>(qp);
+ wr.num_sge = 0;
+ wr.opcode = IBV_WR_SEND;
+ wr.send_flags = IBV_SEND_SIGNALED;
+ ibv_send_wr* bad_tx_work_request;
+ if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) {
+ ldout(cct, 1) << __func__ << " failed to send message="
+ << " ibv_post_send failed(most probably should be peer not ready): "
+ << cpp_strerror(errno) << dendl;
+ worker->perf_logger->inc(l_msgr_rdma_tx_failed);
+ return ;
+ }
+ qp->add_tx_wr(1);
+}
+
+void RDMAConnectedSocketImpl::cleanup() {
+ if (read_handler && tcp_fd >= 0) {
+ (static_cast<C_handle_connection_read*>(read_handler))->close();
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE);
+ }, false);
+ delete read_handler;
+ read_handler = nullptr;
+ }
+ if (established_handler) {
+ (static_cast<C_handle_connection_established*>(established_handler))->close();
+ delete established_handler;
+ established_handler = nullptr;
+ }
+}
+
+void RDMAConnectedSocketImpl::notify()
+{
+ // note: notify_fd is an event fd (man eventfd)
+ // write argument must be a 64bit integer
+ uint64_t i = 1;
+
+ ceph_assert(sizeof(i) == write(notify_fd, &i, sizeof(i)));
+}
+
+void RDMAConnectedSocketImpl::shutdown()
+{
+ if (!error)
+ fin();
+ error = ECONNRESET;
+ active = false;
+}
+
+void RDMAConnectedSocketImpl::close()
+{
+ if (!error)
+ fin();
+ error = ECONNRESET;
+ active = false;
+}
+
+void RDMAConnectedSocketImpl::fault()
+{
+ ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl;
+ /*if (qp) {
+ qp->to_dead();
+ qp = NULL;
+ }*/
+ error = ECONNRESET;
+ connected = 1;
+ notify();
+}
+
+void RDMAConnectedSocketImpl::set_accept_fd(int sd)
+{
+ tcp_fd = sd;
+ is_server = true;
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler);
+ }, true);
+}
+
+void RDMAConnectedSocketImpl::post_chunks_to_rq(int num)
+{
+ post_backlog += num - infiniband->post_chunks_to_rq(num, qp->get_qp());
+}
+
+void RDMAConnectedSocketImpl::update_post_backlog()
+{
+ if (post_backlog)
+ post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp->get_qp());
+}
diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
new file mode 100644
index 00000000..432c2d2b
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc
@@ -0,0 +1,183 @@
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl "
+
+#define TIMEOUT_MS 3000
+#define RETRY_COUNT 7
+
+RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w, RDMACMInfo *info)
+ : RDMAConnectedSocketImpl(cct, ib, s, w), cm_con_handler(new C_handle_cm_connection(this))
+{
+ status = IDLE;
+ notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK);
+ if (info) {
+ is_server = true;
+ cm_id = info->cm_id;
+ cm_channel = info->cm_channel;
+ status = RDMA_ID_CREATED;
+ remote_qpn = info->qp_num;
+ if (alloc_resource()) {
+ close_notify();
+ return;
+ }
+ worker->center.submit_to(worker->center.get_id(), [this]() {
+ worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+ status = CHANNEL_FD_CREATED;
+ }, false);
+ status = RESOURCE_ALLOCATED;
+ local_qpn = qp->get_local_qp_number();
+ my_msg.qpn = local_qpn;
+ } else {
+ is_server = false;
+ cm_channel = rdma_create_event_channel();
+ rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+ status = RDMA_ID_CREATED;
+ ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+ }
+}
+
+RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() {
+ ldout(cct, 20) << __func__ << " destruct." << dendl;
+ std::unique_lock l(close_mtx);
+ close_condition.wait(l, [&] { return closed; });
+ if (status >= RDMA_ID_CREATED) {
+ rdma_destroy_id(cm_id);
+ rdma_destroy_event_channel(cm_channel);
+ }
+}
+
+int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) {
+ worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler);
+ status = CHANNEL_FD_CREATED;
+ if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) {
+ lderr(cct) << __func__ << " failed to resolve addr" << dendl;
+ return -1;
+ }
+ return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close() {
+ error = ECONNRESET;
+ active = false;
+ if (status >= CONNECTED) {
+ rdma_disconnect(cm_id);
+ }
+ close_notify();
+}
+
+void RDMAIWARPConnectedSocketImpl::shutdown() {
+ error = ECONNRESET;
+ active = false;
+}
+
+void RDMAIWARPConnectedSocketImpl::handle_cm_connection() {
+ struct rdma_cm_event *event;
+ rdma_get_cm_event(cm_channel, &event);
+ ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event)
+ << " (cm id: " << cm_id << ")" << dendl;
+ struct rdma_conn_param cm_params;
+ switch (event->event) {
+ case RDMA_CM_EVENT_ADDR_RESOLVED:
+ status = ADDR_RESOLVED;
+ if (rdma_resolve_route(cm_id, TIMEOUT_MS)) {
+ lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl;
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_ROUTE_RESOLVED:
+ status = ROUTE_RESOLVED;
+ if (alloc_resource()) {
+ lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl;
+ connected = -ECONNREFUSED;
+ notify();
+ break;
+ }
+ local_qpn = qp->get_local_qp_number();
+ my_msg.qpn = local_qpn;
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&cm_params, 0, sizeof(cm_params));
+ cm_params.retry_count = RETRY_COUNT;
+ cm_params.qp_num = local_qpn;
+ if (rdma_connect(cm_id, &cm_params)) {
+ lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl;
+ connected = -ECONNREFUSED;
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_ESTABLISHED:
+ ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl;
+ status = CONNECTED;
+ if (!is_server) {
+ remote_qpn = event->param.conn.qp_num;
+ activate();
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_ADDR_ERROR:
+ case RDMA_CM_EVENT_ROUTE_ERROR:
+ case RDMA_CM_EVENT_CONNECT_ERROR:
+ case RDMA_CM_EVENT_UNREACHABLE:
+ case RDMA_CM_EVENT_REJECTED:
+ lderr(cct) << __func__ << " rdma connection rejected" << dendl;
+ connected = -ECONNREFUSED;
+ notify();
+ break;
+
+ case RDMA_CM_EVENT_DISCONNECTED:
+ status = DISCONNECTED;
+ close_notify();
+ if (!error) {
+ error = ECONNRESET;
+ notify();
+ }
+ break;
+
+ case RDMA_CM_EVENT_DEVICE_REMOVAL:
+ break;
+
+ default:
+ ceph_abort_msg("unhandled event");
+ break;
+ }
+ rdma_ack_cm_event(event);
+}
+
+void RDMAIWARPConnectedSocketImpl::activate() {
+ ldout(cct, 30) << __func__ << dendl;
+ active = true;
+ connected = 1;
+}
+
+int RDMAIWARPConnectedSocketImpl::alloc_resource() {
+ ldout(cct, 30) << __func__ << dendl;
+ qp = infiniband->create_queue_pair(cct, dispatcher->get_tx_cq(),
+ dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id);
+ if (!qp) {
+ return -1;
+ }
+ if (!cct->_conf->ms_async_rdma_support_srq)
+ dispatcher->post_chunks_to_rq(infiniband->get_rx_queue_len(), qp->get_qp());
+ dispatcher->register_qp(qp, this);
+ dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair);
+ dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair);
+ return 0;
+}
+
+void RDMAIWARPConnectedSocketImpl::close_notify() {
+ ldout(cct, 30) << __func__ << dendl;
+ if (status >= CHANNEL_FD_CREATED) {
+ worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE);
+ }
+ std::unique_lock l(close_mtx);
+ if (!closed) {
+ closed = true;
+ close_condition.notify_all();
+ }
+}
diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
new file mode 100644
index 00000000..210eaf00
--- /dev/null
+++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc
@@ -0,0 +1,107 @@
+#include <poll.h>
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl "
+
+RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl(
+ CephContext *cct, Infiniband* i,
+ RDMADispatcher *s, RDMAWorker *w, entity_addr_t& a, unsigned addr_slot)
+ : RDMAServerSocketImpl(cct, i, s, w, a, addr_slot)
+{
+}
+
+int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa,
+ const SocketOptions &opt)
+{
+ ldout(cct, 20) << __func__ << " bind to rdma point" << dendl;
+ cm_channel = rdma_create_event_channel();
+ rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP);
+ ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl;
+ int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr()));
+ if (rc < 0) {
+ rc = -errno;
+ ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+ << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+ rc = rdma_listen(cm_id, 128);
+ if (rc < 0) {
+ rc = -errno;
+ ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr()
+ << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+ server_setup_socket = cm_channel->fd;
+ ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl;
+ return 0;
+
+err:
+ server_setup_socket = -1;
+ rdma_destroy_id(cm_id);
+ rdma_destroy_event_channel(cm_channel);
+ return rc;
+}
+
+int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt,
+ entity_addr_t *out, Worker *w)
+{
+ ldout(cct, 15) << __func__ << dendl;
+
+ ceph_assert(sock);
+ struct pollfd pfd = {
+ .fd = cm_channel->fd,
+ .events = POLLIN,
+ };
+ int ret = poll(&pfd, 1, 0);
+ ceph_assert(ret >= 0);
+ if (!ret)
+ return -EAGAIN;
+
+ struct rdma_cm_event *cm_event;
+ rdma_get_cm_event(cm_channel, &cm_event);
+ ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl;
+
+ struct rdma_cm_id *event_cm_id = cm_event->id;
+ struct rdma_event_channel *event_channel = rdma_create_event_channel();
+
+ rdma_migrate_id(event_cm_id, event_channel);
+
+ struct rdma_cm_id *new_cm_id = event_cm_id;
+ struct rdma_conn_param *remote_conn_param = &cm_event->param.conn;
+ struct rdma_conn_param local_conn_param;
+
+ RDMACMInfo info(new_cm_id, event_channel, remote_conn_param->qp_num);
+ RDMAIWARPConnectedSocketImpl* server =
+ new RDMAIWARPConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w), &info);
+
+ // FIPS zeroization audit 20191115: this memset is not security related.
+ memset(&local_conn_param, 0, sizeof(local_conn_param));
+ local_conn_param.qp_num = server->get_local_qpn();
+
+ if (rdma_accept(new_cm_id, &local_conn_param)) {
+ return -EAGAIN;
+ }
+ server->activate();
+ ldout(cct, 20) << __func__ << " accepted a new QP" << dendl;
+
+ rdma_ack_cm_event(cm_event);
+
+ std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+ *sock = ConnectedSocket(std::move(csi));
+ struct sockaddr *addr = &new_cm_id->route.addr.dst_addr;
+ out->set_sockaddr(addr);
+
+ return 0;
+}
+
+void RDMAIWARPServerSocketImpl::abort_accept()
+{
+ if (server_setup_socket >= 0) {
+ rdma_destroy_id(cm_id);
+ rdma_destroy_event_channel(cm_channel);
+ }
+}
diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc
new file mode 100644
index 00000000..98402cfd
--- /dev/null
+++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "msg/async/net_handler.h"
+#include "RDMAStack.h"
+
+#include "include/compat.h"
+#include "include/sock_compat.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << " RDMAServerSocketImpl "
+
+RDMAServerSocketImpl::RDMAServerSocketImpl(
+ CephContext *cct, Infiniband* i, RDMADispatcher *s, RDMAWorker *w,
+ entity_addr_t& a, unsigned slot)
+ : ServerSocketImpl(a.get_type(), slot),
+ cct(cct), net(cct), server_setup_socket(-1), infiniband(i),
+ dispatcher(s), worker(w), sa(a)
+{
+}
+
+int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt)
+{
+ int rc = 0;
+ server_setup_socket = net.create_socket(sa.get_family(), true);
+ if (server_setup_socket < 0) {
+ rc = -errno;
+ lderr(cct) << __func__ << " failed to create server socket: "
+ << cpp_strerror(errno) << dendl;
+ return rc;
+ }
+
+ rc = net.set_nonblock(server_setup_socket);
+ if (rc < 0) {
+ goto err;
+ }
+
+ rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size);
+ if (rc < 0) {
+ goto err;
+ }
+
+ rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len());
+ if (rc < 0) {
+ rc = -errno;
+ ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr()
+ << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+
+ rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog);
+ if (rc < 0) {
+ rc = -errno;
+ lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl;
+ goto err;
+ }
+
+ ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port() << dendl;
+ return 0;
+
+err:
+ ::close(server_setup_socket);
+ server_setup_socket = -1;
+ return rc;
+}
+
+int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w)
+{
+ ldout(cct, 15) << __func__ << dendl;
+
+ ceph_assert(sock);
+
+ sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen);
+ if (sd < 0) {
+ return -errno;
+ }
+
+ int r = net.set_nonblock(sd);
+ if (r < 0) {
+ ::close(sd);
+ return -errno;
+ }
+
+ r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size);
+ if (r < 0) {
+ ::close(sd);
+ return -errno;
+ }
+
+ ceph_assert(NULL != out); //out should not be NULL in accept connection
+
+ out->set_type(addr_type);
+ out->set_sockaddr((sockaddr*)&ss);
+ net.set_priority(sd, opt.priority, out->get_family());
+
+ RDMAConnectedSocketImpl* server;
+ //Worker* w = dispatcher->get_stack()->get_worker();
+ server = new RDMAConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w));
+ server->set_accept_fd(sd);
+ ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl;
+ std::unique_ptr<RDMAConnectedSocketImpl> csi(server);
+ *sock = ConnectedSocket(std::move(csi));
+
+ return 0;
+}
+
+void RDMAServerSocketImpl::abort_accept()
+{
+ if (server_setup_socket >= 0)
+ ::close(server_setup_socket);
+}
diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc
new file mode 100644
index 00000000..f63a8e7d
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.cc
@@ -0,0 +1,610 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <poll.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "include/str_list.h"
+#include "include/compat.h"
+#include "common/Cycles.h"
+#include "common/deleter.h"
+#include "common/Tub.h"
+#include "RDMAStack.h"
+
+#define dout_subsys ceph_subsys_ms
+#undef dout_prefix
+#define dout_prefix *_dout << "RDMAStack "
+
+RDMADispatcher::~RDMADispatcher()
+{
+ ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl;
+ polling_stop();
+
+ ceph_assert(qp_conns.empty());
+ ceph_assert(num_qp_conn == 0);
+ ceph_assert(dead_queue_pairs.empty());
+ ceph_assert(num_dead_queue_pair == 0);
+
+ delete async_handler;
+}
+
+RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s)
+ : cct(c), async_handler(new C_handle_cq_async(this)), lock("RDMADispatcher::lock"),
+ w_lock("RDMADispatcher::for worker pending list"), stack(s)
+{
+ PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last);
+
+ plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling");
+ plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks");
+ plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed");
+ plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers");
+
+ plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions");
+ plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors");
+ plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors");
+ plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors");
+
+ plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion");
+ plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion");
+ plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request");
+
+ plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events");
+ plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events");
+
+ plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors");
+
+
+ plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number");
+ plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ Cycles::init();
+}
+
+void RDMADispatcher::polling_start()
+{
+ // take lock because listen/connect can happen from different worker threads
+ Mutex::Locker l(lock);
+
+ if (t.joinable())
+ return; // dispatcher thread already running
+
+ get_stack()->get_infiniband().get_memory_manager()->set_rx_stat_logger(perf_logger);
+
+ tx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+ ceph_assert(tx_cc);
+ rx_cc = get_stack()->get_infiniband().create_comp_channel(cct);
+ ceph_assert(rx_cc);
+ tx_cq = get_stack()->get_infiniband().create_comp_queue(cct, tx_cc);
+ ceph_assert(tx_cq);
+ rx_cq = get_stack()->get_infiniband().create_comp_queue(cct, rx_cc);
+ ceph_assert(rx_cq);
+
+ t = std::thread(&RDMADispatcher::polling, this);
+ ceph_pthread_setname(t.native_handle(), "rdma-polling");
+}
+
+void RDMADispatcher::polling_stop()
+{
+ {
+ Mutex::Locker l(lock);
+ done = true;
+ }
+
+ if (!t.joinable())
+ return;
+
+ t.join();
+
+ tx_cc->ack_events();
+ rx_cc->ack_events();
+ delete tx_cq;
+ delete rx_cq;
+ delete tx_cc;
+ delete rx_cc;
+}
+
+void RDMADispatcher::handle_async_event()
+{
+ ldout(cct, 30) << __func__ << dendl;
+ while (1) {
+ ibv_async_event async_event;
+ if (ibv_get_async_event(get_stack()->get_infiniband().get_device()->ctxt, &async_event)) {
+ if (errno != EAGAIN)
+ lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno
+ << " " << cpp_strerror(errno) << ")" << dendl;
+ return;
+ }
+ perf_logger->inc(l_msgr_rdma_total_async_events);
+ // FIXME: Currently we must ensure no other factor make QP in ERROR state,
+ // otherwise this qp can't be deleted in current cleanup flow.
+ if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) {
+ perf_logger->inc(l_msgr_rdma_async_last_wqe_events);
+ uint64_t qpn = async_event.element.qp->qp_num;
+ ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp
+ << " evt: " << ibv_event_type_str(async_event.event_type) << dendl;
+ Mutex::Locker l(lock);
+ RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn);
+ if (!conn) {
+ ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl;
+ } else {
+ ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl;
+ conn->fault();
+ if (!cct->_conf->ms_async_rdma_cm)
+ erase_qpn_lockless(qpn);
+ }
+ } else {
+ ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt
+ << " evt: " << ibv_event_type_str(async_event.event_type)
+ << dendl;
+ }
+ ibv_ack_async_event(&async_event);
+ }
+}
+
+void RDMADispatcher::post_chunk_to_pool(Chunk* chunk)
+{
+ Mutex::Locker l(lock);
+ get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+ perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+}
+
+int RDMADispatcher::post_chunks_to_rq(int num, ibv_qp *qp)
+{
+ Mutex::Locker l(lock);
+ return get_stack()->get_infiniband().post_chunks_to_rq(num, qp);
+}
+
+void RDMADispatcher::polling()
+{
+ static int MAX_COMPLETIONS = 32;
+ ibv_wc wc[MAX_COMPLETIONS];
+
+ std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled;
+ std::vector<ibv_wc> tx_cqe;
+ ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl;
+ RDMAConnectedSocketImpl *conn = nullptr;
+ uint64_t last_inactive = Cycles::rdtsc();
+ bool rearmed = false;
+ int r = 0;
+
+ while (true) {
+ int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc);
+ if (tx_ret > 0) {
+ ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret
+ << " responses."<< dendl;
+ handle_tx_event(wc, tx_ret);
+ }
+
+ int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc);
+ if (rx_ret > 0) {
+ ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret
+ << " responses."<< dendl;
+ perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_ret);
+ perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_ret);
+
+ Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+
+ for (int i = 0; i < rx_ret; ++i) {
+ ibv_wc* response = &wc[i];
+ Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+
+ if (response->status == IBV_WC_SUCCESS) {
+ ceph_assert(wc[i].opcode == IBV_WC_RECV);
+ conn = get_conn_lockless(response->qp_num);
+ if (!conn) {
+ ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk " << chunk << " will be back ? " << r << dendl;
+ get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+ perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+ } else {
+ conn->post_chunks_to_rq(1);
+ polled[conn].push_back(*response);
+ }
+ } else {
+ perf_logger->inc(l_msgr_rdma_rx_total_wc_errors);
+ ldout(cct, 1) << __func__ << " work request returned error for buffer(" << chunk
+ << ") status(" << response->status << ":"
+ << get_stack()->get_infiniband().wc_status_to_string(response->status) << ")" << dendl;
+ if (response->status != IBV_WC_WR_FLUSH_ERR) {
+ conn = get_conn_lockless(response->qp_num);
+ if (conn && conn->is_connected())
+ conn->fault();
+ }
+ get_stack()->get_infiniband().post_chunk_to_pool(chunk);
+ perf_logger->dec(l_msgr_rdma_rx_bufs_in_use);
+ }
+ }
+ for (auto &&i : polled)
+ i.first->pass_wc(std::move(i.second));
+ polled.clear();
+ }
+
+ if (!tx_ret && !rx_ret) {
+ // NOTE: Has TX just transitioned to idle? We should do it when idle!
+ // It's now safe to delete queue pairs (see comment by declaration
+ // for dead_queue_pairs).
+ // Additionally, don't delete qp while outstanding_buffers isn't empty,
+ // because we need to check qp's state before sending
+ perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight);
+ if (num_dead_queue_pair) {
+ Mutex::Locker l(lock); // FIXME reuse dead qp because creating one qp costs 1 ms
+ auto it = dead_queue_pairs.begin();
+ while (it != dead_queue_pairs.end()) {
+ auto i = *it;
+ // Bypass QPs that do not collect all Tx completions yet.
+ if (i->get_tx_wr()) {
+ ldout(cct, 20) << __func__ << " bypass qp=" << i << " tx_wr=" << i->get_tx_wr() << dendl;
+ ++it;
+ } else {
+ ldout(cct, 10) << __func__ << " finally delete qp=" << i << dendl;
+ delete i;
+ it = dead_queue_pairs.erase(it);
+ perf_logger->dec(l_msgr_rdma_active_queue_pair);
+ --num_dead_queue_pair;
+ }
+ }
+ }
+ if (!num_qp_conn && done && dead_queue_pairs.empty())
+ break;
+
+ uint64_t now = Cycles::rdtsc();
+ if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) {
+ handle_async_event();
+ if (!rearmed) {
+ // Clean up cq events after rearm notify ensure no new incoming event
+ // arrived between polling and rearm
+ tx_cq->rearm_notify();
+ rx_cq->rearm_notify();
+ rearmed = true;
+ continue;
+ }
+
+ struct pollfd channel_poll[2];
+ channel_poll[0].fd = tx_cc->get_fd();
+ channel_poll[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+ channel_poll[0].revents = 0;
+ channel_poll[1].fd = rx_cc->get_fd();
+ channel_poll[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP;
+ channel_poll[1].revents = 0;
+ r = 0;
+ perf_logger->set(l_msgr_rdma_polling, 0);
+ while (!done && r == 0) {
+ r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100));
+ if (r < 0) {
+ r = -errno;
+ lderr(cct) << __func__ << " poll failed " << r << dendl;
+ ceph_abort();
+ }
+ }
+ if (r > 0 && tx_cc->get_cq_event())
+ ldout(cct, 20) << __func__ << " got tx cq event." << dendl;
+ if (r > 0 && rx_cc->get_cq_event())
+ ldout(cct, 20) << __func__ << " got rx cq event." << dendl;
+ last_inactive = Cycles::rdtsc();
+ perf_logger->set(l_msgr_rdma_polling, 1);
+ rearmed = false;
+ }
+ }
+ }
+}
+
+void RDMADispatcher::notify_pending_workers() {
+ if (num_pending_workers) {
+ RDMAWorker *w = nullptr;
+ {
+ Mutex::Locker l(w_lock);
+ if (!pending_workers.empty()) {
+ w = pending_workers.front();
+ pending_workers.pop_front();
+ --num_pending_workers;
+ }
+ }
+ if (w)
+ w->notify_worker();
+ }
+}
+
+void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi)
+{
+ Mutex::Locker l(lock);
+ ceph_assert(!qp_conns.count(qp->get_local_qp_number()));
+ qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi);
+ ++num_qp_conn;
+}
+
+RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp)
+{
+ auto it = qp_conns.find(qp);
+ if (it == qp_conns.end())
+ return nullptr;
+ if (it->second.first->is_dead())
+ return nullptr;
+ return it->second.second;
+}
+
+Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp)
+{
+ Mutex::Locker l(lock);
+ // Try to find the QP in qp_conns firstly.
+ auto it = qp_conns.find(qp);
+ if (it != qp_conns.end())
+ return it->second.first;
+
+ // Try again in dead_queue_pairs.
+ for (auto &i: dead_queue_pairs)
+ if (i->get_local_qp_number() == qp)
+ return i;
+
+ return nullptr;
+}
+
+void RDMADispatcher::erase_qpn_lockless(uint32_t qpn)
+{
+ auto it = qp_conns.find(qpn);
+ if (it == qp_conns.end())
+ return ;
+ ++num_dead_queue_pair;
+ dead_queue_pairs.push_back(it->second.first);
+ qp_conns.erase(it);
+ --num_qp_conn;
+}
+
+void RDMADispatcher::erase_qpn(uint32_t qpn)
+{
+ Mutex::Locker l(lock);
+ erase_qpn_lockless(qpn);
+}
+
+void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n)
+{
+ std::vector<Chunk*> tx_chunks;
+
+ for (int i = 0; i < n; ++i) {
+ ibv_wc* response = &cqe[i];
+ Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id);
+ ldout(cct, 25) << __func__ << " QP: " << response->qp_num
+ << " len: " << response->byte_len << " , addr:" << chunk
+ << " " << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+
+ QueuePair *qp = get_qp(response->qp_num);
+ if (qp)
+ qp->dec_tx_wr(1);
+
+ if (response->status != IBV_WC_SUCCESS) {
+ perf_logger->inc(l_msgr_rdma_tx_total_wc_errors);
+ if (response->status == IBV_WC_RETRY_EXC_ERR) {
+ ldout(cct, 1) << __func__ << " connection between server and client not working. Disconnect this now" << dendl;
+ perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors);
+ } else if (response->status == IBV_WC_WR_FLUSH_ERR) {
+ ldout(cct, 1) << __func__ << " Work Request Flushed Error: this connection's qp="
+ << response->qp_num << " should be down while this WR=" << response->wr_id
+ << " still in flight." << dendl;
+ perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors);
+ } else {
+ ldout(cct, 1) << __func__ << " send work request returned error for buffer("
+ << response->wr_id << ") status(" << response->status << "): "
+ << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl;
+ Mutex::Locker l(lock);//make sure connected socket alive when pass wc
+ RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num);
+
+ if (conn && conn->is_connected()) {
+ ldout(cct, 25) << __func__ << " qp state is : " << conn->get_qp_state() << dendl;
+ conn->fault();
+ } else {
+ ldout(cct, 1) << __func__ << " missing qp_num=" << response->qp_num << " discard event" << dendl;
+ }
+ }
+ }
+
+ //TX completion may come either from regular send message or from 'fin' message.
+ //In the case of 'fin' wr_id points to the QueuePair.
+ if (get_stack()->get_infiniband().get_memory_manager()->is_tx_buffer(chunk->buffer)) {
+ tx_chunks.push_back(chunk);
+ } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) {
+ ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl;
+ } else {
+ ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl;
+ ceph_abort();
+ }
+ }
+
+ perf_logger->inc(l_msgr_rdma_tx_total_wc, n);
+ post_tx_buffer(tx_chunks);
+}
+
+/**
+ * Add the given Chunks to the given free queue.
+ *
+ * \param[in] chunks
+ * The Chunks to enqueue.
+ * \return
+ * 0 if success or -1 for failure
+ */
+void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks)
+{
+ if (chunks.empty())
+ return ;
+
+ inflight -= chunks.size();
+ get_stack()->get_infiniband().get_memory_manager()->return_tx(chunks);
+ ldout(cct, 30) << __func__ << " release " << chunks.size()
+ << " chunks, inflight " << inflight << dendl;
+ notify_pending_workers();
+}
+
+
+RDMAWorker::RDMAWorker(CephContext *c, unsigned i)
+ : Worker(c, i), stack(nullptr),
+ tx_handler(new C_handle_cq_tx(this)), lock("RDMAWorker::lock")
+{
+ // initialize perf_logger
+ char name[128];
+ sprintf(name, "AsyncMessenger::RDMAWorker-%u", id);
+ PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last);
+
+ plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer");
+ plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer");
+ plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted");
+
+ plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted");
+ plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted");
+ plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+}
+
+RDMAWorker::~RDMAWorker()
+{
+ delete tx_handler;
+}
+
+void RDMAWorker::initialize()
+{
+ if (!dispatcher) {
+ dispatcher = &stack->get_dispatcher();
+ }
+}
+
+int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot,
+ const SocketOptions &opt,ServerSocket *sock)
+{
+ get_stack()->get_infiniband().init();
+ dispatcher->polling_start();
+ RDMAServerSocketImpl *p;
+ if (cct->_conf->ms_async_rdma_type == "iwarp") {
+ p = new RDMAIWARPServerSocketImpl(
+ cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this,
+ sa, addr_slot);
+ } else {
+ p = new RDMAServerSocketImpl(cct, &get_stack()->get_infiniband(),
+ &get_stack()->get_dispatcher(), this, sa,
+ addr_slot);
+ }
+ int r = p->listen(sa, opt);
+ if (r < 0) {
+ delete p;
+ return r;
+ }
+
+ *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+ return 0;
+}
+
+int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+ get_stack()->get_infiniband().init();
+ dispatcher->polling_start();
+
+ RDMAConnectedSocketImpl* p;
+ if (cct->_conf->ms_async_rdma_type == "iwarp") {
+ p = new RDMAIWARPConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+ } else {
+ p = new RDMAConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this);
+ }
+ int r = p->try_connect(addr, opts);
+
+ if (r < 0) {
+ ldout(cct, 1) << __func__ << " try connecting failed." << dendl;
+ delete p;
+ return r;
+ }
+ std::unique_ptr<RDMAConnectedSocketImpl> csi(p);
+ *socket = ConnectedSocket(std::move(csi));
+ return 0;
+}
+
+int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes)
+{
+ ceph_assert(center.in_thread());
+ int r = get_stack()->get_infiniband().get_tx_buffers(c, bytes);
+ ceph_assert(r >= 0);
+ size_t got = get_stack()->get_infiniband().get_memory_manager()->get_tx_buffer_size() * r;
+ ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered bytes, inflight " << dispatcher->inflight << dendl;
+ stack->get_dispatcher().inflight += r;
+ if (got >= bytes)
+ return r;
+
+ if (o) {
+ if (!o->is_pending()) {
+ pending_sent_conns.push_back(o);
+ perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1);
+ o->set_pending(1);
+ }
+ dispatcher->make_pending_worker(this);
+ }
+ return r;
+}
+
+
+void RDMAWorker::handle_pending_message()
+{
+ ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl;
+ while (!pending_sent_conns.empty()) {
+ RDMAConnectedSocketImpl *o = pending_sent_conns.front();
+ pending_sent_conns.pop_front();
+ ssize_t r = o->submit(false);
+ ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl;
+ if (r < 0) {
+ if (r == -EAGAIN) {
+ pending_sent_conns.push_back(o);
+ dispatcher->make_pending_worker(this);
+ return ;
+ }
+ o->fault();
+ }
+ o->set_pending(0);
+ perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1);
+ }
+ dispatcher->notify_pending_workers();
+}
+
+RDMAStack::RDMAStack(CephContext *cct, const string &t)
+ : NetworkStack(cct, t), ib(cct), dispatcher(cct, this)
+{
+ ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl;
+
+ unsigned num = get_num_worker();
+ for (unsigned i = 0; i < num; ++i) {
+ RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i));
+ w->set_stack(this);
+ }
+ ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << &dispatcher << dendl;
+}
+
+RDMAStack::~RDMAStack()
+{
+ if (cct->_conf->ms_async_rdma_enable_hugepage) {
+ unsetenv("RDMAV_HUGEPAGES_SAFE"); //remove env variable on destruction
+ }
+}
+
+void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func)
+{
+ threads.resize(i+1);
+ threads[i] = std::thread(func);
+}
+
+void RDMAStack::join_worker(unsigned i)
+{
+ ceph_assert(threads.size() > i && threads[i].joinable());
+ threads[i].join();
+}
diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h
new file mode 100644
index 00000000..e4d34ee0
--- /dev/null
+++ b/src/msg/async/rdma/RDMAStack.h
@@ -0,0 +1,348 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSKY <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_RDMASTACK_H
+#define CEPH_MSG_RDMASTACK_H
+
+#include <sys/eventfd.h>
+
+#include <list>
+#include <vector>
+#include <thread>
+
+#include "common/ceph_context.h"
+#include "common/debug.h"
+#include "common/errno.h"
+#include "msg/async/Stack.h"
+#include "Infiniband.h"
+
+class RDMAConnectedSocketImpl;
+class RDMAServerSocketImpl;
+class RDMAStack;
+class RDMAWorker;
+
+class RDMADispatcher {
+ typedef Infiniband::MemoryManager::Chunk Chunk;
+ typedef Infiniband::QueuePair QueuePair;
+
+ std::thread t;
+ CephContext *cct;
+ Infiniband::CompletionQueue* tx_cq = nullptr;
+ Infiniband::CompletionQueue* rx_cq = nullptr;
+ Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr;
+ EventCallbackRef async_handler;
+ bool done = false;
+ std::atomic<uint64_t> num_dead_queue_pair = {0};
+ std::atomic<uint64_t> num_qp_conn = {0};
+ Mutex lock; // protect `qp_conns`, `dead_queue_pairs`
+ // qp_num -> InfRcConnection
+ // The main usage of `qp_conns` is looking up connection by qp_num,
+ // so the lifecycle of element in `qp_conns` is the lifecycle of qp.
+ //// make qp queue into dead state
+ /**
+ * 1. Connection call mark_down
+ * 2. Move the Queue Pair into the Error state(QueuePair::to_dead)
+ * 3. Wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED(handle_async_event)
+ * 4. Wait for CQ to be empty(handle_tx_event)
+ * 5. Destroy the QP by calling ibv_destroy_qp()(handle_tx_event)
+ *
+ * @param qp The qp needed to dead
+ */
+ ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns;
+
+ /// if a queue pair is closed when transmit buffers are active
+ /// on it, the transmit buffers never get returned via tx_cq. To
+ /// work around this problem, don't delete queue pairs immediately. Instead,
+ /// save them in this vector and delete them at a safe time, when there are
+ /// no outstanding transmit buffers to be lost.
+ std::vector<QueuePair*> dead_queue_pairs;
+
+ std::atomic<uint64_t> num_pending_workers = {0};
+ Mutex w_lock; // protect pending workers
+ // fixme: lockfree
+ std::list<RDMAWorker*> pending_workers;
+ RDMAStack* stack;
+
+ class C_handle_cq_async : public EventCallback {
+ RDMADispatcher *dispatcher;
+ public:
+ explicit C_handle_cq_async(RDMADispatcher *w): dispatcher(w) {}
+ void do_request(uint64_t fd) {
+ // worker->handle_tx_event();
+ dispatcher->handle_async_event();
+ }
+ };
+
+ public:
+ PerfCounters *perf_logger;
+
+ explicit RDMADispatcher(CephContext* c, RDMAStack* s);
+ virtual ~RDMADispatcher();
+ void handle_async_event();
+
+ void polling_start();
+ void polling_stop();
+ void polling();
+ void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi);
+ void make_pending_worker(RDMAWorker* w) {
+ Mutex::Locker l(w_lock);
+ auto it = std::find(pending_workers.begin(), pending_workers.end(), w);
+ if (it != pending_workers.end())
+ return;
+ pending_workers.push_back(w);
+ ++num_pending_workers;
+ }
+ RDMAStack* get_stack() { return stack; }
+ RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp);
+ QueuePair* get_qp(uint32_t qp);
+ void erase_qpn_lockless(uint32_t qpn);
+ void erase_qpn(uint32_t qpn);
+ Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; }
+ Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; }
+ void notify_pending_workers();
+ void handle_tx_event(ibv_wc *cqe, int n);
+ void post_tx_buffer(std::vector<Chunk*> &chunks);
+
+ std::atomic<uint64_t> inflight = {0};
+
+ void post_chunk_to_pool(Chunk* chunk);
+ int post_chunks_to_rq(int num, ibv_qp *qp=NULL);
+};
+
+class RDMAWorker : public Worker {
+ typedef Infiniband::CompletionQueue CompletionQueue;
+ typedef Infiniband::CompletionChannel CompletionChannel;
+ typedef Infiniband::MemoryManager::Chunk Chunk;
+ typedef Infiniband::MemoryManager MemoryManager;
+ typedef std::vector<Chunk*>::iterator ChunkIter;
+ RDMAStack *stack;
+ EventCallbackRef tx_handler;
+ std::list<RDMAConnectedSocketImpl*> pending_sent_conns;
+ RDMADispatcher* dispatcher = nullptr;
+ Mutex lock;
+
+ class C_handle_cq_tx : public EventCallback {
+ RDMAWorker *worker;
+ public:
+ explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {}
+ void do_request(uint64_t fd) {
+ worker->handle_pending_message();
+ }
+ };
+
+ public:
+ PerfCounters *perf_logger;
+ explicit RDMAWorker(CephContext *c, unsigned i);
+ virtual ~RDMAWorker();
+ virtual int listen(entity_addr_t &addr,
+ unsigned addr_slot,
+ const SocketOptions &opts, ServerSocket *) override;
+ virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+ virtual void initialize() override;
+ RDMAStack *get_stack() { return stack; }
+ int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes);
+ void remove_pending_conn(RDMAConnectedSocketImpl *o) {
+ ceph_assert(center.in_thread());
+ pending_sent_conns.remove(o);
+ }
+ void handle_pending_message();
+ void set_stack(RDMAStack *s) { stack = s; }
+ void notify_worker() {
+ center.dispatch_event_external(tx_handler);
+ }
+};
+
+struct RDMACMInfo {
+ RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_)
+ : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {}
+ rdma_cm_id *cm_id;
+ rdma_event_channel *cm_channel;
+ uint32_t qp_num;
+};
+
+class RDMAConnectedSocketImpl : public ConnectedSocketImpl {
+ public:
+ typedef Infiniband::MemoryManager::Chunk Chunk;
+ typedef Infiniband::CompletionChannel CompletionChannel;
+ typedef Infiniband::CompletionQueue CompletionQueue;
+
+ protected:
+ CephContext *cct;
+ Infiniband::QueuePair *qp;
+ IBSYNMsg peer_msg;
+ IBSYNMsg my_msg;
+ int connected;
+ int error;
+ Infiniband* infiniband;
+ RDMADispatcher* dispatcher;
+ RDMAWorker* worker;
+ std::vector<Chunk*> buffers;
+ int notify_fd = -1;
+ bufferlist pending_bl;
+
+ Mutex lock;
+ std::vector<ibv_wc> wc;
+ bool is_server;
+ EventCallbackRef read_handler;
+ EventCallbackRef established_handler;
+ int tcp_fd = -1;
+ bool active;// qp is active ?
+ bool pending;
+ int post_backlog = 0;
+
+ void notify();
+ ssize_t read_buffers(char* buf, size_t len);
+ int post_work_request(std::vector<Chunk*>&);
+
+ public:
+ RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w);
+ virtual ~RDMAConnectedSocketImpl();
+
+ void pass_wc(std::vector<ibv_wc> &&v);
+ void get_wc(std::vector<ibv_wc> &w);
+ virtual int is_connected() override { return connected; }
+
+ virtual ssize_t read(char* buf, size_t len) override;
+ virtual ssize_t zero_copy_read(bufferptr &data) override;
+ virtual ssize_t send(bufferlist &bl, bool more) override;
+ virtual void shutdown() override;
+ virtual void close() override;
+ virtual int fd() const override { return notify_fd; }
+ virtual int socket_fd() const override { return tcp_fd; }
+ void fault();
+ const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); }
+ ssize_t submit(bool more);
+ int activate();
+ void fin();
+ void handle_connection();
+ int handle_connection_established(bool need_set_fault = true);
+ void cleanup();
+ void set_accept_fd(int sd);
+ virtual int try_connect(const entity_addr_t&, const SocketOptions &opt);
+ bool is_pending() {return pending;}
+ void set_pending(bool val) {pending = val;}
+ void post_chunks_to_rq(int num);
+ void update_post_backlog();
+};
+
+enum RDMA_CM_STATUS {
+ IDLE = 1,
+ RDMA_ID_CREATED,
+ CHANNEL_FD_CREATED,
+ RESOURCE_ALLOCATED,
+ ADDR_RESOLVED,
+ ROUTE_RESOLVED,
+ CONNECTED,
+ DISCONNECTED,
+ ERROR
+};
+
+class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl {
+ public:
+ RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s,
+ RDMAWorker *w, RDMACMInfo *info = nullptr);
+ ~RDMAIWARPConnectedSocketImpl();
+ virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override;
+ virtual void close() override;
+ virtual void shutdown() override;
+ virtual void handle_cm_connection();
+ uint32_t get_local_qpn() const { return local_qpn; }
+ void activate();
+ int alloc_resource();
+ void close_notify();
+
+ private:
+ rdma_cm_id *cm_id;
+ rdma_event_channel *cm_channel;
+ uint32_t local_qpn;
+ uint32_t remote_qpn;
+ EventCallbackRef cm_con_handler;
+ bool is_server;
+ std::mutex close_mtx;
+ std::condition_variable close_condition;
+ bool closed;
+ RDMA_CM_STATUS status;
+
+
+ class C_handle_cm_connection : public EventCallback {
+ RDMAIWARPConnectedSocketImpl *csi;
+ public:
+ C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {}
+ void do_request(uint64_t fd) {
+ csi->handle_cm_connection();
+ }
+ };
+};
+
+class RDMAServerSocketImpl : public ServerSocketImpl {
+ protected:
+ CephContext *cct;
+ NetHandler net;
+ int server_setup_socket;
+ Infiniband* infiniband;
+ RDMADispatcher *dispatcher;
+ RDMAWorker *worker;
+ entity_addr_t sa;
+
+ public:
+ RDMAServerSocketImpl(CephContext *cct, Infiniband* i, RDMADispatcher *s,
+ RDMAWorker *w, entity_addr_t& a, unsigned slot);
+
+ virtual int listen(entity_addr_t &sa, const SocketOptions &opt);
+ virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ virtual void abort_accept() override;
+ virtual int fd() const override { return server_setup_socket; }
+ int get_fd() { return server_setup_socket; }
+};
+
+class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl {
+ public:
+ RDMAIWARPServerSocketImpl(
+ CephContext *cct, Infiniband *i, RDMADispatcher *s, RDMAWorker *w,
+ entity_addr_t& addr, unsigned addr_slot);
+ virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override;
+ virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ virtual void abort_accept() override;
+ private:
+ rdma_cm_id *cm_id;
+ rdma_event_channel *cm_channel;
+};
+
+class RDMAStack : public NetworkStack {
+ vector<std::thread> threads;
+ PerfCounters *perf_counter;
+ Infiniband ib;
+ RDMADispatcher dispatcher;
+
+ std::atomic<bool> fork_finished = {false};
+
+ public:
+ explicit RDMAStack(CephContext *cct, const string &t);
+ virtual ~RDMAStack();
+ virtual bool support_zero_copy_read() const override { return false; }
+ virtual bool nonblock_connect_need_writable_event() const override { return false; }
+
+ virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override;
+ virtual void join_worker(unsigned i) override;
+ RDMADispatcher &get_dispatcher() { return dispatcher; }
+ Infiniband &get_infiniband() { return ib; }
+ virtual bool is_ready() override { return fork_finished.load(); };
+ virtual void ready() override { fork_finished = true; };
+};
+
+
+#endif