diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg | |
parent | Initial commit. (diff) | |
download | ceph-upstream.tar.xz ceph-upstream.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/msg')
112 files changed, 41000 insertions, 0 deletions
diff --git a/src/msg/CMakeLists.txt b/src/msg/CMakeLists.txt new file mode 100644 index 00000000..1ad34615 --- /dev/null +++ b/src/msg/CMakeLists.txt @@ -0,0 +1,76 @@ +set(msg_srcs + DispatchQueue.cc + Message.cc + Messenger.cc + QueueStrategy.cc + msg_types.cc + simple/Accepter.cc + simple/Pipe.cc + simple/PipeConnection.cc + simple/SimpleMessenger.cc) + +if(HAVE_XIO) + list(APPEND msg_srcs + xio/XioConnection.cc + xio/XioMsg.cc + xio/XioPool.cc + xio/XioMessenger.cc + xio/XioPortal.cc) +endif(HAVE_XIO) + +list(APPEND msg_srcs + async/AsyncConnection.cc + async/AsyncMessenger.cc + async/Protocol.cc + async/ProtocolV1.cc + async/ProtocolV2.cc + async/Event.cc + async/EventSelect.cc + async/PosixStack.cc + async/Stack.cc + async/crypto_onwire.cc + async/frames_v2.cc + async/net_handler.cc) + +if(LINUX) + list(APPEND msg_srcs + async/EventEpoll.cc) +elseif(FREEBSD OR APPLE) + list(APPEND msg_srcs + async/EventKqueue.cc) +endif(LINUX) + +if(HAVE_RDMA) + list(APPEND msg_srcs + async/rdma/Infiniband.cc + async/rdma/RDMAConnectedSocketImpl.cc + async/rdma/RDMAIWARPConnectedSocketImpl.cc + async/rdma/RDMAServerSocketImpl.cc + async/rdma/RDMAIWARPServerSocketImpl.cc + async/rdma/RDMAStack.cc) +endif() + +add_library(common-msg-objs OBJECT ${msg_srcs}) + +if(WITH_DPDK) + set(async_dpdk_srcs + async/dpdk/ARP.cc + async/dpdk/DPDK.cc + async/dpdk/dpdk_rte.cc + async/dpdk/DPDKStack.cc + async/dpdk/EventDPDK.cc + async/dpdk/IP.cc + async/dpdk/net.cc + async/dpdk/IPChecksum.cc + async/dpdk/Packet.cc + async/dpdk/TCP.cc + async/dpdk/UserspaceEvent.cc + async/dpdk/ethernet.cc) + add_library(common_async_dpdk STATIC + ${async_dpdk_srcs}) + target_link_libraries(common_async_dpdk PRIVATE + dpdk::dpdk) + # Stack.cc includes DPDKStack.h, which includes rte_config.h indirectly + target_include_directories(common-msg-objs PRIVATE + $<TARGET_PROPERTY:dpdk::dpdk,INTERFACE_INCLUDE_DIRECTORIES>) +endif(WITH_DPDK) diff --git a/src/msg/Connection.h b/src/msg/Connection.h new file mode 100644 index 00000000..4eea5ff0 --- /dev/null +++ b/src/msg/Connection.h @@ -0,0 +1,253 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_CONNECTION_H +#define CEPH_CONNECTION_H + +#include <stdlib.h> +#include <ostream> + +#include <boost/intrusive_ptr.hpp> + +#include "auth/Auth.h" +#include "common/RefCountedObj.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/Mutex.h" +#include "include/ceph_assert.h" // Because intusive_ptr clobbers our assert... +#include "include/buffer.h" +#include "include/types.h" +#include "common/item_history.h" +#include "msg/MessageRef.h" + + +// ====================================================== + +// abstract Connection, for keeping per-connection state + +class Messenger; + +#ifdef UNIT_TESTS_BUILT +class Interceptor; +#endif + +struct Connection : public RefCountedObject { + mutable Mutex lock; + Messenger *msgr; + RefCountedPtr priv; + int peer_type; + int64_t peer_id = -1; // [msgr2 only] the 0 of osd.0, 4567 or client.4567 + safe_item_history<entity_addrvec_t> peer_addrs; + utime_t last_keepalive, last_keepalive_ack; +private: + uint64_t features; +public: + bool failed; // true if we are a lossy connection that has failed. + + int rx_buffers_version; + map<ceph_tid_t,pair<bufferlist,int> > rx_buffers; + + // authentication state + // FIXME make these private after ms_handle_authorizer is removed +public: + AuthCapsInfo peer_caps_info; + EntityName peer_name; + uint64_t peer_global_id = 0; + +#ifdef UNIT_TESTS_BUILT + Interceptor *interceptor; +#endif + + friend class boost::intrusive_ptr<Connection>; + friend class PipeConnection; + +public: + Connection(CephContext *cct, Messenger *m) + // we are managed exclusively by ConnectionRef; make it so you can + // ConnectionRef foo = new Connection; + : RefCountedObject(cct, 0), + lock("Connection::lock"), + msgr(m), + peer_type(-1), + features(0), + failed(false), + rx_buffers_version(0) { + } + + ~Connection() override { + //generic_dout(0) << "~Connection " << this << dendl; + } + + void set_priv(const RefCountedPtr& o) { + Mutex::Locker l(lock); + priv = o; + } + + RefCountedPtr get_priv() { + Mutex::Locker l(lock); + return priv; + } + + /** + * Used to judge whether this connection is ready to send. Usually, the + * implementation need to build a own shakehand or sesson then it can be + * ready to send. + * + * @return true if ready to send, or false otherwise + */ + virtual bool is_connected() = 0; + + Messenger *get_messenger() { + return msgr; + } + + /** + * Queue the given Message to send out on the given Connection. + * Success in this function does not guarantee Message delivery, only + * success in queueing the Message. Other guarantees may be provided based + * on the Connection policy. + * + * @param m The Message to send. The Messenger consumes a single reference + * when you pass it in. + * + * @return 0 on success, or -errno on failure. + */ + virtual int send_message(Message *m) = 0; + + virtual int send_message2(MessageRef m) + { + return send_message(m.detach()); /* send_message(Message *m) consumes a reference */ + } + + /** + * Send a "keepalive" ping along the given Connection, if it's working. + * If the underlying connection has broken, this function does nothing. + * + * @return 0, or implementation-defined error numbers. + */ + virtual void send_keepalive() = 0; + /** + * Mark down the given Connection. + * + * This will cause us to discard its outgoing queue, and if reset + * detection is enabled in the policy and the endpoint tries to + * reconnect they will discard their queue when we inform them of + * the session reset. + * + * It does not generate any notifications to the Dispatcher. + */ + virtual void mark_down() = 0; + + /** + * Mark a Connection as "disposable", setting it to lossy + * (regardless of initial Policy). This does not immediately close + * the Connection once Messages have been delivered, so as long as + * there are no errors you can continue to receive responses; but it + * will not attempt to reconnect for message delivery or preserve + * your old delivery semantics, either. + * + * TODO: There's some odd stuff going on in our SimpleMessenger + * implementation during connect that looks unused; is there + * more of a contract that that's enforcing? + */ + virtual void mark_disposable() = 0; + + // WARNING / FIXME: this is not populated for loopback connections + AuthCapsInfo& get_peer_caps_info() { + return peer_caps_info; + } + const EntityName& get_peer_entity_name() { + return peer_name; + } + uint64_t get_peer_global_id() { + return peer_global_id; + } + + int get_peer_type() const { return peer_type; } + void set_peer_type(int t) { peer_type = t; } + + // peer_id is only defined for msgr2 + int64_t get_peer_id() const { return peer_id; } + void set_peer_id(int64_t t) { peer_id = t; } + + bool peer_is_mon() const { return peer_type == CEPH_ENTITY_TYPE_MON; } + bool peer_is_mgr() const { return peer_type == CEPH_ENTITY_TYPE_MGR; } + bool peer_is_mds() const { return peer_type == CEPH_ENTITY_TYPE_MDS; } + bool peer_is_osd() const { return peer_type == CEPH_ENTITY_TYPE_OSD; } + bool peer_is_client() const { return peer_type == CEPH_ENTITY_TYPE_CLIENT; } + + /// which of the peer's addrs is actually in use for this connection + virtual entity_addr_t get_peer_socket_addr() const = 0; + + entity_addr_t get_peer_addr() const { + return peer_addrs->front(); + } + const entity_addrvec_t& get_peer_addrs() const { + return *peer_addrs; + } + void set_peer_addr(const entity_addr_t& a) { + peer_addrs = entity_addrvec_t(a); + } + void set_peer_addrs(const entity_addrvec_t& av) { peer_addrs = av; } + + uint64_t get_features() const { return features; } + bool has_feature(uint64_t f) const { return features & f; } + bool has_features(uint64_t f) const { + return (features & f) == f; + } + void set_features(uint64_t f) { features = f; } + void set_feature(uint64_t f) { features |= f; } + + virtual int get_con_mode() const { + return CEPH_CON_MODE_CRC; + } + + void post_rx_buffer(ceph_tid_t tid, bufferlist& bl) { +#if 0 + Mutex::Locker l(lock); + ++rx_buffers_version; + rx_buffers[tid] = pair<bufferlist,int>(bl, rx_buffers_version); +#endif + } + + void revoke_rx_buffer(ceph_tid_t tid) { +#if 0 + Mutex::Locker l(lock); + rx_buffers.erase(tid); +#endif + } + + utime_t get_last_keepalive() const { + Mutex::Locker l(lock); + return last_keepalive; + } + void set_last_keepalive(utime_t t) { + Mutex::Locker l(lock); + last_keepalive = t; + } + utime_t get_last_keepalive_ack() const { + Mutex::Locker l(lock); + return last_keepalive_ack; + } + void set_last_keepalive_ack(utime_t t) { + Mutex::Locker l(lock); + last_keepalive_ack = t; + } + +}; + +typedef boost::intrusive_ptr<Connection> ConnectionRef; + + +#endif /* CEPH_CONNECTION_H */ diff --git a/src/msg/DispatchQueue.cc b/src/msg/DispatchQueue.cc new file mode 100644 index 00000000..587a2dbe --- /dev/null +++ b/src/msg/DispatchQueue.cc @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "msg/Message.h" +#include "DispatchQueue.h" +#include "Messenger.h" +#include "common/ceph_context.h" + +#define dout_subsys ceph_subsys_ms +#include "common/debug.h" + + +/******************* + * DispatchQueue + */ + +#undef dout_prefix +#define dout_prefix *_dout << "-- " << msgr->get_myaddrs() << " " + +double DispatchQueue::get_max_age(utime_t now) const { + Mutex::Locker l(lock); + if (marrival.empty()) + return 0; + else + return (now - marrival.begin()->first); +} + +uint64_t DispatchQueue::pre_dispatch(const Message::ref& m) +{ + ldout(cct,1) << "<== " << m->get_source_inst() + << " " << m->get_seq() + << " ==== " << *m + << " ==== " << m->get_payload().length() + << "+" << m->get_middle().length() + << "+" << m->get_data().length() + << " (" << ceph_con_mode_name(m->get_connection()->get_con_mode()) + << " " << m->get_footer().front_crc << " " + << m->get_footer().middle_crc + << " " << m->get_footer().data_crc << ")" + << " " << m << " con " << m->get_connection() + << dendl; + uint64_t msize = m->get_dispatch_throttle_size(); + m->set_dispatch_throttle_size(0); // clear it out, in case we requeue this message. + return msize; +} + +void DispatchQueue::post_dispatch(const Message::ref& m, uint64_t msize) +{ + dispatch_throttle_release(msize); + ldout(cct,20) << "done calling dispatch on " << m << dendl; +} + +bool DispatchQueue::can_fast_dispatch(const Message::const_ref &m) const +{ + return msgr->ms_can_fast_dispatch(m); +} + +void DispatchQueue::fast_dispatch(const Message::ref& m) +{ + uint64_t msize = pre_dispatch(m); + msgr->ms_fast_dispatch(m); + post_dispatch(m, msize); +} + +void DispatchQueue::fast_preprocess(const Message::ref& m) +{ + msgr->ms_fast_preprocess(m); +} + +void DispatchQueue::enqueue(const Message::ref& m, int priority, uint64_t id) +{ + Mutex::Locker l(lock); + if (stop) { + return; + } + ldout(cct,20) << "queue " << m << " prio " << priority << dendl; + add_arrival(m); + if (priority >= CEPH_MSG_PRIO_LOW) { + mqueue.enqueue_strict(id, priority, QueueItem(m)); + } else { + mqueue.enqueue(id, priority, m->get_cost(), QueueItem(m)); + } + cond.Signal(); +} + +void DispatchQueue::local_delivery(const Message::ref& m, int priority) +{ + m->set_recv_stamp(ceph_clock_now()); + Mutex::Locker l(local_delivery_lock); + if (local_messages.empty()) + local_delivery_cond.Signal(); + local_messages.emplace(m, priority); + return; +} + +void DispatchQueue::run_local_delivery() +{ + local_delivery_lock.Lock(); + while (true) { + if (stop_local_delivery) + break; + if (local_messages.empty()) { + local_delivery_cond.Wait(local_delivery_lock); + continue; + } + auto p = std::move(local_messages.front()); + local_messages.pop(); + local_delivery_lock.Unlock(); + const Message::ref& m = p.first; + int priority = p.second; + fast_preprocess(m); + if (can_fast_dispatch(m)) { + fast_dispatch(m); + } else { + enqueue(m, priority, 0); + } + local_delivery_lock.Lock(); + } + local_delivery_lock.Unlock(); +} + +void DispatchQueue::dispatch_throttle_release(uint64_t msize) +{ + if (msize) { + ldout(cct,10) << __func__ << " " << msize << " to dispatch throttler " + << dispatch_throttler.get_current() << "/" + << dispatch_throttler.get_max() << dendl; + dispatch_throttler.put(msize); + } +} + +/* + * This function delivers incoming messages to the Messenger. + * Connections with messages are kept in queues; when beginning a message + * delivery the highest-priority queue is selected, the connection from the + * front of the queue is removed, and its message read. If the connection + * has remaining messages at that priority level, it is re-placed on to the + * end of the queue. If the queue is empty; it's removed. + * The message is then delivered and the process starts again. + */ +void DispatchQueue::entry() +{ + lock.Lock(); + while (true) { + while (!mqueue.empty()) { + QueueItem qitem = mqueue.dequeue(); + if (!qitem.is_code()) + remove_arrival(qitem.get_message()); + lock.Unlock(); + + if (qitem.is_code()) { + if (cct->_conf->ms_inject_internal_delays && + cct->_conf->ms_inject_delay_probability && + (rand() % 10000)/10000.0 < cct->_conf->ms_inject_delay_probability) { + utime_t t; + t.set_from_double(cct->_conf->ms_inject_internal_delays); + ldout(cct, 1) << "DispatchQueue::entry inject delay of " << t + << dendl; + t.sleep(); + } + switch (qitem.get_code()) { + case D_BAD_REMOTE_RESET: + msgr->ms_deliver_handle_remote_reset(qitem.get_connection()); + break; + case D_CONNECT: + msgr->ms_deliver_handle_connect(qitem.get_connection()); + break; + case D_ACCEPT: + msgr->ms_deliver_handle_accept(qitem.get_connection()); + break; + case D_BAD_RESET: + msgr->ms_deliver_handle_reset(qitem.get_connection()); + break; + case D_CONN_REFUSED: + msgr->ms_deliver_handle_refused(qitem.get_connection()); + break; + default: + ceph_abort(); + } + } else { + const Message::ref& m = qitem.get_message(); + if (stop) { + ldout(cct,10) << " stop flag set, discarding " << m << " " << *m << dendl; + } else { + uint64_t msize = pre_dispatch(m); + msgr->ms_deliver_dispatch(m); + post_dispatch(m, msize); + } + } + + lock.Lock(); + } + if (stop) + break; + + // wait for something to be put on queue + cond.Wait(lock); + } + lock.Unlock(); +} + +void DispatchQueue::discard_queue(uint64_t id) { + Mutex::Locker l(lock); + list<QueueItem> removed; + mqueue.remove_by_class(id, &removed); + for (list<QueueItem>::iterator i = removed.begin(); + i != removed.end(); + ++i) { + ceph_assert(!(i->is_code())); // We don't discard id 0, ever! + const Message::ref& m = i->get_message(); + remove_arrival(m); + dispatch_throttle_release(m->get_dispatch_throttle_size()); + } +} + +void DispatchQueue::start() +{ + ceph_assert(!stop); + ceph_assert(!dispatch_thread.is_started()); + dispatch_thread.create("ms_dispatch"); + local_delivery_thread.create("ms_local"); +} + +void DispatchQueue::wait() +{ + local_delivery_thread.join(); + dispatch_thread.join(); +} + +void DispatchQueue::discard_local() +{ + decltype(local_messages)().swap(local_messages); +} + +void DispatchQueue::shutdown() +{ + // stop my local delivery thread + local_delivery_lock.Lock(); + stop_local_delivery = true; + local_delivery_cond.Signal(); + local_delivery_lock.Unlock(); + + // stop my dispatch thread + lock.Lock(); + stop = true; + cond.Signal(); + lock.Unlock(); +} diff --git a/src/msg/DispatchQueue.h b/src/msg/DispatchQueue.h new file mode 100644 index 00000000..2d90d82c --- /dev/null +++ b/src/msg/DispatchQueue.h @@ -0,0 +1,243 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_DISPATCHQUEUE_H +#define CEPH_DISPATCHQUEUE_H + +#include <atomic> +#include <map> +#include <queue> +#include <boost/intrusive_ptr.hpp> +#include "include/ceph_assert.h" +#include "common/Throttle.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" +#include "common/PrioritizedQueue.h" + +#include "Message.h" + +class CephContext; +class Messenger; +struct Connection; + +/** + * The DispatchQueue contains all the connections which have Messages + * they want to be dispatched, carefully organized by Message priority + * and permitted to deliver in a round-robin fashion. + * See Messenger::dispatch_entry for details. + */ +class DispatchQueue { + class QueueItem { + int type; + ConnectionRef con; + Message::ref m; + public: + explicit QueueItem(const Message::ref& m) : type(-1), con(0), m(m) {} + QueueItem(int type, Connection *con) : type(type), con(con), m(0) {} + bool is_code() const { + return type != -1; + } + int get_code () const { + ceph_assert(is_code()); + return type; + } + const Message::ref& get_message() { + ceph_assert(!is_code()); + return m; + } + Connection *get_connection() { + ceph_assert(is_code()); + return con.get(); + } + }; + + CephContext *cct; + Messenger *msgr; + mutable Mutex lock; + Cond cond; + + PrioritizedQueue<QueueItem, uint64_t> mqueue; + + std::set<pair<double, Message::ref>> marrival; + map<Message::ref, decltype(marrival)::iterator> marrival_map; + void add_arrival(const Message::ref& m) { + marrival_map.insert( + make_pair( + m, + marrival.insert(make_pair(m->get_recv_stamp(), m)).first + ) + ); + } + void remove_arrival(const Message::ref& m) { + auto it = marrival_map.find(m); + ceph_assert(it != marrival_map.end()); + marrival.erase(it->second); + marrival_map.erase(it); + } + + std::atomic<uint64_t> next_id; + + enum { D_CONNECT = 1, D_ACCEPT, D_BAD_REMOTE_RESET, D_BAD_RESET, D_CONN_REFUSED, D_NUM_CODES }; + + /** + * The DispatchThread runs dispatch_entry to empty out the dispatch_queue. + */ + class DispatchThread : public Thread { + DispatchQueue *dq; + public: + explicit DispatchThread(DispatchQueue *dq) : dq(dq) {} + void *entry() override { + dq->entry(); + return 0; + } + } dispatch_thread; + + Mutex local_delivery_lock; + Cond local_delivery_cond; + bool stop_local_delivery; + std::queue<pair<Message::ref, int>> local_messages; + class LocalDeliveryThread : public Thread { + DispatchQueue *dq; + public: + explicit LocalDeliveryThread(DispatchQueue *dq) : dq(dq) {} + void *entry() override { + dq->run_local_delivery(); + return 0; + } + } local_delivery_thread; + + uint64_t pre_dispatch(const Message::ref& m); + void post_dispatch(const Message::ref& m, uint64_t msize); + + public: + + /// Throttle preventing us from building up a big backlog waiting for dispatch + Throttle dispatch_throttler; + + bool stop; + void local_delivery(const Message::ref& m, int priority); + void local_delivery(Message* m, int priority) { + return local_delivery(Message::ref(m, false), priority); /* consume ref */ + } + void run_local_delivery(); + + double get_max_age(utime_t now) const; + + int get_queue_len() const { + Mutex::Locker l(lock); + return mqueue.length(); + } + + /** + * Release memory accounting back to the dispatch throttler. + * + * @param msize The amount of memory to release. + */ + void dispatch_throttle_release(uint64_t msize); + + void queue_connect(Connection *con) { + Mutex::Locker l(lock); + if (stop) + return; + mqueue.enqueue_strict( + 0, + CEPH_MSG_PRIO_HIGHEST, + QueueItem(D_CONNECT, con)); + cond.Signal(); + } + void queue_accept(Connection *con) { + Mutex::Locker l(lock); + if (stop) + return; + mqueue.enqueue_strict( + 0, + CEPH_MSG_PRIO_HIGHEST, + QueueItem(D_ACCEPT, con)); + cond.Signal(); + } + void queue_remote_reset(Connection *con) { + Mutex::Locker l(lock); + if (stop) + return; + mqueue.enqueue_strict( + 0, + CEPH_MSG_PRIO_HIGHEST, + QueueItem(D_BAD_REMOTE_RESET, con)); + cond.Signal(); + } + void queue_reset(Connection *con) { + Mutex::Locker l(lock); + if (stop) + return; + mqueue.enqueue_strict( + 0, + CEPH_MSG_PRIO_HIGHEST, + QueueItem(D_BAD_RESET, con)); + cond.Signal(); + } + void queue_refused(Connection *con) { + Mutex::Locker l(lock); + if (stop) + return; + mqueue.enqueue_strict( + 0, + CEPH_MSG_PRIO_HIGHEST, + QueueItem(D_CONN_REFUSED, con)); + cond.Signal(); + } + + bool can_fast_dispatch(const Message::const_ref &m) const; + void fast_dispatch(const Message::ref& m); + void fast_dispatch(Message* m) { + return fast_dispatch(Message::ref(m, false)); /* consume ref */ + } + void fast_preprocess(const Message::ref& m); + void enqueue(const Message::ref& m, int priority, uint64_t id); + void enqueue(Message* m, int priority, uint64_t id) { + return enqueue(Message::ref(m, false), priority, id); /* consume ref */ + } + void discard_queue(uint64_t id); + void discard_local(); + uint64_t get_id() { + return next_id++; + } + void start(); + void entry(); + void wait(); + void shutdown(); + bool is_started() const {return dispatch_thread.is_started();} + + DispatchQueue(CephContext *cct, Messenger *msgr, string &name) + : cct(cct), msgr(msgr), + lock("Messenger::DispatchQueue::lock" + name), + mqueue(cct->_conf->ms_pq_max_tokens_per_priority, + cct->_conf->ms_pq_min_cost), + next_id(1), + dispatch_thread(this), + local_delivery_lock("Messenger::DispatchQueue::local_delivery_lock" + name), + stop_local_delivery(false), + local_delivery_thread(this), + dispatch_throttler(cct, string("msgr_dispatch_throttler-") + name, + cct->_conf->ms_dispatch_throttle_bytes), + stop(false) + {} + ~DispatchQueue() { + ceph_assert(mqueue.empty()); + ceph_assert(marrival.empty()); + ceph_assert(local_messages.empty()); + } +}; + +#endif diff --git a/src/msg/DispatchStrategy.h b/src/msg/DispatchStrategy.h new file mode 100644 index 00000000..4c9726ed --- /dev/null +++ b/src/msg/DispatchStrategy.h @@ -0,0 +1,37 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef DISPATCH_STRATEGY_H +#define DISPATCH_STRATEGY_H + +#include "msg/Message.h" + +class Messenger; + +class DispatchStrategy +{ +protected: + Messenger *msgr = nullptr; +public: + DispatchStrategy() {} + Messenger *get_messenger() { return msgr; } + void set_messenger(Messenger *_msgr) { msgr = _msgr; } + virtual void ds_dispatch(Message *m) = 0; + virtual void shutdown() = 0; + virtual void start() = 0; + virtual void wait() = 0; + virtual ~DispatchStrategy() {} +}; + +#endif /* DISPATCH_STRATEGY_H */ diff --git a/src/msg/Dispatcher.h b/src/msg/Dispatcher.h new file mode 100644 index 00000000..fef5e320 --- /dev/null +++ b/src/msg/Dispatcher.h @@ -0,0 +1,264 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_DISPATCHER_H +#define CEPH_DISPATCHER_H + +#include <memory> +#include "include/buffer_fwd.h" +#include "include/ceph_assert.h" +#include "msg/MessageRef.h" + +class Messenger; +class Connection; +class AuthAuthorizer; +class CryptoKey; +class CephContext; +class AuthAuthorizerChallenge; +class KeyStore; + +class Dispatcher { +public: + explicit Dispatcher(CephContext *cct_) + : cct(cct_) + { + } + virtual ~Dispatcher() { } + + /** + * The Messenger calls this function to query if you are capable + * of "fast dispatch"ing a message. Indicating that you can fast + * dispatch it requires that you: + * 1) Handle the Message quickly and without taking long-term contended + * locks. (This function is likely to be called in-line with message + * receipt.) + * 2) Be able to accept the Message even if you have not yet received + * an ms_handle_accept() notification for the Connection it is associated + * with, and even if you *have* called mark_down() or received an + * ms_handle_reset() (or similar) call on the Connection. You will + * not receive more than one dead "message" (and should generally be + * prepared for that circumstance anyway, since the normal dispatch can begin, + * then trigger Connection failure before it's percolated through your system). + * We provide ms_handle_fast_[connect|accept] calls if you need them, under + * similar speed and state constraints as fast_dispatch itself. + * 3) Be able to make a determination on fast_dispatch without relying + * on particular system state -- the ms_can_fast_dispatch() call might + * be called multiple times on a single message; the state might change between + * calling ms_can_fast_dispatch and ms_fast_dispatch; etc. + * + * @param m The message we want to fast dispatch. + * @returns True if the message can be fast dispatched; false otherwise. + */ + virtual bool ms_can_fast_dispatch(const Message *m) const { return false; } + virtual bool ms_can_fast_dispatch2(const MessageConstRef& m) const { + return ms_can_fast_dispatch(m.get()); + } + /** + * This function determines if a dispatcher is included in the + * list of fast-dispatch capable Dispatchers. + * @returns True if the Dispatcher can handle any messages via + * fast dispatch; false otherwise. + */ + virtual bool ms_can_fast_dispatch_any() const { return false; } + /** + * Perform a "fast dispatch" on a given message. See + * ms_can_fast_dispatch() for the requirements. + * + * @param m The Message to fast dispatch. + */ + virtual void ms_fast_dispatch(Message *m) { ceph_abort(); } + + /* ms_fast_dispatch2 because otherwise the child must define both */ + virtual void ms_fast_dispatch2(const MessageRef &m) { + /* allow old style dispatch handling that expects a Message * with a floating ref */ + return ms_fast_dispatch(MessageRef(m).detach()); /* XXX N.B. always consumes ref */ + } + + /** + * Let the Dispatcher preview a Message before it is dispatched. This + * function is called on *every* Message, prior to the fast/regular dispatch + * decision point, but it is only used on fast-dispatch capable systems. An + * implementation of ms_fast_preprocess must be essentially lock-free in the + * same way as the ms_fast_dispatch function is (in particular, ms_fast_preprocess + * may be called while the Messenger holds internal locks that prevent progress from + * other threads, so any locks it takes must be at the very bottom of the hierarchy). + * Messages are delivered in receipt order within a single Connection, but there are + * no guarantees across Connections. This makes it useful for some limited + * coordination between Messages which can be fast_dispatch'ed and those which must + * go through normal dispatch. + * + * @param m A message which has been received + */ + virtual void ms_fast_preprocess(Message *m) {} + + /* ms_fast_preprocess2 because otherwise the child must define both */ + virtual void ms_fast_preprocess2(const MessageRef &m) { + /* allow old style dispatch handling that expects a Message* */ + return ms_fast_preprocess(m.get()); + } + + /** + * The Messenger calls this function to deliver a single message. + * + * @param m The message being delivered. You (the Dispatcher) + * are given a single reference count on it. + */ + virtual bool ms_dispatch(Message *m) { + ceph_abort(); + } + + /* ms_dispatch2 because otherwise the child must define both */ + virtual bool ms_dispatch2(const MessageRef &m) { + /* allow old style dispatch handling that expects a Message * with a floating ref */ + MessageRef mr(m); + if (ms_dispatch(mr.get())) { + mr.detach(); /* dispatcher consumed ref */ + return true; + } + return false; + } + + /** + * This function will be called whenever a Connection is newly-created + * or reconnects in the Messenger. + * + * @param con The new Connection which has been established. You are not + * granted a reference to it -- take one if you need one! + */ + virtual void ms_handle_connect(Connection *con) {} + + /** + * This function will be called synchronously whenever a Connection is + * newly-created or reconnects in the Messenger, if you support fast + * dispatch. It is guaranteed to be called before any messages are + * dispatched. + * + * @param con The new Connection which has been established. You are not + * granted a reference to it -- take one if you need one! + */ + virtual void ms_handle_fast_connect(Connection *con) {} + + /** + * Callback indicating we have accepted an incoming connection. + * + * @param con The (new or existing) Connection associated with the session + */ + virtual void ms_handle_accept(Connection *con) {} + + /** + * Callback indicating we have accepted an incoming connection, if you + * support fast dispatch. It is guaranteed to be called before any messages + * are dispatched. + * + * @param con The (new or existing) Connection associated with the session + */ + virtual void ms_handle_fast_accept(Connection *con) {} + + /* + * this indicates that the ordered+reliable delivery semantics have + * been violated. Messages may have been lost due to a fault + * in the network connection. + * Only called on lossy Connections. + * + * @param con The Connection which broke. You are not granted + * a reference to it. + */ + virtual bool ms_handle_reset(Connection *con) = 0; + + /** + * This indicates that the ordered+reliable delivery semantics + * have been violated because the remote somehow reset. + * It implies that incoming messages were dropped, and + * probably some of our previous outgoing messages were too. + * + * @param con The Connection which broke. You are not granted + * a reference to it. + */ + virtual void ms_handle_remote_reset(Connection *con) = 0; + + /** + * This indicates that the connection is both broken and further + * connection attempts are failing because other side refuses + * it. + * + * @param con The Connection which broke. You are not granted + * a reference to it. + */ + virtual bool ms_handle_refused(Connection *con) = 0; + + /** + * @defgroup Authentication + * @{ + */ + + /** + * handle successful authentication (msgr2) + * + * Authenticated result/state will be attached to the Connection. + * + * return 1 for success + * return 0 for no action (let another Dispatcher handle it) + * return <0 for failure (failure to parse caps, for instance) + */ + virtual int ms_handle_authentication(Connection *con) { + return 0; + } + + /** + * get authentication keyring + * + * Return the keyring to use for authentication with msgr1. Remove me + * someday. + */ + virtual KeyStore* ms_get_auth1_authorizer_keystore() { + return nullptr; + } + + /** + * Retrieve the AuthAuthorizer for the given peer type. It might not + * provide one if it knows there is no AuthAuthorizer for that type. + * + * @param dest_type The peer type we want the authorizer for. + * @param a Double pointer to an AuthAuthorizer. The Dispatcher will fill + * in *a with the correct AuthAuthorizer, if it can. Make sure that you have + * set *a to NULL before calling in. + * @param force_new Force the Dispatcher to wait for a new set of keys before + * returning the authorizer. + * + * @return True if this function call properly filled in *a, false otherwise. + */ + virtual bool ms_get_authorizer(int dest_type, AuthAuthorizer **a) { + return false; + } + /** + * @} //Authentication + */ + + void ms_set_require_authorizer(bool b) { + require_authorizer = b; + } +protected: + CephContext *cct; +public: + // allow unauthenticated connections. This is needed for + // compatibility with pre-nautilus OSDs, which do not authenticate + // the heartbeat sessions. + bool require_authorizer = true; +private: + explicit Dispatcher(const Dispatcher &rhs); + Dispatcher& operator=(const Dispatcher &rhs); +}; + +#endif diff --git a/src/msg/FastStrategy.h b/src/msg/FastStrategy.h new file mode 100644 index 00000000..001ff400 --- /dev/null +++ b/src/msg/FastStrategy.h @@ -0,0 +1,35 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef FAST_STRATEGY_H +#define FAST_STRATEGY_H +#include "DispatchStrategy.h" + +class FastStrategy : public DispatchStrategy { +public: + FastStrategy() {} + void ds_dispatch(Message *m) override { + msgr->ms_fast_preprocess(m); + if (msgr->ms_can_fast_dispatch(m)) + msgr->ms_fast_dispatch(m); + else + msgr->ms_deliver_dispatch(m); + } + void shutdown() override {} + void start() override {} + void wait() override {} + virtual ~FastStrategy() {} +}; +#endif /* FAST_STRATEGY_H */ diff --git a/src/msg/Message.cc b/src/msg/Message.cc new file mode 100644 index 00000000..d36a95eb --- /dev/null +++ b/src/msg/Message.cc @@ -0,0 +1,1000 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifdef ENCODE_DUMP +# include <typeinfo> +# include <cxxabi.h> +#endif + +#include <iostream> + +#include "include/types.h" + +#include "global/global_context.h" + +#include "Message.h" + +#include "messages/MPGStats.h" + +#include "messages/MGenericMessage.h" + +#include "messages/MPGStatsAck.h" + +#include "messages/MStatfs.h" +#include "messages/MStatfsReply.h" + +#include "messages/MGetPoolStats.h" +#include "messages/MGetPoolStatsReply.h" + + +#include "messages/MPoolOp.h" +#include "messages/MPoolOpReply.h" + +#include "messages/PaxosServiceMessage.h" +#include "messages/MMonCommand.h" +#include "messages/MMonCommandAck.h" +#include "messages/MMonPaxos.h" +#include "messages/MConfig.h" +#include "messages/MGetConfig.h" + +#include "messages/MMonProbe.h" +#include "messages/MMonJoin.h" +#include "messages/MMonElection.h" +#include "messages/MMonSync.h" +#include "messages/MMonScrub.h" + +#include "messages/MLog.h" +#include "messages/MLogAck.h" + +#include "messages/MPing.h" + +#include "messages/MCommand.h" +#include "messages/MCommandReply.h" +#include "messages/MBackfillReserve.h" +#include "messages/MRecoveryReserve.h" + +#include "messages/MRoute.h" +#include "messages/MForward.h" + +#include "messages/MOSDBoot.h" +#include "messages/MOSDAlive.h" +#include "messages/MOSDBeacon.h" +#include "messages/MOSDPGTemp.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDMarkMeDown.h" +#include "messages/MOSDFull.h" +#include "messages/MOSDPing.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDRepOp.h" +#include "messages/MOSDRepOpReply.h" +#include "messages/MOSDMap.h" +#include "messages/MMonGetOSDMap.h" + +#include "messages/MOSDPGCreated.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" +#include "messages/MOSDPGInfo.h" +#include "messages/MOSDPGCreate.h" +#include "messages/MOSDPGCreate2.h" +#include "messages/MOSDPGTrim.h" +#include "messages/MOSDScrub.h" +#include "messages/MOSDScrub2.h" +#include "messages/MOSDScrubReserve.h" +#include "messages/MOSDRepScrub.h" +#include "messages/MOSDRepScrubMap.h" +#include "messages/MOSDForceRecovery.h" +#include "messages/MOSDPGScan.h" +#include "messages/MOSDPGBackfill.h" +#include "messages/MOSDBackoff.h" +#include "messages/MOSDPGBackfillRemove.h" +#include "messages/MOSDPGRecoveryDelete.h" +#include "messages/MOSDPGRecoveryDeleteReply.h" +#include "messages/MOSDPGReadyToMerge.h" + +#include "messages/MRemoveSnaps.h" + +#include "messages/MMonMap.h" +#include "messages/MMonGetMap.h" +#include "messages/MMonGetVersion.h" +#include "messages/MMonGetVersionReply.h" +#include "messages/MMonHealth.h" +#include "messages/MMonHealthChecks.h" +#include "messages/MMonMetadata.h" +#include "messages/MDataPing.h" +#include "messages/MAuth.h" +#include "messages/MAuthReply.h" +#include "messages/MMonSubscribe.h" +#include "messages/MMonSubscribeAck.h" +#include "messages/MMonGlobalID.h" +#include "messages/MClientSession.h" +#include "messages/MClientReconnect.h" +#include "messages/MClientRequest.h" +#include "messages/MClientRequestForward.h" +#include "messages/MClientReply.h" +#include "messages/MClientReclaim.h" +#include "messages/MClientReclaimReply.h" +#include "messages/MClientCaps.h" +#include "messages/MClientCapRelease.h" +#include "messages/MClientLease.h" +#include "messages/MClientSnap.h" +#include "messages/MClientQuota.h" + +#include "messages/MMDSSlaveRequest.h" + +#include "messages/MMDSMap.h" +#include "messages/MFSMap.h" +#include "messages/MFSMapUser.h" +#include "messages/MMDSBeacon.h" +#include "messages/MMDSLoadTargets.h" +#include "messages/MMDSResolve.h" +#include "messages/MMDSResolveAck.h" +#include "messages/MMDSCacheRejoin.h" +#include "messages/MMDSFindIno.h" +#include "messages/MMDSFindInoReply.h" +#include "messages/MMDSOpenIno.h" +#include "messages/MMDSOpenInoReply.h" +#include "messages/MMDSSnapUpdate.h" + +#include "messages/MDirUpdate.h" +#include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" + +#include "messages/MMDSFragmentNotify.h" +#include "messages/MMDSFragmentNotifyAck.h" + +#include "messages/MExportDirDiscover.h" +#include "messages/MExportDirDiscoverAck.h" +#include "messages/MExportDirCancel.h" +#include "messages/MExportDirPrep.h" +#include "messages/MExportDirPrepAck.h" +#include "messages/MExportDir.h" +#include "messages/MExportDirAck.h" +#include "messages/MExportDirNotify.h" +#include "messages/MExportDirNotifyAck.h" +#include "messages/MExportDirFinish.h" + +#include "messages/MExportCaps.h" +#include "messages/MExportCapsAck.h" +#include "messages/MGatherCaps.h" + + +#include "messages/MDentryUnlink.h" +#include "messages/MDentryLink.h" + +#include "messages/MHeartbeat.h" + +#include "messages/MMDSTableRequest.h" + +//#include "messages/MInodeUpdate.h" +#include "messages/MCacheExpire.h" +#include "messages/MInodeFileCaps.h" + +#include "messages/MMgrBeacon.h" +#include "messages/MMgrMap.h" +#include "messages/MMgrDigest.h" +#include "messages/MMgrReport.h" +#include "messages/MMgrOpen.h" +#include "messages/MMgrClose.h" +#include "messages/MMgrConfigure.h" +#include "messages/MMonMgrReport.h" +#include "messages/MServiceMap.h" + +#include "messages/MLock.h" + +#include "messages/MWatchNotify.h" +#include "messages/MTimeCheck.h" +#include "messages/MTimeCheck2.h" + +#include "common/config.h" + +#include "messages/MOSDPGPush.h" +#include "messages/MOSDPGPushReply.h" +#include "messages/MOSDPGPull.h" + +#include "messages/MOSDECSubOpWrite.h" +#include "messages/MOSDECSubOpWriteReply.h" +#include "messages/MOSDECSubOpRead.h" +#include "messages/MOSDECSubOpReadReply.h" + +#include "messages/MOSDPGUpdateLogMissing.h" +#include "messages/MOSDPGUpdateLogMissingReply.h" + +#define DEBUGLVL 10 // debug level of output + +#define dout_subsys ceph_subsys_ms + +void Message::encode(uint64_t features, int crcflags) +{ + // encode and copy out of *m + if (empty_payload()) { + ceph_assert(middle.length() == 0); + encode_payload(features); + + if (byte_throttler) { + byte_throttler->take(payload.length() + middle.length()); + } + + // if the encoder didn't specify past compatibility, we assume it + // is incompatible. + if (header.compat_version == 0) + header.compat_version = header.version; + } + if (crcflags & MSG_CRC_HEADER) + calc_front_crc(); + + // update envelope + header.front_len = get_payload().length(); + header.middle_len = get_middle().length(); + header.data_len = get_data().length(); + if (crcflags & MSG_CRC_HEADER) + calc_header_crc(); + + footer.flags = CEPH_MSG_FOOTER_COMPLETE; + + if (crcflags & MSG_CRC_DATA) { + calc_data_crc(); + +#ifdef ENCODE_DUMP + bufferlist bl; + encode(get_header(), bl); + + // dump the old footer format + ceph_msg_footer_old old_footer; + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + old_footer.data_crc = footer.data_crc; + old_footer.flags = footer.flags; + encode(old_footer, bl); + + encode(get_payload(), bl); + encode(get_middle(), bl); + encode(get_data(), bl); + + // this is almost an exponential backoff, except because we count + // bits we tend to sample things we encode later, which should be + // more representative. + static int i = 0; + i++; + int bits = 0; + for (unsigned t = i; t; bits++) + t &= t - 1; + if (bits <= 2) { + char fn[200]; + int status; + snprintf(fn, sizeof(fn), ENCODE_STRINGIFY(ENCODE_DUMP) "/%s__%d.%x", + abi::__cxa_demangle(typeid(*this).name(), 0, 0, &status), + getpid(), i++); + int fd = ::open(fn, O_WRONLY|O_TRUNC|O_CREAT|O_CLOEXEC, 0644); + if (fd >= 0) { + bl.write_fd(fd); + ::close(fd); + } + } +#endif + } else { + footer.flags = (unsigned)footer.flags | CEPH_MSG_FOOTER_NOCRC; + } +} + +void Message::dump(Formatter *f) const +{ + stringstream ss; + print(ss); + f->dump_string("summary", ss.str()); +} + +Message *decode_message(CephContext *cct, int crcflags, + ceph_msg_header& header, + ceph_msg_footer& footer, + bufferlist& front, bufferlist& middle, + bufferlist& data, Connection* conn) +{ + // verify crc + if (crcflags & MSG_CRC_HEADER) { + __u32 front_crc = front.crc32c(0); + __u32 middle_crc = middle.crc32c(0); + + if (front_crc != footer.front_crc) { + if (cct) { + ldout(cct, 0) << "bad crc in front " << front_crc << " != exp " << footer.front_crc + << " from " << conn->get_peer_addr() << dendl; + ldout(cct, 20) << " "; + front.hexdump(*_dout); + *_dout << dendl; + } + return 0; + } + if (middle_crc != footer.middle_crc) { + if (cct) { + ldout(cct, 0) << "bad crc in middle " << middle_crc << " != exp " << footer.middle_crc + << " from " << conn->get_peer_addr() << dendl; + ldout(cct, 20) << " "; + middle.hexdump(*_dout); + *_dout << dendl; + } + return 0; + } + } + if (crcflags & MSG_CRC_DATA) { + if ((footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0) { + __u32 data_crc = data.crc32c(0); + if (data_crc != footer.data_crc) { + if (cct) { + ldout(cct, 0) << "bad crc in data " << data_crc << " != exp " << footer.data_crc + << " from " << conn->get_peer_addr() << dendl; + ldout(cct, 20) << " "; + data.hexdump(*_dout); + *_dout << dendl; + } + return 0; + } + } + } + + // make message + Message::ref m; + int type = header.type; + switch (type) { + + // -- with payload -- + + case MSG_PGSTATS: + m = MPGStats::create(); + break; + case MSG_PGSTATSACK: + m = MPGStatsAck::create(); + break; + + case CEPH_MSG_STATFS: + m = MStatfs::create(); + break; + case CEPH_MSG_STATFS_REPLY: + m = MStatfsReply::create(); + break; + case MSG_GETPOOLSTATS: + m = MGetPoolStats::create(); + break; + case MSG_GETPOOLSTATSREPLY: + m = MGetPoolStatsReply::create(); + break; + case CEPH_MSG_POOLOP: + m = MPoolOp::create(); + break; + case CEPH_MSG_POOLOP_REPLY: + m = MPoolOpReply::create(); + break; + case MSG_MON_COMMAND: + m = MMonCommand::create(); + break; + case MSG_MON_COMMAND_ACK: + m = MMonCommandAck::create(); + break; + case MSG_MON_PAXOS: + m = MMonPaxos::create(); + break; + case MSG_CONFIG: + m = MConfig::create(); + break; + case MSG_GET_CONFIG: + m = MGetConfig::create(); + break; + + case MSG_MON_PROBE: + m = MMonProbe::create(); + break; + case MSG_MON_JOIN: + m = MMonJoin::create(); + break; + case MSG_MON_ELECTION: + m = MMonElection::create(); + break; + case MSG_MON_SYNC: + m = MMonSync::create(); + break; + case MSG_MON_SCRUB: + m = MMonScrub::create(); + break; + + case MSG_LOG: + m = MLog::create(); + break; + case MSG_LOGACK: + m = MLogAck::create(); + break; + + case CEPH_MSG_PING: + m = MPing::create(); + break; + case MSG_COMMAND: + m = MCommand::create(); + break; + case MSG_COMMAND_REPLY: + m = MCommandReply::create(); + break; + case MSG_OSD_BACKFILL_RESERVE: + m = MBackfillReserve::create(); + break; + case MSG_OSD_RECOVERY_RESERVE: + m = MRecoveryReserve::create(); + break; + case MSG_OSD_FORCE_RECOVERY: + m = MOSDForceRecovery::create(); + break; + + case MSG_ROUTE: + m = MRoute::create(); + break; + case MSG_FORWARD: + m = MForward::create(); + break; + + case CEPH_MSG_MON_MAP: + m = MMonMap::create(); + break; + case CEPH_MSG_MON_GET_MAP: + m = MMonGetMap::create(); + break; + case CEPH_MSG_MON_GET_OSDMAP: + m = MMonGetOSDMap::create(); + break; + case CEPH_MSG_MON_GET_VERSION: + m = MMonGetVersion::create(); + break; + case CEPH_MSG_MON_GET_VERSION_REPLY: + m = MMonGetVersionReply::create(); + break; + case CEPH_MSG_MON_METADATA: + m = MMonMetadata::create(); + break; + + case MSG_OSD_BOOT: + m = MOSDBoot::create(); + break; + case MSG_OSD_ALIVE: + m = MOSDAlive::create(); + break; + case MSG_OSD_BEACON: + m = MOSDBeacon::create(); + break; + case MSG_OSD_PGTEMP: + m = MOSDPGTemp::create(); + break; + case MSG_OSD_FAILURE: + m = MOSDFailure::create(); + break; + case MSG_OSD_MARK_ME_DOWN: + m = MOSDMarkMeDown::create(); + break; + case MSG_OSD_FULL: + m = MOSDFull::create(); + break; + case MSG_OSD_PING: + m = MOSDPing::create(); + break; + case CEPH_MSG_OSD_OP: + m = MOSDOp::create(); + break; + case CEPH_MSG_OSD_OPREPLY: + m = MOSDOpReply::create(); + break; + case MSG_OSD_REPOP: + m = MOSDRepOp::create(); + break; + case MSG_OSD_REPOPREPLY: + m = MOSDRepOpReply::create(); + break; + case MSG_OSD_PG_CREATED: + m = MOSDPGCreated::create(); + break; + case MSG_OSD_PG_UPDATE_LOG_MISSING: + m = MOSDPGUpdateLogMissing::create(); + break; + case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY: + m = MOSDPGUpdateLogMissingReply::create(); + break; + case CEPH_MSG_OSD_BACKOFF: + m = MOSDBackoff::create(); + break; + + case CEPH_MSG_OSD_MAP: + m = MOSDMap::create(); + break; + + case CEPH_MSG_WATCH_NOTIFY: + m = MWatchNotify::create(); + break; + + case MSG_OSD_PG_NOTIFY: + m = MOSDPGNotify::create(); + break; + case MSG_OSD_PG_QUERY: + m = MOSDPGQuery::create(); + break; + case MSG_OSD_PG_LOG: + m = MOSDPGLog::create(); + break; + case MSG_OSD_PG_REMOVE: + m = MOSDPGRemove::create(); + break; + case MSG_OSD_PG_INFO: + m = MOSDPGInfo::create(); + break; + case MSG_OSD_PG_CREATE: + m = MOSDPGCreate::create(); + break; + case MSG_OSD_PG_CREATE2: + m = MOSDPGCreate2::create(); + break; + case MSG_OSD_PG_TRIM: + m = MOSDPGTrim::create(); + break; + + case MSG_OSD_SCRUB: + m = MOSDScrub::create(); + break; + case MSG_OSD_SCRUB2: + m = MOSDScrub2::create(); + break; + case MSG_OSD_SCRUB_RESERVE: + m = MOSDScrubReserve::create(); + break; + case MSG_REMOVE_SNAPS: + m = MRemoveSnaps::create(); + break; + case MSG_OSD_REP_SCRUB: + m = MOSDRepScrub::create(); + break; + case MSG_OSD_REP_SCRUBMAP: + m = MOSDRepScrubMap::create(); + break; + case MSG_OSD_PG_SCAN: + m = MOSDPGScan::create(); + break; + case MSG_OSD_PG_BACKFILL: + m = MOSDPGBackfill::create(); + break; + case MSG_OSD_PG_BACKFILL_REMOVE: + m = MOSDPGBackfillRemove::create(); + break; + case MSG_OSD_PG_PUSH: + m = MOSDPGPush::create(); + break; + case MSG_OSD_PG_PULL: + m = MOSDPGPull::create(); + break; + case MSG_OSD_PG_PUSH_REPLY: + m = MOSDPGPushReply::create(); + break; + case MSG_OSD_PG_RECOVERY_DELETE: + m = MOSDPGRecoveryDelete::create(); + break; + case MSG_OSD_PG_RECOVERY_DELETE_REPLY: + m = MOSDPGRecoveryDeleteReply::create(); + break; + case MSG_OSD_PG_READY_TO_MERGE: + m = MOSDPGReadyToMerge::create(); + break; + case MSG_OSD_EC_WRITE: + m = MOSDECSubOpWrite::create(); + break; + case MSG_OSD_EC_WRITE_REPLY: + m = MOSDECSubOpWriteReply::create(); + break; + case MSG_OSD_EC_READ: + m = MOSDECSubOpRead::create(); + break; + case MSG_OSD_EC_READ_REPLY: + m = MOSDECSubOpReadReply::create(); + break; + // auth + case CEPH_MSG_AUTH: + m = MAuth::create(); + break; + case CEPH_MSG_AUTH_REPLY: + m = MAuthReply::create(); + break; + + case MSG_MON_GLOBAL_ID: + m = MMonGlobalID::create(); + break; + + // clients + case CEPH_MSG_MON_SUBSCRIBE: + m = MMonSubscribe::create(); + break; + case CEPH_MSG_MON_SUBSCRIBE_ACK: + m = MMonSubscribeAck::create(); + break; + case CEPH_MSG_CLIENT_SESSION: + m = MClientSession::create(); + break; + case CEPH_MSG_CLIENT_RECONNECT: + m = MClientReconnect::create(); + break; + case CEPH_MSG_CLIENT_REQUEST: + m = MClientRequest::create(); + break; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + m = MClientRequestForward::create(); + break; + case CEPH_MSG_CLIENT_REPLY: + m = MClientReply::create(); + break; + case CEPH_MSG_CLIENT_RECLAIM: + m = MClientReclaim::create(); + break; + case CEPH_MSG_CLIENT_RECLAIM_REPLY: + m = MClientReclaimReply::create(); + break; + case CEPH_MSG_CLIENT_CAPS: + m = MClientCaps::create(); + break; + case CEPH_MSG_CLIENT_CAPRELEASE: + m = MClientCapRelease::create(); + break; + case CEPH_MSG_CLIENT_LEASE: + m = MClientLease::create(); + break; + case CEPH_MSG_CLIENT_SNAP: + m = MClientSnap::create(); + break; + case CEPH_MSG_CLIENT_QUOTA: + m = MClientQuota::create(); + break; + + // mds + case MSG_MDS_SLAVE_REQUEST: + m = MMDSSlaveRequest::create(); + break; + + case CEPH_MSG_MDS_MAP: + m = MMDSMap::create(); + break; + case CEPH_MSG_FS_MAP: + m = MFSMap::create(); + break; + case CEPH_MSG_FS_MAP_USER: + m = MFSMapUser::create(); + break; + case MSG_MDS_BEACON: + m = MMDSBeacon::create(); + break; + case MSG_MDS_OFFLOAD_TARGETS: + m = MMDSLoadTargets::create(); + break; + case MSG_MDS_RESOLVE: + m = MMDSResolve::create(); + break; + case MSG_MDS_RESOLVEACK: + m = MMDSResolveAck::create(); + break; + case MSG_MDS_CACHEREJOIN: + m = MMDSCacheRejoin::create(); + break; + + case MSG_MDS_DIRUPDATE: + m = MDirUpdate::create(); + break; + + case MSG_MDS_DISCOVER: + m = MDiscover::create(); + break; + case MSG_MDS_DISCOVERREPLY: + m = MDiscoverReply::create(); + break; + + case MSG_MDS_FINDINO: + m = MMDSFindIno::create(); + break; + case MSG_MDS_FINDINOREPLY: + m = MMDSFindInoReply::create(); + break; + + case MSG_MDS_OPENINO: + m = MMDSOpenIno::create(); + break; + case MSG_MDS_OPENINOREPLY: + m = MMDSOpenInoReply::create(); + break; + + case MSG_MDS_SNAPUPDATE: + m = MMDSSnapUpdate::create(); + break; + + case MSG_MDS_FRAGMENTNOTIFY: + m = MMDSFragmentNotify::create(); + break; + + case MSG_MDS_FRAGMENTNOTIFYACK: + m = MMDSFragmentNotifyAck::create(); + break; + + case MSG_MDS_EXPORTDIRDISCOVER: + m = MExportDirDiscover::create(); + break; + case MSG_MDS_EXPORTDIRDISCOVERACK: + m = MExportDirDiscoverAck::create(); + break; + case MSG_MDS_EXPORTDIRCANCEL: + m = MExportDirCancel::create(); + break; + + case MSG_MDS_EXPORTDIR: + m = MExportDir::create(); + break; + case MSG_MDS_EXPORTDIRACK: + m = MExportDirAck::create(); + break; + case MSG_MDS_EXPORTDIRFINISH: + m = MExportDirFinish::create(); + break; + + case MSG_MDS_EXPORTDIRNOTIFY: + m = MExportDirNotify::create(); + break; + + case MSG_MDS_EXPORTDIRNOTIFYACK: + m = MExportDirNotifyAck::create(); + break; + + case MSG_MDS_EXPORTDIRPREP: + m = MExportDirPrep::create(); + break; + + case MSG_MDS_EXPORTDIRPREPACK: + m = MExportDirPrepAck::create(); + break; + + case MSG_MDS_EXPORTCAPS: + m = MExportCaps::create(); + break; + case MSG_MDS_EXPORTCAPSACK: + m = MExportCapsAck::create(); + break; + case MSG_MDS_GATHERCAPS: + m = MGatherCaps::create(); + break; + + + case MSG_MDS_DENTRYUNLINK: + m = MDentryUnlink::create(); + break; + case MSG_MDS_DENTRYLINK: + m = MDentryLink::create(); + break; + + case MSG_MDS_HEARTBEAT: + m = MHeartbeat::create(); + break; + + case MSG_MDS_CACHEEXPIRE: + m = MCacheExpire::create(); + break; + + case MSG_MDS_TABLE_REQUEST: + m = MMDSTableRequest::create(); + break; + + /* case MSG_MDS_INODEUPDATE: + m = MInodeUpdate::create(); + break; + */ + + case MSG_MDS_INODEFILECAPS: + m = MInodeFileCaps::create(); + break; + + case MSG_MDS_LOCK: + m = MLock::create(); + break; + + case MSG_MGR_BEACON: + m = MMgrBeacon::create(); + break; + + case MSG_MON_MGR_REPORT: + m = MMonMgrReport::create(); + break; + + case MSG_SERVICE_MAP: + m = MServiceMap::create(); + break; + + case MSG_MGR_MAP: + m = MMgrMap::create(); + break; + + case MSG_MGR_DIGEST: + m = MMgrDigest::create(); + break; + + case MSG_MGR_OPEN: + m = MMgrOpen::create(); + break; + + case MSG_MGR_CLOSE: + m = MMgrClose::create(); + break; + + case MSG_MGR_REPORT: + m = MMgrReport::create(); + break; + + case MSG_MGR_CONFIGURE: + m = MMgrConfigure::create(); + break; + + case MSG_TIMECHECK: + m = MTimeCheck::create(); + break; + case MSG_TIMECHECK2: + m = MTimeCheck2::create(); + break; + + case MSG_MON_HEALTH: + m = MMonHealth::create(); + break; + + case MSG_MON_HEALTH_CHECKS: + m = MMonHealthChecks::create(); + break; + +#if defined(HAVE_XIO) + case MSG_DATA_PING: + m = MDataPing::create(); + break; +#endif + // -- simple messages without payload -- + + case CEPH_MSG_SHUTDOWN: + m = MGenericMessage::create(type); + break; + + default: + if (cct) { + ldout(cct, 0) << "can't decode unknown message type " << type << " MSG_AUTH=" << CEPH_MSG_AUTH << dendl; + if (cct->_conf->ms_die_on_bad_msg) + ceph_abort(); + } + return 0; + } + + m->set_cct(cct); + + // m->header.version, if non-zero, should be populated with the + // newest version of the encoding the code supports. If set, check + // it against compat_version. + if (m->get_header().version && + m->get_header().version < header.compat_version) { + if (cct) { + ldout(cct, 0) << "will not decode message of type " << type + << " version " << header.version + << " because compat_version " << header.compat_version + << " > supported version " << m->get_header().version << dendl; + if (cct->_conf->ms_die_on_bad_msg) + ceph_abort(); + } + return 0; + } + + m->set_connection(conn); + m->set_header(header); + m->set_footer(footer); + m->set_payload(front); + m->set_middle(middle); + m->set_data(data); + + try { + m->decode_payload(); + } + catch (const buffer::error &e) { + if (cct) { + lderr(cct) << "failed to decode message of type " << type + << " v" << header.version + << ": " << e.what() << dendl; + ldout(cct, ceph::dout::need_dynamic( + cct->_conf->ms_dump_corrupt_message_level)) << "dump: \n"; + m->get_payload().hexdump(*_dout); + *_dout << dendl; + if (cct->_conf->ms_die_on_bad_msg) + ceph_abort(); + } + return 0; + } + + // done! + return m.detach(); +} + +void Message::encode_trace(bufferlist &bl, uint64_t features) const +{ + using ceph::encode; + auto p = trace.get_info(); + static const blkin_trace_info empty = { 0, 0, 0 }; + if (!p) { + p = ∅ + } + encode(*p, bl); +} + +void Message::decode_trace(bufferlist::const_iterator &p, bool create) +{ + blkin_trace_info info = {}; + decode(info, p); + +#ifdef WITH_BLKIN + if (!connection) + return; + + const auto msgr = connection->get_messenger(); + const auto endpoint = msgr->get_trace_endpoint(); + if (info.trace_id) { + trace.init(get_type_name(), endpoint, &info, true); + trace.event("decoded trace"); + } else if (create || (msgr->get_myname().is_osd() && + msgr->cct->_conf->osd_blkin_trace_all)) { + // create a trace even if we didn't get one on the wire + trace.init(get_type_name(), endpoint); + trace.event("created trace"); + } + trace.keyval("tid", get_tid()); + trace.keyval("entity type", get_source().type_str()); + trace.keyval("entity num", get_source().num()); +#endif +} + + +// This routine is not used for ordinary messages, but only when encapsulating a message +// for forwarding and routing. It's also used in a backward compatibility test, which only +// effectively tests backward compability for those functions. To avoid backward compatibility +// problems, we currently always encode and decode using the old footer format that doesn't +// allow for message authentication. Eventually we should fix that. PLR + +void encode_message(Message *msg, uint64_t features, bufferlist& payload) +{ + bufferlist front, middle, data; + ceph_msg_footer_old old_footer; + ceph_msg_footer footer; + msg->encode(features, MSG_CRC_ALL); + encode(msg->get_header(), payload); + + // Here's where we switch to the old footer format. PLR + + footer = msg->get_footer(); + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + old_footer.data_crc = footer.data_crc; + old_footer.flags = footer.flags; + encode(old_footer, payload); + + encode(msg->get_payload(), payload); + encode(msg->get_middle(), payload); + encode(msg->get_data(), payload); +} + +// See above for somewhat bogus use of the old message footer. We switch to the current footer +// after decoding the old one so the other form of decode_message() doesn't have to change. +// We've slipped in a 0 signature at this point, so any signature checking after this will +// fail. PLR + +Message *decode_message(CephContext *cct, int crcflags, bufferlist::const_iterator& p) +{ + ceph_msg_header h; + ceph_msg_footer_old fo; + ceph_msg_footer f; + bufferlist fr, mi, da; + decode(h, p); + decode(fo, p); + f.front_crc = fo.front_crc; + f.middle_crc = fo.middle_crc; + f.data_crc = fo.data_crc; + f.flags = fo.flags; + f.sig = 0; + decode(fr, p); + decode(mi, p); + decode(da, p); + return decode_message(cct, crcflags, h, f, fr, mi, da, nullptr); +} diff --git a/src/msg/Message.h b/src/msg/Message.h new file mode 100644 index 00000000..42405ed3 --- /dev/null +++ b/src/msg/Message.h @@ -0,0 +1,577 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MESSAGE_H +#define CEPH_MESSAGE_H + +#include <stdlib.h> +#include <ostream> +#include <string_view> + +#include <boost/intrusive/list.hpp> + +#include "include/Context.h" +#include "common/RefCountedObj.h" +#include "common/ThrottleInterface.h" +#include "common/config.h" +#include "common/debug.h" +#include "common/zipkin_trace.h" +#include "include/ceph_assert.h" // Because intrusive_ptr clobbers our assert... +#include "include/buffer.h" +#include "include/types.h" +#include "msg/Connection.h" +#include "msg/MessageRef.h" +#include "msg_types.h" + +// monitor internal +#define MSG_MON_SCRUB 64 +#define MSG_MON_ELECTION 65 +#define MSG_MON_PAXOS 66 +#define MSG_MON_PROBE 67 +#define MSG_MON_JOIN 68 +#define MSG_MON_SYNC 69 + +/* monitor <-> mon admin tool */ +#define MSG_MON_COMMAND 50 +#define MSG_MON_COMMAND_ACK 51 +#define MSG_LOG 52 +#define MSG_LOGACK 53 + +#define MSG_GETPOOLSTATS 58 +#define MSG_GETPOOLSTATSREPLY 59 + +#define MSG_MON_GLOBAL_ID 60 + +#define MSG_ROUTE 47 +#define MSG_FORWARD 46 + +#define MSG_PAXOS 40 + +#define MSG_CONFIG 62 +#define MSG_GET_CONFIG 63 + + +// osd internal +#define MSG_OSD_PING 70 +#define MSG_OSD_BOOT 71 +#define MSG_OSD_FAILURE 72 +#define MSG_OSD_ALIVE 73 +#define MSG_OSD_MARK_ME_DOWN 74 +#define MSG_OSD_FULL 75 + +// removed right after luminous +//#define MSG_OSD_SUBOP 76 +//#define MSG_OSD_SUBOPREPLY 77 + +#define MSG_OSD_PGTEMP 78 + +#define MSG_OSD_BEACON 79 + +#define MSG_OSD_PG_NOTIFY 80 +#define MSG_OSD_PG_QUERY 81 +#define MSG_OSD_PG_LOG 83 +#define MSG_OSD_PG_REMOVE 84 +#define MSG_OSD_PG_INFO 85 +#define MSG_OSD_PG_TRIM 86 + +#define MSG_PGSTATS 87 +#define MSG_PGSTATSACK 88 + +#define MSG_OSD_PG_CREATE 89 +#define MSG_REMOVE_SNAPS 90 + +#define MSG_OSD_SCRUB 91 +#define MSG_OSD_SCRUB_RESERVE 92 // previous PG_MISSING +#define MSG_OSD_REP_SCRUB 93 + +#define MSG_OSD_PG_SCAN 94 +#define MSG_OSD_PG_BACKFILL 95 +#define MSG_OSD_PG_BACKFILL_REMOVE 96 + +#define MSG_COMMAND 97 +#define MSG_COMMAND_REPLY 98 + +#define MSG_OSD_BACKFILL_RESERVE 99 +#define MSG_OSD_RECOVERY_RESERVE 150 +#define MSG_OSD_FORCE_RECOVERY 151 + +#define MSG_OSD_PG_PUSH 105 +#define MSG_OSD_PG_PULL 106 +#define MSG_OSD_PG_PUSH_REPLY 107 + +#define MSG_OSD_EC_WRITE 108 +#define MSG_OSD_EC_WRITE_REPLY 109 +#define MSG_OSD_EC_READ 110 +#define MSG_OSD_EC_READ_REPLY 111 + +#define MSG_OSD_REPOP 112 +#define MSG_OSD_REPOPREPLY 113 +#define MSG_OSD_PG_UPDATE_LOG_MISSING 114 +#define MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY 115 + +#define MSG_OSD_PG_CREATED 116 +#define MSG_OSD_REP_SCRUBMAP 117 +#define MSG_OSD_PG_RECOVERY_DELETE 118 +#define MSG_OSD_PG_RECOVERY_DELETE_REPLY 119 +#define MSG_OSD_PG_CREATE2 120 +#define MSG_OSD_SCRUB2 121 + +#define MSG_OSD_PG_READY_TO_MERGE 122 + +// *** MDS *** + +#define MSG_MDS_BEACON 100 // to monitor +#define MSG_MDS_SLAVE_REQUEST 101 +#define MSG_MDS_TABLE_REQUEST 102 + + // 150 already in use (MSG_OSD_RECOVERY_RESERVE) + +#define MSG_MDS_RESOLVE 0x200 +#define MSG_MDS_RESOLVEACK 0x201 +#define MSG_MDS_CACHEREJOIN 0x202 +#define MSG_MDS_DISCOVER 0x203 +#define MSG_MDS_DISCOVERREPLY 0x204 +#define MSG_MDS_INODEUPDATE 0x205 +#define MSG_MDS_DIRUPDATE 0x206 +#define MSG_MDS_CACHEEXPIRE 0x207 +#define MSG_MDS_DENTRYUNLINK 0x208 +#define MSG_MDS_FRAGMENTNOTIFY 0x209 +#define MSG_MDS_OFFLOAD_TARGETS 0x20a +#define MSG_MDS_DENTRYLINK 0x20c +#define MSG_MDS_FINDINO 0x20d +#define MSG_MDS_FINDINOREPLY 0x20e +#define MSG_MDS_OPENINO 0x20f +#define MSG_MDS_OPENINOREPLY 0x210 +#define MSG_MDS_SNAPUPDATE 0x211 +#define MSG_MDS_FRAGMENTNOTIFYACK 0x212 +#define MSG_MDS_LOCK 0x300 +#define MSG_MDS_INODEFILECAPS 0x301 + +#define MSG_MDS_EXPORTDIRDISCOVER 0x449 +#define MSG_MDS_EXPORTDIRDISCOVERACK 0x450 +#define MSG_MDS_EXPORTDIRCANCEL 0x451 +#define MSG_MDS_EXPORTDIRPREP 0x452 +#define MSG_MDS_EXPORTDIRPREPACK 0x453 +#define MSG_MDS_EXPORTDIRWARNING 0x454 +#define MSG_MDS_EXPORTDIRWARNINGACK 0x455 +#define MSG_MDS_EXPORTDIR 0x456 +#define MSG_MDS_EXPORTDIRACK 0x457 +#define MSG_MDS_EXPORTDIRNOTIFY 0x458 +#define MSG_MDS_EXPORTDIRNOTIFYACK 0x459 +#define MSG_MDS_EXPORTDIRFINISH 0x460 + +#define MSG_MDS_EXPORTCAPS 0x470 +#define MSG_MDS_EXPORTCAPSACK 0x471 +#define MSG_MDS_GATHERCAPS 0x472 + +#define MSG_MDS_HEARTBEAT 0x500 // for mds load balancer + +// *** generic *** +#define MSG_TIMECHECK 0x600 +#define MSG_MON_HEALTH 0x601 + +// *** Message::encode() crcflags bits *** +#define MSG_CRC_DATA (1 << 0) +#define MSG_CRC_HEADER (1 << 1) +#define MSG_CRC_ALL (MSG_CRC_DATA | MSG_CRC_HEADER) + +// Xio Testing +#define MSG_DATA_PING 0x602 + +// Xio intends to define messages 0x603..0x606 + +// Special +#define MSG_NOP 0x607 + +#define MSG_MON_HEALTH_CHECKS 0x608 +#define MSG_TIMECHECK2 0x609 + +// *** ceph-mgr <-> OSD/MDS daemons *** +#define MSG_MGR_OPEN 0x700 +#define MSG_MGR_CONFIGURE 0x701 +#define MSG_MGR_REPORT 0x702 + +// *** ceph-mgr <-> ceph-mon *** +#define MSG_MGR_BEACON 0x703 + +// *** ceph-mon(MgrMonitor) -> OSD/MDS daemons *** +#define MSG_MGR_MAP 0x704 + +// *** ceph-mon(MgrMonitor) -> ceph-mgr +#define MSG_MGR_DIGEST 0x705 +// *** cephmgr -> ceph-mon +#define MSG_MON_MGR_REPORT 0x706 +#define MSG_SERVICE_MAP 0x707 + +#define MSG_MGR_CLOSE 0x708 + +// ====================================================== + +// abstract Message class + +namespace bi = boost::intrusive; + +// XioMessenger conditional trace flags +#define MSG_MAGIC_XIO 0x0002 +#define MSG_MAGIC_TRACE_XCON 0x0004 +#define MSG_MAGIC_TRACE_DTOR 0x0008 +#define MSG_MAGIC_TRACE_HDR 0x0010 +#define MSG_MAGIC_TRACE_XIO 0x0020 +#define MSG_MAGIC_TRACE_XMSGR 0x0040 +#define MSG_MAGIC_TRACE_CTR 0x0080 + +// XioMessenger diagnostic "ping pong" flag (resend msg when send completes) +#define MSG_MAGIC_REDUPE 0x0100 + +class Message : public RefCountedObject { +protected: + ceph_msg_header header; // headerelope + ceph_msg_footer footer; + bufferlist payload; // "front" unaligned blob + bufferlist middle; // "middle" unaligned blob + bufferlist data; // data payload (page-alignment will be preserved where possible) + + /* recv_stamp is set when the Messenger starts reading the + * Message off the wire */ + utime_t recv_stamp; + /* dispatch_stamp is set when the Messenger starts calling dispatch() on + * its endpoints */ + utime_t dispatch_stamp; + /* throttle_stamp is the point at which we got throttle */ + utime_t throttle_stamp; + /* time at which message was fully read */ + utime_t recv_complete_stamp; + + ConnectionRef connection; + + uint32_t magic = 0; + + bi::list_member_hook<> dispatch_q; + +public: + using ref = MessageRef; + using const_ref = MessageConstRef; + + // zipkin tracing + ZTracer::Trace trace; + void encode_trace(bufferlist &bl, uint64_t features) const; + void decode_trace(bufferlist::const_iterator &p, bool create = false); + + class CompletionHook : public Context { + protected: + Message *m; + friend class Message; + public: + explicit CompletionHook(Message *_m) : m(_m) {} + virtual void set_message(Message *_m) { m = _m; } + }; + + typedef bi::list< Message, + bi::member_hook< Message, + bi::list_member_hook<>, + &Message::dispatch_q > > Queue; + +protected: + CompletionHook* completion_hook = nullptr; // owned by Messenger + + // release our size in bytes back to this throttler when our payload + // is adjusted or when we are destroyed. + ThrottleInterface *byte_throttler = nullptr; + + // release a count back to this throttler when we are destroyed + ThrottleInterface *msg_throttler = nullptr; + + // keep track of how big this message was when we reserved space in + // the msgr dispatch_throttler, so that we can properly release it + // later. this is necessary because messages can enter the dispatch + // queue locally (not via read_message()), and those are not + // currently throttled. + uint64_t dispatch_throttle_size = 0; + + friend class Messenger; + +public: + Message() { + memset(&header, 0, sizeof(header)); + memset(&footer, 0, sizeof(footer)); + } + Message(int t, int version=1, int compat_version=0) { + memset(&header, 0, sizeof(header)); + header.type = t; + header.version = version; + header.compat_version = compat_version; + header.priority = 0; // undef + header.data_off = 0; + memset(&footer, 0, sizeof(footer)); + } + + Message *get() { + return static_cast<Message *>(RefCountedObject::get()); + } + +protected: + ~Message() override { + if (byte_throttler) + byte_throttler->put(payload.length() + middle.length() + data.length()); + release_message_throttle(); + trace.event("message destructed"); + /* call completion hooks (if any) */ + if (completion_hook) + completion_hook->complete(0); + } +public: + const ConnectionRef& get_connection() const { return connection; } + void set_connection(const ConnectionRef& c) { + connection = c; + } + CompletionHook* get_completion_hook() { return completion_hook; } + void set_completion_hook(CompletionHook *hook) { completion_hook = hook; } + void set_byte_throttler(ThrottleInterface *t) { + byte_throttler = t; + } + void set_message_throttler(ThrottleInterface *t) { + msg_throttler = t; + } + + void set_dispatch_throttle_size(uint64_t s) { dispatch_throttle_size = s; } + uint64_t get_dispatch_throttle_size() const { return dispatch_throttle_size; } + + const ceph_msg_header &get_header() const { return header; } + ceph_msg_header &get_header() { return header; } + void set_header(const ceph_msg_header &e) { header = e; } + void set_footer(const ceph_msg_footer &e) { footer = e; } + const ceph_msg_footer &get_footer() const { return footer; } + ceph_msg_footer &get_footer() { return footer; } + void set_src(const entity_name_t& src) { header.src = src; } + + uint32_t get_magic() const { return magic; } + void set_magic(int _magic) { magic = _magic; } + + /* + * If you use get_[data, middle, payload] you shouldn't + * use it to change those bufferlists unless you KNOW + * there is no throttle being used. The other + * functions are throttling-aware as appropriate. + */ + + void clear_payload() { + if (byte_throttler) { + byte_throttler->put(payload.length() + middle.length()); + } + payload.clear(); + middle.clear(); + } + + virtual void clear_buffers() {} + void clear_data() { + if (byte_throttler) + byte_throttler->put(data.length()); + data.clear(); + clear_buffers(); // let subclass drop buffers as well + } + void release_message_throttle() { + if (msg_throttler) + msg_throttler->put(); + msg_throttler = nullptr; + } + + bool empty_payload() const { return payload.length() == 0; } + bufferlist& get_payload() { return payload; } + const bufferlist& get_payload() const { return payload; } + void set_payload(bufferlist& bl) { + if (byte_throttler) + byte_throttler->put(payload.length()); + payload.claim(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); + if (byte_throttler) + byte_throttler->take(payload.length()); + } + + void set_middle(bufferlist& bl) { + if (byte_throttler) + byte_throttler->put(middle.length()); + middle.claim(bl, buffer::list::CLAIM_ALLOW_NONSHAREABLE); + if (byte_throttler) + byte_throttler->take(middle.length()); + } + bufferlist& get_middle() { return middle; } + + void set_data(const bufferlist &bl) { + if (byte_throttler) + byte_throttler->put(data.length()); + data.share(bl); + if (byte_throttler) + byte_throttler->take(data.length()); + } + + const bufferlist& get_data() const { return data; } + bufferlist& get_data() { return data; } + void claim_data(bufferlist& bl, + unsigned int flags = buffer::list::CLAIM_DEFAULT) { + if (byte_throttler) + byte_throttler->put(data.length()); + bl.claim(data, flags); + } + off_t get_data_len() const { return data.length(); } + + void set_recv_stamp(utime_t t) { recv_stamp = t; } + const utime_t& get_recv_stamp() const { return recv_stamp; } + void set_dispatch_stamp(utime_t t) { dispatch_stamp = t; } + const utime_t& get_dispatch_stamp() const { return dispatch_stamp; } + void set_throttle_stamp(utime_t t) { throttle_stamp = t; } + const utime_t& get_throttle_stamp() const { return throttle_stamp; } + void set_recv_complete_stamp(utime_t t) { recv_complete_stamp = t; } + const utime_t& get_recv_complete_stamp() const { return recv_complete_stamp; } + + void calc_header_crc() { + header.crc = ceph_crc32c(0, (unsigned char*)&header, + sizeof(header) - sizeof(header.crc)); + } + void calc_front_crc() { + footer.front_crc = payload.crc32c(0); + footer.middle_crc = middle.crc32c(0); + } + void calc_data_crc() { + footer.data_crc = data.crc32c(0); + } + + virtual int get_cost() const { + return data.length(); + } + + // type + int get_type() const { return header.type; } + void set_type(int t) { header.type = t; } + + uint64_t get_tid() const { return header.tid; } + void set_tid(uint64_t t) { header.tid = t; } + + uint64_t get_seq() const { return header.seq; } + void set_seq(uint64_t s) { header.seq = s; } + + unsigned get_priority() const { return header.priority; } + void set_priority(__s16 p) { header.priority = p; } + + // source/dest + entity_inst_t get_source_inst() const { + return entity_inst_t(get_source(), get_source_addr()); + } + entity_name_t get_source() const { + return entity_name_t(header.src); + } + entity_addr_t get_source_addr() const { + if (connection) + return connection->get_peer_addr(); + return entity_addr_t(); + } + entity_addrvec_t get_source_addrs() const { + if (connection) + return connection->get_peer_addrs(); + return entity_addrvec_t(); + } + + // forwarded? + entity_inst_t get_orig_source_inst() const { + return get_source_inst(); + } + entity_name_t get_orig_source() const { + return get_source(); + } + entity_addr_t get_orig_source_addr() const { + return get_source_addr(); + } + entity_addrvec_t get_orig_source_addrs() const { + return get_source_addrs(); + } + + // virtual bits + virtual void decode_payload() = 0; + virtual void encode_payload(uint64_t features) = 0; + virtual std::string_view get_type_name() const = 0; + virtual void print(ostream& out) const { + out << get_type_name() << " magic: " << magic; + } + + virtual void dump(Formatter *f) const; + + void encode(uint64_t features, int crcflags); +}; + +extern Message *decode_message(CephContext *cct, int crcflags, + ceph_msg_header &header, + ceph_msg_footer& footer, bufferlist& front, + bufferlist& middle, bufferlist& data, + Connection* conn); +inline ostream& operator<<(ostream& out, const Message& m) { + m.print(out); + if (m.get_header().version) + out << " v" << m.get_header().version; + return out; +} + +extern void encode_message(Message *m, uint64_t features, bufferlist& bl); +extern Message *decode_message(CephContext *cct, int crcflags, + bufferlist::const_iterator& bl); + +template <class MessageType> +class MessageFactory { +public: +template<typename... Args> + static typename MessageType::ref build(Args&&... args) { + return typename MessageType::ref(new MessageType(std::forward<Args>(args)...), false); + } +}; + +template<class T, class M = Message> +class MessageSubType : public M { +public: + typedef boost::intrusive_ptr<T> ref; + typedef boost::intrusive_ptr<T const> const_ref; + + static auto msgref_cast(typename M::ref const& m) { + return boost::static_pointer_cast<typename T::const_ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m); + } + static auto msgref_cast(typename M::const_ref const& m) { + return boost::static_pointer_cast<typename T::ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m); + } + +protected: +template<typename... Args> + MessageSubType(Args&&... args) : M(std::forward<Args>(args)...) {} + virtual ~MessageSubType() override {} +}; + + +template<class T, class M = Message> +class MessageInstance : public MessageSubType<T, M> { +public: + using factory = MessageFactory<T>; + + template<typename... Args> + static auto create(Args&&... args) { + return MessageFactory<T>::build(std::forward<Args>(args)...); + } + static auto msgref_cast(typename Message::ref const& m) { + return boost::static_pointer_cast<typename T::ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m); + } + static auto msgref_cast(typename Message::const_ref const& m) { + return boost::static_pointer_cast<typename T::const_ref::element_type, typename std::remove_reference<decltype(m)>::type::element_type>(m); + } + +protected: +template<typename... Args> + MessageInstance(Args&&... args) : MessageSubType<T,M>(std::forward<Args>(args)...) {} + virtual ~MessageInstance() override {} +}; + +#endif diff --git a/src/msg/MessageRef.h b/src/msg/MessageRef.h new file mode 100644 index 00000000..c2bd3152 --- /dev/null +++ b/src/msg/MessageRef.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2018 Red Hat, Inc. <contact@redhat.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MESSAGEREF_H +#define CEPH_MESSAGEREF_H + +#include <boost/intrusive_ptr.hpp> + +template<typename T> +using MRef = boost::intrusive_ptr<T>; +template<typename T> +using MConstRef = boost::intrusive_ptr<T const>; + +using MessageRef = MRef<class Message>; +using MessageConstRef = MConstRef<class Message>; + +/* cd src/messages/ && for f in *; do printf 'class '; basename "$f" .h | tr -d '\n'; printf ';\n'; done >> ../msg/MessageRef.h */ + +class MAuth; +class MAuthReply; +class MBackfillReserve; +class MCacheExpire; +class MClientCapRelease; +class MClientCaps; +class MClientLease; +class MClientQuota; +class MClientReclaim; +class MClientReclaimReply; +class MClientReconnect; +class MClientReply; +class MClientRequestForward; +class MClientRequest; +class MClientSession; +class MClientSnap; +class MCommand; +class MCommandReply; +class MConfig; +class MDataPing; +class MDentryLink; +class MDentryUnlink; +class MDirUpdate; +class MDiscover; +class MDiscoverReply; +class MExportCapsAck; +class MExportCaps; +class MExportDirAck; +class MExportDirCancel; +class MExportDirDiscoverAck; +class MExportDirDiscover; +class MExportDirFinish; +class MExportDir; +class MExportDirNotifyAck; +class MExportDirNotify; +class MExportDirPrepAck; +class MExportDirPrep; +class MForward; +class MFSMap; +class MFSMapUser; +class MGatherCaps; +class MGenericMessage; +class MGetConfig; +class MGetPoolStats; +class MGetPoolStatsReply; +class MHeartbeat; +class MInodeFileCaps; +class MLock; +class MLogAck; +class MLog; +class MMDSBeacon; +class MMDSCacheRejoin; +class MMDSFindIno; +class MMDSFindInoReply; +class MMDSFragmentNotifyAck; +class MMDSFragmentNotify; +class MMDSLoadTargets; +class MMDSMap; +class MMDSOpenIno; +class MMDSOpenInoReply; +class MMDSResolveAck; +class MMDSResolve; +class MMDSSlaveRequest; +class MMDSSnapUpdate; +class MMDSTableRequest; +class MMgrBeacon; +class MMgrClose; +class MMgrConfigure; +class MMgrDigest; +class MMgrMap; +class MMgrOpen; +class MMgrReport; +class MMonCommandAck; +class MMonCommand; +class MMonElection; +class MMonGetMap; +class MMonGetOSDMap; +class MMonGetVersion; +class MMonGetVersionReply; +class MMonGlobalID; +class MMonHealthChecks; +class MMonHealth; +class MMonJoin; +class MMonMap; +class MMonMetadata; +class MMonMgrReport; +class MMonPaxos; +class MMonProbe; +class MMonQuorumService; +class MMonScrub; +class MMonSubscribeAck; +class MMonSubscribe; +class MMonSync; +class MNop; +class MOSDAlive; +class MOSDBackoff; +class MOSDBeacon; +class MOSDBoot; +class MOSDECSubOpRead; +class MOSDECSubOpReadReply; +class MOSDECSubOpWrite; +class MOSDECSubOpWriteReply; +class MOSDFailure; +class MOSDFastDispatchOp; +class MOSDForceRecovery; +class MOSDFull; +class MOSDMap; +class MOSDMarkMeDown; +class MOSDOp; +class MOSDOpReply; +class MOSDPeeringOp; +class MOSDPGBackfill; +class MOSDPGBackfillRemove; +class MOSDPGCreate2; +class MOSDPGCreated; +class MOSDPGCreate; +class MOSDPGInfo; +class MOSDPGLog; +class MOSDPGNotify; +class MOSDPGPull; +class MOSDPGPush; +class MOSDPGPushReply; +class MOSDPGQuery; +class MOSDPGReadyToMerge; +class MOSDPGRecoveryDelete; +class MOSDPGRecoveryDeleteReply; +class MOSDPGRemove; +class MOSDPGScan; +class MOSDPGTemp; +class MOSDPGTrim; +class MOSDPGUpdateLogMissing; +class MOSDPGUpdateLogMissingReply; +class MOSDPing; +class MOSDRepOp; +class MOSDRepOpReply; +class MOSDRepScrub; +class MOSDRepScrubMap; +class MOSDScrub2; +class MOSDScrub; +class MOSDScrubReserve; +class MPGStatsAck; +class MPGStats; +class MPing; +class MPoolOp; +class MPoolOpReply; +class MRecoveryReserve; +class MRemoveSnaps; +class MRoute; +class MServiceMap; +class MStatfs; +class MStatfsReply; +class MTimeCheck2; +class MTimeCheck; +class MWatchNotify; +class PaxosServiceMessage; + +#endif diff --git a/src/msg/Messenger.cc b/src/msg/Messenger.cc new file mode 100644 index 00000000..efeab390 --- /dev/null +++ b/src/msg/Messenger.cc @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <netdb.h> + +#include "include/types.h" +#include "include/random.h" + +#include "Messenger.h" + +#include "msg/simple/SimpleMessenger.h" +#include "msg/async/AsyncMessenger.h" +#ifdef HAVE_XIO +#include "msg/xio/XioMessenger.h" +#endif + +Messenger *Messenger::create_client_messenger(CephContext *cct, string lname) +{ + std::string public_msgr_type = cct->_conf->ms_public_type.empty() ? cct->_conf.get_val<std::string>("ms_type") : cct->_conf->ms_public_type; + auto nonce = ceph::util::generate_random_number<uint64_t>(); + return Messenger::create(cct, public_msgr_type, entity_name_t::CLIENT(), + std::move(lname), nonce, 0); +} + +Messenger *Messenger::create(CephContext *cct, const string &type, + entity_name_t name, string lname, + uint64_t nonce, uint64_t cflags) +{ + int r = -1; + if (type == "random") { + r = ceph::util::generate_random_number(0, 1); + } + if (r == 0 || type == "simple") + return new SimpleMessenger(cct, name, std::move(lname), nonce); + else if (r == 1 || type.find("async") != std::string::npos) + return new AsyncMessenger(cct, name, type, std::move(lname), nonce); +#ifdef HAVE_XIO + else if ((type == "xio") && + cct->check_experimental_feature_enabled("ms-type-xio")) + return new XioMessenger(cct, name, std::move(lname), nonce, cflags); +#endif + lderr(cct) << "unrecognized ms_type '" << type << "'" << dendl; + return nullptr; +} + +/** + * Get the default crc flags for this messenger. + * but not yet dispatched. + */ +static int get_default_crc_flags(const ConfigProxy&); + +Messenger::Messenger(CephContext *cct_, entity_name_t w) + : trace_endpoint("0.0.0.0", 0, "Messenger"), + my_name(w), + default_send_priority(CEPH_MSG_PRIO_DEFAULT), + started(false), + magic(0), + socket_priority(-1), + cct(cct_), + crcflags(get_default_crc_flags(cct->_conf)), + auth_registry(cct) +{ + auth_registry.refresh_config(); +} + +void Messenger::set_endpoint_addr(const entity_addr_t& a, + const entity_name_t &name) +{ + size_t hostlen; + if (a.get_family() == AF_INET) + hostlen = sizeof(struct sockaddr_in); + else if (a.get_family() == AF_INET6) + hostlen = sizeof(struct sockaddr_in6); + else + hostlen = 0; + + if (hostlen) { + char buf[NI_MAXHOST] = { 0 }; + getnameinfo(a.get_sockaddr(), hostlen, buf, sizeof(buf), + NULL, 0, NI_NUMERICHOST); + + trace_endpoint.copy_ip(buf); + } + trace_endpoint.set_port(a.get_port()); +} + +/** + * Get the default crc flags for this messenger. + * but not yet dispatched. + * + * Pre-calculate desired software CRC settings. CRC computation may + * be disabled by default for some transports (e.g., those with strong + * hardware checksum support). + */ +int get_default_crc_flags(const ConfigProxy& conf) +{ + int r = 0; + if (conf->ms_crc_data) + r |= MSG_CRC_DATA; + if (conf->ms_crc_header) + r |= MSG_CRC_HEADER; + return r; +} + +int Messenger::bindv(const entity_addrvec_t& addrs) +{ + return bind(addrs.legacy_addr()); +} + +bool Messenger::ms_deliver_verify_authorizer( + Connection *con, + int peer_type, + int protocol, + bufferlist& authorizer, + bufferlist& authorizer_reply, + bool& isvalid, + CryptoKey& session_key, + std::string *connection_secret, + std::unique_ptr<AuthAuthorizerChallenge> *challenge) +{ + if (authorizer.length() == 0) { + for (auto dis : dispatchers) { + if (!dis->require_authorizer) { + //ldout(cct,10) << __func__ << " tolerating missing authorizer" << dendl; + isvalid = true; + return true; + } + } + } + AuthAuthorizeHandler *ah = auth_registry.get_handler(peer_type, protocol); + if (get_mytype() == CEPH_ENTITY_TYPE_MON && + peer_type != CEPH_ENTITY_TYPE_MON) { + // the monitor doesn't do authenticators for msgr1. + isvalid = true; + return true; + } + if (!ah) { + lderr(cct) << __func__ << " no AuthAuthorizeHandler found for protocol " + << protocol << dendl; + isvalid = false; + return false; + } + + for (auto dis : dispatchers) { + KeyStore *ks = dis->ms_get_auth1_authorizer_keystore(); + if (ks) { + isvalid = ah->verify_authorizer( + cct, + ks, + authorizer, + 0, + &authorizer_reply, + &con->peer_name, + &con->peer_global_id, + &con->peer_caps_info, + &session_key, + connection_secret, + challenge); + if (isvalid) { + return dis->ms_handle_authentication(con)>=0; + } + return true; + } + } + return false; +} diff --git a/src/msg/Messenger.h b/src/msg/Messenger.h new file mode 100644 index 00000000..2602765c --- /dev/null +++ b/src/msg/Messenger.h @@ -0,0 +1,837 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef CEPH_MESSENGER_H +#define CEPH_MESSENGER_H + +#include <map> +#include <deque> + +#include <errno.h> +#include <sstream> +#include <memory> + +#include "Message.h" +#include "Dispatcher.h" +#include "Policy.h" +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Throttle.h" +#include "include/Context.h" +#include "include/types.h" +#include "include/ceph_features.h" +#include "auth/Crypto.h" +#include "common/item_history.h" +#include "auth/AuthRegistry.h" +#include "include/ceph_assert.h" + +#include <errno.h> +#include <sstream> +#include <signal.h> + +#define SOCKET_PRIORITY_MIN_DELAY 6 + +class Timer; + +class AuthClient; +class AuthServer; + +#ifdef UNIT_TESTS_BUILT + +struct Interceptor { + std::mutex lock; + std::condition_variable cond_var; + + enum ACTION : uint32_t { + CONTINUE = 0, + FAIL, + STOP + }; + + virtual ~Interceptor() {} + virtual ACTION intercept(Connection *conn, uint32_t step) = 0; +}; + +#endif + +class Messenger { +private: + std::deque<Dispatcher*> dispatchers; + std::deque<Dispatcher*> fast_dispatchers; + ZTracer::Endpoint trace_endpoint; + +protected: + void set_endpoint_addr(const entity_addr_t& a, + const entity_name_t &name); + +protected: + /// the "name" of the local daemon. eg client.99 + entity_name_t my_name; + + /// my addr + safe_item_history<entity_addrvec_t> my_addrs; + + int default_send_priority; + /// set to true once the Messenger has started, and set to false on shutdown + bool started; + uint32_t magic; + int socket_priority; + +public: + AuthClient *auth_client = 0; + AuthServer *auth_server = 0; + +#ifdef UNIT_TESTS_BUILT + Interceptor *interceptor = nullptr; +#endif + + /** + * Various Messenger conditional config/type flags to allow + * different "transport" Messengers to tune themselves + */ + static const int HAS_HEAVY_TRAFFIC = 0x0001; + static const int HAS_MANY_CONNECTIONS = 0x0002; + static const int HEARTBEAT = 0x0004; + + /** + * The CephContext this Messenger uses. Many other components initialize themselves + * from this value. + */ + CephContext *cct; + int crcflags; + + using Policy = ceph::net::Policy<Throttle>; + +protected: + // for authentication + AuthRegistry auth_registry; + +public: + /** + * Messenger constructor. Call this from your implementation. + * Messenger users should construct full implementations directly, + * or use the create() function. + */ + Messenger(CephContext *cct_, entity_name_t w); + virtual ~Messenger() {} + + /** + * create a new messenger + * + * Create a new messenger instance, with whatever implementation is + * available or specified via the configuration in cct. + * + * @param cct context + * @param type name of messenger type + * @param name entity name to register + * @param lname logical name of the messenger in this process (e.g., "client") + * @param nonce nonce value to uniquely identify this instance on the current host + * @param features bits for the local connection + * @param cflags general set of flags to configure transport resources + */ + static Messenger *create(CephContext *cct, + const string &type, + entity_name_t name, + string lname, + uint64_t nonce, + uint64_t cflags); + + /** + * create a new messenger + * + * Create a new messenger instance. + * Same as the above, but a slightly simpler interface for clients: + * - Generate a random nonce + * - use the default feature bits + * - get the messenger type from cct + * - use the client entity_type + * + * @param cct context + * @param lname logical name of the messenger in this process (e.g., "client") + */ + static Messenger *create_client_messenger(CephContext *cct, string lname); + + /** + * @defgroup Accessors + * @{ + */ + int get_mytype() const { return my_name.type(); } + + /** + * Retrieve the Messenger's name + * + * @return A const reference to the name this Messenger + * currently believes to be its own. + */ + const entity_name_t& get_myname() { return my_name; } + + /** + * Retrieve the Messenger's address. + * + * @return A const reference to the address this Messenger + * currently believes to be its own. + */ + const entity_addrvec_t& get_myaddrs() { + return *my_addrs; + } + + /** + * get legacy addr for myself, suitable for protocol v1 + * + * Note that myaddrs might be a proper addrvec with v1 in it, or it might be an + * ANY addr (if i am a pure client). + */ + entity_addr_t get_myaddr_legacy() { + return my_addrs->as_legacy_addr(); + } + + + /** + * set messenger's instance + */ + uint32_t get_magic() { return magic; } + void set_magic(int _magic) { magic = _magic; } + + void set_auth_client(AuthClient *ac) { + auth_client = ac; + } + void set_auth_server(AuthServer *as) { + auth_server = as; + } + +protected: + /** + * set messenger's address + */ + virtual void set_myaddrs(const entity_addrvec_t& a) { + my_addrs = a; + set_endpoint_addr(a.front(), my_name); + } +public: + /** + * @return the zipkin trace endpoint + */ + const ZTracer::Endpoint* get_trace_endpoint() const { + return &trace_endpoint; + } + + /** + * Set the name of the local entity. The name is reported to others and + * can be changed while the system is running, but doing so at incorrect + * times may have bad results. + * + * @param m The name to set. + */ + void set_myname(const entity_name_t& m) { my_name = m; } + + /** + * Set the unknown address components for this Messenger. + * This is useful if the Messenger doesn't know its full address just by + * binding, but another Messenger on the same interface has already learned + * its full address. This function does not fill in known address elements, + * cause a rebind, or do anything of that sort. + * + * @param addr The address to use as a template. + */ + virtual bool set_addr_unknowns(const entity_addrvec_t &addrs) = 0; + /** + * Set the address for this Messenger. This is useful if the Messenger + * binds to a specific address but advertises a different address on the + * the network. + * + * @param addr The address to use. + */ + virtual void set_addrs(const entity_addrvec_t &addr) = 0; + /// Get the default send priority. + int get_default_send_priority() { return default_send_priority; } + /** + * Get the number of Messages which the Messenger has received + * but not yet dispatched. + */ + virtual int get_dispatch_queue_len() = 0; + + /** + * Get age of oldest undelivered message + * (0 if the queue is empty) + */ + virtual double get_dispatch_queue_max_age(utime_t now) = 0; + + /** + * @} // Accessors + */ + + /** + * @defgroup Configuration + * @{ + */ + /** + * Set the cluster protocol in use by this daemon. + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param p The cluster protocol to use. Defined externally. + */ + virtual void set_cluster_protocol(int p) = 0; + /** + * Set a policy which is applied to all peers who do not have a type-specific + * Policy. + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param p The Policy to apply. + */ + virtual void set_default_policy(Policy p) = 0; + /** + * Set a policy which is applied to all peers of the given type. + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param type The peer type this policy applies to. + * @param p The policy to apply. + */ + virtual void set_policy(int type, Policy p) = 0; + /** + * Set the Policy associated with a type of peer. + * + * This can be called either on initial setup, or after connections + * are already established. However, the policies for existing + * connections will not be affected; the new policy will only apply + * to future connections. + * + * @param t The peer type to get the default policy for. + * @return A const Policy reference. + */ + virtual Policy get_policy(int t) = 0; + /** + * Get the default Policy + * + * @return A const Policy reference. + */ + virtual Policy get_default_policy() = 0; + /** + * Set Throttlers applied to all Messages from the given type of peer + * + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param type The peer type the Throttlers will apply to. + * @param bytes The Throttle for the number of bytes carried by the message + * @param msgs The Throttle for the number of messages for this @p type + * @note The Messenger does not take ownership of the Throttle pointers, but + * you must not destroy them before you destroy the Messenger. + */ + virtual void set_policy_throttlers(int type, Throttle *bytes, Throttle *msgs=NULL) = 0; + /** + * Set the default send priority + * + * This is an init-time function and must be called *before* calling + * start(). + * + * @param p The cluster protocol to use. Defined externally. + */ + void set_default_send_priority(int p) { + ceph_assert(!started); + default_send_priority = p; + } + /** + * Set the priority(SO_PRIORITY) for all packets to be sent on this socket. + * + * Linux uses this value to order the networking queues: packets with a higher + * priority may be processed first depending on the selected device queueing + * discipline. + * + * @param prio The priority. Setting a priority outside the range 0 to 6 + * requires the CAP_NET_ADMIN capability. + */ + void set_socket_priority(int prio) { + socket_priority = prio; + } + /** + * Get the socket priority + * + * @return the socket priority + */ + int get_socket_priority() { + return socket_priority; + } + /** + * Add a new Dispatcher to the front of the list. If you add + * a Dispatcher which is already included, it will get a duplicate + * entry. This will reduce efficiency but not break anything. + * + * @param d The Dispatcher to insert into the list. + */ + void add_dispatcher_head(Dispatcher *d) { + bool first = dispatchers.empty(); + dispatchers.push_front(d); + if (d->ms_can_fast_dispatch_any()) + fast_dispatchers.push_front(d); + if (first) + ready(); + } + /** + * Add a new Dispatcher to the end of the list. If you add + * a Dispatcher which is already included, it will get a duplicate + * entry. This will reduce efficiency but not break anything. + * + * @param d The Dispatcher to insert into the list. + */ + void add_dispatcher_tail(Dispatcher *d) { + bool first = dispatchers.empty(); + dispatchers.push_back(d); + if (d->ms_can_fast_dispatch_any()) + fast_dispatchers.push_back(d); + if (first) + ready(); + } + /** + * Bind the Messenger to a specific address. If bind_addr + * is not completely filled in the system will use the + * valid portions and cycle through the unset ones (eg, the port) + * in an unspecified order. + * + * @param bind_addr The address to bind to. + * @return 0 on success, or -1 on error, or -errno if + * we can be more specific about the failure. + */ + virtual int bind(const entity_addr_t& bind_addr) = 0; + + /** + * This function performs a full restart of the Messenger component, + * whatever that means. Other entities who connect to this + * Messenger post-rebind() should perceive it as a new entity which + * they have not previously contacted, and it MUST bind to a + * different address than it did previously. + * + * @param avoid_ports Additional port to avoid binding to. + */ + virtual int rebind(const set<int>& avoid_ports) { return -EOPNOTSUPP; } + /** + * Bind the 'client' Messenger to a specific address.Messenger will bind + * the address before connect to others when option ms_bind_before_connect + * is true. + * @param bind_addr The address to bind to. + * @return 0 on success, or -1 on error, or -errno if + */ + virtual int client_bind(const entity_addr_t& bind_addr) = 0; + + virtual int bindv(const entity_addrvec_t& addrs); + + + virtual bool should_use_msgr2() { + return false; + } + + /** + * @} // Configuration + */ + + /** + * @defgroup Startup/Shutdown + * @{ + */ + /** + * Perform any resource allocation, thread startup, etc + * that is required before attempting to connect to other + * Messengers or transmit messages. + * Once this function completes, started shall be set to true. + * + * @return 0 on success; -errno on failure. + */ + virtual int start() { started = true; return 0; } + + // shutdown + /** + * Block until the Messenger has finished shutting down (according + * to the shutdown() function). + * It is valid to call this after calling shutdown(), but it must + * be called before deleting the Messenger. + */ + virtual void wait() = 0; + /** + * Initiate a shutdown of the Messenger. + * + * @return 0 on success, -errno otherwise. + */ + virtual int shutdown() { started = false; return 0; } + /** + * @} // Startup/Shutdown + */ + + /** + * @defgroup Messaging + * @{ + */ + /** + * Queue the given Message for the given entity. + * Success in this function does not guarantee Message delivery, only + * success in queueing the Message. Other guarantees may be provided based + * on the Connection policy associated with the dest. + * + * @param m The Message to send. The Messenger consumes a single reference + * when you pass it in. + * @param dest The entity to send the Message to. + * + * DEPRECATED: please do not use this interface for any new code; + * use the Connection* variant. + * + * @return 0 on success, or -errno on failure. + */ + virtual int send_to( + Message *m, + int type, + const entity_addrvec_t& addr) = 0; + int send_to_mon( + Message *m, const entity_addrvec_t& addrs) { + return send_to(m, CEPH_ENTITY_TYPE_MON, addrs); + } + int send_to_mds( + Message *m, const entity_addrvec_t& addrs) { + return send_to(m, CEPH_ENTITY_TYPE_MDS, addrs); + } + int send_to_osd( + Message *m, const entity_addrvec_t& addrs) { + return send_to(m, CEPH_ENTITY_TYPE_OSD, addrs); + } + int send_to_mgr( + Message *m, const entity_addrvec_t& addrs) { + return send_to(m, CEPH_ENTITY_TYPE_MGR, addrs); + } + + /** + * @} // Messaging + */ + /** + * @defgroup Connection Management + * @{ + */ + /** + * Get the Connection object associated with a given entity. If a + * Connection does not exist, create one and establish a logical connection. + * The caller owns a reference when this returns. Call ->put() when you're + * done! + * + * @param dest The entity to get a connection for. + */ + virtual ConnectionRef connect_to( + int type, const entity_addrvec_t& dest) = 0; + ConnectionRef connect_to_mon(const entity_addrvec_t& dest) { + return connect_to(CEPH_ENTITY_TYPE_MON, dest); + } + ConnectionRef connect_to_mds(const entity_addrvec_t& dest) { + return connect_to(CEPH_ENTITY_TYPE_MDS, dest); + } + ConnectionRef connect_to_osd(const entity_addrvec_t& dest) { + return connect_to(CEPH_ENTITY_TYPE_OSD, dest); + } + ConnectionRef connect_to_mgr(const entity_addrvec_t& dest) { + return connect_to(CEPH_ENTITY_TYPE_MGR, dest); + } + + /** + * Get the Connection object associated with ourselves. + */ + virtual ConnectionRef get_loopback_connection() = 0; + /** + * Mark down a Connection to a remote. + * + * This will cause us to discard our outgoing queue for them, and if + * reset detection is enabled in the policy and the endpoint tries + * to reconnect they will discard their queue when we inform them of + * the session reset. + * + * If there is no Connection to the given dest, it is a no-op. + * + * This generates a RESET notification to the Dispatcher. + * + * DEPRECATED: please do not use this interface for any new code; + * use the Connection* variant. + * + * @param a The address to mark down. + */ + virtual void mark_down(const entity_addr_t& a) = 0; + virtual void mark_down_addrs(const entity_addrvec_t& a) { + mark_down(a.legacy_addr()); + } + /** + * Mark all the existing Connections down. This is equivalent + * to iterating over all Connections and calling mark_down() + * on each. + * + * This will generate a RESET event for each closed connections. + */ + virtual void mark_down_all() = 0; + /** + * @} // Connection Management + */ +protected: + /** + * @defgroup Subclass Interfacing + * @{ + */ + /** + * A courtesy function for Messenger implementations which + * will be called when we receive our first Dispatcher. + */ + virtual void ready() { } + /** + * @} // Subclass Interfacing + */ +public: +#ifdef CEPH_USE_SIGPIPE_BLOCKER + /** + * We need to disable SIGPIPE on all platforms, and if they + * don't give us a better mechanism (read: are on Solaris) that + * means blocking the signal whenever we do a send or sendmsg... + * That means any implementations must invoke MSGR_SIGPIPE_STOPPER in-scope + * whenever doing so. On most systems that's blank, but on systems where + * it's needed we construct an RAII object to plug and un-plug the SIGPIPE. + * See http://www.microhowto.info/howto/ignore_sigpipe_without_affecting_other_threads_in_a_process.html + */ + struct sigpipe_stopper { + bool blocked; + sigset_t existing_mask; + sigset_t pipe_mask; + sigpipe_stopper() { + sigemptyset(&pipe_mask); + sigaddset(&pipe_mask, SIGPIPE); + sigset_t signals; + sigemptyset(&signals); + sigpending(&signals); + if (sigismember(&signals, SIGPIPE)) { + blocked = false; + } else { + blocked = true; + int r = pthread_sigmask(SIG_BLOCK, &pipe_mask, &existing_mask); + ceph_assert(r == 0); + } + } + ~sigpipe_stopper() { + if (blocked) { + struct timespec nowait{0}; + int r = sigtimedwait(&pipe_mask, 0, &nowait); + ceph_assert(r == EAGAIN || r == 0); + r = pthread_sigmask(SIG_SETMASK, &existing_mask, 0); + ceph_assert(r == 0); + } + } + }; +# define MSGR_SIGPIPE_STOPPER Messenger::sigpipe_stopper stopper(); +#else +# define MSGR_SIGPIPE_STOPPER +#endif + /** + * @defgroup Dispatcher Interfacing + * @{ + */ + /** + * Determine whether a message can be fast-dispatched. We will + * query each Dispatcher in sequence to determine if they are + * capable of handling a particular message via "fast dispatch". + * + * @param m The Message we are testing. + */ + bool ms_can_fast_dispatch(const Message::const_ref& m) { + for (const auto &dispatcher : fast_dispatchers) { + if (dispatcher->ms_can_fast_dispatch2(m)) + return true; + } + return false; + } + + /** + * Deliver a single Message via "fast dispatch". + * + * @param m The Message we are fast dispatching. + * If none of our Dispatchers can handle it, ceph_abort(). + */ + void ms_fast_dispatch(const Message::ref &m) { + m->set_dispatch_stamp(ceph_clock_now()); + for (const auto &dispatcher : fast_dispatchers) { + if (dispatcher->ms_can_fast_dispatch2(m)) { + dispatcher->ms_fast_dispatch2(m); + return; + } + } + ceph_abort(); + } + void ms_fast_dispatch(Message *m) { + return ms_fast_dispatch(Message::ref(m, false)); /* consume ref */ + } + /** + * + */ + void ms_fast_preprocess(const Message::ref &m) { + for (const auto &dispatcher : fast_dispatchers) { + dispatcher->ms_fast_preprocess2(m); + } + } + /** + * Deliver a single Message. Send it to each Dispatcher + * in sequence until one of them handles it. + * If none of our Dispatchers can handle it, ceph_abort(). + * + * @param m The Message to deliver. + */ + void ms_deliver_dispatch(const Message::ref &m) { + m->set_dispatch_stamp(ceph_clock_now()); + for (const auto &dispatcher : dispatchers) { + if (dispatcher->ms_dispatch2(m)) + return; + } + lsubdout(cct, ms, 0) << "ms_deliver_dispatch: unhandled message " << m << " " << *m << " from " + << m->get_source_inst() << dendl; + ceph_assert(!cct->_conf->ms_die_on_unhandled_msg); + } + void ms_deliver_dispatch(Message *m) { + return ms_deliver_dispatch(Message::ref(m, false)); /* consume ref */ + } + /** + * Notify each Dispatcher of a new Connection. Call + * this function whenever a new Connection is initiated or + * reconnects. + * + * @param con Pointer to the new Connection. + */ + void ms_deliver_handle_connect(Connection *con) { + for (const auto& dispatcher : dispatchers) { + dispatcher->ms_handle_connect(con); + } + } + + /** + * Notify each fast Dispatcher of a new Connection. Call + * this function whenever a new Connection is initiated or + * reconnects. + * + * @param con Pointer to the new Connection. + */ + void ms_deliver_handle_fast_connect(Connection *con) { + for (const auto& dispatcher : fast_dispatchers) { + dispatcher->ms_handle_fast_connect(con); + } + } + + /** + * Notify each Dispatcher of a new incoming Connection. Call + * this function whenever a new Connection is accepted. + * + * @param con Pointer to the new Connection. + */ + void ms_deliver_handle_accept(Connection *con) { + for (const auto& dispatcher : dispatchers) { + dispatcher->ms_handle_accept(con); + } + } + + /** + * Notify each fast Dispatcher of a new incoming Connection. Call + * this function whenever a new Connection is accepted. + * + * @param con Pointer to the new Connection. + */ + void ms_deliver_handle_fast_accept(Connection *con) { + for (const auto& dispatcher : fast_dispatchers) { + dispatcher->ms_handle_fast_accept(con); + } + } + + /** + * Notify each Dispatcher of a Connection which may have lost + * Messages. Call this function whenever you detect that a lossy Connection + * has been disconnected. + * + * @param con Pointer to the broken Connection. + */ + void ms_deliver_handle_reset(Connection *con) { + for (const auto& dispatcher : dispatchers) { + if (dispatcher->ms_handle_reset(con)) + return; + } + } + /** + * Notify each Dispatcher of a Connection which has been "forgotten" about + * by the remote end, implying that messages have probably been lost. + * Call this function whenever you detect a reset. + * + * @param con Pointer to the broken Connection. + */ + void ms_deliver_handle_remote_reset(Connection *con) { + for (const auto& dispatcher : dispatchers) { + dispatcher->ms_handle_remote_reset(con); + } + } + + /** + * Notify each Dispatcher of a Connection for which reconnection + * attempts are being refused. Call this function whenever you + * detect that a lossy Connection has been disconnected and it's + * impossible to reconnect. + * + * @param con Pointer to the broken Connection. + */ + void ms_deliver_handle_refused(Connection *con) { + for (const auto& dispatcher : dispatchers) { + if (dispatcher->ms_handle_refused(con)) + return; + } + } + + /** + * Get the AuthAuthorizer for a new outgoing Connection. + * + * @param peer_type The peer type for the new Connection + * @param force_new True if we want to wait for new keys, false otherwise. + * @return A pointer to the AuthAuthorizer, if we have one; NULL otherwise + */ + AuthAuthorizer *ms_deliver_get_authorizer(int peer_type) { + AuthAuthorizer *a = 0; + for (const auto& dispatcher : dispatchers) { + if (dispatcher->ms_get_authorizer(peer_type, &a)) + return a; + } + return NULL; + } + /** + * Verify that the authorizer on a new incoming Connection is correct. + * + * @param con The new incoming Connection + * @param peer_type The type of the endpoint on the new Connection + * @param protocol The ID of the protocol in use (at time of writing, cephx or none) + * @param authorizer The authorization string supplied by the remote + * @param authorizer_reply Output param: The string we should send back to + * the remote to authorize ourselves. Only filled in if isvalid + * @param isvalid Output param: True if authorizer is valid, false otherwise + * + * @return True if we were able to prove or disprove correctness of + * authorizer, false otherwise. + */ + bool ms_deliver_verify_authorizer( + Connection *con, int peer_type, + int protocol, bufferlist& authorizer, bufferlist& authorizer_reply, + bool& isvalid, + CryptoKey& session_key, + std::string *connection_secret, + std::unique_ptr<AuthAuthorizerChallenge> *challenge); + + /** + * @} // Dispatcher Interfacing + */ +}; + + + +#endif diff --git a/src/msg/Policy.h b/src/msg/Policy.h new file mode 100644 index 00000000..5d13ffb8 --- /dev/null +++ b/src/msg/Policy.h @@ -0,0 +1,117 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#pragma once + +#include "include/ceph_features.h" + +namespace ceph::net { + +using peer_type_t = int; + +/** + * A Policy describes the rules of a Connection. Is there a limit on how + * much data this Connection can have locally? When the underlying connection + * experiences an error, does the Connection disappear? Can this Messenger + * re-establish the underlying connection? + */ +template<class ThrottleType> +struct Policy { + /// If true, the Connection is tossed out on errors. + bool lossy; + /// If true, the underlying connection can't be re-established from this end. + bool server; + /// If true, we will standby when idle + bool standby; + /// If true, we will try to detect session resets + bool resetcheck; + /** + * The throttler is used to limit how much data is held by Messages from + * the associated Connection(s). When reading in a new Message, the Messenger + * will call throttler->throttle() for the size of the new Message. + */ + ThrottleType* throttler_bytes; + ThrottleType* throttler_messages; + + /// Specify features supported locally by the endpoint. + uint64_t features_supported; + /// Specify features any remotes must have to talk to this endpoint. + uint64_t features_required; + + Policy() + : lossy(false), server(false), standby(false), resetcheck(true), + throttler_bytes(NULL), + throttler_messages(NULL), + features_supported(CEPH_FEATURES_SUPPORTED_DEFAULT), + features_required(0) {} +private: + Policy(bool l, bool s, bool st, bool r, uint64_t req) + : lossy(l), server(s), standby(st), resetcheck(r), + throttler_bytes(NULL), + throttler_messages(NULL), + features_supported(CEPH_FEATURES_SUPPORTED_DEFAULT), + features_required(req) {} + +public: + static Policy stateful_server(uint64_t req) { + return Policy(false, true, true, true, req); + } + static Policy stateless_server(uint64_t req) { + return Policy(true, true, false, false, req); + } + static Policy lossless_peer(uint64_t req) { + return Policy(false, false, true, false, req); + } + static Policy lossless_peer_reuse(uint64_t req) { + return Policy(false, false, true, true, req); + } + static Policy lossy_client(uint64_t req) { + return Policy(true, false, false, false, req); + } + static Policy lossless_client(uint64_t req) { + return Policy(false, false, false, true, req); + } +}; + +template<class ThrottleType> +class PolicySet { + using policy_t = Policy<ThrottleType> ; + /// the default Policy we use for Pipes + policy_t default_policy; + /// map specifying different Policies for specific peer types + std::map<int, policy_t> policy_map; // entity_name_t::type -> Policy + +public: + const policy_t& get(peer_type_t peer_type) const { + if (auto found = policy_map.find(peer_type); found != policy_map.end()) { + return found->second; + } else { + return default_policy; + } + } + policy_t& get(peer_type_t peer_type) { + if (auto found = policy_map.find(peer_type); found != policy_map.end()) { + return found->second; + } else { + return default_policy; + } + } + void set(peer_type_t peer_type, const policy_t& p) { + policy_map[peer_type] = p; + } + const policy_t& get_default() const { + return default_policy; + } + void set_default(const policy_t& p) { + default_policy = p; + } + void set_throttlers(peer_type_t peer_type, + ThrottleType* byte_throttle, + ThrottleType* msg_throttle) { + auto& policy = get(peer_type); + policy.throttler_bytes = byte_throttle; + policy.throttler_messages = msg_throttle; + } +}; + +} diff --git a/src/msg/QueueStrategy.cc b/src/msg/QueueStrategy.cc new file mode 100644 index 00000000..9356e5c5 --- /dev/null +++ b/src/msg/QueueStrategy.cc @@ -0,0 +1,112 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include <string> +#include "QueueStrategy.h" +#define dout_subsys ceph_subsys_ms +#include "common/debug.h" + +QueueStrategy::QueueStrategy(int _n_threads) + : lock("QueueStrategy::lock"), + n_threads(_n_threads), + stop(false), + mqueue(), + disp_threads() +{ +} + +void QueueStrategy::ds_dispatch(Message *m) { + msgr->ms_fast_preprocess(m); + if (msgr->ms_can_fast_dispatch(m)) { + msgr->ms_fast_dispatch(m); + return; + } + lock.Lock(); + mqueue.push_back(*m); + if (disp_threads.size()) { + if (! disp_threads.empty()) { + QSThread *thrd = &disp_threads.front(); + disp_threads.pop_front(); + thrd->cond.Signal(); + } + } + lock.Unlock(); +} + +void QueueStrategy::entry(QSThread *thrd) +{ + for (;;) { + Message::ref m; + lock.Lock(); + for (;;) { + if (! mqueue.empty()) { + m = Message::ref(&mqueue.front(), false); + mqueue.pop_front(); + break; + } + if (stop) + break; + disp_threads.push_front(*thrd); + thrd->cond.Wait(lock); + } + lock.Unlock(); + if (stop) { + if (!m) break; + continue; + } + get_messenger()->ms_deliver_dispatch(m); + } +} + +void QueueStrategy::shutdown() +{ + QSThread *thrd; + lock.Lock(); + stop = true; + while (disp_threads.size()) { + thrd = &(disp_threads.front()); + disp_threads.pop_front(); + thrd->cond.Signal(); + } + lock.Unlock(); +} + +void QueueStrategy::wait() +{ + lock.Lock(); + ceph_assert(stop); + for (auto& thread : threads) { + lock.Unlock(); + + // join outside of lock + thread->join(); + + lock.Lock(); + } + lock.Unlock(); +} + +void QueueStrategy::start() +{ + ceph_assert(!stop); + lock.Lock(); + threads.reserve(n_threads); + for (int ix = 0; ix < n_threads; ++ix) { + string thread_name = "ms_xio_qs_"; + thread_name.append(std::to_string(ix)); + auto thrd = std::make_unique<QSThread>(this); + thrd->create(thread_name.c_str()); + threads.emplace_back(std::move(thrd)); + } + lock.Unlock(); +} diff --git a/src/msg/QueueStrategy.h b/src/msg/QueueStrategy.h new file mode 100644 index 00000000..a531cd77 --- /dev/null +++ b/src/msg/QueueStrategy.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef QUEUE_STRATEGY_H +#define QUEUE_STRATEGY_H + +#include <vector> +#include <memory> +#include <boost/intrusive/list.hpp> +#include "DispatchStrategy.h" +#include "msg/Messenger.h" + +namespace bi = boost::intrusive; + +class QueueStrategy : public DispatchStrategy { + Mutex lock; + const int n_threads; + bool stop; + + Message::Queue mqueue; + + class QSThread : public Thread { + public: + bi::list_member_hook<> thread_q; + QueueStrategy *dq; + Cond cond; + explicit QSThread(QueueStrategy *dq) : thread_q(), dq(dq), cond() {} + void* entry() { + dq->entry(this); + return NULL; + } + + typedef bi::list< QSThread, + bi::member_hook< QSThread, + bi::list_member_hook<>, + &QSThread::thread_q > > Queue; + }; + + std::vector<std::unique_ptr<QSThread>> threads; //< all threads + QSThread::Queue disp_threads; //< waiting threads + +public: + explicit QueueStrategy(int n_threads); + void ds_dispatch(Message *m) override; + void shutdown() override; + void start() override; + void wait() override; + void entry(QSThread *thrd); + virtual ~QueueStrategy() {} +}; +#endif /* QUEUE_STRATEGY_H */ diff --git a/src/msg/SimplePolicyMessenger.h b/src/msg/SimplePolicyMessenger.h new file mode 100644 index 00000000..2e9b84ec --- /dev/null +++ b/src/msg/SimplePolicyMessenger.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef SIMPLE_POLICY_MESSENGER_H +#define SIMPLE_POLICY_MESSENGER_H + +#include "Messenger.h" +#include "Policy.h" + +class SimplePolicyMessenger : public Messenger +{ +private: + /// lock protecting policy + Mutex policy_lock; + // entity_name_t::type -> Policy + ceph::net::PolicySet<Throttle> policy_set; + +public: + + SimplePolicyMessenger(CephContext *cct, entity_name_t name, + string mname, uint64_t _nonce) + : Messenger(cct, name), + policy_lock("SimplePolicyMessenger::policy_lock") + { + } + + /** + * Get the Policy associated with a type of peer. + * @param t The peer type to get the default policy for. + * + * @return A const Policy reference. + */ + Policy get_policy(int t) override { + Mutex::Locker l(policy_lock); + return policy_set.get(t); + } + + Policy get_default_policy() override { + Mutex::Locker l(policy_lock); + return policy_set.get_default(); + } + + /** + * Set a policy which is applied to all peers who do not have a type-specific + * Policy. + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param p The Policy to apply. + */ + void set_default_policy(Policy p) override { + Mutex::Locker l(policy_lock); + policy_set.set_default(p); + } + /** + * Set a policy which is applied to all peers of the given type. + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param type The peer type this policy applies to. + * @param p The policy to apply. + */ + void set_policy(int type, Policy p) override { + Mutex::Locker l(policy_lock); + policy_set.set(type, p); + } + + /** + * Set a Throttler which is applied to all Messages from the given + * type of peer. + * This is an init-time function and cannot be called after calling + * start() or bind(). + * + * @param type The peer type this Throttler will apply to. + * @param t The Throttler to apply. SimpleMessenger does not take + * ownership of this pointer, but you must not destroy it before + * you destroy SimpleMessenger. + */ + void set_policy_throttlers(int type, + Throttle* byte_throttle, + Throttle* msg_throttle) override { + Mutex::Locker l(policy_lock); + policy_set.set_throttlers(type, byte_throttle, msg_throttle); + } + +}; /* SimplePolicyMessenger */ + +#endif /* SIMPLE_POLICY_MESSENGER_H */ diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc new file mode 100644 index 00000000..b78d84a3 --- /dev/null +++ b/src/msg/async/AsyncConnection.cc @@ -0,0 +1,771 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> + +#include "include/Context.h" +#include "include/random.h" +#include "common/errno.h" +#include "AsyncMessenger.h" +#include "AsyncConnection.h" + +#include "ProtocolV1.h" +#include "ProtocolV2.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "common/EventTrace.h" + +// Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR +#define SEQ_MASK 0x7fffffff + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) { + return *_dout << "-- " << async_msgr->get_myaddrs() << " >> " + << *peer_addrs << " conn(" << this + << (msgr2 ? " msgr2=" : " legacy=") + << protocol.get() + << " " << ceph_con_mode_name(protocol->auth_meta->con_mode) + << " :" << port + << " s=" << get_state_name(state) + << " l=" << policy.lossy + << ")."; +} + +// Notes: +// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead + +const uint32_t AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512; + +class C_time_wakeup : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_time_wakeup(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd_or_id) override { + conn->wakeup_from(fd_or_id); + } +}; + +class C_handle_read : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_handle_read(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd_or_id) override { + conn->process(); + } +}; + +class C_handle_write : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_handle_write(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd) override { + conn->handle_write(); + } +}; + +class C_handle_write_callback : public EventCallback { + AsyncConnectionRef conn; + +public: + explicit C_handle_write_callback(AsyncConnectionRef c) : conn(c) {} + void do_request(uint64_t fd) override { conn->handle_write_callback(); } +}; + +class C_clean_handler : public EventCallback { + AsyncConnectionRef conn; + public: + explicit C_clean_handler(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t id) override { + conn->cleanup(); + delete this; + } +}; + +class C_tick_wakeup : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_tick_wakeup(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd_or_id) override { + conn->tick(fd_or_id); + } +}; + + +AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q, + Worker *w, bool m2, bool local) + : Connection(cct, m), delay_state(NULL), async_msgr(m), conn_id(q->get_id()), + logger(w->get_perf_counter()), + state(STATE_NONE), port(-1), + dispatch_queue(q), recv_buf(NULL), + recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)), + recv_start(0), recv_end(0), + last_active(ceph::coarse_mono_clock::now()), + connect_timeout_us(cct->_conf->ms_connection_ready_timeout*1000*1000), + inactive_timeout_us(cct->_conf->ms_connection_idle_timeout*1000*1000), + msgr2(m2), state_offset(0), + worker(w), center(&w->center),read_buffer(nullptr) +{ +#ifdef UNIT_TESTS_BUILT + this->interceptor = m->interceptor; +#endif + read_handler = new C_handle_read(this); + write_handler = new C_handle_write(this); + write_callback_handler = new C_handle_write_callback(this); + wakeup_handler = new C_time_wakeup(this); + tick_handler = new C_tick_wakeup(this); + // double recv_max_prefetch see "read_until" + recv_buf = new char[2*recv_max_prefetch]; + if (local) { + protocol = std::unique_ptr<Protocol>(new LoopbackProtocolV1(this)); + } else if (m2) { + protocol = std::unique_ptr<Protocol>(new ProtocolV2(this)); + } else { + protocol = std::unique_ptr<Protocol>(new ProtocolV1(this)); + } + logger->inc(l_msgr_created_connections); +} + +AsyncConnection::~AsyncConnection() +{ + if (recv_buf) + delete[] recv_buf; + ceph_assert(!delay_state); +} + +int AsyncConnection::get_con_mode() const { + return protocol->get_con_mode(); +} + +void AsyncConnection::maybe_start_delay_thread() +{ + if (!delay_state) { + async_msgr->cct->_conf.with_val<std::string>( + "ms_inject_delay_type", + [this](const string& s) { + if (s.find(ceph_entity_type_name(peer_type)) != string::npos) { + ldout(msgr->cct, 1) << __func__ << " setting up a delay queue" + << dendl; + delay_state = new DelayedDelivery(async_msgr, center, dispatch_queue, + conn_id); + } + }); + } +} + + +ssize_t AsyncConnection::read(unsigned len, char *buffer, + std::function<void(char *, ssize_t)> callback) { + ldout(async_msgr->cct, 20) << __func__ + << (pendingReadLen ? " continue" : " start") + << " len=" << len << dendl; + ssize_t r = read_until(len, buffer); + if (r > 0) { + readCallback = callback; + pendingReadLen = len; + read_buffer = buffer; + } + return r; +} + +// Because this func will be called multi times to populate +// the needed buffer, so the passed in bufferptr must be the same. +// Normally, only "read_message" will pass existing bufferptr in +// +// And it will uses readahead method to reduce small read overhead, +// "recv_buf" is used to store read buffer +// +// return the remaining bytes, 0 means this buffer is finished +// else return < 0 means error +ssize_t AsyncConnection::read_until(unsigned len, char *p) +{ + ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is " + << state_offset << dendl; + + if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) { + if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) { + ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl; + cs.shutdown(); + } + } + + ssize_t r = 0; + uint64_t left = len - state_offset; + if (recv_end > recv_start) { + uint64_t to_read = std::min<uint64_t>(recv_end - recv_start, left); + memcpy(p, recv_buf+recv_start, to_read); + recv_start += to_read; + left -= to_read; + ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer " + << " left is " << left << " buffer still has " + << recv_end - recv_start << dendl; + if (left == 0) { + return 0; + } + state_offset += to_read; + } + + recv_end = recv_start = 0; + /* nothing left in the prefetch buffer */ + if (left > (uint64_t)recv_max_prefetch) { + /* this was a large read, we don't prefetch for these */ + do { + r = read_bulk(p+state_offset, left); + ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl; + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl; + return -1; + } else if (r == static_cast<int>(left)) { + state_offset = 0; + return 0; + } + state_offset += r; + left -= r; + } while (r > 0); + } else { + do { + r = read_bulk(recv_buf+recv_end, recv_max_prefetch); + ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end + << " left is " << left << " got " << r << dendl; + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl; + return -1; + } + recv_end += r; + if (r >= static_cast<int>(left)) { + recv_start = len - state_offset; + memcpy(p+state_offset, recv_buf, recv_start); + state_offset = 0; + return 0; + } + left -= r; + } while (r > 0); + memcpy(p+state_offset, recv_buf, recv_end-recv_start); + state_offset += (recv_end - recv_start); + recv_end = recv_start = 0; + } + ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining " + << len - state_offset << " bytes" << dendl; + return len - state_offset; +} + +/* return -1 means `fd` occurs error or closed, it should be closed + * return 0 means EAGAIN or EINTR */ +ssize_t AsyncConnection::read_bulk(char *buf, unsigned len) +{ + ssize_t nread; + again: + nread = cs.read(buf, len); + if (nread < 0) { + if (nread == -EAGAIN) { + nread = 0; + } else if (nread == -EINTR) { + goto again; + } else { + ldout(async_msgr->cct, 1) << __func__ << " reading from fd=" << cs.fd() + << " : "<< strerror(nread) << dendl; + return -1; + } + } else if (nread == 0) { + ldout(async_msgr->cct, 1) << __func__ << " peer close file descriptor " + << cs.fd() << dendl; + return -1; + } + return nread; +} + +ssize_t AsyncConnection::write(bufferlist &bl, + std::function<void(ssize_t)> callback, + bool more) { + + std::unique_lock<std::mutex> l(write_lock); + outgoing_bl.claim_append(bl); + ssize_t r = _try_send(more); + if (r > 0) { + writeCallback = callback; + } + return r; +} + +// return the remaining bytes, it may larger than the length of ptr +// else return < 0 means error +ssize_t AsyncConnection::_try_send(bool more) +{ + if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) { + if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) { + ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl; + cs.shutdown(); + } + } + + ceph_assert(center->in_thread()); + ldout(async_msgr->cct, 25) << __func__ << " cs.send " << outgoing_bl.length() + << " bytes" << dendl; + ssize_t r = cs.send(outgoing_bl, more); + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " send error: " << cpp_strerror(r) << dendl; + return r; + } + + ldout(async_msgr->cct, 10) << __func__ << " sent bytes " << r + << " remaining bytes " << outgoing_bl.length() << dendl; + + if (!open_write && is_queued()) { + center->create_file_event(cs.fd(), EVENT_WRITABLE, write_handler); + open_write = true; + } + + if (open_write && !is_queued()) { + center->delete_file_event(cs.fd(), EVENT_WRITABLE); + open_write = false; + if (writeCallback) { + center->dispatch_event_external(write_callback_handler); + } + } + + return outgoing_bl.length(); +} + +void AsyncConnection::inject_delay() { + if (async_msgr->cct->_conf->ms_inject_internal_delays) { + ldout(async_msgr->cct, 10) << __func__ << " sleep for " << + async_msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } +} + +void AsyncConnection::process() { + std::lock_guard<std::mutex> l(lock); + last_active = ceph::coarse_mono_clock::now(); + recv_start_time = ceph::mono_clock::now(); + + ldout(async_msgr->cct, 20) << __func__ << dendl; + + switch (state) { + case STATE_NONE: { + ldout(async_msgr->cct, 20) << __func__ << " enter none state" << dendl; + return; + } + case STATE_CLOSED: { + ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl; + return; + } + case STATE_CONNECTING: { + ceph_assert(!policy.server); + + // clear timer (if any) since we are connecting/re-connecting + if (last_tick_id) { + center->delete_time_event(last_tick_id); + last_tick_id = 0; + } + + if (cs) { + center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE); + cs.close(); + } + + SocketOptions opts; + opts.priority = async_msgr->get_socket_priority(); + opts.connect_bind_addr = msgr->get_myaddrs().front(); + ssize_t r = worker->connect(target_addr, opts, &cs); + if (r < 0) { + protocol->fault(); + return; + } + + center->create_file_event(cs.fd(), EVENT_READABLE, read_handler); + state = STATE_CONNECTING_RE; + } + case STATE_CONNECTING_RE: { + ssize_t r = cs.is_connected(); + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " reconnect failed to " + << target_addr << dendl; + if (r == -ECONNREFUSED) { + ldout(async_msgr->cct, 2) + << __func__ << " connection refused!" << dendl; + dispatch_queue->queue_refused(this); + } + protocol->fault(); + return; + } else if (r == 0) { + ldout(async_msgr->cct, 10) + << __func__ << " nonblock connect inprogress" << dendl; + if (async_msgr->get_stack()->nonblock_connect_need_writable_event()) { + center->create_file_event(cs.fd(), EVENT_WRITABLE, + read_handler); + } + logger->tinc(l_msgr_running_recv_time, + ceph::mono_clock::now() - recv_start_time); + return; + } + + center->delete_file_event(cs.fd(), EVENT_WRITABLE); + ldout(async_msgr->cct, 10) + << __func__ << " connect successfully, ready to send banner" << dendl; + state = STATE_CONNECTION_ESTABLISHED; + ceph_assert(last_tick_id == 0); + // exclude TCP nonblock connect time + last_connect_started = ceph::coarse_mono_clock::now(); + last_tick_id = center->create_time_event( + connect_timeout_us, tick_handler); + break; + } + + case STATE_ACCEPTING: { + center->create_file_event(cs.fd(), EVENT_READABLE, read_handler); + state = STATE_CONNECTION_ESTABLISHED; + + break; + } + + case STATE_CONNECTION_ESTABLISHED: { + if (pendingReadLen) { + ssize_t r = read(*pendingReadLen, read_buffer, readCallback); + if (r <= 0) { // read all bytes, or an error occured + pendingReadLen.reset(); + char *buf_tmp = read_buffer; + read_buffer = nullptr; + readCallback(buf_tmp, r); + } + return; + } + break; + } + } + + protocol->read_event(); + + logger->tinc(l_msgr_running_recv_time, + ceph::mono_clock::now() - recv_start_time); +} + +bool AsyncConnection::is_connected() { + return protocol->is_connected(); +} + +void AsyncConnection::connect(const entity_addrvec_t &addrs, int type, + entity_addr_t &target) { + + std::lock_guard<std::mutex> l(lock); + set_peer_type(type); + set_peer_addrs(addrs); + policy = msgr->get_policy(type); + target_addr = target; + _connect(); +} + +void AsyncConnection::_connect() +{ + ldout(async_msgr->cct, 10) << __func__ << dendl; + + state = STATE_CONNECTING; + protocol->connect(); + // rescheduler connection in order to avoid lock dep + // may called by external thread(send_message) + center->dispatch_event_external(read_handler); +} + +void AsyncConnection::accept(ConnectedSocket socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr) +{ + ldout(async_msgr->cct, 10) << __func__ << " sd=" << socket.fd() + << " listen_addr " << listen_addr + << " peer_addr " << peer_addr << dendl; + ceph_assert(socket.fd() >= 0); + + std::lock_guard<std::mutex> l(lock); + cs = std::move(socket); + socket_addr = listen_addr; + target_addr = peer_addr; // until we know better + state = STATE_ACCEPTING; + protocol->accept(); + // rescheduler connection in order to avoid lock dep + center->dispatch_event_external(read_handler); +} + +int AsyncConnection::send_message(Message *m) +{ + FUNCTRACE(async_msgr->cct); + lgeneric_subdout(async_msgr->cct, ms, + 1) << "-- " << async_msgr->get_myaddrs() << " --> " + << get_peer_addrs() << " -- " + << *m << " -- " << m << " con " + << this + << dendl; + + // optimistic think it's ok to encode(actually may broken now) + if (!m->get_priority()) + m->set_priority(async_msgr->get_default_send_priority()); + + m->get_header().src = async_msgr->get_myname(); + m->set_connection(this); + + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_BEGIN", true); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_BEGIN", true); + + if (async_msgr->get_myaddrs() == get_peer_addrs()) { //loopback connection + ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl; + std::lock_guard<std::mutex> l(write_lock); + if (protocol->is_connected()) { + dispatch_queue->local_delivery(m, m->get_priority()); + } else { + ldout(async_msgr->cct, 10) << __func__ << " loopback connection closed." + << " Drop message " << m << dendl; + m->put(); + } + return 0; + } + + // we don't want to consider local message here, it's too lightweight which + // may disturb users + logger->inc(l_msgr_send_messages); + + protocol->send_message(m); + return 0; +} + +entity_addr_t AsyncConnection::_infer_target_addr(const entity_addrvec_t& av) +{ + // pick the first addr of the same address family as socket_addr. it could be + // an any: or v2: addr, we don't care. it should not be a v1 addr. + for (auto& i : av.v) { + if (i.is_legacy()) { + continue; + } + if (i.get_family() == socket_addr.get_family()) { + ldout(async_msgr->cct,10) << __func__ << " " << av << " -> " << i << dendl; + return i; + } + } + ldout(async_msgr->cct,10) << __func__ << " " << av << " -> nothing to match " + << socket_addr << dendl; + return {}; +} + +void AsyncConnection::fault() +{ + shutdown_socket(); + open_write = false; + + // queue delayed items immediately + if (delay_state) + delay_state->flush(); + + recv_start = recv_end = 0; + state_offset = 0; + outgoing_bl.clear(); +} + +void AsyncConnection::_stop() { + writeCallback.reset(); + dispatch_queue->discard_queue(conn_id); + async_msgr->unregister_conn(this); + worker->release_worker(); + + state = STATE_CLOSED; + open_write = false; + + state_offset = 0; + // Make sure in-queue events will been processed + center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this))); +} + +bool AsyncConnection::is_queued() const { + return outgoing_bl.length(); +} + +void AsyncConnection::shutdown_socket() { + for (auto &&t : register_time_events) center->delete_time_event(t); + register_time_events.clear(); + if (last_tick_id) { + center->delete_time_event(last_tick_id); + last_tick_id = 0; + } + if (cs) { + center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE); + cs.shutdown(); + cs.close(); + } +} + +void AsyncConnection::DelayedDelivery::do_request(uint64_t id) +{ + Message *m = nullptr; + { + std::lock_guard<std::mutex> l(delay_lock); + register_time_events.erase(id); + if (stop_dispatch) + return ; + if (delay_queue.empty()) + return ; + m = delay_queue.front(); + delay_queue.pop_front(); + } + if (msgr->ms_can_fast_dispatch(m)) { + dispatch_queue->fast_dispatch(m); + } else { + dispatch_queue->enqueue(m, m->get_priority(), conn_id); + } +} + +void AsyncConnection::DelayedDelivery::discard() { + stop_dispatch = true; + center->submit_to(center->get_id(), + [this]() mutable { + std::lock_guard<std::mutex> l(delay_lock); + while (!delay_queue.empty()) { + Message *m = delay_queue.front(); + dispatch_queue->dispatch_throttle_release( + m->get_dispatch_throttle_size()); + m->put(); + delay_queue.pop_front(); + } + for (auto i : register_time_events) + center->delete_time_event(i); + register_time_events.clear(); + stop_dispatch = false; + }, + true); +} + +void AsyncConnection::DelayedDelivery::flush() { + stop_dispatch = true; + center->submit_to( + center->get_id(), [this] () mutable { + std::lock_guard<std::mutex> l(delay_lock); + while (!delay_queue.empty()) { + Message *m = delay_queue.front(); + if (msgr->ms_can_fast_dispatch(m)) { + dispatch_queue->fast_dispatch(m); + } else { + dispatch_queue->enqueue(m, m->get_priority(), conn_id); + } + delay_queue.pop_front(); + } + for (auto i : register_time_events) + center->delete_time_event(i); + register_time_events.clear(); + stop_dispatch = false; + }, true); +} + +void AsyncConnection::send_keepalive() +{ + protocol->send_keepalive(); +} + +void AsyncConnection::mark_down() +{ + ldout(async_msgr->cct, 1) << __func__ << dendl; + std::lock_guard<std::mutex> l(lock); + protocol->stop(); +} + +void AsyncConnection::handle_write() +{ + ldout(async_msgr->cct, 10) << __func__ << dendl; + protocol->write_event(); +} + +void AsyncConnection::handle_write_callback() { + std::lock_guard<std::mutex> l(lock); + last_active = ceph::coarse_mono_clock::now(); + recv_start_time = ceph::mono_clock::now(); + write_lock.lock(); + if (writeCallback) { + auto callback = *writeCallback; + writeCallback.reset(); + write_lock.unlock(); + callback(0); + return; + } + write_lock.unlock(); +} + +void AsyncConnection::stop(bool queue_reset) { + lock.lock(); + bool need_queue_reset = (state != STATE_CLOSED) && queue_reset; + protocol->stop(); + lock.unlock(); + if (need_queue_reset) dispatch_queue->queue_reset(this); +} + +void AsyncConnection::cleanup() { + shutdown_socket(); + delete read_handler; + delete write_handler; + delete write_callback_handler; + delete wakeup_handler; + delete tick_handler; + if (delay_state) { + delete delay_state; + delay_state = NULL; + } +} + +void AsyncConnection::wakeup_from(uint64_t id) +{ + lock.lock(); + register_time_events.erase(id); + lock.unlock(); + process(); +} + +void AsyncConnection::tick(uint64_t id) +{ + auto now = ceph::coarse_mono_clock::now(); + ldout(async_msgr->cct, 20) << __func__ << " last_id=" << last_tick_id + << " last_active=" << last_active << dendl; + std::lock_guard<std::mutex> l(lock); + last_tick_id = 0; + if (!is_connected()) { + if (connect_timeout_us <= + (uint64_t)std::chrono::duration_cast<std::chrono::microseconds> + (now - last_connect_started).count()) { + ldout(async_msgr->cct, 1) << __func__ << " see no progress in more than " + << connect_timeout_us + << " us during connecting, fault." + << dendl; + protocol->fault(); + } else { + last_tick_id = center->create_time_event(connect_timeout_us, tick_handler); + } + } else { + auto idle_period = std::chrono::duration_cast<std::chrono::microseconds> + (now - last_active).count(); + if (inactive_timeout_us < (uint64_t)idle_period) { + ldout(async_msgr->cct, 1) << __func__ << " idle (" << idle_period + << ") for more than " << inactive_timeout_us + << " us, fault." + << dendl; + protocol->fault(); + } else { + last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler); + } + } +} diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h new file mode 100644 index 00000000..0c2512c8 --- /dev/null +++ b/src/msg/async/AsyncConnection.h @@ -0,0 +1,238 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ASYNCCONNECTION_H +#define CEPH_MSG_ASYNCCONNECTION_H + +#include <atomic> +#include <pthread.h> +#include <climits> +#include <list> +#include <mutex> +#include <map> +#include <functional> +#include <optional> + +#include "auth/AuthSessionHandler.h" +#include "common/ceph_time.h" +#include "common/perf_counters.h" +#include "include/buffer.h" +#include "msg/Connection.h" +#include "msg/Messenger.h" + +#include "Event.h" +#include "Stack.h" + +class AsyncMessenger; +class DispatchQueue; +class Worker; +class Protocol; + +static const int ASYNC_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX); + +/* + * AsyncConnection maintains a logic session between two endpoints. In other + * word, a pair of addresses can find the only AsyncConnection. AsyncConnection + * will handle with network fault or read/write transactions. If one file + * descriptor broken, AsyncConnection will maintain the message queue and + * sequence, try to reconnect peer endpoint. + */ +class AsyncConnection : public Connection { + + ssize_t read(unsigned len, char *buffer, + std::function<void(char *, ssize_t)> callback); + ssize_t read_until(unsigned needed, char *p); + ssize_t read_bulk(char *buf, unsigned len); + + ssize_t write(bufferlist &bl, std::function<void(ssize_t)> callback, + bool more=false); + ssize_t _try_send(bool more=false); + + void _connect(); + void _stop(); + void fault(); + void inject_delay(); + + bool is_queued() const; + void shutdown_socket(); + + /** + * The DelayedDelivery is for injecting delays into Message delivery off + * the socket. It is only enabled if delays are requested, and if they + * are then it pulls Messages off the DelayQueue and puts them into the + * AsyncMessenger event queue. + */ + class DelayedDelivery : public EventCallback { + std::set<uint64_t> register_time_events; // need to delete it if stop + std::deque<Message*> delay_queue; + std::mutex delay_lock; + AsyncMessenger *msgr; + EventCenter *center; + DispatchQueue *dispatch_queue; + uint64_t conn_id; + std::atomic_bool stop_dispatch; + + public: + explicit DelayedDelivery(AsyncMessenger *omsgr, EventCenter *c, + DispatchQueue *q, uint64_t cid) + : msgr(omsgr), center(c), dispatch_queue(q), conn_id(cid), + stop_dispatch(false) { } + ~DelayedDelivery() override { + ceph_assert(register_time_events.empty()); + ceph_assert(delay_queue.empty()); + } + void set_center(EventCenter *c) { center = c; } + void do_request(uint64_t id) override; + void queue(double delay_period, Message *m) { + std::lock_guard<std::mutex> l(delay_lock); + delay_queue.push_back(m); + register_time_events.insert(center->create_time_event(delay_period*1000000, this)); + } + void discard(); + bool ready() const { return !stop_dispatch && delay_queue.empty() && register_time_events.empty(); } + void flush(); + } *delay_state; + + public: + AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q, + Worker *w, bool is_msgr2, bool local); + ~AsyncConnection() override; + void maybe_start_delay_thread(); + + ostream& _conn_prefix(std::ostream *_dout); + + bool is_connected() override; + + // Only call when AsyncConnection first construct + void connect(const entity_addrvec_t& addrs, int type, entity_addr_t& target); + + // Only call when AsyncConnection first construct + void accept(ConnectedSocket socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr); + int send_message(Message *m) override; + + void send_keepalive() override; + void mark_down() override; + void mark_disposable() override { + std::lock_guard<std::mutex> l(lock); + policy.lossy = true; + } + + entity_addr_t get_peer_socket_addr() const override { + return target_addr; + } + + int get_con_mode() const override; + + private: + enum { + STATE_NONE, + STATE_CONNECTING, + STATE_CONNECTING_RE, + STATE_ACCEPTING, + STATE_CONNECTION_ESTABLISHED, + STATE_CLOSED + }; + + static const uint32_t TCP_PREFETCH_MIN_SIZE; + static const char *get_state_name(int state) { + const char* const statenames[] = {"STATE_NONE", + "STATE_CONNECTING", + "STATE_CONNECTING_RE", + "STATE_ACCEPTING", + "STATE_CONNECTION_ESTABLISHED", + "STATE_CLOSED"}; + return statenames[state]; + } + + AsyncMessenger *async_msgr; + uint64_t conn_id; + PerfCounters *logger; + int state; + ConnectedSocket cs; + int port; + Messenger::Policy policy; + + DispatchQueue *dispatch_queue; + + // lockfree, only used in own thread + bufferlist outgoing_bl; + bool open_write = false; + + std::mutex write_lock; + + std::mutex lock; + EventCallbackRef read_handler; + EventCallbackRef write_handler; + EventCallbackRef write_callback_handler; + EventCallbackRef wakeup_handler; + EventCallbackRef tick_handler; + char *recv_buf; + uint32_t recv_max_prefetch; + uint32_t recv_start; + uint32_t recv_end; + set<uint64_t> register_time_events; // need to delete it if stop + ceph::coarse_mono_clock::time_point last_connect_started; + ceph::coarse_mono_clock::time_point last_active; + ceph::mono_clock::time_point recv_start_time; + uint64_t last_tick_id = 0; + const uint64_t connect_timeout_us; + const uint64_t inactive_timeout_us; + + // Tis section are temp variables used by state transition + + // Accepting state + bool msgr2 = false; + entity_addr_t socket_addr; ///< local socket addr + entity_addr_t target_addr; ///< which of the peer_addrs we're connecting to (as clienet) or should reconnect to (as peer) + + entity_addr_t _infer_target_addr(const entity_addrvec_t& av); + + // used only by "read_until" + uint64_t state_offset; + Worker *worker; + EventCenter *center; + + std::unique_ptr<Protocol> protocol; + + std::optional<std::function<void(ssize_t)>> writeCallback; + std::function<void(char *, ssize_t)> readCallback; + std::optional<unsigned> pendingReadLen; + char *read_buffer; + + public: + // used by eventcallback + void handle_write(); + void handle_write_callback(); + void process(); + void wakeup_from(uint64_t id); + void tick(uint64_t id); + void local_deliver(); + void stop(bool queue_reset); + void cleanup(); + PerfCounters *get_perf_counter() { + return logger; + } + + friend class Protocol; + friend class ProtocolV1; + friend class ProtocolV2; +}; /* AsyncConnection */ + +typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef; + +#endif diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc new file mode 100644 index 00000000..2b1488c4 --- /dev/null +++ b/src/msg/async/AsyncMessenger.cc @@ -0,0 +1,949 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "acconfig.h" + +#include <iostream> +#include <fstream> + +#include "AsyncMessenger.h" + +#include "common/config.h" +#include "common/Timer.h" +#include "common/errno.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "common/EventTrace.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) { + return *_dout << "-- " << m->get_myaddrs() << " "; +} + +static ostream& _prefix(std::ostream *_dout, Processor *p) { + return *_dout << " Processor -- "; +} + + +/******************* + * Processor + */ + +class Processor::C_processor_accept : public EventCallback { + Processor *pro; + + public: + explicit C_processor_accept(Processor *p): pro(p) {} + void do_request(uint64_t id) override { + pro->accept(); + } +}; + +Processor::Processor(AsyncMessenger *r, Worker *w, CephContext *c) + : msgr(r), net(c), worker(w), + listen_handler(new C_processor_accept(this)) {} + +int Processor::bind(const entity_addrvec_t &bind_addrs, + const set<int>& avoid_ports, + entity_addrvec_t* bound_addrs) +{ + const auto& conf = msgr->cct->_conf; + // bind to socket(s) + ldout(msgr->cct, 10) << __func__ << " " << bind_addrs << dendl; + + SocketOptions opts; + opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay; + opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf; + + listen_sockets.resize(bind_addrs.v.size()); + *bound_addrs = bind_addrs; + + for (unsigned k = 0; k < bind_addrs.v.size(); ++k) { + auto& listen_addr = bound_addrs->v[k]; + + /* bind to port */ + int r = -1; + + for (int i = 0; i < conf->ms_bind_retry_count; i++) { + if (i > 0) { + lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in " + << conf->ms_bind_retry_delay << " seconds " << dendl; + sleep(conf->ms_bind_retry_delay); + } + + if (listen_addr.get_port()) { + worker->center.submit_to( + worker->center.get_id(), + [this, k, &listen_addr, &opts, &r]() { + r = worker->listen(listen_addr, k, opts, &listen_sockets[k]); + }, false); + if (r < 0) { + lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr + << ": " << cpp_strerror(r) << dendl; + continue; + } + } else { + // try a range of ports + for (int port = msgr->cct->_conf->ms_bind_port_min; + port <= msgr->cct->_conf->ms_bind_port_max; + port++) { + if (avoid_ports.count(port)) + continue; + + listen_addr.set_port(port); + worker->center.submit_to( + worker->center.get_id(), + [this, k, &listen_addr, &opts, &r]() { + r = worker->listen(listen_addr, k, opts, &listen_sockets[k]); + }, false); + if (r == 0) + break; + } + if (r < 0) { + lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr + << " on any port in range " + << msgr->cct->_conf->ms_bind_port_min + << "-" << msgr->cct->_conf->ms_bind_port_max << ": " + << cpp_strerror(r) << dendl; + listen_addr.set_port(0); // Clear port before retry, otherwise we shall fail again. + continue; + } + ldout(msgr->cct, 10) << __func__ << " bound on random port " + << listen_addr << dendl; + } + if (r == 0) { + break; + } + } + + // It seems that binding completely failed, return with that exit status + if (r < 0) { + lderr(msgr->cct) << __func__ << " was unable to bind after " + << conf->ms_bind_retry_count + << " attempts: " << cpp_strerror(r) << dendl; + for (unsigned j = 0; j < k; ++j) { + // clean up previous bind + listen_sockets[j].abort_accept(); + } + return r; + } + } + + ldout(msgr->cct, 10) << __func__ << " bound to " << *bound_addrs << dendl; + return 0; +} + +void Processor::start() +{ + ldout(msgr->cct, 1) << __func__ << dendl; + + // start thread + worker->center.submit_to(worker->center.get_id(), [this]() { + for (auto& l : listen_sockets) { + if (l) { + worker->center.create_file_event(l.fd(), EVENT_READABLE, + listen_handler); } + } + }, false); +} + +void Processor::accept() +{ + SocketOptions opts; + opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay; + opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf; + opts.priority = msgr->get_socket_priority(); + + for (auto& listen_socket : listen_sockets) { + ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd() + << dendl; + unsigned accept_error_num = 0; + + while (true) { + entity_addr_t addr; + ConnectedSocket cli_socket; + Worker *w = worker; + if (!msgr->get_stack()->support_local_listen_table()) + w = msgr->get_stack()->get_worker(); + else + ++w->references; + int r = listen_socket.accept(&cli_socket, opts, &addr, w); + if (r == 0) { + ldout(msgr->cct, 10) << __func__ << " accepted incoming on sd " + << cli_socket.fd() << dendl; + + msgr->add_accept( + w, std::move(cli_socket), + msgr->get_myaddrs().v[listen_socket.get_addr_slot()], + addr); + accept_error_num = 0; + continue; + } else { + --w->references; + if (r == -EINTR) { + continue; + } else if (r == -EAGAIN) { + break; + } else if (r == -EMFILE || r == -ENFILE) { + lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd() + << " errno " << r << " " << cpp_strerror(r) << dendl; + if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) { + lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl; + ceph_abort(); + } + continue; + } else if (r == -ECONNABORTED) { + ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd() + << " errno " << r << " " << cpp_strerror(r) << dendl; + continue; + } else { + lderr(msgr->cct) << __func__ << " no incoming connection?" + << " errno " << r << " " << cpp_strerror(r) << dendl; + if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) { + lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl; + ceph_abort(); + } + continue; + } + } + } + } +} + +void Processor::stop() +{ + ldout(msgr->cct,10) << __func__ << dendl; + + worker->center.submit_to(worker->center.get_id(), [this]() { + for (auto& listen_socket : listen_sockets) { + if (listen_socket) { + worker->center.delete_file_event(listen_socket.fd(), EVENT_READABLE); + listen_socket.abort_accept(); + } + } + }, false); +} + + +struct StackSingleton { + CephContext *cct; + std::shared_ptr<NetworkStack> stack; + + explicit StackSingleton(CephContext *c): cct(c) {} + void ready(std::string &type) { + if (!stack) + stack = NetworkStack::create(cct, type); + } + ~StackSingleton() { + stack->stop(); + } +}; + + +class C_handle_reap : public EventCallback { + AsyncMessenger *msgr; + + public: + explicit C_handle_reap(AsyncMessenger *m): msgr(m) {} + void do_request(uint64_t id) override { + // judge whether is a time event + msgr->reap_dead(); + } +}; + +/******************* + * AsyncMessenger + */ + +AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name, + const std::string &type, string mname, uint64_t _nonce) + : SimplePolicyMessenger(cct, name,mname, _nonce), + dispatch_queue(cct, this, mname), + lock("AsyncMessenger::lock"), + nonce(_nonce), need_addr(true), did_bind(false), + global_seq(0), deleted_lock("AsyncMessenger::deleted_lock"), + cluster_protocol(0), stopped(true) +{ + std::string transport_type = "posix"; + if (type.find("rdma") != std::string::npos) + transport_type = "rdma"; + else if (type.find("dpdk") != std::string::npos) + transport_type = "dpdk"; + + auto single = &cct->lookup_or_create_singleton_object<StackSingleton>( + "AsyncMessenger::NetworkStack::" + transport_type, true, cct); + single->ready(transport_type); + stack = single->stack.get(); + stack->start(); + local_worker = stack->get_worker(); + local_connection = new AsyncConnection(cct, this, &dispatch_queue, + local_worker, true, true); + init_local_connection(); + reap_handler = new C_handle_reap(this); + unsigned processor_num = 1; + if (stack->support_local_listen_table()) + processor_num = stack->get_num_worker(); + for (unsigned i = 0; i < processor_num; ++i) + processors.push_back(new Processor(this, stack->get_worker(i), cct)); +} + +/** + * Destroy the AsyncMessenger. Pretty simple since all the work is done + * elsewhere. + */ +AsyncMessenger::~AsyncMessenger() +{ + delete reap_handler; + ceph_assert(!did_bind); // either we didn't bind or we shut down the Processor + for (auto &&p : processors) + delete p; +} + +void AsyncMessenger::ready() +{ + ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl; + + stack->ready(); + if (pending_bind) { + int err = bindv(pending_bind_addrs); + if (err) { + lderr(cct) << __func__ << " postponed bind failed" << dendl; + ceph_abort(); + } + } + + Mutex::Locker l(lock); + for (auto &&p : processors) + p->start(); + dispatch_queue.start(); +} + +int AsyncMessenger::shutdown() +{ + ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl; + + // done! clean up. + for (auto &&p : processors) + p->stop(); + mark_down_all(); + // break ref cycles on the loopback connection + local_connection->set_priv(NULL); + local_connection->mark_down(); + did_bind = false; + lock.Lock(); + stop_cond.Signal(); + stopped = true; + lock.Unlock(); + stack->drain(); + return 0; +} + +int AsyncMessenger::bind(const entity_addr_t &bind_addr) +{ + ldout(cct,10) << __func__ << " " << bind_addr << dendl; + // old bind() can take entity_addr_t(). new bindv() can take a + // 0.0.0.0-like address but needs type and family to be set. + auto a = bind_addr; + if (a == entity_addr_t()) { + a.set_type(entity_addr_t::TYPE_LEGACY); + if (cct->_conf->ms_bind_ipv6) { + a.set_family(AF_INET6); + } else { + a.set_family(AF_INET); + } + } + return bindv(entity_addrvec_t(a)); +} + +int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs) +{ + lock.Lock(); + + if (!pending_bind && started) { + ldout(cct,10) << __func__ << " already started" << dendl; + lock.Unlock(); + return -1; + } + + ldout(cct,10) << __func__ << " " << bind_addrs << dendl; + + if (!stack->is_ready()) { + ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl; + pending_bind_addrs = bind_addrs; + pending_bind = true; + lock.Unlock(); + return 0; + } + + lock.Unlock(); + + // bind to a socket + set<int> avoid_ports; + entity_addrvec_t bound_addrs; + unsigned i = 0; + for (auto &&p : processors) { + int r = p->bind(bind_addrs, avoid_ports, &bound_addrs); + if (r) { + // Note: this is related to local tcp listen table problem. + // Posix(default kernel implementation) backend shares listen table + // in the kernel, so all threads can use the same listen table naturally + // and only one thread need to bind. But other backends(like dpdk) uses local + // listen table, we need to bind/listen tcp port for each worker. So if the + // first worker failed to bind, it could be think the normal error then handle + // it, like port is used case. But if the first worker successfully to bind + // but the second worker failed, it's not expected and we need to assert + // here + ceph_assert(i == 0); + return r; + } + ++i; + } + _finish_bind(bind_addrs, bound_addrs); + return 0; +} + +int AsyncMessenger::rebind(const set<int>& avoid_ports) +{ + ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl; + ceph_assert(did_bind); + + for (auto &&p : processors) + p->stop(); + mark_down_all(); + + // adjust the nonce; we want our entity_addr_t to be truly unique. + nonce += 1000000; + ldout(cct, 10) << __func__ << " new nonce " << nonce + << " and addr " << get_myaddrs() << dendl; + + entity_addrvec_t bound_addrs; + entity_addrvec_t bind_addrs = get_myaddrs(); + set<int> new_avoid(avoid_ports); + for (auto& a : bind_addrs.v) { + new_avoid.insert(a.get_port()); + a.set_port(0); + } + ldout(cct, 10) << __func__ << " will try " << bind_addrs + << " and avoid ports " << new_avoid << dendl; + unsigned i = 0; + for (auto &&p : processors) { + int r = p->bind(bind_addrs, avoid_ports, &bound_addrs); + if (r) { + ceph_assert(i == 0); + return r; + } + ++i; + } + _finish_bind(bind_addrs, bound_addrs); + for (auto &&p : processors) { + p->start(); + } + return 0; +} + +int AsyncMessenger::client_bind(const entity_addr_t &bind_addr) +{ + if (!cct->_conf->ms_bind_before_connect) + return 0; + Mutex::Locker l(lock); + if (did_bind) { + return 0; + } + if (started) { + ldout(cct, 10) << __func__ << " already started" << dendl; + return -1; + } + ldout(cct, 10) << __func__ << " " << bind_addr << dendl; + + set_myaddrs(entity_addrvec_t(bind_addr)); + return 0; +} + +void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs, + const entity_addrvec_t& listen_addrs) +{ + set_myaddrs(bind_addrs); + for (auto& a : bind_addrs.v) { + if (!a.is_blank_ip()) { + learned_addr(a); + } + } + + if (get_myaddrs().front().get_port() == 0) { + set_myaddrs(listen_addrs); + } + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + a.set_nonce(nonce); + } + set_myaddrs(newaddrs); + + init_local_connection(); + + ldout(cct,1) << __func__ << " bind my_addrs is " << get_myaddrs() << dendl; + did_bind = true; +} + +int AsyncMessenger::start() +{ + lock.Lock(); + ldout(cct,1) << __func__ << " start" << dendl; + + // register at least one entity, first! + ceph_assert(my_name.type() >= 0); + + ceph_assert(!started); + started = true; + stopped = false; + + if (!did_bind) { + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + a.nonce = nonce; + } + set_myaddrs(newaddrs); + _init_local_connection(); + } + + lock.Unlock(); + return 0; +} + +void AsyncMessenger::wait() +{ + lock.Lock(); + if (!started) { + lock.Unlock(); + return; + } + if (!stopped) + stop_cond.Wait(lock); + + lock.Unlock(); + + dispatch_queue.shutdown(); + if (dispatch_queue.is_started()) { + ldout(cct, 10) << __func__ << ": waiting for dispatch queue" << dendl; + dispatch_queue.wait(); + dispatch_queue.discard_local(); + ldout(cct, 10) << __func__ << ": dispatch queue is stopped" << dendl; + } + + // close all connections + shutdown_connections(false); + stack->drain(); + + ldout(cct, 10) << __func__ << ": done." << dendl; + ldout(cct, 1) << __func__ << " complete." << dendl; + started = false; +} + +void AsyncMessenger::add_accept(Worker *w, ConnectedSocket cli_socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr) +{ + lock.Lock(); + AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w, + listen_addr.is_msgr2(), false); + conn->accept(std::move(cli_socket), listen_addr, peer_addr); + accepting_conns.insert(conn); + lock.Unlock(); +} + +AsyncConnectionRef AsyncMessenger::create_connect( + const entity_addrvec_t& addrs, int type) +{ + ceph_assert(lock.is_locked()); + + ldout(cct, 10) << __func__ << " " << addrs + << ", creating connection and registering" << dendl; + + // here is where we decide which of the addrs to connect to. always prefer + // the first one, if we support it. + entity_addr_t target; + for (auto& a : addrs.v) { + if (!a.is_msgr2() && !a.is_legacy()) { + continue; + } + // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before + // trying it? for now, just pick whichever is listed first. + target = a; + break; + } + + // create connection + Worker *w = stack->get_worker(); + AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w, + target.is_msgr2(), false); + conn->connect(addrs, type, target); + ceph_assert(!conns.count(addrs)); + ldout(cct, 10) << __func__ << " " << conn << " " << addrs << " " + << *conn->peer_addrs << dendl; + conns[addrs] = conn; + w->get_perf_counter()->inc(l_msgr_active_connections); + + return conn; +} + + +ConnectionRef AsyncMessenger::get_loopback_connection() +{ + return local_connection; +} + +bool AsyncMessenger::should_use_msgr2() +{ + // if we are bound to v1 only, and we are connecting to a v2 peer, + // we cannot use the peer's v2 address. otherwise the connection + // is assymetrical, because they would have to use v1 to connect + // to us, and we would use v2, and connection race detection etc + // would totally break down (among other things). or, the other + // end will be confused that we advertise ourselve with a v1 + // address only (that we bound to) but connected with protocol v2. + return !did_bind || get_myaddrs().has_msgr2(); +} + +entity_addrvec_t AsyncMessenger::_filter_addrs(int type, + const entity_addrvec_t& addrs) +{ + if (!should_use_msgr2()) { + ldout(cct, 10) << __func__ << " " << addrs << " type " << type + << " limiting to v1 ()" << dendl; + entity_addrvec_t r; + for (auto& i : addrs.v) { + if (i.is_msgr2()) { + continue; + } + r.v.push_back(i); + } + return r; + } else { + return addrs; + } +} + +int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + + FUNCTRACE(cct); + ceph_assert(m); + + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE(((MOSDOp *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP"); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE(((MOSDOpReply *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP_REPLY"); + + ldout(cct, 1) << __func__ << "--> " << ceph_entity_type_name(type) << " " + << addrs << " -- " << *m << " -- ?+" + << m->get_data().length() << " " << m << dendl; + + if (addrs.empty()) { + ldout(cct,0) << __func__ << " message " << *m + << " with empty dest " << addrs << dendl; + m->put(); + return -EINVAL; + } + + auto av = _filter_addrs(type, addrs); + AsyncConnectionRef conn = _lookup_conn(av); + submit_message(m, conn, av, type); + return 0; +} + +ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + if (*my_addrs == addrs || + (addrs.v.size() == 1 && + my_addrs->contains(addrs.front()))) { + // local + return local_connection; + } + + auto av = _filter_addrs(type, addrs); + + AsyncConnectionRef conn = _lookup_conn(av); + if (conn) { + ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl; + } else { + conn = create_connect(av, type); + ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl; + } + + return conn; +} + +void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con, + const entity_addrvec_t& dest_addrs, + int dest_type) +{ + if (cct->_conf->ms_dump_on_send) { + m->encode(-1, MSG_CRC_ALL); + ldout(cct, 0) << __func__ << " submit_message " << *m << "\n"; + m->get_payload().hexdump(*_dout); + if (m->get_data().length() > 0) { + *_dout << " data:\n"; + m->get_data().hexdump(*_dout); + } + *_dout << dendl; + m->clear_payload(); + } + + // existing connection? + if (con) { + con->send_message(m); + return ; + } + + // local? + if (*my_addrs == dest_addrs || + (dest_addrs.v.size() == 1 && + my_addrs->contains(dest_addrs.front()))) { + // local + local_connection->send_message(m); + return ; + } + + // remote, no existing connection. + const Policy& policy = get_policy(dest_type); + if (policy.server) { + ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addrs + << ", lossy server for target type " + << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl; + m->put(); + } else { + ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addrs + << ", new connection." << dendl; + con = create_connect(dest_addrs, dest_type); + con->send_message(m); + } +} + +/** + * If my_addr doesn't have an IP set, this function + * will fill it in from the passed addr. Otherwise it does nothing and returns. + */ +bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs) +{ + ldout(cct,1) << __func__ << " " << addrs << dendl; + bool ret = false; + Mutex::Locker l(lock); + + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + if (a.is_blank_ip()) { + int type = a.get_type(); + int port = a.get_port(); + uint32_t nonce = a.get_nonce(); + for (auto& b : addrs.v) { + if (a.get_family() == b.get_family()) { + ldout(cct,1) << __func__ << " assuming my addr " << a + << " matches provided addr " << b << dendl; + a = b; + a.set_nonce(nonce); + a.set_type(type); + a.set_port(port); + ret = true; + break; + } + } + } + } + set_myaddrs(newaddrs); + if (ret) { + _init_local_connection(); + } + ldout(cct,1) << __func__ << " now " << *my_addrs << dendl; + return ret; +} + +void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs) +{ + Mutex::Locker l(lock); + auto t = addrs; + for (auto& a : t.v) { + a.set_nonce(nonce); + } + set_myaddrs(t); + _init_local_connection(); +} + +void AsyncMessenger::shutdown_connections(bool queue_reset) +{ + ldout(cct,1) << __func__ << " " << dendl; + lock.Lock(); + for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin(); + q != accepting_conns.end(); ++q) { + AsyncConnectionRef p = *q; + ldout(cct, 5) << __func__ << " accepting_conn " << p.get() << dendl; + p->stop(queue_reset); + } + accepting_conns.clear(); + + while (!conns.empty()) { + auto it = conns.begin(); + AsyncConnectionRef p = it->second; + ldout(cct, 5) << __func__ << " mark down " << it->first << " " << p << dendl; + conns.erase(it); + p->get_perf_counter()->dec(l_msgr_active_connections); + p->stop(queue_reset); + } + + { + Mutex::Locker l(deleted_lock); + while (!deleted_conns.empty()) { + set<AsyncConnectionRef>::iterator it = deleted_conns.begin(); + AsyncConnectionRef p = *it; + ldout(cct, 5) << __func__ << " delete " << p << dendl; + deleted_conns.erase(it); + } + } + lock.Unlock(); +} + +void AsyncMessenger::mark_down_addrs(const entity_addrvec_t& addrs) +{ + lock.Lock(); + AsyncConnectionRef p = _lookup_conn(addrs); + if (p) { + ldout(cct, 1) << __func__ << " " << addrs << " -- " << p << dendl; + p->stop(true); + } else { + ldout(cct, 1) << __func__ << " " << addrs << " -- connection dne" << dendl; + } + lock.Unlock(); +} + +int AsyncMessenger::get_proto_version(int peer_type, bool connect) const +{ + int my_type = my_name.type(); + + // set reply protocol version + if (peer_type == my_type) { + // internal + return cluster_protocol; + } else { + // public + switch (connect ? peer_type : my_type) { + case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL; + case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL; + case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL; + } + } + return 0; +} + +int AsyncMessenger::accept_conn(AsyncConnectionRef conn) +{ + Mutex::Locker l(lock); + auto it = conns.find(*conn->peer_addrs); + if (it != conns.end()) { + AsyncConnectionRef existing = it->second; + + // lazy delete, see "deleted_conns" + // If conn already in, we will return 0 + Mutex::Locker l(deleted_lock); + if (deleted_conns.erase(existing)) { + conns.erase(it); + } else if (conn != existing) { + return -1; + } + } + ldout(cct, 10) << __func__ << " " << conn << " " << *conn->peer_addrs << dendl; + conns[*conn->peer_addrs] = conn; + conn->get_perf_counter()->inc(l_msgr_active_connections); + accepting_conns.erase(conn); + return 0; +} + + +bool AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me) +{ + // be careful here: multiple threads may block here, and readers of + // my_addr do NOT hold any lock. + + // this always goes from true -> false under the protection of the + // mutex. if it is already false, we need not retake the mutex at + // all. + if (!need_addr) + return false; + std::lock_guard l(lock); + if (need_addr) { + if (my_addrs->empty()) { + auto a = peer_addr_for_me; + a.set_type(entity_addr_t::TYPE_ANY); + a.set_nonce(nonce); + if (!did_bind) { + a.set_port(0); + } + set_myaddrs(entity_addrvec_t(a)); + ldout(cct,10) << __func__ << " had no addrs" << dendl; + } else { + // fix all addrs of the same family, regardless of type (msgr2 vs legacy) + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + if (a.is_blank_ip() && + a.get_family() == peer_addr_for_me.get_family()) { + entity_addr_t t = peer_addr_for_me; + if (!did_bind) { + t.set_type(entity_addr_t::TYPE_ANY); + t.set_port(0); + } else { + t.set_type(a.get_type()); + t.set_port(a.get_port()); + } + t.set_nonce(a.get_nonce()); + ldout(cct,10) << __func__ << " " << a << " -> " << t << dendl; + a = t; + } + } + set_myaddrs(newaddrs); + } + ldout(cct, 1) << __func__ << " learned my addr " << *my_addrs + << " (peer_addr_for_me " << peer_addr_for_me << ")" << dendl; + _init_local_connection(); + need_addr = false; + return true; + } + return false; +} + +int AsyncMessenger::reap_dead() +{ + ldout(cct, 1) << __func__ << " start" << dendl; + int num = 0; + + Mutex::Locker l1(lock); + Mutex::Locker l2(deleted_lock); + + while (!deleted_conns.empty()) { + auto it = deleted_conns.begin(); + AsyncConnectionRef p = *it; + ldout(cct, 5) << __func__ << " delete " << p << dendl; + auto conns_it = conns.find(*p->peer_addrs); + if (conns_it != conns.end() && conns_it->second == p) + conns.erase(conns_it); + accepting_conns.erase(p); + deleted_conns.erase(it); + ++num; + } + + return num; +} diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h new file mode 100644 index 00000000..98bf9d52 --- /dev/null +++ b/src/msg/async/AsyncMessenger.h @@ -0,0 +1,426 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ASYNCMESSENGER_H +#define CEPH_ASYNCMESSENGER_H + +#include <map> +#include <mutex> + +#include "include/types.h" +#include "include/xlist.h" +#include "include/spinlock.h" +#include "include/unordered_map.h" +#include "include/unordered_set.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "msg/SimplePolicyMessenger.h" +#include "msg/DispatchQueue.h" +#include "AsyncConnection.h" +#include "Event.h" + +#include "include/ceph_assert.h" + +class AsyncMessenger; + +/** + * If the Messenger binds to a specific address, the Processor runs + * and listens for incoming connections. + */ +class Processor { + AsyncMessenger *msgr; + NetHandler net; + Worker *worker; + vector<ServerSocket> listen_sockets; + EventCallbackRef listen_handler; + + class C_processor_accept; + + public: + Processor(AsyncMessenger *r, Worker *w, CephContext *c); + ~Processor() { delete listen_handler; }; + + void stop(); + int bind(const entity_addrvec_t &bind_addrs, + const set<int>& avoid_ports, + entity_addrvec_t* bound_addrs); + void start(); + void accept(); +}; + +/* + * AsyncMessenger is represented for maintaining a set of asynchronous connections, + * it may own a bind address and the accepted connections will be managed by + * AsyncMessenger. + * + */ + +class AsyncMessenger : public SimplePolicyMessenger { + // First we have the public Messenger interface implementation... +public: + /** + * Initialize the AsyncMessenger! + * + * @param cct The CephContext to use + * @param name The name to assign ourselves + * _nonce A unique ID to use for this AsyncMessenger. It should not + * be a value that will be repeated if the daemon restarts. + */ + AsyncMessenger(CephContext *cct, entity_name_t name, const std::string &type, + string mname, uint64_t _nonce); + + /** + * Destroy the AsyncMessenger. Pretty simple since all the work is done + * elsewhere. + */ + ~AsyncMessenger() override; + + /** @defgroup Accessors + * @{ + */ + bool set_addr_unknowns(const entity_addrvec_t &addr) override; + void set_addrs(const entity_addrvec_t &addrs) override; + + int get_dispatch_queue_len() override { + return dispatch_queue.get_queue_len(); + } + + double get_dispatch_queue_max_age(utime_t now) override { + return dispatch_queue.get_max_age(now); + } + /** @} Accessors */ + + /** + * @defgroup Configuration functions + * @{ + */ + void set_cluster_protocol(int p) override { + ceph_assert(!started && !did_bind); + cluster_protocol = p; + } + + int bind(const entity_addr_t& bind_addr) override; + int rebind(const set<int>& avoid_ports) override; + int client_bind(const entity_addr_t& bind_addr) override; + + int bindv(const entity_addrvec_t& bind_addrs) override; + + bool should_use_msgr2() override; + + /** @} Configuration functions */ + + /** + * @defgroup Startup/Shutdown + * @{ + */ + int start() override; + void wait() override; + int shutdown() override; + + /** @} // Startup/Shutdown */ + + /** + * @defgroup Messaging + * @{ + */ + int send_to(Message *m, int type, const entity_addrvec_t& addrs) override; + + /** @} // Messaging */ + + /** + * @defgroup Connection Management + * @{ + */ + ConnectionRef connect_to(int type, + const entity_addrvec_t& addrs) override; + ConnectionRef get_loopback_connection() override; + void mark_down(const entity_addr_t& addr) override { + mark_down_addrs(entity_addrvec_t(addr)); + } + void mark_down_addrs(const entity_addrvec_t& addrs) override; + void mark_down_all() override { + shutdown_connections(true); + } + /** @} // Connection Management */ + + /** + * @defgroup Inner classes + * @{ + */ + + /** + * @} // Inner classes + */ + +protected: + /** + * @defgroup Messenger Interfaces + * @{ + */ + /** + * Start up the DispatchQueue thread once we have somebody to dispatch to. + */ + void ready() override; + /** @} // Messenger Interfaces */ + +private: + + /** + * @defgroup Utility functions + * @{ + */ + + /** + * Create a connection associated with the given entity (of the given type). + * Initiate the connection. (This function returning does not guarantee + * connection success.) + * + * @param addrs The address(es) of the entity to connect to. + * @param type The peer type of the entity at the address. + * + * @return a pointer to the newly-created connection. Caller does not own a + * reference; take one if you need it. + */ + AsyncConnectionRef create_connect(const entity_addrvec_t& addrs, int type); + + /** + * Queue up a Message for delivery to the entity specified + * by addr and dest_type. + * submit_message() is responsible for creating + * new AsyncConnection (and closing old ones) as necessary. + * + * @param m The Message to queue up. This function eats a reference. + * @param con The existing Connection to use, or NULL if you don't know of one. + * @param dest_addr The address to send the Message to. + * @param dest_type The peer type of the address we're sending to + * just drop silently under failure. + */ + void submit_message(Message *m, AsyncConnectionRef con, + const entity_addrvec_t& dest_addrs, int dest_type); + + void _finish_bind(const entity_addrvec_t& bind_addrs, + const entity_addrvec_t& listen_addrs); + + entity_addrvec_t _filter_addrs(int type, + const entity_addrvec_t& addrs); + + private: + static const uint64_t ReapDeadConnectionThreshold = 5; + + NetworkStack *stack; + std::vector<Processor*> processors; + friend class Processor; + DispatchQueue dispatch_queue; + + // the worker run messenger's cron jobs + Worker *local_worker; + + std::string ms_type; + + /// overall lock used for AsyncMessenger data structures + Mutex lock; + // AsyncMessenger stuff + /// approximately unique ID set by the Constructor for use in entity_addr_t + uint64_t nonce; + + /// true, specifying we haven't learned our addr; set false when we find it. + // maybe this should be protected by the lock? + bool need_addr; + + /** + * set to bind addresses if bind was called before NetworkStack was ready to + * bind + */ + entity_addrvec_t pending_bind_addrs; + + /** + * false; set to true if a pending bind exists + */ + bool pending_bind = false; + + /** + * The following aren't lock-protected since you shouldn't be able to race + * the only writers. + */ + + /** + * false; set to true if the AsyncMessenger bound to a specific address; + * and set false again by Accepter::stop(). + */ + bool did_bind; + /// counter for the global seq our connection protocol uses + __u32 global_seq; + /// lock to protect the global_seq + ceph::spinlock global_seq_lock; + + /** + * hash map of addresses to Asyncconnection + * + * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered + * invalid and can be replaced by anyone holding the msgr lock + */ + ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns; + + /** + * list of connection are in the process of accepting + * + * These are not yet in the conns map. + */ + set<AsyncConnectionRef> accepting_conns; + + /** + * list of connection are closed which need to be clean up + * + * Because AsyncMessenger and AsyncConnection follow a lock rule that + * we can lock AsyncMesenger::lock firstly then lock AsyncConnection::lock + * but can't reversed. This rule is aimed to avoid dead lock. + * So if AsyncConnection want to unregister itself from AsyncMessenger, + * we pick up this idea that just queue itself to this set and do lazy + * deleted for AsyncConnection. "_lookup_conn" must ensure not return a + * AsyncConnection in this set. + */ + Mutex deleted_lock; + set<AsyncConnectionRef> deleted_conns; + + EventCallbackRef reap_handler; + + /// internal cluster protocol version, if any, for talking to entities of the same type. + int cluster_protocol; + + Cond stop_cond; + bool stopped; + + AsyncConnectionRef _lookup_conn(const entity_addrvec_t& k) { + ceph_assert(lock.is_locked()); + auto p = conns.find(k); + if (p == conns.end()) + return NULL; + + // lazy delete, see "deleted_conns" + Mutex::Locker l(deleted_lock); + if (deleted_conns.erase(p->second)) { + conns.erase(p); + return NULL; + } + + return p->second; + } + + void _init_local_connection() { + ceph_assert(lock.is_locked()); + local_connection->peer_addrs = *my_addrs; + local_connection->peer_type = my_name.type(); + local_connection->set_features(CEPH_FEATURES_ALL); + ms_deliver_handle_fast_connect(local_connection.get()); + } + + void shutdown_connections(bool queue_reset); + +public: + + /// con used for sending messages to ourselves + AsyncConnectionRef local_connection; + + /** + * @defgroup AsyncMessenger internals + * @{ + */ + /** + * This wraps _lookup_conn. + */ + AsyncConnectionRef lookup_conn(const entity_addrvec_t& k) { + Mutex::Locker l(lock); + return _lookup_conn(k); + } + + int accept_conn(AsyncConnectionRef conn); + bool learned_addr(const entity_addr_t &peer_addr_for_me); + void add_accept(Worker *w, ConnectedSocket cli_socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr); + NetworkStack *get_stack() { + return stack; + } + + uint64_t get_nonce() const { + return nonce; + } + + /** + * Increment the global sequence for this AsyncMessenger and return it. + * This is for the connect protocol, although it doesn't hurt if somebody + * else calls it. + * + * @return a global sequence ID that nobody else has seen. + */ + __u32 get_global_seq(__u32 old=0) { + std::lock_guard<ceph::spinlock> lg(global_seq_lock); + + if (old > global_seq) + global_seq = old; + __u32 ret = ++global_seq; + + return ret; + } + /** + * Get the protocol version we support for the given peer type: either + * a peer protocol (if it matches our own), the protocol version for the + * peer (if we're connecting), or our protocol version (if we're accepting). + */ + int get_proto_version(int peer_type, bool connect) const; + + /** + * Fill in the address and peer type for the local connection, which + * is used for delivering messages back to ourself. + */ + void init_local_connection() { + Mutex::Locker l(lock); + _init_local_connection(); + } + + /** + * Unregister connection from `conns` + * + * See "deleted_conns" + */ + void unregister_conn(AsyncConnectionRef conn) { + Mutex::Locker l(deleted_lock); + conn->get_perf_counter()->dec(l_msgr_active_connections); + deleted_conns.emplace(std::move(conn)); + + if (deleted_conns.size() >= ReapDeadConnectionThreshold) { + local_worker->center.dispatch_event_external(reap_handler); + } + } + + /** + * Reap dead connection from `deleted_conns` + * + * @return the number of dead connections + * + * See "deleted_conns" + */ + int reap_dead(); + + /** + * @} // AsyncMessenger Internals + */ +} ; + +#endif /* CEPH_ASYNCMESSENGER_H */ diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc new file mode 100644 index 00000000..6b5e4c7c --- /dev/null +++ b/src/msg/async/Event.cc @@ -0,0 +1,471 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "common/errno.h" +#include "Event.h" + +#ifdef HAVE_DPDK +#include "dpdk/EventDPDK.h" +#endif + +#ifdef HAVE_EPOLL +#include "EventEpoll.h" +#else +#ifdef HAVE_KQUEUE +#include "EventKqueue.h" +#else +#include "EventSelect.h" +#endif +#endif + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "EventCallback " +class C_handle_notify : public EventCallback { + EventCenter *center; + CephContext *cct; + + public: + C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {} + void do_request(uint64_t fd_or_id) override { + char c[256]; + int r = 0; + do { + r = read(fd_or_id, c, sizeof(c)); + if (r < 0) { + if (errno != EAGAIN) + ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(errno) << dendl; + } + } while (r > 0); + } +}; + +#undef dout_prefix +#define dout_prefix _event_prefix(_dout) + +/** + * Construct a Poller. + * + * \param center + * EventCenter object through which the poller will be invoked (defaults + * to the global #RAMCloud::center object). + * \param pollerName + * Human readable name that can be printed out in debugging messages + * about the poller. The name of the superclass is probably sufficient + * for most cases. + */ +EventCenter::Poller::Poller(EventCenter* center, const string& name) + : owner(center), poller_name(name), slot(owner->pollers.size()) +{ + owner->pollers.push_back(this); +} + +/** + * Destroy a Poller. + */ +EventCenter::Poller::~Poller() +{ + // Erase this Poller from the vector by overwriting it with the + // poller that used to be the last one in the vector. + // + // Note: this approach is reentrant (it is safe to delete a + // poller from a poller callback, which means that the poll + // method is in the middle of scanning the list of all pollers; + // the worst that will happen is that the poller that got moved + // may not be invoked in the current scan). + owner->pollers[slot] = owner->pollers.back(); + owner->pollers[slot]->slot = slot; + owner->pollers.pop_back(); + slot = -1; +} + +ostream& EventCenter::_event_prefix(std::ostream *_dout) +{ + return *_dout << "Event(" << this << " nevent=" << nevent + << " time_id=" << time_event_next_id << ")."; +} + +int EventCenter::init(int n, unsigned i, const std::string &t) +{ + // can't init multi times + ceph_assert(nevent == 0); + + type = t; + idx = i; + + if (t == "dpdk") { +#ifdef HAVE_DPDK + driver = new DPDKDriver(cct); +#endif + } else { +#ifdef HAVE_EPOLL + driver = new EpollDriver(cct); +#else +#ifdef HAVE_KQUEUE + driver = new KqueueDriver(cct); +#else + driver = new SelectDriver(cct); +#endif +#endif + } + + if (!driver) { + lderr(cct) << __func__ << " failed to create event driver " << dendl; + return -1; + } + + int r = driver->init(this, n); + if (r < 0) { + lderr(cct) << __func__ << " failed to init event driver." << dendl; + return r; + } + + file_events.resize(n); + nevent = n; + + if (!driver->need_wakeup()) + return 0; + + int fds[2]; + if (pipe_cloexec(fds) < 0) { + int e = errno; + lderr(cct) << __func__ << " can't create notify pipe: " << cpp_strerror(e) << dendl; + return -e; + } + + notify_receive_fd = fds[0]; + notify_send_fd = fds[1]; + r = net.set_nonblock(notify_receive_fd); + if (r < 0) { + return r; + } + r = net.set_nonblock(notify_send_fd); + if (r < 0) { + return r; + } + + return r; +} + +EventCenter::~EventCenter() +{ + { + std::lock_guard<std::mutex> l(external_lock); + while (!external_events.empty()) { + EventCallbackRef e = external_events.front(); + if (e) + e->do_request(0); + external_events.pop_front(); + } + } + time_events.clear(); + //assert(time_events.empty()); + + if (notify_receive_fd >= 0) + ::close(notify_receive_fd); + if (notify_send_fd >= 0) + ::close(notify_send_fd); + + delete driver; + if (notify_handler) + delete notify_handler; +} + + +void EventCenter::set_owner() +{ + owner = pthread_self(); + ldout(cct, 2) << __func__ << " idx=" << idx << " owner=" << owner << dendl; + if (!global_centers) { + global_centers = &cct->lookup_or_create_singleton_object< + EventCenter::AssociatedCenters>( + "AsyncMessenger::EventCenter::global_center::" + type, true); + ceph_assert(global_centers); + global_centers->centers[idx] = this; + if (driver->need_wakeup()) { + notify_handler = new C_handle_notify(this, cct); + int r = create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler); + ceph_assert(r == 0); + } + } +} + +int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt) +{ + ceph_assert(in_thread()); + int r = 0; + if (fd >= nevent) { + int new_size = nevent << 2; + while (fd >= new_size) + new_size <<= 2; + ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl; + r = driver->resize_events(new_size); + if (r < 0) { + lderr(cct) << __func__ << " event count is exceed." << dendl; + return -ERANGE; + } + file_events.resize(new_size); + nevent = new_size; + } + + EventCenter::FileEvent *event = _get_file_event(fd); + ldout(cct, 20) << __func__ << " create event started fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; + if (event->mask == mask) + return 0; + + r = driver->add_event(fd, event->mask, mask); + if (r < 0) { + // Actually we don't allow any failed error code, caller doesn't prepare to + // handle error status. So now we need to assert failure here. In practice, + // add_event shouldn't report error, otherwise it must be a innermost bug! + lderr(cct) << __func__ << " add event failed, ret=" << r << " fd=" << fd + << " mask=" << mask << " original mask is " << event->mask << dendl; + ceph_abort_msg("BUG!"); + return r; + } + + event->mask |= mask; + if (mask & EVENT_READABLE) { + event->read_cb = ctxt; + } + if (mask & EVENT_WRITABLE) { + event->write_cb = ctxt; + } + ldout(cct, 20) << __func__ << " create event end fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; + return 0; +} + +void EventCenter::delete_file_event(int fd, int mask) +{ + ceph_assert(in_thread() && fd >= 0); + if (fd >= nevent) { + ldout(cct, 1) << __func__ << " delete event fd=" << fd << " is equal or greater than nevent=" << nevent + << "mask=" << mask << dendl; + return ; + } + EventCenter::FileEvent *event = _get_file_event(fd); + ldout(cct, 30) << __func__ << " delete event started fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; + if (!event->mask) + return ; + + int r = driver->del_event(fd, event->mask, mask); + if (r < 0) { + // see create_file_event + ceph_abort_msg("BUG!"); + } + + if (mask & EVENT_READABLE && event->read_cb) { + event->read_cb = nullptr; + } + if (mask & EVENT_WRITABLE && event->write_cb) { + event->write_cb = nullptr; + } + + event->mask = event->mask & (~mask); + ldout(cct, 30) << __func__ << " delete event end fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; +} + +uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt) +{ + ceph_assert(in_thread()); + uint64_t id = time_event_next_id++; + + ldout(cct, 30) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl; + EventCenter::TimeEvent event; + clock_type::time_point expire = clock_type::now() + std::chrono::microseconds(microseconds); + event.id = id; + event.time_cb = ctxt; + std::multimap<clock_type::time_point, TimeEvent>::value_type s_val(expire, event); + auto it = time_events.insert(std::move(s_val)); + event_map[id] = it; + + return id; +} + +void EventCenter::delete_time_event(uint64_t id) +{ + ceph_assert(in_thread()); + ldout(cct, 30) << __func__ << " id=" << id << dendl; + if (id >= time_event_next_id || id == 0) + return ; + + auto it = event_map.find(id); + if (it == event_map.end()) { + ldout(cct, 10) << __func__ << " id=" << id << " not found" << dendl; + return ; + } + + time_events.erase(it->second); + event_map.erase(it); +} + +void EventCenter::wakeup() +{ + // No need to wake up since we never sleep + if (!pollers.empty() || !driver->need_wakeup()) + return ; + + ldout(cct, 20) << __func__ << dendl; + char buf = 'c'; + // wake up "event_wait" + int n = write(notify_send_fd, &buf, sizeof(buf)); + if (n < 0) { + if (errno != EAGAIN) { + ldout(cct, 1) << __func__ << " write notify pipe failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + } +} + +int EventCenter::process_time_events() +{ + int processed = 0; + clock_type::time_point now = clock_type::now(); + ldout(cct, 30) << __func__ << " cur time is " << now << dendl; + + while (!time_events.empty()) { + auto it = time_events.begin(); + if (now >= it->first) { + TimeEvent &e = it->second; + EventCallbackRef cb = e.time_cb; + uint64_t id = e.id; + time_events.erase(it); + event_map.erase(id); + ldout(cct, 30) << __func__ << " process time event: id=" << id << dendl; + processed++; + cb->do_request(id); + } else { + break; + } + } + + return processed; +} + +int EventCenter::process_events(unsigned timeout_microseconds, ceph::timespan *working_dur) +{ + struct timeval tv; + int numevents; + bool trigger_time = false; + auto now = clock_type::now(); + + auto it = time_events.begin(); + bool blocking = pollers.empty() && !external_num_events.load(); + // If exists external events or poller, don't block + if (!blocking) { + if (it != time_events.end() && now >= it->first) + trigger_time = true; + tv.tv_sec = 0; + tv.tv_usec = 0; + } else { + clock_type::time_point shortest; + shortest = now + std::chrono::microseconds(timeout_microseconds); + + if (it != time_events.end() && shortest >= it->first) { + ldout(cct, 30) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl; + shortest = it->first; + trigger_time = true; + if (shortest > now) { + timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>( + shortest - now).count(); + } else { + shortest = now; + timeout_microseconds = 0; + } + } + tv.tv_sec = timeout_microseconds / 1000000; + tv.tv_usec = timeout_microseconds % 1000000; + } + + ldout(cct, 30) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl; + vector<FiredFileEvent> fired_events; + numevents = driver->event_wait(fired_events, &tv); + auto working_start = ceph::mono_clock::now(); + for (int j = 0; j < numevents; j++) { + int rfired = 0; + FileEvent *event; + EventCallbackRef cb; + event = _get_file_event(fired_events[j].fd); + + /* note the event->mask & mask & ... code: maybe an already processed + * event removed an element that fired and we still didn't + * processed, so we check if the event is still valid. */ + if (event->mask & fired_events[j].mask & EVENT_READABLE) { + rfired = 1; + cb = event->read_cb; + cb->do_request(fired_events[j].fd); + } + + if (event->mask & fired_events[j].mask & EVENT_WRITABLE) { + if (!rfired || event->read_cb != event->write_cb) { + cb = event->write_cb; + cb->do_request(fired_events[j].fd); + } + } + + ldout(cct, 30) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl; + } + + if (trigger_time) + numevents += process_time_events(); + + if (external_num_events.load()) { + external_lock.lock(); + deque<EventCallbackRef> cur_process; + cur_process.swap(external_events); + external_num_events.store(0); + external_lock.unlock(); + numevents += cur_process.size(); + while (!cur_process.empty()) { + EventCallbackRef e = cur_process.front(); + ldout(cct, 30) << __func__ << " do " << e << dendl; + e->do_request(0); + cur_process.pop_front(); + } + } + + if (!numevents && !blocking) { + for (uint32_t i = 0; i < pollers.size(); i++) + numevents += pollers[i]->poll(); + } + + if (working_dur) + *working_dur = ceph::mono_clock::now() - working_start; + return numevents; +} + +void EventCenter::dispatch_event_external(EventCallbackRef e) +{ + uint64_t num = 0; + { + std::lock_guard lock{external_lock}; + if (external_num_events > 0 && *external_events.rbegin() == e) { + return; + } + external_events.push_back(e); + num = ++external_num_events; + } + if (num == 1 && !in_thread()) + wakeup(); + + ldout(cct, 30) << __func__ << " " << e << " pending " << num << dendl; +} diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h new file mode 100644 index 00000000..6736060e --- /dev/null +++ b/src/msg/async/Event.h @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENT_H +#define CEPH_MSG_EVENT_H + +#ifdef __APPLE__ +#include <AvailabilityMacros.h> +#endif + +// We use epoll, kqueue, evport, select in descending order by performance. +#if defined(__linux__) +#define HAVE_EPOLL 1 +#endif + +#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) +#define HAVE_KQUEUE 1 +#endif + +#ifdef __sun +#include <sys/feature_tests.h> +#ifdef _DTRACE_VERSION +#define HAVE_EVPORT 1 +#endif +#endif + +#include <atomic> +#include <mutex> +#include <condition_variable> + +#include "common/ceph_time.h" +#include "common/dout.h" +#include "net_handler.h" + +#define EVENT_NONE 0 +#define EVENT_READABLE 1 +#define EVENT_WRITABLE 2 + +class EventCenter; + +class EventCallback { + + public: + virtual void do_request(uint64_t fd_or_id) = 0; + virtual ~EventCallback() {} // we want a virtual destructor!!! +}; + +typedef EventCallback* EventCallbackRef; + +struct FiredFileEvent { + int fd; + int mask; +}; + +/* + * EventDriver is a wrap of event mechanisms depends on different OS. + * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will + * be used for worst condition. + */ +class EventDriver { + public: + virtual ~EventDriver() {} // we want a virtual destructor!!! + virtual int init(EventCenter *center, int nevent) = 0; + virtual int add_event(int fd, int cur_mask, int mask) = 0; + virtual int del_event(int fd, int cur_mask, int del_mask) = 0; + virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0; + virtual int resize_events(int newsize) = 0; + virtual bool need_wakeup() { return true; } +}; + +/* + * EventCenter maintain a set of file descriptor and handle registered events. + */ +class EventCenter { + public: + // should be enough; + static const int MAX_EVENTCENTER = 24; + + private: + using clock_type = ceph::coarse_mono_clock; + + struct AssociatedCenters { + EventCenter *centers[MAX_EVENTCENTER]; + AssociatedCenters() { + // FIPS zeroization audit 20191115: this memset is not security related. + memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*)); + } + }; + + struct FileEvent { + int mask; + EventCallbackRef read_cb; + EventCallbackRef write_cb; + FileEvent(): mask(0), read_cb(NULL), write_cb(NULL) {} + }; + + struct TimeEvent { + uint64_t id; + EventCallbackRef time_cb; + + TimeEvent(): id(0), time_cb(NULL) {} + }; + + public: + /** + * A Poller object is invoked once each time through the dispatcher's + * inner polling loop. + */ + class Poller { + public: + explicit Poller(EventCenter* center, const string& pollerName); + virtual ~Poller(); + + /** + * This method is defined by a subclass and invoked once by the + * center during each pass through its inner polling loop. + * + * \return + * 1 means that this poller did useful work during this call. + * 0 means that the poller found no work to do. + */ + virtual int poll() = 0; + + private: + /// The EventCenter object that owns this Poller. NULL means the + /// EventCenter has been deleted. + EventCenter* owner; + + /// Human-readable string name given to the poller to make it + /// easy to identify for debugging. For most pollers just passing + /// in the subclass name probably makes sense. + string poller_name; + + /// Index of this Poller in EventCenter::pollers. Allows deletion + /// without having to scan all the entries in pollers. -1 means + /// this poller isn't currently in EventCenter::pollers (happens + /// after EventCenter::reset). + int slot; + }; + + private: + CephContext *cct; + std::string type; + int nevent; + // Used only to external event + pthread_t owner = 0; + std::mutex external_lock; + std::atomic_ulong external_num_events; + deque<EventCallbackRef> external_events; + vector<FileEvent> file_events; + EventDriver *driver; + std::multimap<clock_type::time_point, TimeEvent> time_events; + // Keeps track of all of the pollers currently defined. We don't + // use an intrusive list here because it isn't reentrant: we need + // to add/remove elements while the center is traversing the list. + std::vector<Poller*> pollers; + std::map<uint64_t, std::multimap<clock_type::time_point, TimeEvent>::iterator> event_map; + uint64_t time_event_next_id; + int notify_receive_fd; + int notify_send_fd; + NetHandler net; + EventCallbackRef notify_handler; + unsigned idx; + AssociatedCenters *global_centers = nullptr; + + int process_time_events(); + FileEvent *_get_file_event(int fd) { + ceph_assert(fd < nevent); + return &file_events[fd]; + } + + public: + explicit EventCenter(CephContext *c): + cct(c), nevent(0), + external_num_events(0), + driver(NULL), time_event_next_id(1), + notify_receive_fd(-1), notify_send_fd(-1), net(c), + notify_handler(NULL), idx(0) { } + ~EventCenter(); + ostream& _event_prefix(std::ostream *_dout); + + int init(int nevent, unsigned idx, const std::string &t); + void set_owner(); + pthread_t get_owner() const { return owner; } + unsigned get_id() const { return idx; } + + EventDriver *get_driver() { return driver; } + + // Used by internal thread + int create_file_event(int fd, int mask, EventCallbackRef ctxt); + uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt); + void delete_file_event(int fd, int mask); + void delete_time_event(uint64_t id); + int process_events(unsigned timeout_microseconds, ceph::timespan *working_dur = nullptr); + void wakeup(); + + // Used by external thread + void dispatch_event_external(EventCallbackRef e); + inline bool in_thread() const { + return pthread_equal(pthread_self(), owner); + } + + private: + template <typename func> + class C_submit_event : public EventCallback { + std::mutex lock; + std::condition_variable cond; + bool done = false; + func f; + bool nonwait; + public: + C_submit_event(func &&_f, bool nw) + : f(std::move(_f)), nonwait(nw) {} + void do_request(uint64_t id) override { + f(); + lock.lock(); + cond.notify_all(); + done = true; + bool del = nonwait; + lock.unlock(); + if (del) + delete this; + } + void wait() { + ceph_assert(!nonwait); + std::unique_lock<std::mutex> l(lock); + while (!done) + cond.wait(l); + } + }; + + public: + template <typename func> + void submit_to(int i, func &&f, bool nowait = false) { + ceph_assert(i < MAX_EVENTCENTER && global_centers); + EventCenter *c = global_centers->centers[i]; + ceph_assert(c); + if (!nowait && c->in_thread()) { + f(); + return ; + } + if (nowait) { + C_submit_event<func> *event = new C_submit_event<func>(std::move(f), true); + c->dispatch_event_external(event); + } else { + C_submit_event<func> event(std::move(f), false); + c->dispatch_event_external(&event); + event.wait(); + } + }; +}; + +#endif diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc new file mode 100644 index 00000000..37b46973 --- /dev/null +++ b/src/msg/async/EventEpoll.cc @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include <fcntl.h> +#include "EventEpoll.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "EpollDriver." + +int EpollDriver::init(EventCenter *c, int nevent) +{ + events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent); + if (!events) { + lderr(cct) << __func__ << " unable to malloc memory. " << dendl; + return -ENOMEM; + } + memset(events, 0, sizeof(struct epoll_event)*nevent); + + epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ + if (epfd == -1) { + lderr(cct) << __func__ << " unable to do epoll_create: " + << cpp_strerror(errno) << dendl; + return -errno; + } + if (::fcntl(epfd, F_SETFD, FD_CLOEXEC) == -1) { + int e = errno; + ::close(epfd); + lderr(cct) << __func__ << " unable to set cloexec: " + << cpp_strerror(e) << dendl; + + return -e; + } + + size = nevent; + + return 0; +} + +int EpollDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask + << " add_mask=" << add_mask << " to " << epfd << dendl; + struct epoll_event ee; + /* If the fd was already monitored for some event, we need a MOD + * operation. Otherwise we need an ADD operation. */ + int op; + op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD; + + ee.events = EPOLLET; + add_mask |= cur_mask; /* Merge old events */ + if (add_mask & EVENT_READABLE) + ee.events |= EPOLLIN; + if (add_mask & EVENT_WRITABLE) + ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (epoll_ctl(epfd, op, fd, &ee) == -1) { + lderr(cct) << __func__ << " epoll_ctl: add fd=" << fd << " failed. " + << cpp_strerror(errno) << dendl; + return -errno; + } + + return 0; +} + +int EpollDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask + << " delmask=" << delmask << " to " << epfd << dendl; + struct epoll_event ee; + int mask = cur_mask & (~delmask); + int r = 0; + + ee.events = 0; + if (mask & EVENT_READABLE) ee.events |= EPOLLIN; + if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (mask != EVENT_NONE) { + if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) { + lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask + << " failed." << cpp_strerror(errno) << dendl; + return -errno; + } + } else { + /* Note, Kernel < 2.6.9 requires a non null event pointer even for + * EPOLL_CTL_DEL. */ + if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) { + lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd + << " failed." << cpp_strerror(errno) << dendl; + return -errno; + } + } + return 0; +} + +int EpollDriver::resize_events(int newsize) +{ + return 0; +} + +int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int retval, numevents = 0; + + retval = epoll_wait(epfd, events, size, + tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + if (retval > 0) { + int j; + + numevents = retval; + fired_events.resize(numevents); + for (j = 0; j < numevents; j++) { + int mask = 0; + struct epoll_event *e = events + j; + + if (e->events & EPOLLIN) mask |= EVENT_READABLE; + if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE; + if (e->events & EPOLLERR) mask |= EVENT_READABLE|EVENT_WRITABLE; + if (e->events & EPOLLHUP) mask |= EVENT_READABLE|EVENT_WRITABLE; + fired_events[j].fd = e->data.fd; + fired_events[j].mask = mask; + } + } + return numevents; +} diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h new file mode 100644 index 00000000..abc4b8bb --- /dev/null +++ b/src/msg/async/EventEpoll.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENTEPOLL_H +#define CEPH_MSG_EVENTEPOLL_H + +#include <unistd.h> +#include <sys/epoll.h> + +#include "Event.h" + +class EpollDriver : public EventDriver { + int epfd; + struct epoll_event *events; + CephContext *cct; + int size; + + public: + explicit EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), size(0) {} + ~EpollDriver() override { + if (epfd != -1) + close(epfd); + + if (events) + free(events); + } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, + struct timeval *tp) override; +}; + +#endif diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc new file mode 100644 index 00000000..d6ba4a3d --- /dev/null +++ b/src/msg/async/EventKqueue.cc @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "EventKqueue.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "KqueueDriver." + +#define KEVENT_NOWAIT 0 + +int KqueueDriver::test_kqfd() { + struct kevent ke[1]; + if (kevent(kqfd, ke, 0, NULL, 0, KEVENT_NOWAIT) == -1) { + ldout(cct,0) << __func__ << " invalid kqfd = " << kqfd + << cpp_strerror(errno) << dendl; + return -errno; + } + return kqfd; +} + +int KqueueDriver::restore_events() { + struct kevent ke[2]; + int i; + + ldout(cct,30) << __func__ << " on kqfd = " << kqfd << dendl; + for(i=0;i<size;i++) { + int num = 0; + if (sav_events[i].mask == 0 ) + continue; + ldout(cct,30) << __func__ << " restore kqfd = " << kqfd + << " fd = " << i << " mask " << sav_events[i].mask << dendl; + if (sav_events[i].mask & EVENT_READABLE) + EV_SET(&ke[num++], i, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (sav_events[i].mask & EVENT_WRITABLE) + EV_SET(&ke[num++], i, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + if (num) { + if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) { + ldout(cct,0) << __func__ << " unable to add event: " + << cpp_strerror(errno) << dendl; + return -errno; + } + } + } + return 0; +} + +int KqueueDriver::test_thread_change(const char* funcname) { + // check to see if we changed thread, because that invalidates + // the kqfd and we need to restore that + int oldkqfd = kqfd; + + if (!pthread_equal(mythread, pthread_self())) { + ldout(cct,20) << funcname << " We changed thread from " << mythread + << " to " << pthread_self() << dendl; + mythread = pthread_self(); + kqfd = -1; + } else if ((kqfd != -1) && (test_kqfd() < 0)) { + // should this ever happen? + // It would be strange to change kqfd with thread change. + // Might nee to change this into an ceph_assert() in the future. + ldout(cct,0) << funcname << " Warning: Recreating old kqfd. " + << "This should not happen!!!" << dendl; + kqfd = -1; + } + if (kqfd == -1) { + kqfd = kqueue(); + ldout(cct,30) << funcname << " kqueue: new kqfd = " << kqfd + << " (was: " << oldkqfd << ")" + << dendl; + if (kqfd < 0) { + lderr(cct) << funcname << " unable to do kqueue: " + << cpp_strerror(errno) << dendl; + return -errno; + } + if (restore_events()< 0) { + lderr(cct) << funcname << " unable restore all events " + << cpp_strerror(errno) << dendl; + return -errno; + } + } + return 0; +} + +int KqueueDriver::init(EventCenter *c, int nevent) +{ + // keep track of possible changes of our thread + // because change of thread kills the kqfd + mythread = pthread_self(); + + // Reserve the space to accept the kevent return events. + res_events = (struct kevent*)malloc(sizeof(struct kevent)*nevent); + if (!res_events) { + lderr(cct) << __func__ << " unable to malloc memory: " + << cpp_strerror(errno) << dendl; + return -ENOMEM; + } + memset(res_events, 0, sizeof(struct kevent)*nevent); + size = nevent; + + // Reserve the space to keep all of the events set, so it can be redone + // when we change trhread ID. + sav_events = (struct SaveEvent*)malloc(sizeof(struct SaveEvent)*nevent); + if (!sav_events) { + lderr(cct) << __func__ << " unable to malloc memory: " + << cpp_strerror(errno) << dendl; + return -ENOMEM; + } + memset(sav_events, 0, sizeof(struct SaveEvent)*nevent); + sav_max = nevent; + + // Delay assigning a descriptor until it is really needed. + // kqfd = kqueue(); + kqfd = -1; + return 0; +} + +int KqueueDriver::add_event(int fd, int cur_mask, int add_mask) +{ + struct kevent ke[2]; + int num = 0; + + ldout(cct,30) << __func__ << " add event kqfd = " << kqfd << " fd = " << fd + << " cur_mask = " << cur_mask << " add_mask = " << add_mask + << dendl; + + int r = test_thread_change(__func__); + if ( r < 0 ) + return r; + + if (add_mask & EVENT_READABLE) + EV_SET(&ke[num++], fd, EVFILT_READ, EV_ADD|EV_CLEAR, 0, 0, NULL); + if (add_mask & EVENT_WRITABLE) + EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_ADD|EV_CLEAR, 0, 0, NULL); + + if (num) { + if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) { + lderr(cct) << __func__ << " unable to add event: " + << cpp_strerror(errno) << dendl; + return -errno; + } + } + // keep what we set + if (fd >= sav_max) + resize_events(sav_max+5000); + sav_events[fd].mask = cur_mask | add_mask; + return 0; +} + +int KqueueDriver::del_event(int fd, int cur_mask, int del_mask) +{ + struct kevent ke[2]; + int num = 0; + int mask = cur_mask & del_mask; + + ldout(cct,30) << __func__ << " delete event kqfd = " << kqfd + << " fd = " << fd << " cur_mask = " << cur_mask + << " del_mask = " << del_mask << dendl; + + int r = test_thread_change(__func__); + if ( r < 0 ) + return r; + + if (mask & EVENT_READABLE) + EV_SET(&ke[num++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + if (mask & EVENT_WRITABLE) + EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + + if (num) { + int r = 0; + if ((r = kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT)) < 0) { + lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << mask + << " failed." << cpp_strerror(errno) << dendl; + return -errno; + } + } + // keep the administration + sav_events[fd].mask = cur_mask & ~del_mask; + return 0; +} + +int KqueueDriver::resize_events(int newsize) +{ + ldout(cct,30) << __func__ << " kqfd = " << kqfd << "newsize = " << newsize + << dendl; + if (newsize > sav_max) { + sav_events = (struct SaveEvent*)realloc(sav_events, sizeof(struct SaveEvent)*newsize); + if (!sav_events) { + lderr(cct) << __func__ << " unable to realloc memory: " + << cpp_strerror(errno) << dendl; + ceph_assert(sav_events); + return -ENOMEM; + } + memset(&sav_events[size], 0, sizeof(struct SaveEvent)*(newsize-sav_max)); + sav_max = newsize; + } + return 0; +} + +int KqueueDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int retval, numevents = 0; + struct timespec timeout; + + ldout(cct,10) << __func__ << " kqfd = " << kqfd << dendl; + + int r = test_thread_change(__func__); + if ( r < 0 ) + return r; + + if (tvp != NULL) { + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + ldout(cct,20) << __func__ << " " + << timeout.tv_sec << " sec " + << timeout.tv_nsec << " nsec" + << dendl; + retval = kevent(kqfd, NULL, 0, res_events, size, &timeout); + } else { + ldout(cct,30) << __func__ << " event_wait: " << " NULL" << dendl; + retval = kevent(kqfd, NULL, 0, res_events, size, KEVENT_NOWAIT); + } + + ldout(cct,25) << __func__ << " kevent retval: " << retval << dendl; + if (retval < 0) { + lderr(cct) << __func__ << " kqueue error: " + << cpp_strerror(errno) << dendl; + return -errno; + } else if (retval == 0) { + ldout(cct,5) << __func__ << " Hit timeout(" + << timeout.tv_sec << " sec " + << timeout.tv_nsec << " nsec" + << ")." << dendl; + } else { + int j; + + numevents = retval; + fired_events.resize(numevents); + for (j = 0; j < numevents; j++) { + int mask = 0; + struct kevent *e = res_events + j; + + if (e->filter == EVFILT_READ) mask |= EVENT_READABLE; + if (e->filter == EVFILT_WRITE) mask |= EVENT_WRITABLE; + if (e->flags & EV_ERROR) mask |= EVENT_READABLE|EVENT_WRITABLE; + fired_events[j].fd = (int)e->ident; + fired_events[j].mask = mask; + + } + } + return numevents; +} diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h new file mode 100644 index 00000000..24863a93 --- /dev/null +++ b/src/msg/async/EventKqueue.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENTKQUEUE_H +#define CEPH_MSG_EVENTKQUEUE_H + +#include <sys/types.h> +#include <sys/event.h> +#include <unistd.h> + +#include "Event.h" + +class KqueueDriver : public EventDriver { + int kqfd; + pthread_t mythread; + struct kevent *res_events; + CephContext *cct; + int size; + + // Keep what we set on the kqfd + struct SaveEvent{ + int fd; + int mask; + }; + struct SaveEvent *sav_events; + int sav_max; + int restore_events(); + int test_kqfd(); + int test_thread_change(const char* funcname); + + public: + explicit KqueueDriver(CephContext *c): kqfd(-1), res_events(NULL), cct(c), + size(0), sav_max(0) {} + virtual ~KqueueDriver() { + if (kqfd != -1) + close(kqfd); + + if (res_events) + free(res_events); + size = 0; + if (sav_events) + free(sav_events); + sav_max = 0; + } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, + struct timeval *tp) override; +}; + +#endif diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc new file mode 100644 index 00000000..fdee6ebc --- /dev/null +++ b/src/msg/async/EventSelect.cc @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "EventSelect.h" + +#include <unistd.h> +#include <sys/select.h> +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "SelectDriver." + +int SelectDriver::init(EventCenter *c, int nevent) +{ + ldout(cct, 0) << "Select isn't suitable for production env, just avoid " + << "compiling error or special purpose" << dendl; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + max_fd = 0; + return 0; +} + +int SelectDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask + << dendl; + + int mask = cur_mask | add_mask; + if (mask & EVENT_READABLE) + FD_SET(fd, &rfds); + if (mask & EVENT_WRITABLE) + FD_SET(fd, &wfds); + if (fd > max_fd) + max_fd = fd; + + return 0; +} + +int SelectDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask + << dendl; + + if (delmask & EVENT_READABLE) + FD_CLR(fd, &rfds); + if (delmask & EVENT_WRITABLE) + FD_CLR(fd, &wfds); + return 0; +} + +int SelectDriver::resize_events(int newsize) +{ + return 0; +} + +int SelectDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int retval, numevents = 0; + + memcpy(&_rfds, &rfds, sizeof(fd_set)); + memcpy(&_wfds, &wfds, sizeof(fd_set)); + + retval = select(max_fd+1, &_rfds, &_wfds, NULL, tvp); + if (retval > 0) { + for (int j = 0; j <= max_fd; j++) { + int mask = 0; + struct FiredFileEvent fe; + if (FD_ISSET(j, &_rfds)) + mask |= EVENT_READABLE; + if (FD_ISSET(j, &_wfds)) + mask |= EVENT_WRITABLE; + if (mask) { + fe.fd = j; + fe.mask = mask; + fired_events.push_back(fe); + numevents++; + } + } + } + return numevents; +} diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h new file mode 100644 index 00000000..1b75da0b --- /dev/null +++ b/src/msg/async/EventSelect.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENTSELECT_H +#define CEPH_MSG_EVENTSELECT_H + +#include "Event.h" + +class SelectDriver : public EventDriver { + fd_set rfds, wfds; + /* We need to have a copy of the fd sets as it's not safe to reuse + * FD sets after select(). */ + fd_set _rfds, _wfds; + int max_fd; + CephContext *cct; + + public: + explicit SelectDriver(CephContext *c): max_fd(0), cct(c) {} + ~SelectDriver() override {} + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, + struct timeval *tp) override; +}; + +#endif diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc new file mode 100644 index 00000000..e9c8d404 --- /dev/null +++ b/src/msg/async/PosixStack.cc @@ -0,0 +1,293 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <errno.h> + +#include <algorithm> + +#include "PosixStack.h" + +#include "include/buffer.h" +#include "include/str_list.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "common/dout.h" +#include "msg/Messenger.h" +#include "include/compat.h" +#include "include/sock_compat.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "PosixStack " + +class PosixConnectedSocketImpl final : public ConnectedSocketImpl { + NetHandler &handler; + int _fd; + entity_addr_t sa; + bool connected; + + public: + explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected) + : handler(h), _fd(f), sa(sa), connected(connected) {} + + int is_connected() override { + if (connected) + return 1; + + int r = handler.reconnect(sa, _fd); + if (r == 0) { + connected = true; + return 1; + } else if (r < 0) { + return r; + } else { + return 0; + } + } + + ssize_t zero_copy_read(bufferptr&) override { + return -EOPNOTSUPP; + } + + ssize_t read(char *buf, size_t len) override { + ssize_t r = ::read(_fd, buf, len); + if (r < 0) + r = -errno; + return r; + } + + // return the sent length + // < 0 means error occurred + static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more) + { + size_t sent = 0; + while (1) { + MSGR_SIGPIPE_STOPPER; + ssize_t r; + r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0)); + if (r < 0) { + if (errno == EINTR) { + continue; + } else if (errno == EAGAIN) { + break; + } + return -errno; + } + + sent += r; + if (len == sent) break; + + while (r > 0) { + if (msg.msg_iov[0].iov_len <= (size_t)r) { + // drain this whole item + r -= msg.msg_iov[0].iov_len; + msg.msg_iov++; + msg.msg_iovlen--; + } else { + msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r; + msg.msg_iov[0].iov_len -= r; + break; + } + } + } + return (ssize_t)sent; + } + + ssize_t send(bufferlist &bl, bool more) override { + size_t sent_bytes = 0; + auto pb = std::cbegin(bl.buffers()); + uint64_t left_pbrs = std::size(bl.buffers()); + while (left_pbrs) { + struct msghdr msg; + struct iovec msgvec[IOV_MAX]; + uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX); + left_pbrs -= size; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&msg, 0, sizeof(msg)); + msg.msg_iovlen = size; + msg.msg_iov = msgvec; + unsigned msglen = 0; + for (auto iov = msgvec; iov != msgvec + size; iov++) { + iov->iov_base = (void*)(pb->c_str()); + iov->iov_len = pb->length(); + msglen += pb->length(); + ++pb; + } + ssize_t r = do_sendmsg(_fd, msg, msglen, left_pbrs || more); + if (r < 0) + return r; + + // "r" is the remaining length + sent_bytes += r; + if (static_cast<unsigned>(r) < msglen) + break; + // only "r" == 0 continue + } + + if (sent_bytes) { + bufferlist swapped; + if (sent_bytes < bl.length()) { + bl.splice(sent_bytes, bl.length()-sent_bytes, &swapped); + bl.swap(swapped); + } else { + bl.clear(); + } + } + + return static_cast<ssize_t>(sent_bytes); + } + void shutdown() override { + ::shutdown(_fd, SHUT_RDWR); + } + void close() override { + ::close(_fd); + } + int fd() const override { + return _fd; + } + int socket_fd() const override { + return _fd; + } + friend class PosixServerSocketImpl; + friend class PosixNetworkStack; +}; + +class PosixServerSocketImpl : public ServerSocketImpl { + NetHandler &handler; + int _fd; + + public: + explicit PosixServerSocketImpl(NetHandler &h, int f, + const entity_addr_t& listen_addr, unsigned slot) + : ServerSocketImpl(listen_addr.get_type(), slot), + handler(h), _fd(f) {} + int accept(ConnectedSocket *sock, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + void abort_accept() override { + ::close(_fd); + } + int fd() const override { + return _fd; + } +}; + +int PosixServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) { + ceph_assert(sock); + sockaddr_storage ss; + socklen_t slen = sizeof(ss); + int sd = accept_cloexec(_fd, (sockaddr*)&ss, &slen); + if (sd < 0) { + return -errno; + } + + int r = handler.set_nonblock(sd); + if (r < 0) { + ::close(sd); + return -errno; + } + + r = handler.set_socket_options(sd, opt.nodelay, opt.rcbuf_size); + if (r < 0) { + ::close(sd); + return -errno; + } + + ceph_assert(NULL != out); //out should not be NULL in accept connection + + out->set_type(addr_type); + out->set_sockaddr((sockaddr*)&ss); + handler.set_priority(sd, opt.priority, out->get_family()); + + std::unique_ptr<PosixConnectedSocketImpl> csi(new PosixConnectedSocketImpl(handler, *out, sd, true)); + *sock = ConnectedSocket(std::move(csi)); + return 0; +} + +void PosixWorker::initialize() +{ +} + +int PosixWorker::listen(entity_addr_t &sa, + unsigned addr_slot, + const SocketOptions &opt, + ServerSocket *sock) +{ + int listen_sd = net.create_socket(sa.get_family(), true); + if (listen_sd < 0) { + return -errno; + } + + int r = net.set_nonblock(listen_sd); + if (r < 0) { + ::close(listen_sd); + return -errno; + } + + r = net.set_socket_options(listen_sd, opt.nodelay, opt.rcbuf_size); + if (r < 0) { + ::close(listen_sd); + return -errno; + } + + r = ::bind(listen_sd, sa.get_sockaddr(), sa.get_sockaddr_len()); + if (r < 0) { + r = -errno; + ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr() + << ": " << cpp_strerror(r) << dendl; + ::close(listen_sd); + return r; + } + + r = ::listen(listen_sd, cct->_conf->ms_tcp_listen_backlog); + if (r < 0) { + r = -errno; + lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(r) << dendl; + ::close(listen_sd); + return r; + } + + *sock = ServerSocket( + std::unique_ptr<PosixServerSocketImpl>( + new PosixServerSocketImpl(net, listen_sd, sa, addr_slot))); + return 0; +} + +int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) { + int sd; + + if (opts.nonblock) { + sd = net.nonblock_connect(addr, opts.connect_bind_addr); + } else { + sd = net.connect(addr, opts.connect_bind_addr); + } + + if (sd < 0) { + return -errno; + } + + net.set_priority(sd, opts.priority, addr.get_family()); + *socket = ConnectedSocket( + std::unique_ptr<PosixConnectedSocketImpl>(new PosixConnectedSocketImpl(net, addr, sd, !opts.nonblock))); + return 0; +} + +PosixNetworkStack::PosixNetworkStack(CephContext *c, const string &t) + : NetworkStack(c, t) +{ +} diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h new file mode 100644 index 00000000..f1aaccd4 --- /dev/null +++ b/src/msg/async/PosixStack.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ASYNC_POSIXSTACK_H +#define CEPH_MSG_ASYNC_POSIXSTACK_H + +#include <thread> + +#include "msg/msg_types.h" +#include "msg/async/net_handler.h" + +#include "Stack.h" + +class PosixWorker : public Worker { + NetHandler net; + void initialize() override; + public: + PosixWorker(CephContext *c, unsigned i) + : Worker(c, i), net(c) {} + int listen(entity_addr_t &sa, + unsigned addr_slot, + const SocketOptions &opt, + ServerSocket *socks) override; + int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; +}; + +class PosixNetworkStack : public NetworkStack { + vector<std::thread> threads; + + public: + explicit PosixNetworkStack(CephContext *c, const string &t); + + void spawn_worker(unsigned i, std::function<void ()> &&func) override { + threads.resize(i+1); + threads[i] = std::thread(func); + } + void join_worker(unsigned i) override { + ceph_assert(threads.size() > i && threads[i].joinable()); + threads[i].join(); + } +}; + +#endif //CEPH_MSG_ASYNC_POSIXSTACK_H diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc new file mode 100644 index 00000000..4bdc065e --- /dev/null +++ b/src/msg/async/Protocol.cc @@ -0,0 +1,14 @@ +#include "Protocol.h" + +#include "AsyncConnection.h" +#include "AsyncMessenger.h" + +Protocol::Protocol(int type, AsyncConnection *connection) + : proto_type(type), + connection(connection), + messenger(connection->async_msgr), + cct(connection->async_msgr->cct) { + auth_meta.reset(new AuthConnectionMeta()); +} + +Protocol::~Protocol() {} diff --git a/src/msg/async/Protocol.h b/src/msg/async/Protocol.h new file mode 100644 index 00000000..cccba183 --- /dev/null +++ b/src/msg/async/Protocol.h @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _MSG_ASYNC_PROTOCOL_ +#define _MSG_ASYNC_PROTOCOL_ + +#include <list> +#include <map> + +#include "AsyncConnection.h" +#include "include/buffer.h" +#include "include/msgr.h" + +/* + * Continuation Helper Classes + */ + +#include <memory> +#include <tuple> + +template <class C> +class Ct { +public: + virtual ~Ct() {} + virtual Ct<C> *call(C *foo) const = 0; +}; + +template <class C, typename... Args> +class CtFun : public Ct<C> { +private: + using fn_t = Ct<C> *(C::*)(Args...); + fn_t _f; + std::tuple<Args...> _params; + + template <std::size_t... Is> + inline Ct<C> *_call(C *foo, std::index_sequence<Is...>) const { + return (foo->*_f)(std::get<Is>(_params)...); + } + +public: + CtFun(fn_t f) : _f(f) {} + + inline void setParams(Args... args) { _params = std::make_tuple(args...); } + inline Ct<C> *call(C *foo) const override { + return _call(foo, std::index_sequence_for<Args...>()); + } +}; + +using rx_buffer_t = + std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>; + +template <class C> +class CtRxNode : public Ct<C> { + using fn_t = Ct<C> *(C::*)(rx_buffer_t&&, int r); + fn_t _f; + +public: + mutable rx_buffer_t node; + int r; + + CtRxNode(fn_t f) : _f(f) {} + void setParams(rx_buffer_t &&node, int r) { + this->node = std::move(node); + this->r = r; + } + inline Ct<C> *call(C *foo) const override { + return (foo->*_f)(std::move(node), r); + } +}; + +template <class C> using CONTINUATION_TYPE = CtFun<C>; +template <class C> using CONTINUATION_TX_TYPE = CtFun<C, int>; +template <class C> using CONTINUATION_RX_TYPE = CtFun<C, char*, int>; +template <class C> using CONTINUATION_RXBPTR_TYPE = CtRxNode<C>; + +#define CONTINUATION_DECL(C, F, ...) \ + CtFun<C, ##__VA_ARGS__> F##_cont { (&C::F) }; + +#define CONTINUATION(F) F##_cont +#define CONTINUE(F, ...) (F##_cont.setParams(__VA_ARGS__), &F##_cont) + +#define CONTINUATION_RUN(CT) \ + { \ + Ct<std::remove_reference<decltype(*this)>::type> *_cont = &CT;\ + do { \ + _cont = _cont->call(this); \ + } while (_cont); \ + } + +#define READ_HANDLER_CONTINUATION_DECL(C, F) \ + CONTINUATION_DECL(C, F, char *, int) + +#define READ_BPTR_HANDLER_CONTINUATION_DECL(C, F) \ + CtRxNode<C> F##_cont { (&C::F) }; + +#define WRITE_HANDLER_CONTINUATION_DECL(C, F) CONTINUATION_DECL(C, F, int) + +////////////////////////////////////////////////////////////////////// + +class AsyncMessenger; + +class Protocol { +public: + const int proto_type; +protected: + AsyncConnection *connection; + AsyncMessenger *messenger; + CephContext *cct; +public: + std::shared_ptr<AuthConnectionMeta> auth_meta; + +public: + Protocol(int type, AsyncConnection *connection); + virtual ~Protocol(); + + // prepare protocol for connecting to peer + virtual void connect() = 0; + // prepare protocol for accepting peer connections + virtual void accept() = 0; + // true -> protocol is ready for sending messages + virtual bool is_connected() = 0; + // stop connection + virtual void stop() = 0; + // signal and handle connection failure + virtual void fault() = 0; + // send message + virtual void send_message(Message *m) = 0; + // send keepalive + virtual void send_keepalive() = 0; + + virtual void read_event() = 0; + virtual void write_event() = 0; + virtual bool is_queued() = 0; + + int get_con_mode() const { + return auth_meta->con_mode; + } +}; + +#endif /* _MSG_ASYNC_PROTOCOL_ */ diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc new file mode 100644 index 00000000..9a7ab9d4 --- /dev/null +++ b/src/msg/async/ProtocolV1.cc @@ -0,0 +1,2547 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ProtocolV1.h" + +#include "common/errno.h" + +#include "AsyncConnection.h" +#include "AsyncMessenger.h" +#include "common/EventTrace.h" +#include "include/random.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) { + return *_dout << "--1- " << messenger->get_myaddrs() << " >> " + << *connection->peer_addrs + << " conn(" + << connection << " " << this + << " :" << connection->port << " s=" << get_state_name(state) + << " pgs=" << peer_global_seq << " cs=" << connect_seq + << " l=" << connection->policy.lossy << ")."; +} + +#define WRITE(B, C) write(CONTINUATION(C), B) + +#define READ(L, C) read(CONTINUATION(C), L) + +#define READB(L, B, C) read(CONTINUATION(C), L, B) + +// Constant to limit starting sequence number to 2^31. Nothing special about +// it, just a big number. PLR +#define SEQ_MASK 0x7fffffff + +const int ASYNC_COALESCE_THRESHOLD = 256; + +using namespace std; + +static void alloc_aligned_buffer(bufferlist &data, unsigned len, unsigned off) { + // create a buffer to read into that matches the data alignment + unsigned alloc_len = 0; + unsigned left = len; + unsigned head = 0; + if (off & ~CEPH_PAGE_MASK) { + // head + alloc_len += CEPH_PAGE_SIZE; + head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left); + left -= head; + } + alloc_len += left; + bufferptr ptr(buffer::create_small_page_aligned(alloc_len)); + if (head) ptr.set_offset(CEPH_PAGE_SIZE - head); + data.push_back(std::move(ptr)); +} + +/** + * Protocol V1 + **/ + +ProtocolV1::ProtocolV1(AsyncConnection *connection) + : Protocol(1, connection), + temp_buffer(nullptr), + can_write(WriteStatus::NOWRITE), + keepalive(false), + connect_seq(0), + peer_global_seq(0), + msg_left(0), + cur_msg_size(0), + replacing(false), + is_reset_from_peer(false), + once_ready(false), + state(NONE), + global_seq(0), + authorizer(nullptr), + wait_for_seq(false) { + temp_buffer = new char[4096]; +} + +ProtocolV1::~ProtocolV1() { + ceph_assert(out_q.empty()); + ceph_assert(sent.empty()); + + delete[] temp_buffer; + + if (authorizer) { + delete authorizer; + } +} + +void ProtocolV1::connect() { + this->state = START_CONNECT; + + // reset connect state variables + if (authorizer) { + delete authorizer; + authorizer = nullptr; + } + authorizer_buf.clear(); + // FIPS zeroization audit 20191115: these memsets are not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + memset(&connect_reply, 0, sizeof(connect_reply)); + + global_seq = messenger->get_global_seq(); +} + +void ProtocolV1::accept() { this->state = START_ACCEPT; } + +bool ProtocolV1::is_connected() { + return can_write.load() == WriteStatus::CANWRITE; +} + +void ProtocolV1::stop() { + ldout(cct, 20) << __func__ << dendl; + if (state == CLOSED) { + return; + } + + if (connection->delay_state) connection->delay_state->flush(); + + ldout(cct, 2) << __func__ << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + + reset_recv_state(); + discard_out_queue(); + + connection->_stop(); + + can_write = WriteStatus::CLOSED; + state = CLOSED; +} + +void ProtocolV1::fault() { + ldout(cct, 20) << __func__ << dendl; + + if (state == CLOSED || state == NONE) { + ldout(cct, 10) << __func__ << " connection is already closed" << dendl; + return; + } + + if (connection->policy.lossy && state != START_CONNECT && + state != CONNECTING) { + ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return; + } + + connection->write_lock.lock(); + can_write = WriteStatus::NOWRITE; + is_reset_from_peer = false; + + // requeue sent items + requeue_sent(); + + if (!once_ready && out_q.empty() && state >= START_ACCEPT && + state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) { + ldout(cct, 10) << __func__ << " with nothing to send and in the half " + << " accept state just closed" << dendl; + connection->write_lock.unlock(); + stop(); + connection->dispatch_queue->queue_reset(connection); + return; + } + replacing = false; + + connection->fault(); + + reset_recv_state(); + + if (connection->policy.standby && out_q.empty() && !keepalive && + state != WAIT) { + ldout(cct, 10) << __func__ << " with nothing to send, going to standby" + << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return; + } + + connection->write_lock.unlock(); + + if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) || + state == WAIT) { + // backoff! + if (state == WAIT) { + backoff.set_from_double(cct->_conf->ms_max_backoff); + } else if (backoff == utime_t()) { + backoff.set_from_double(cct->_conf->ms_initial_backoff); + } else { + backoff += backoff; + if (backoff > cct->_conf->ms_max_backoff) + backoff.set_from_double(cct->_conf->ms_max_backoff); + } + + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + ldout(cct, 10) << __func__ << " waiting " << backoff << dendl; + // woke up again; + connection->register_time_events.insert( + connection->center->create_time_event(backoff.to_nsec() / 1000, + connection->wakeup_handler)); + } else { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 0) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); + } +} + +void ProtocolV1::send_message(Message *m) { + bufferlist bl; + uint64_t f = connection->get_features(); + + // TODO: Currently not all messages supports reencode like MOSDMap, so here + // only let fast dispatch support messages prepare message + bool can_fast_prepare = messenger->ms_can_fast_dispatch(m); + if (can_fast_prepare) { + prepare_send_message(f, m, bl); + } + + std::lock_guard<std::mutex> l(connection->write_lock); + // "features" changes will change the payload encoding + if (can_fast_prepare && + (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) { + // ensure the correctness of message encoding + bl.clear(); + m->clear_payload(); + ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f + << " != " << connection->get_features() << dendl; + } + if (can_write == WriteStatus::CLOSED) { + ldout(cct, 10) << __func__ << " connection closed." + << " Drop message " << m << dendl; + m->put(); + } else { + m->trace.event("async enqueueing message"); + out_q[m->get_priority()].emplace_back(std::move(bl), m); + ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m + << dendl; + if (can_write != WriteStatus::REPLACING && !write_in_progress) { + write_in_progress = true; + connection->center->dispatch_event_external(connection->write_handler); + } + } +} + +void ProtocolV1::prepare_send_message(uint64_t features, Message *m, + bufferlist &bl) { + ldout(cct, 20) << __func__ << " m " << *m << dendl; + + // associate message with Connection (for benefit of encode_payload) + if (m->empty_payload()) { + ldout(cct, 20) << __func__ << " encoding features " << features << " " << m + << " " << *m << dendl; + } else { + ldout(cct, 20) << __func__ << " half-reencoding features " << features + << " " << m << " " << *m << dendl; + } + + // encode and copy out of *m + m->encode(features, messenger->crcflags); + + bl.append(m->get_payload()); + bl.append(m->get_middle()); + bl.append(m->get_data()); +} + +void ProtocolV1::send_keepalive() { + ldout(cct, 10) << __func__ << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (can_write != WriteStatus::CLOSED) { + keepalive = true; + connection->center->dispatch_event_external(connection->write_handler); + } +} + +void ProtocolV1::read_event() { + ldout(cct, 20) << __func__ << dendl; + switch (state) { + case START_CONNECT: + CONTINUATION_RUN(CONTINUATION(send_client_banner)); + break; + case START_ACCEPT: + CONTINUATION_RUN(CONTINUATION(send_server_banner)); + break; + case OPENED: + CONTINUATION_RUN(CONTINUATION(wait_message)); + break; + case THROTTLE_MESSAGE: + CONTINUATION_RUN(CONTINUATION(throttle_message)); + break; + case THROTTLE_BYTES: + CONTINUATION_RUN(CONTINUATION(throttle_bytes)); + break; + case THROTTLE_DISPATCH_QUEUE: + CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue)); + break; + default: + break; + } +} + +void ProtocolV1::write_event() { + ldout(cct, 10) << __func__ << dendl; + ssize_t r = 0; + + connection->write_lock.lock(); + if (can_write == WriteStatus::CANWRITE) { + if (keepalive) { + append_keepalive_or_ack(); + keepalive = false; + } + + auto start = ceph::mono_clock::now(); + bool more; + do { + bufferlist data; + Message *m = _get_next_outgoing(&data); + if (!m) { + break; + } + + if (!connection->policy.lossy) { + // put on sent list + sent.push_back(m); + m->get(); + } + more = !out_q.empty(); + connection->write_lock.unlock(); + + // send_message or requeue messages may not encode message + if (!data.length()) { + prepare_send_message(connection->get_features(), m, data); + } + + r = write_message(m, data, more); + + connection->write_lock.lock(); + if (r == 0) { + ; + } else if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + break; + } else if (r > 0) + break; + } while (can_write == WriteStatus::CANWRITE); + write_in_progress = false; + connection->write_lock.unlock(); + + // if r > 0 mean data still lefted, so no need _try_send. + if (r == 0) { + uint64_t left = ack_left; + if (left) { + ceph_le64 s; + s = in_seq; + connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK); + connection->outgoing_bl.append((char *)&s, sizeof(s)); + ldout(cct, 10) << __func__ << " try send msg ack, acked " << left + << " messages" << dendl; + ack_left -= left; + left = ack_left; + r = connection->_try_send(left); + } else if (is_queued()) { + r = connection->_try_send(); + } + } + + connection->logger->tinc(l_msgr_running_send_time, + ceph::mono_clock::now() - start); + if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + } else { + write_in_progress = false; + connection->write_lock.unlock(); + connection->lock.lock(); + connection->write_lock.lock(); + if (state == STANDBY && !connection->policy.server && is_queued()) { + ldout(cct, 10) << __func__ << " policy.server is false" << dendl; + connection->_connect(); + } else if (connection->cs && state != NONE && state != CLOSED && + state != START_CONNECT) { + r = connection->_try_send(); + if (r < 0) { + ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl; + connection->write_lock.unlock(); + fault(); + connection->lock.unlock(); + return; + } + } + connection->write_lock.unlock(); + connection->lock.unlock(); + } +} + +bool ProtocolV1::is_queued() { + return !out_q.empty() || connection->is_queued(); +} + +void ProtocolV1::run_continuation(CtPtr pcontinuation) { + if (pcontinuation) { + CONTINUATION_RUN(*pcontinuation); + } +} + +CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE<ProtocolV1> &next, + int len, char *buffer) { + if (!buffer) { + buffer = temp_buffer; + } + ssize_t r = connection->read(len, buffer, + [&next, this](char *buffer, int r) { + next.setParams(buffer, r); + CONTINUATION_RUN(next); + }); + if (r <= 0) { + next.setParams(buffer, r); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE<ProtocolV1> &next, + bufferlist &buffer) { + ssize_t r = connection->write(buffer, [&next, this](int r) { + next.setParams(r); + CONTINUATION_RUN(next); + }); + if (r <= 0) { + next.setParams(r); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV1::ready() { + ldout(cct, 25) << __func__ << dendl; + + // make sure no pending tick timer + if (connection->last_tick_id) { + connection->center->delete_time_event(connection->last_tick_id); + } + connection->last_tick_id = connection->center->create_time_event( + connection->inactive_timeout_us, connection->tick_handler); + + connection->write_lock.lock(); + can_write = WriteStatus::CANWRITE; + if (is_queued()) { + connection->center->dispatch_event_external(connection->write_handler); + } + connection->write_lock.unlock(); + connection->maybe_start_delay_thread(); + + state = OPENED; + return wait_message(); +} + +CtPtr ProtocolV1::wait_message() { + if (state != OPENED) { // must have changed due to a replace + return nullptr; + } + + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(char), handle_message); +} + +CtPtr ProtocolV1::handle_message(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read tag failed" << dendl; + return _fault(); + } + + char tag = buffer[0]; + ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl; + + if (tag == CEPH_MSGR_TAG_KEEPALIVE) { + ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl; + connection->set_last_keepalive(ceph_clock_now()); + } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) { + return READ(sizeof(ceph_timespec), handle_keepalive2); + } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + return READ(sizeof(ceph_timespec), handle_keepalive2_ack); + } else if (tag == CEPH_MSGR_TAG_ACK) { + return READ(sizeof(ceph_le64), handle_tag_ack); + } else if (tag == CEPH_MSGR_TAG_MSG) { +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + ltt_recv_stamp = ceph_clock_now(); +#endif + recv_stamp = ceph_clock_now(); + ldout(cct, 20) << __func__ << " begin MSG" << dendl; + return READ(sizeof(ceph_msg_header), handle_message_header); + } else if (tag == CEPH_MSGR_TAG_CLOSE) { + ldout(cct, 20) << __func__ << " got CLOSE" << dendl; + stop(); + } else { + ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl; + return _fault(); + } + return nullptr; +} + +CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl; + return _fault(); + } + + ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl; + + ceph_timespec *t; + t = (ceph_timespec *)buffer; + utime_t kp_t = utime_t(*t); + connection->write_lock.lock(); + append_keepalive_or_ack(true, &kp_t); + connection->write_lock.unlock(); + + ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl; + connection->set_last_keepalive(ceph_clock_now()); + + if (is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(wait_message); +} + +void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) { + ldout(cct, 10) << __func__ << dendl; + if (ack) { + ceph_assert(tp); + struct ceph_timespec ts; + tp->encode_timeval(&ts); + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK); + connection->outgoing_bl.append((char *)&ts, sizeof(ts)); + } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + struct ceph_timespec ts; + utime_t t = ceph_clock_now(); + t.encode_timeval(&ts); + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2); + connection->outgoing_bl.append((char *)&ts, sizeof(ts)); + } else { + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE); + } +} + +CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl; + return _fault(); + } + + ceph_timespec *t; + t = (ceph_timespec *)buffer; + connection->set_last_keepalive_ack(utime_t(*t)); + ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl; + + return CONTINUE(wait_message); +} + +CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read ack seq failed" << dendl; + return _fault(); + } + + ceph_le64 seq; + seq = *(ceph_le64 *)buffer; + ldout(cct, 20) << __func__ << " got ACK" << dendl; + + ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl; + // trim sent list + static const int max_pending = 128; + int i = 0; + Message *pending[max_pending]; + connection->write_lock.lock(); + while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) { + Message *m = sent.front(); + sent.pop_front(); + pending[i++] = m; + ldout(cct, 10) << __func__ << " got ack seq " << seq + << " >= " << m->get_seq() << " on " << m << " " << *m + << dendl; + } + connection->write_lock.unlock(); + for (int k = 0; k < i; k++) { + pending[k]->put(); + } + + return CONTINUE(wait_message); +} + +CtPtr ProtocolV1::handle_message_header(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message header failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got MSG header" << dendl; + + current_header = *((ceph_msg_header *)buffer); + + ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src " + << entity_name_t(current_header.src) << " front=" << current_header.front_len + << " data=" << current_header.data_len << " off " << current_header.data_off + << dendl; + + if (messenger->crcflags & MSG_CRC_HEADER) { + __u32 header_crc = 0; + header_crc = ceph_crc32c(0, (unsigned char *)¤t_header, + sizeof(current_header) - sizeof(current_header.crc)); + // verify header crc + if (header_crc != current_header.crc) { + ldout(cct, 0) << __func__ << " got bad header crc " << header_crc + << " != " << current_header.crc << dendl; + return _fault(); + } + } + + // Reset state + data_buf.clear(); + front.clear(); + middle.clear(); + data.clear(); + + state = THROTTLE_MESSAGE; + return CONTINUE(throttle_message); +} + +CtPtr ProtocolV1::throttle_message() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " wants " << 1 + << " message from policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + if (!connection->policy.throttler_messages->get_or_fail()) { + ldout(cct, 10) << __func__ << " wants 1 message from policy throttle " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + state = THROTTLE_BYTES; + return CONTINUE(throttle_bytes); +} + +CtPtr ProtocolV1::throttle_bytes() { + ldout(cct, 20) << __func__ << dendl; + + cur_msg_size = current_header.front_len + current_header.middle_len + + current_header.data_len; + if (cur_msg_size) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() + << "/" << connection->policy.throttler_bytes->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event( + 1000, connection->wakeup_handler)); + } + return nullptr; + } + } + } + + state = THROTTLE_DISPATCH_QUEUE; + return CONTINUE(throttle_dispatch_queue); +} + +CtPtr ProtocolV1::throttle_dispatch_queue() { + ldout(cct, 20) << __func__ << dendl; + + if (cur_msg_size) { + if (!connection->dispatch_queue->dispatch_throttler.get_or_fail( + cur_msg_size)) { + ldout(cct, 10) + << __func__ << " wants " << cur_msg_size + << " bytes from dispatch throttle " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + throttle_stamp = ceph_clock_now(); + + state = READ_MESSAGE_FRONT; + return read_message_front(); +} + +CtPtr ProtocolV1::read_message_front() { + ldout(cct, 20) << __func__ << dendl; + + unsigned front_len = current_header.front_len; + if (front_len) { + if (!front.length()) { + front.push_back(buffer::create(front_len)); + } + return READB(front_len, front.c_str(), handle_message_front); + } + return read_message_middle(); +} + +CtPtr ProtocolV1::handle_message_front(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message front failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got front " << front.length() << dendl; + + return read_message_middle(); +} + +CtPtr ProtocolV1::read_message_middle() { + ldout(cct, 20) << __func__ << dendl; + + if (current_header.middle_len) { + if (!middle.length()) { + middle.push_back(buffer::create(current_header.middle_len)); + } + return READB(current_header.middle_len, middle.c_str(), + handle_message_middle); + } + + return read_message_data_prepare(); +} + +CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message middle failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl; + + return read_message_data_prepare(); +} + +CtPtr ProtocolV1::read_message_data_prepare() { + ldout(cct, 20) << __func__ << dendl; + + unsigned data_len = le32_to_cpu(current_header.data_len); + unsigned data_off = le32_to_cpu(current_header.data_off); + + if (data_len) { + // get a buffer +#if 0 + // rx_buffers is broken by design... see + // http://tracker.ceph.com/issues/22480 + map<ceph_tid_t, pair<bufferlist, int> >::iterator p = + connection->rx_buffers.find(current_header.tid); + if (p != connection->rx_buffers.end()) { + ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second + << " at offset " << data_off << " len " + << p->second.first.length() << dendl; + data_buf = p->second.first; + // make sure it's big enough + if (data_buf.length() < data_len) + data_buf.push_back(buffer::create(data_len - data_buf.length())); + data_blp = data_buf.begin(); + } else { + ldout(cct, 20) << __func__ << " allocating new rx buffer at offset " + << data_off << dendl; + alloc_aligned_buffer(data_buf, data_len, data_off); + data_blp = data_buf.begin(); + } +#else + ldout(cct, 20) << __func__ << " allocating new rx buffer at offset " + << data_off << dendl; + alloc_aligned_buffer(data_buf, data_len, data_off); + data_blp = data_buf.begin(); +#endif + } + + msg_left = data_len; + + return CONTINUE(read_message_data); +} + +CtPtr ProtocolV1::read_message_data() { + ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl; + + if (msg_left > 0) { + bufferptr bp = data_blp.get_current_ptr(); + unsigned read_len = std::min(bp.length(), msg_left); + + return READB(read_len, bp.c_str(), handle_message_data); + } + + return read_message_footer(); +} + +CtPtr ProtocolV1::handle_message_data(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read data error " << dendl; + return _fault(); + } + + bufferptr bp = data_blp.get_current_ptr(); + unsigned read_len = std::min(bp.length(), msg_left); + ceph_assert(read_len < std::numeric_limits<int>::max()); + data_blp.advance(read_len); + data.append(bp, 0, read_len); + msg_left -= read_len; + + return CONTINUE(read_message_data); +} + +CtPtr ProtocolV1::read_message_footer() { + ldout(cct, 20) << __func__ << dendl; + + state = READ_FOOTER_AND_DISPATCH; + + unsigned len; + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + len = sizeof(ceph_msg_footer); + } else { + len = sizeof(ceph_msg_footer_old); + } + + return READ(len, handle_message_footer); +} + +CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read footer data error " << dendl; + return _fault(); + } + + ceph_msg_footer footer; + ceph_msg_footer_old old_footer; + + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + footer = *((ceph_msg_footer *)buffer); + } else { + old_footer = *((ceph_msg_footer_old *)buffer); + footer.front_crc = old_footer.front_crc; + footer.middle_crc = old_footer.middle_crc; + footer.data_crc = old_footer.data_crc; + footer.sig = 0; + footer.flags = old_footer.flags; + } + + int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0; + ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl; + if (aborted) { + ldout(cct, 0) << __func__ << " got " << front.length() << " + " + << middle.length() << " + " << data.length() + << " byte message.. ABORTED" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got " << front.length() << " + " + << middle.length() << " + " << data.length() << " byte message" + << dendl; + Message *message = decode_message(cct, messenger->crcflags, current_header, + footer, front, middle, data, connection); + if (!message) { + ldout(cct, 1) << __func__ << " decode message failed " << dendl; + return _fault(); + } + + // + // Check the signature if one should be present. A zero return indicates + // success. PLR + // + + if (session_security.get() == NULL) { + ldout(cct, 10) << __func__ << " no session security set" << dendl; + } else { + if (session_security->check_message_signature(message)) { + ldout(cct, 0) << __func__ << " Signature check failed" << dendl; + message->put(); + return _fault(); + } + } + message->set_byte_throttler(connection->policy.throttler_bytes); + message->set_message_throttler(connection->policy.throttler_messages); + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(cur_msg_size); + + message->set_recv_stamp(recv_stamp); + message->set_throttle_stamp(throttle_stamp); + message->set_recv_complete_stamp(ceph_clock_now()); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = in_seq; + if (message->get_seq() <= cur_seq) { + ldout(cct, 0) << __func__ << " got old message " << message->get_seq() + << " <= " << cur_seq << " " << message << " " << *message + << ", discarding" << dendl; + message->put(); + if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) && + cct->_conf->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return nullptr; + } + if (message->get_seq() > cur_seq + 1) { + ldout(cct, 0) << __func__ << " missed message? skipped from seq " + << cur_seq << " to " << message->get_seq() << dendl; + if (cct->_conf->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + if (message->get_type() == CEPH_MSG_OSD_OP || + message->get_type() == CEPH_MSG_OSD_OPREPLY) { + utime_t ltt_processed_stamp = ceph_clock_now(); + double usecs_elapsed = + (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000; + ostringstream buf; + if (message->get_type() == CEPH_MSG_OSD_OP) + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP", + false); + else + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY", + false); + } +#endif + + // note last received message. + in_seq = message->get_seq(); + ldout(cct, 5) << " rx " << message->get_source() << " seq " + << message->get_seq() << " " << message << " " << *message + << dendl; + + bool need_dispatch_writer = false; + if (!connection->policy.lossy) { + ack_left++; + need_dispatch_writer = true; + } + + state = OPENED; + + connection->logger->inc(l_msgr_recv_messages); + connection->logger->inc( + l_msgr_recv_bytes, + cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer)); + + messenger->ms_fast_preprocess(message); + auto fast_dispatch_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_recv_time, + fast_dispatch_time - connection->recv_start_time); + if (connection->delay_state) { + double delay_period = 0; + if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) { + delay_period = + cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0; + ldout(cct, 1) << "queue_received will delay after " + << (ceph_clock_now() + delay_period) << " on " << message + << " " << *message << dendl; + } + connection->delay_state->queue(delay_period, message); + } else if (messenger->ms_can_fast_dispatch(message)) { + connection->lock.unlock(); + connection->dispatch_queue->fast_dispatch(message); + connection->recv_start_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_fast_dispatch_time, + connection->recv_start_time - fast_dispatch_time); + connection->lock.lock(); + } else { + connection->dispatch_queue->enqueue(message, message->get_priority(), + connection->conn_id); + } + + // clean up local buffer references + data_buf.clear(); + front.clear(); + middle.clear(); + data.clear(); + + if (need_dispatch_writer && connection->is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(wait_message); +} + +void ProtocolV1::session_reset() { + ldout(cct, 10) << __func__ << " started" << dendl; + + std::lock_guard<std::mutex> l(connection->write_lock); + if (connection->delay_state) { + connection->delay_state->discard(); + } + + connection->dispatch_queue->discard_queue(connection->conn_id); + discard_out_queue(); + // note: we need to clear outgoing_bl here, but session_reset may be + // called by other thread, so let caller clear this itself! + // outgoing_bl.clear(); + + connection->dispatch_queue->queue_remote_reset(connection); + + randomize_out_seq(); + + in_seq = 0; + connect_seq = 0; + // it's safe to directly set 0, double locked + ack_left = 0; + once_ready = false; + can_write = WriteStatus::NOWRITE; +} + +void ProtocolV1::randomize_out_seq() { + if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) { + // Set out_seq to a random value, so CRC won't be predictable. + auto rand_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK); + ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl; + out_seq = rand_seq; + } else { + // previously, seq #'s always started at 0. + out_seq = 0; + } +} + +ssize_t ProtocolV1::write_message(Message *m, bufferlist &bl, bool more) { + FUNCTRACE(cct); + ceph_assert(connection->center->in_thread()); + m->set_seq(++out_seq); + + if (messenger->crcflags & MSG_CRC_HEADER) { + m->calc_header_crc(); + } + + ceph_msg_header &header = m->get_header(); + ceph_msg_footer &footer = m->get_footer(); + + // TODO: let sign_message could be reentry? + // Now that we have all the crcs calculated, handle the + // digital signature for the message, if the AsyncConnection has session + // security set up. Some session security options do not + // actually calculate and check the signature, but they should + // handle the calls to sign_message and check_signature. PLR + if (session_security.get() == NULL) { + ldout(cct, 20) << __func__ << " no session security" << dendl; + } else { + if (session_security->sign_message(m)) { + ldout(cct, 20) << __func__ << " failed to sign m=" << m + << "): sig = " << footer.sig << dendl; + } else { + ldout(cct, 20) << __func__ << " signed m=" << m + << "): sig = " << footer.sig << dendl; + } + } + + connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG); + connection->outgoing_bl.append((char *)&header, sizeof(header)); + + ldout(cct, 20) << __func__ << " sending message type=" << header.type + << " src " << entity_name_t(header.src) + << " front=" << header.front_len << " data=" << header.data_len + << " off " << header.data_off << dendl; + + if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.buffers().size() > 1)) { + for (const auto &pb : bl.buffers()) { + connection->outgoing_bl.append((char *)pb.c_str(), pb.length()); + } + } else { + connection->outgoing_bl.claim_append(bl); + } + + // send footer; if receiver doesn't support signatures, use the old footer + // format + ceph_msg_footer_old old_footer; + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + connection->outgoing_bl.append((char *)&footer, sizeof(footer)); + } else { + if (messenger->crcflags & MSG_CRC_HEADER) { + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + old_footer.data_crc = footer.data_crc; + } else { + old_footer.front_crc = old_footer.middle_crc = 0; + } + old_footer.data_crc = + messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0; + old_footer.flags = footer.flags; + connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer)); + } + + m->trace.event("async writing message"); + ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m + << dendl; + ssize_t total_send_size = connection->outgoing_bl.length(); + ssize_t rc = connection->_try_send(more); + if (rc < 0) { + ldout(cct, 1) << __func__ << " error sending " << m << ", " + << cpp_strerror(rc) << dendl; + } else { + connection->logger->inc( + l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length()); + ldout(cct, 10) << __func__ << " sending " << m + << (rc ? " continuely." : " done.") << dendl; + } + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false); + m->put(); + + return rc; +} + +void ProtocolV1::requeue_sent() { + write_in_progress = false; + if (sent.empty()) { + return; + } + + list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + out_seq -= sent.size(); + while (!sent.empty()) { + Message *m = sent.back(); + sent.pop_back(); + ldout(cct, 10) << __func__ << " " << *m << " for resend " + << " (" << m->get_seq() << ")" << dendl; + rq.push_front(make_pair(bufferlist(), m)); + } +} + +uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) { + ldout(cct, 10) << __func__ << " " << seq << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) { + return seq; + } + list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + uint64_t count = out_seq; + while (!rq.empty()) { + pair<bufferlist, Message *> p = rq.front(); + if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break; + ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq " + << p.second->get_seq() << " <= " << seq << ", discarding" + << dendl; + p.second->put(); + rq.pop_front(); + count++; + } + if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST); + return count; +} + +/* + * Tears down the message queues, and removes them from the + * DispatchQueue Must hold write_lock prior to calling. + */ +void ProtocolV1::discard_out_queue() { + ldout(cct, 10) << __func__ << " started" << dendl; + + for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) { + ldout(cct, 20) << __func__ << " discard " << *p << dendl; + (*p)->put(); + } + sent.clear(); + for (map<int, list<pair<bufferlist, Message *> > >::iterator p = + out_q.begin(); + p != out_q.end(); ++p) { + for (list<pair<bufferlist, Message *> >::iterator r = p->second.begin(); + r != p->second.end(); ++r) { + ldout(cct, 20) << __func__ << " discard " << r->second << dendl; + r->second->put(); + } + } + out_q.clear(); + write_in_progress = false; +} + +void ProtocolV1::reset_security() +{ + ldout(cct, 5) << __func__ << dendl; + + // clean up state internal variables and states + if (state == CONNECTING_SEND_CONNECT_MSG) { + if (authorizer) { + delete authorizer; + } + authorizer = nullptr; + } +} + +void ProtocolV1::reset_recv_state() { + ldout(cct, 5) << __func__ << dendl; + + // execute in the same thread that uses the `session_security`. + // We need to do the warp because holding `write_lock` is not + // enough as `write_event()` releases it just before calling + // `write_message()`. `submit_to()` here is NOT blocking. + if (!connection->center->in_thread()) { + connection->center->submit_to(connection->center->get_id(), [this] { + ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" + << dendl; + // Possibly unnecessary. See the comment in `deactivate_existing`. + std::lock_guard<std::mutex> l(connection->lock); + std::lock_guard<std::mutex> wl(connection->write_lock); + reset_security(); + }, /* nowait = */true); + } else { + reset_security(); + } + + // clean read and write callbacks + connection->pendingReadLen.reset(); + connection->writeCallback.reset(); + + if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH && + connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " releasing " << 1 + << " message to policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + connection->policy.throttler_messages->put(); + } + if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " releasing " << cur_msg_size + << " bytes to policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + connection->policy.throttler_bytes->put(cur_msg_size); + } + } + if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) { + ldout(cct, 10) + << __func__ << " releasing " << cur_msg_size + << " bytes to dispatch_queue throttler " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() << dendl; + connection->dispatch_queue->dispatch_throttle_release(cur_msg_size); + } +} + +Message *ProtocolV1::_get_next_outgoing(bufferlist *bl) { + Message *m = 0; + if (!out_q.empty()) { + map<int, list<pair<bufferlist, Message *> > >::reverse_iterator it = + out_q.rbegin(); + ceph_assert(!it->second.empty()); + list<pair<bufferlist, Message *> >::iterator p = it->second.begin(); + m = p->second; + if (bl) bl->swap(p->first); + it->second.erase(p); + if (it->second.empty()) out_q.erase(it->first); + } + return m; +} + +/** + * Client Protocol V1 + **/ + +CtPtr ProtocolV1::send_client_banner() { + ldout(cct, 20) << __func__ << dendl; + state = CONNECTING; + + bufferlist bl; + bl.append(CEPH_BANNER, strlen(CEPH_BANNER)); + return WRITE(bl, handle_client_banner_write); +} + +CtPtr ProtocolV1::handle_client_banner_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " write client banner failed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " connect write banner done: " + << connection->get_peer_addr() << dendl; + + return wait_server_banner(); +} + +CtPtr ProtocolV1::wait_server_banner() { + state = CONNECTING_WAIT_BANNER_AND_IDENTIFY; + + ldout(cct, 20) << __func__ << dendl; + + bufferlist myaddrbl; + unsigned banner_len = strlen(CEPH_BANNER); + unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2; + return READ(need_len, handle_server_banner_and_identify); +} + +CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read banner and identify addresses failed" + << dendl; + return _fault(); + } + + unsigned banner_len = strlen(CEPH_BANNER); + if (memcmp(buffer, CEPH_BANNER, banner_len)) { + ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer " + << connection->get_peer_addr() << dendl; + return _fault(); + } + + bufferlist bl; + entity_addr_t paddr, peer_addr_for_me; + + bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2); + auto p = bl.cbegin(); + try { + decode(paddr, p); + decode(peer_addr_for_me, p); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode peer addr failed " << dendl; + return _fault(); + } + ldout(cct, 20) << __func__ << " connect read peer addr " << paddr + << " on socket " << connection->cs.fd() << dendl; + + entity_addr_t peer_addr = connection->peer_addrs->legacy_addr(); + if (peer_addr != paddr) { + if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() && + peer_addr.get_nonce() == paddr.get_nonce()) { + ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not " + << peer_addr << " - presumably this is the same node!" + << dendl; + } else { + ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not " + << peer_addr << dendl; + return _fault(); + } + } + + ldout(cct, 20) << __func__ << " connect peer addr for me is " + << peer_addr_for_me << dendl; + if (messenger->get_myaddrs().empty() || + messenger->get_myaddrs().front().is_blank_ip()) { + sockaddr_storage ss; + socklen_t len = sizeof(ss); + getsockname(connection->cs.fd(), (sockaddr *)&ss, &len); + entity_addr_t a; + if (cct->_conf->ms_learn_addr_from_peer) { + ldout(cct, 1) << __func__ << " peer " << connection->target_addr + << " says I am " << peer_addr_for_me << " (socket says " + << (sockaddr*)&ss << ")" << dendl; + a = peer_addr_for_me; + } else { + ldout(cct, 1) << __func__ << " socket to " << connection->target_addr + << " says I am " << (sockaddr*)&ss + << " (peer says " << peer_addr_for_me << ")" << dendl; + a.set_sockaddr((sockaddr *)&ss); + } + a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this + a.set_port(0); + connection->lock.unlock(); + messenger->learned_addr(a); + if (cct->_conf->ms_inject_internal_delays && + cct->_conf->ms_inject_socket_failures) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 10) << __func__ << " sleep for " + << cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + } + connection->lock.lock(); + if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) { + ldout(cct, 1) << __func__ + << " state changed while learned_addr, mark_down or " + << " replacing must be happened just now" << dendl; + return nullptr; + } + } + + bufferlist myaddrbl; + encode(messenger->get_myaddr_legacy(), myaddrbl, 0); // legacy + return WRITE(myaddrbl, handle_my_addr_write); +} + +CtPtr ProtocolV1::handle_my_addr_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 2) << __func__ << " connect couldn't write my addr, " + << cpp_strerror(r) << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " connect sent my addr " + << messenger->get_myaddr_legacy() << dendl; + + return CONTINUE(send_connect_message); +} + +CtPtr ProtocolV1::send_connect_message() { + state = CONNECTING_SEND_CONNECT_MSG; + + ldout(cct, 20) << __func__ << dendl; + + if (!authorizer) { + authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type); + } + + ceph_msg_connect connect; + connect.features = connection->policy.features_supported; + connect.host_type = messenger->get_myname().type(); + connect.global_seq = global_seq; + connect.connect_seq = connect_seq; + connect.protocol_version = + messenger->get_proto_version(connection->peer_type, true); + connect.authorizer_protocol = authorizer ? authorizer->protocol : 0; + connect.authorizer_len = authorizer ? authorizer->bl.length() : 0; + + if (authorizer) { + ldout(cct, 10) << __func__ + << " connect_msg.authorizer_len=" << connect.authorizer_len + << " protocol=" << connect.authorizer_protocol << dendl; + } + + connect.flags = 0; + if (connection->policy.lossy) { + connect.flags |= + CEPH_MSG_CONNECT_LOSSY; // this is fyi, actually, server decides! + } + + bufferlist bl; + bl.append((char *)&connect, sizeof(connect)); + if (authorizer) { + bl.append(authorizer->bl.c_str(), authorizer->bl.length()); + } + + ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq + << " cseq=" << connect_seq + << " proto=" << connect.protocol_version << dendl; + + return WRITE(bl, handle_connect_message_write); +} + +CtPtr ProtocolV1::handle_connect_message_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 2) << __func__ << " connect couldn't send reply " + << cpp_strerror(r) << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ + << " connect wrote (self +) cseq, waiting for reply" << dendl; + + return wait_connect_reply(); +} + +CtPtr ProtocolV1::wait_connect_reply() { + ldout(cct, 20) << __func__ << dendl; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_reply, 0, sizeof(connect_reply)); + return READ(sizeof(connect_reply), handle_connect_reply_1); +} + +CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect reply failed" << dendl; + return _fault(); + } + + connect_reply = *((ceph_msg_connect_reply *)buffer); + + ldout(cct, 20) << __func__ << " connect got reply tag " + << (int)connect_reply.tag << " connect_seq " + << connect_reply.connect_seq << " global_seq " + << connect_reply.global_seq << " proto " + << connect_reply.protocol_version << " flags " + << (int)connect_reply.flags << " features " + << connect_reply.features << dendl; + + if (connect_reply.authorizer_len) { + return wait_connect_reply_auth(); + } + + return handle_connect_reply_2(); +} + +CtPtr ProtocolV1::wait_connect_reply_auth() { + ldout(cct, 20) << __func__ << dendl; + + ldout(cct, 10) << __func__ + << " reply.authorizer_len=" << connect_reply.authorizer_len + << dendl; + + ceph_assert(connect_reply.authorizer_len < 4096); + + return READ(connect_reply.authorizer_len, handle_connect_reply_auth); +} + +CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect reply authorizer failed" + << dendl; + return _fault(); + } + + bufferlist authorizer_reply; + authorizer_reply.append(buffer, connect_reply.authorizer_len); + + if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + ldout(cct, 10) << __func__ << " connect got auth challenge" << dendl; + authorizer->add_challenge(cct, authorizer_reply); + return CONTINUE(send_connect_message); + } + + auto iter = authorizer_reply.cbegin(); + if (authorizer && !authorizer->verify_reply(iter, + nullptr /* connection_secret */)) { + ldout(cct, 0) << __func__ << " failed verifying authorize reply" << dendl; + return _fault(); + } + + return handle_connect_reply_2(); +} + +CtPtr ProtocolV1::handle_connect_reply_2() { + ldout(cct, 20) << __func__ << dendl; + + if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) { + ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my " + << std::hex << connection->policy.features_supported + << " < peer " << connect_reply.features << " missing " + << (connect_reply.features & + ~connection->policy.features_supported) + << std::dec << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) { + ldout(cct, 0) << __func__ << " connect protocol version mismatch, my " + << messenger->get_proto_version(connection->peer_type, true) + << " != " << connect_reply.protocol_version << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { + ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) { + ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl; + session_reset(); + connect_seq = 0; + + // see session_reset + connection->outgoing_bl.clear(); + + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) { + global_seq = messenger->get_global_seq(connect_reply.global_seq); + ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL " + << connect_reply.global_seq << " chose new " << global_seq + << dendl; + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) { + ceph_assert(connect_reply.connect_seq > connect_seq); + ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq + << " -> " << connect_reply.connect_seq << dendl; + connect_seq = connect_reply.connect_seq; + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) { + ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl; + state = WAIT; + return _fault(); + } + + uint64_t feat_missing; + feat_missing = + connection->policy.features_required & ~(uint64_t)connect_reply.features; + if (feat_missing) { + ldout(cct, 1) << __func__ << " missing required features " << std::hex + << feat_missing << std::dec << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) { + ldout(cct, 10) + << __func__ + << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" + << dendl; + + return wait_ack_seq(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_READY) { + ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl; + } + + return client_ready(); +} + +CtPtr ProtocolV1::wait_ack_seq() { + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(uint64_t), handle_ack_seq); +} + +CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl; + return _fault(); + } + + uint64_t newly_acked_seq = 0; + + newly_acked_seq = *((uint64_t *)buffer); + ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq + << " vs out_seq " << out_seq << dendl; + out_seq = discard_requeued_up_to(out_seq, newly_acked_seq); + + bufferlist bl; + uint64_t s = in_seq; + bl.append((char *)&s, sizeof(s)); + + return WRITE(bl, handle_in_seq_write); +} + +CtPtr ProtocolV1::handle_in_seq_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl; + return _fault(); + } + + ldout(cct, 10) << __func__ << " send in_seq done " << dendl; + + return client_ready(); +} + +CtPtr ProtocolV1::client_ready() { + ldout(cct, 20) << __func__ << dendl; + + // hooray! + peer_global_seq = connect_reply.global_seq; + connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY; + + once_ready = true; + connect_seq += 1; + ceph_assert(connect_seq == connect_reply.connect_seq); + backoff = utime_t(); + connection->set_features((uint64_t)connect_reply.features & + (uint64_t)connection->policy.features_supported); + ldout(cct, 10) << __func__ << " connect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + // If we have an authorizer, get a new AuthSessionHandler to deal with + // ongoing security of the connection. PLR + if (authorizer != NULL) { + ldout(cct, 10) << __func__ << " setting up session_security with auth " + << authorizer << dendl; + session_security.reset(get_auth_session_handler( + cct, authorizer->protocol, + authorizer->session_key, + connection->get_features())); + } else { + // We have no authorizer, so we shouldn't be applying security to messages + // in this AsyncConnection. PLR + ldout(cct, 10) << __func__ << " no authorizer, clearing session_security" + << dendl; + session_security.reset(); + } + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +/** + * Server Protocol V1 + **/ + +CtPtr ProtocolV1::send_server_banner() { + ldout(cct, 20) << __func__ << dendl; + state = ACCEPTING; + + bufferlist bl; + + bl.append(CEPH_BANNER, strlen(CEPH_BANNER)); + + // as a server, we should have a legacy addr if we accepted this connection. + auto legacy = messenger->get_myaddrs().legacy_addr(); + encode(legacy, bl, 0); // legacy + connection->port = legacy.get_port(); + encode(connection->target_addr, bl, 0); // legacy + + ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd() + << " legacy " << legacy + << " socket_addr " << connection->socket_addr + << " target_addr " << connection->target_addr + << dendl; + + return WRITE(bl, handle_server_banner_write); +} + +CtPtr ProtocolV1::handle_server_banner_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << " write server banner failed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " write banner and addr done: " + << connection->get_peer_addr() << dendl; + + return wait_client_banner(); +} + +CtPtr ProtocolV1::wait_client_banner() { + ldout(cct, 20) << __func__ << dendl; + + return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr), + handle_client_banner); +} + +CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl; + return _fault(); + } + + if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) { + ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer + << "' (should be '" << CEPH_BANNER << "')" << dendl; + return _fault(); + } + + bufferlist addr_bl; + entity_addr_t peer_addr; + + addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr)); + try { + auto ti = addr_bl.cbegin(); + decode(peer_addr, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode peer_addr failed " << dendl; + return _fault(); + } + + ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl; + if (peer_addr.is_blank_ip()) { + // peer apparently doesn't know what ip they have; figure it out for them. + int port = peer_addr.get_port(); + peer_addr.set_sockaddr(connection->target_addr.get_sockaddr()); + peer_addr.set_port(port); + + ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr + << " (socket is " << connection->target_addr << ")" << dendl; + } + connection->set_peer_addr(peer_addr); // so that connection_state gets set up + connection->target_addr = peer_addr; + + return CONTINUE(wait_connect_message); +} + +CtPtr ProtocolV1::wait_connect_message() { + ldout(cct, 20) << __func__ << dendl; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + return READ(sizeof(connect_msg), handle_connect_message_1); +} + +CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect msg failed" << dendl; + return _fault(); + } + + connect_msg = *((ceph_msg_connect *)buffer); + + state = ACCEPTING_WAIT_CONNECT_MSG_AUTH; + + if (connect_msg.authorizer_len) { + return wait_connect_message_auth(); + } + + return handle_connect_message_2(); +} + +CtPtr ProtocolV1::wait_connect_message_auth() { + ldout(cct, 20) << __func__ << dendl; + authorizer_buf.clear(); + authorizer_buf.push_back(buffer::create(connect_msg.authorizer_len)); + return READB(connect_msg.authorizer_len, authorizer_buf.c_str(), + handle_connect_message_auth); +} + +CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl; + return _fault(); + } + + return handle_connect_message_2(); +} + +CtPtr ProtocolV1::handle_connect_message_2() { + ldout(cct, 20) << __func__ << dendl; + + ldout(cct, 20) << __func__ << " accept got peer connect_seq " + << connect_msg.connect_seq << " global_seq " + << connect_msg.global_seq << dendl; + + connection->set_peer_type(connect_msg.host_type); + connection->policy = messenger->get_policy(connect_msg.host_type); + + ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type + << ", policy.lossy=" << connection->policy.lossy + << " policy.server=" << connection->policy.server + << " policy.standby=" << connection->policy.standby + << " policy.resetcheck=" << connection->policy.resetcheck + << " features 0x" << std::hex << (uint64_t)connect_msg.features + << std::dec + << dendl; + + ceph_msg_connect_reply reply; + bufferlist authorizer_reply; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&reply, 0, sizeof(reply)); + reply.protocol_version = + messenger->get_proto_version(connection->peer_type, false); + + // mismatch? + ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version + << ", their proto " << connect_msg.protocol_version << dendl; + + if (connect_msg.protocol_version != reply.protocol_version) { + return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply, + authorizer_reply); + } + + // require signatures for cephx? + if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) { + if (connection->peer_type == CEPH_ENTITY_TYPE_OSD || + connection->peer_type == CEPH_ENTITY_TYPE_MDS || + connection->peer_type == CEPH_ENTITY_TYPE_MGR) { + if (cct->_conf->cephx_require_signatures || + cct->_conf->cephx_cluster_require_signatures) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring MSG_AUTH feature bit for cluster" + << dendl; + connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (cct->_conf->cephx_require_version >= 2 || + cct->_conf->cephx_cluster_require_version >= 2) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring cephx v2 feature bit for cluster" + << dendl; + connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + } else { + if (cct->_conf->cephx_require_signatures || + cct->_conf->cephx_service_require_signatures) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring MSG_AUTH feature bit for service" + << dendl; + connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (cct->_conf->cephx_require_version >= 2 || + cct->_conf->cephx_service_require_version >= 2) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring cephx v2 feature bit for service" + << dendl; + connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + } + } + + uint64_t feat_missing = + connection->policy.features_required & ~(uint64_t)connect_msg.features; + if (feat_missing) { + ldout(cct, 1) << __func__ << " peer missing required features " << std::hex + << feat_missing << std::dec << dendl; + return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply, + authorizer_reply); + } + + bufferlist auth_bl_copy = authorizer_buf; + connection->lock.unlock(); + ldout(cct,10) << __func__ << " authorizor_protocol " + << connect_msg.authorizer_protocol + << " len " << auth_bl_copy.length() + << dendl; + bool authorizer_valid; + bool need_challenge = HAVE_FEATURE(connect_msg.features, CEPHX_V2); + bool had_challenge = (bool)authorizer_challenge; + if (!messenger->ms_deliver_verify_authorizer( + connection, connection->peer_type, connect_msg.authorizer_protocol, + auth_bl_copy, authorizer_reply, authorizer_valid, session_key, + nullptr /* connection_secret */, + need_challenge ? &authorizer_challenge : nullptr) || + !authorizer_valid) { + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (need_challenge && !had_challenge && authorizer_challenge) { + ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl; + ceph_assert(authorizer_reply.length()); + return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER, + reply, authorizer_reply); + } else { + ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len=" + << authorizer_reply.length() << dendl; + session_security.reset(); + return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply, + authorizer_reply); + } + } + + // We've verified the authorizer for this AsyncConnection, so set up the + // session security structure. PLR + ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl; + + // existing? + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + connection->inject_delay(); + + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (existing == connection) { + existing = nullptr; + } + if (existing && existing->protocol->proto_type != 1) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + if (existing) { + // There is no possible that existing connection will acquire this + // connection's lock + existing->lock.lock(); // skip lockdep check (we are locking a second + // AsyncConnection here) + + ldout(cct,10) << __func__ << " existing=" << existing << " exproto=" + << existing->protocol.get() << dendl; + ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get()); + ceph_assert(exproto); + ceph_assert(exproto->proto_type == 1); + + if (exproto->state == CLOSED) { + ldout(cct, 1) << __func__ << " existing " << existing + << " already closed." << dendl; + existing->lock.unlock(); + existing = nullptr; + + return open(reply, authorizer_reply); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing_state=" + << connection->get_state_name(existing->state) << dendl; + reply.global_seq = exproto->peer_global_seq; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply, + authorizer_reply); + } + + if (connect_msg.global_seq < exproto->peer_global_seq) { + ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq " + << exproto->peer_global_seq << " > " + << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl; + reply.global_seq = exproto->peer_global_seq; // so we can send it below.. + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply, + authorizer_reply); + } else { + ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq " + << exproto->peer_global_seq + << " <= " << connect_msg.global_seq << ", looks ok" + << dendl; + } + + if (existing->policy.lossy) { + ldout(cct, 0) + << __func__ + << " accept replacing existing (lossy) channel (new one lossy=" + << connection->policy.lossy << ")" << dendl; + exproto->session_reset(); + return replace(existing, reply, authorizer_reply); + } + + ldout(cct, 1) << __func__ << " accept connect_seq " + << connect_msg.connect_seq + << " vs existing csq=" << exproto->connect_seq + << " existing_state=" + << connection->get_state_name(existing->state) << dendl; + + if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) { + ldout(cct, 0) + << __func__ + << " accept peer reset, then tried to connect to us, replacing" + << dendl; + // this is a hard reset from peer + is_reset_from_peer = true; + if (connection->policy.resetcheck) { + exproto->session_reset(); // this resets out_queue, msg_ and + // connect_seq #'s + } + return replace(existing, reply, authorizer_reply); + } + + if (connect_msg.connect_seq < exproto->connect_seq) { + // old attempt, or we sent READY but they didn't get it. + ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq " + << exproto->connect_seq << " > " << connect_msg.connect_seq + << ", RETRY_SESSION" << dendl; + reply.connect_seq = exproto->connect_seq + 1; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply, + authorizer_reply); + } + + if (connect_msg.connect_seq == exproto->connect_seq) { + // if the existing connection successfully opened, and/or + // subsequently went to standby, then the peer should bump + // their connect_seq and retry: this is not a connection race + // we need to resolve here. + if (exproto->state == OPENED || exproto->state == STANDBY) { + ldout(cct, 10) << __func__ << " accept connection race, existing " + << existing << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq + << ", OPEN|STANDBY, RETRY_SESSION " << dendl; + // if connect_seq both zero, dont stuck into dead lock. it's ok to + // replace + if (connection->policy.resetcheck && exproto->connect_seq == 0) { + return replace(existing, reply, authorizer_reply); + } + + reply.connect_seq = exproto->connect_seq + 1; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply, + authorizer_reply); + } + + // connection race? + if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() || + existing->policy.server) { + // incoming wins + ldout(cct, 10) << __func__ << " accept connection race, existing " + << existing << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq + << ", or we are server, replacing my attempt" << dendl; + return replace(existing, reply, authorizer_reply); + } else { + // our existing outgoing wins + ldout(messenger->cct, 10) + << __func__ << " accept connection race, existing " << existing + << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl; + ceph_assert(connection->peer_addrs->legacy_addr() > + messenger->get_myaddr_legacy()); + existing->lock.unlock(); + // make sure we follow through with opening the existing + // connection (if it isn't yet open) since we know the peer + // has something to send to us. + existing->send_keepalive(); + return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply, + authorizer_reply); + } + } + + ceph_assert(connect_msg.connect_seq > exproto->connect_seq); + ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq); + if (connection->policy.resetcheck && // RESETSESSION only used by servers; + // peers do not reset each other + exproto->connect_seq == 0) { + ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq " + << connect_msg.connect_seq << ", " << existing + << ".cseq = " << exproto->connect_seq + << "), sending RESETSESSION " << dendl; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply, + authorizer_reply); + } + + // reconnect + ldout(cct, 10) << __func__ << " accept peer sent cseq " + << connect_msg.connect_seq << " > " << exproto->connect_seq + << dendl; + return replace(existing, reply, authorizer_reply); + } // existing + else if (!replacing && connect_msg.connect_seq > 0) { + // we reset, and they are opening a new session + ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq " + << connect_msg.connect_seq << "), sending RESETSESSION" + << dendl; + return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply, + authorizer_reply); + } else { + // new session + ldout(cct, 10) << __func__ << " accept new session" << dendl; + existing = nullptr; + return open(reply, authorizer_reply); + } +} + +CtPtr ProtocolV1::send_connect_message_reply(char tag, + ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 20) << __func__ << dendl; + bufferlist reply_bl; + reply.tag = tag; + reply.features = + ((uint64_t)connect_msg.features & connection->policy.features_supported) | + connection->policy.features_required; + reply.authorizer_len = authorizer_reply.length(); + reply_bl.append((char *)&reply, sizeof(reply)); + + ldout(cct, 10) << __func__ << " reply features 0x" << std::hex + << reply.features << " = (policy sup 0x" + << connection->policy.features_supported + << " & connect 0x" << (uint64_t)connect_msg.features + << ") | policy req 0x" + << connection->policy.features_required + << dendl; + + if (reply.authorizer_len) { + reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length()); + authorizer_reply.clear(); + } + + return WRITE(reply_bl, handle_connect_message_reply_write); +} + +CtPtr ProtocolV1::handle_connect_message_reply_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << " write connect message reply failed" << dendl; + connection->inject_delay(); + return _fault(); + } + + return CONTINUE(wait_connect_message); +} + +CtPtr ProtocolV1::replace(AsyncConnectionRef existing, + ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl; + + connection->inject_delay(); + if (existing->policy.lossy) { + // disconnect from the Connection + ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing" + << dendl; + existing->protocol->stop(); + existing->dispatch_queue->queue_reset(existing.get()); + } else { + ceph_assert(can_write == WriteStatus::NOWRITE); + existing->write_lock.lock(); + + ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get()); + + // reset the in_seq if this is a hard reset from peer, + // otherwise we respect our original connection's value + if (is_reset_from_peer) { + exproto->is_reset_from_peer = true; + } + + connection->center->delete_file_event(connection->cs.fd(), + EVENT_READABLE | EVENT_WRITABLE); + + if (existing->delay_state) { + existing->delay_state->flush(); + ceph_assert(!connection->delay_state); + } + exproto->reset_recv_state(); + + exproto->connect_msg.features = connect_msg.features; + + auto temp_cs = std::move(connection->cs); + EventCenter *new_center = connection->center; + Worker *new_worker = connection->worker; + // avoid _stop shutdown replacing socket + // queue a reset on the new connection, which we're dumping for the old + stop(); + + connection->dispatch_queue->queue_reset(connection); + ldout(messenger->cct, 1) + << __func__ << " stop myself to swap existing" << dendl; + exproto->can_write = WriteStatus::REPLACING; + exproto->replacing = true; + exproto->write_in_progress = false; + existing->state_offset = 0; + // avoid previous thread modify event + exproto->state = NONE; + existing->state = AsyncConnection::STATE_NONE; + // Discard existing prefetch buffer in `recv_buf` + existing->recv_start = existing->recv_end = 0; + // there shouldn't exist any buffer + ceph_assert(connection->recv_start == connection->recv_end); + + exproto->authorizer_challenge.reset(); + + auto deactivate_existing = std::bind( + [existing, new_worker, new_center, exproto, reply, + authorizer_reply](ConnectedSocket &cs) mutable { + // we need to delete time event in original thread + { + std::lock_guard<std::mutex> l(existing->lock); + existing->write_lock.lock(); + exproto->requeue_sent(); + existing->outgoing_bl.clear(); + existing->open_write = false; + existing->write_lock.unlock(); + if (exproto->state == NONE) { + existing->shutdown_socket(); + existing->cs = std::move(cs); + existing->worker->references--; + new_worker->references++; + existing->logger = new_worker->get_perf_counter(); + existing->worker = new_worker; + existing->center = new_center; + if (existing->delay_state) + existing->delay_state->set_center(new_center); + } else if (exproto->state == CLOSED) { + auto back_to_close = + std::bind([](ConnectedSocket &cs) mutable { cs.close(); }, + std::move(cs)); + new_center->submit_to(new_center->get_id(), + std::move(back_to_close), true); + return; + } else { + ceph_abort(); + } + } + + // Before changing existing->center, it may already exists some + // events in existing->center's queue. Then if we mark down + // `existing`, it will execute in another thread and clean up + // connection. Previous event will result in segment fault + auto transfer_existing = [existing, exproto, reply, + authorizer_reply]() mutable { + std::lock_guard<std::mutex> l(existing->lock); + if (exproto->state == CLOSED) return; + ceph_assert(exproto->state == NONE); + + // we have called shutdown_socket above + ceph_assert(existing->last_tick_id == 0); + // restart timer since we are going to re-build connection + existing->last_connect_started = ceph::coarse_mono_clock::now(); + existing->last_tick_id = existing->center->create_time_event( + existing->connect_timeout_us, existing->tick_handler); + existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED; + exproto->state = ACCEPTING; + + existing->center->create_file_event( + existing->cs.fd(), EVENT_READABLE, existing->read_handler); + reply.global_seq = exproto->peer_global_seq; + exproto->run_continuation(exproto->send_connect_message_reply( + CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply)); + }; + if (existing->center->in_thread()) + transfer_existing(); + else + existing->center->submit_to(existing->center->get_id(), + std::move(transfer_existing), true); + }, + std::move(temp_cs)); + + existing->center->submit_to(existing->center->get_id(), + std::move(deactivate_existing), true); + existing->write_lock.unlock(); + existing->lock.unlock(); + return nullptr; + } + existing->lock.unlock(); + + return open(reply, authorizer_reply); +} + +CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 20) << __func__ << dendl; + + connect_seq = connect_msg.connect_seq + 1; + peer_global_seq = connect_msg.global_seq; + ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq + << " in_seq=" << in_seq << ", sending READY" << dendl; + + // if it is a hard reset from peer, we don't need a round-trip to negotiate + // in/out sequence + if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) && + !is_reset_from_peer) { + reply.tag = CEPH_MSGR_TAG_SEQ; + wait_for_seq = true; + } else { + reply.tag = CEPH_MSGR_TAG_READY; + wait_for_seq = false; + out_seq = discard_requeued_up_to(out_seq, 0); + is_reset_from_peer = false; + in_seq = 0; + } + + // send READY reply + reply.features = connection->policy.features_supported; + reply.global_seq = messenger->get_global_seq(); + reply.connect_seq = connect_seq; + reply.flags = 0; + reply.authorizer_len = authorizer_reply.length(); + if (connection->policy.lossy) { + reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY; + } + + connection->set_features((uint64_t)reply.features & + (uint64_t)connect_msg.features); + ldout(cct, 10) << __func__ << " accept features " + << connection->get_features() + << " authorizer_protocol " + << connect_msg.authorizer_protocol << dendl; + + session_security.reset( + get_auth_session_handler(cct, connect_msg.authorizer_protocol, + session_key, + connection->get_features())); + + bufferlist reply_bl; + reply_bl.append((char *)&reply, sizeof(reply)); + + if (reply.authorizer_len) { + reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length()); + } + + if (reply.tag == CEPH_MSGR_TAG_SEQ) { + uint64_t s = in_seq; + reply_bl.append((char *)&s, sizeof(s)); + } + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + replacing = false; + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->legacy_addr() + << " just fail later one(this)" << dendl; + ldout(cct, 10) << "accept fault after register" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + ldout(cct, 10) << "accept fault after register" << dendl; + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + return WRITE(reply_bl, handle_ready_connect_message_reply_write); +} + +CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " write ready connect message reply failed" + << dendl; + return _fault(); + } + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + once_ready = true; + + state = ACCEPTING_HANDLED_CONNECT_MSG; + + if (wait_for_seq) { + return wait_seq(); + } + + return server_ready(); +} + +CtPtr ProtocolV1::wait_seq() { + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(uint64_t), handle_seq); +} + +CtPtr ProtocolV1::handle_seq(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read ack seq failed" << dendl; + return _fault(); + } + + uint64_t newly_acked_seq = *(uint64_t *)buffer; + ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq + << dendl; + out_seq = discard_requeued_up_to(out_seq, newly_acked_seq); + + return server_ready(); +} + +CtPtr ProtocolV1::server_ready() { + ldout(cct, 20) << __func__ << " session_security is " + << session_security + << dendl; + + ldout(cct, 20) << __func__ << " accept done" << dendl; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + return ready(); +} diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h new file mode 100644 index 00000000..070ce73f --- /dev/null +++ b/src/msg/async/ProtocolV1.h @@ -0,0 +1,305 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _MSG_ASYNC_PROTOCOL_V1_ +#define _MSG_ASYNC_PROTOCOL_V1_ + +#include "Protocol.h" + +class ProtocolV1; +using CtPtr = Ct<ProtocolV1>*; + +class ProtocolV1 : public Protocol { +/* + * ProtocolV1 State Machine + * + + send_server_banner send_client_banner + | | + v v + wait_client_banner wait_server_banner + | | + | v + v handle_server_banner_and_identify + wait_connect_message <---------\ | + | | | v + | wait_connect_message_auth | send_connect_message <----------\ + | | | | | + v v | | | +handle_connect_message_2 | v | + | | | wait_connect_reply | + v v | | | | + replace -> send_connect_message_reply | V | + | | wait_connect_reply_auth | + | | | | + v v v | + open ---\ handle_connect_reply_2 --------/ + | | | + | v v + | wait_seq wait_ack_seq + | | | + v v v + server_ready client_ready + | | + \------------------> wait_message <------------/ + | ^ | ^ + /------------------------/ | | | + | | | \----------------- ------------\ + v /----------/ v | +handle_keepalive2 | handle_message_header read_message_footer +handle_keepalive2_ack | | ^ +handle_tag_ack | v | + | | throttle_message read_message_data + \----------------/ | ^ + v | + read_message_front --> read_message_middle --/ +*/ + +protected: + + enum State { + NONE = 0, + START_CONNECT, + CONNECTING, + CONNECTING_WAIT_BANNER_AND_IDENTIFY, + CONNECTING_SEND_CONNECT_MSG, + START_ACCEPT, + ACCEPTING, + ACCEPTING_WAIT_CONNECT_MSG_AUTH, + ACCEPTING_HANDLED_CONNECT_MSG, + OPENED, + THROTTLE_MESSAGE, + THROTTLE_BYTES, + THROTTLE_DISPATCH_QUEUE, + READ_MESSAGE_FRONT, + READ_FOOTER_AND_DISPATCH, + CLOSED, + WAIT, + STANDBY + }; + + static const char *get_state_name(int state) { + const char *const statenames[] = {"NONE", + "START_CONNECT", + "CONNECTING", + "CONNECTING_WAIT_BANNER_AND_IDENTIFY", + "CONNECTING_SEND_CONNECT_MSG", + "START_ACCEPT", + "ACCEPTING", + "ACCEPTING_WAIT_CONNECT_MSG_AUTH", + "ACCEPTING_HANDLED_CONNECT_MSG", + "OPENED", + "THROTTLE_MESSAGE", + "THROTTLE_BYTES", + "THROTTLE_DISPATCH_QUEUE", + "READ_MESSAGE_FRONT", + "READ_FOOTER_AND_DISPATCH", + "CLOSED", + "WAIT", + "STANDBY"}; + return statenames[state]; + } + + char *temp_buffer; + + enum class WriteStatus { NOWRITE, REPLACING, CANWRITE, CLOSED }; + std::atomic<WriteStatus> can_write; + std::list<Message *> sent; // the first bufferlist need to inject seq + // priority queue for outbound msgs + std::map<int, std::list<std::pair<bufferlist, Message *>>> out_q; + bool keepalive; + bool write_in_progress = false; + + __u32 connect_seq, peer_global_seq; + std::atomic<uint64_t> in_seq{0}; + std::atomic<uint64_t> out_seq{0}; + std::atomic<uint64_t> ack_left{0}; + + CryptoKey session_key; + std::shared_ptr<AuthSessionHandler> session_security; + std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge; // accept side + + // Open state + ceph_msg_connect connect_msg; + ceph_msg_connect_reply connect_reply; + bufferlist authorizer_buf; + + utime_t backoff; // backoff time + utime_t recv_stamp; + utime_t throttle_stamp; + unsigned msg_left; + uint64_t cur_msg_size; + ceph_msg_header current_header; + bufferlist data_buf; + bufferlist::iterator data_blp; + bufferlist front, middle, data; + + bool replacing; // when replacing process happened, we will reply connect + // side with RETRY tag and accept side will clear replaced + // connection. So when connect side reissue connect_msg, + // there won't exists conflicting connection so we use + // "replacing" to skip RESETSESSION to avoid detect wrong + // presentation + bool is_reset_from_peer; + bool once_ready; + + State state; + + void run_continuation(CtPtr pcontinuation); + CtPtr read(CONTINUATION_RX_TYPE<ProtocolV1> &next, int len, + char *buffer = nullptr); + CtPtr write(CONTINUATION_TX_TYPE<ProtocolV1> &next,bufferlist &bl); + inline CtPtr _fault() { // helper fault method that stops continuation + fault(); + return nullptr; + } + + CONTINUATION_DECL(ProtocolV1, wait_message); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2_ack); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_tag_ack); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_header); + CONTINUATION_DECL(ProtocolV1, throttle_message); + CONTINUATION_DECL(ProtocolV1, throttle_bytes); + CONTINUATION_DECL(ProtocolV1, throttle_dispatch_queue); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_front); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_middle); + CONTINUATION_DECL(ProtocolV1, read_message_data); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_data); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_footer); + + CtPtr ready(); + CtPtr wait_message(); + CtPtr handle_message(char *buffer, int r); + + CtPtr handle_keepalive2(char *buffer, int r); + void append_keepalive_or_ack(bool ack = false, utime_t *t = nullptr); + CtPtr handle_keepalive2_ack(char *buffer, int r); + CtPtr handle_tag_ack(char *buffer, int r); + + CtPtr handle_message_header(char *buffer, int r); + CtPtr throttle_message(); + CtPtr throttle_bytes(); + CtPtr throttle_dispatch_queue(); + CtPtr read_message_front(); + CtPtr handle_message_front(char *buffer, int r); + CtPtr read_message_middle(); + CtPtr handle_message_middle(char *buffer, int r); + CtPtr read_message_data_prepare(); + CtPtr read_message_data(); + CtPtr handle_message_data(char *buffer, int r); + CtPtr read_message_footer(); + CtPtr handle_message_footer(char *buffer, int r); + + void session_reset(); + void randomize_out_seq(); + + Message *_get_next_outgoing(bufferlist *bl); + + void prepare_send_message(uint64_t features, Message *m, bufferlist &bl); + ssize_t write_message(Message *m, bufferlist &bl, bool more); + + void requeue_sent(); + uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq); + void discard_out_queue(); + + void reset_recv_state(); + void reset_security(); + + ostream &_conn_prefix(std::ostream *_dout); + +public: + ProtocolV1(AsyncConnection *connection); + virtual ~ProtocolV1(); + + virtual void connect() override; + virtual void accept() override; + virtual bool is_connected() override; + virtual void stop() override; + virtual void fault() override; + virtual void send_message(Message *m) override; + virtual void send_keepalive() override; + + virtual void read_event() override; + virtual void write_event() override; + virtual bool is_queued() override; + + // Client Protocol +private: + int global_seq; + AuthAuthorizer *authorizer; + + CONTINUATION_DECL(ProtocolV1, send_client_banner); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_and_identify); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_my_addr_write); + CONTINUATION_DECL(ProtocolV1, send_connect_message); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_1); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_auth); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_ack_seq); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_in_seq_write); + + CtPtr send_client_banner(); + CtPtr handle_client_banner_write(int r); + CtPtr wait_server_banner(); + CtPtr handle_server_banner_and_identify(char *buffer, int r); + CtPtr handle_my_addr_write(int r); + CtPtr send_connect_message(); + CtPtr handle_connect_message_write(int r); + CtPtr wait_connect_reply(); + CtPtr handle_connect_reply_1(char *buffer, int r); + CtPtr wait_connect_reply_auth(); + CtPtr handle_connect_reply_auth(char *buffer, int r); + CtPtr handle_connect_reply_2(); + CtPtr wait_ack_seq(); + CtPtr handle_ack_seq(char *buffer, int r); + CtPtr handle_in_seq_write(int r); + CtPtr client_ready(); + + // Server Protocol +protected: + bool wait_for_seq; + + CONTINUATION_DECL(ProtocolV1, send_server_banner); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner); + CONTINUATION_DECL(ProtocolV1, wait_connect_message); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_1); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_auth); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, + handle_connect_message_reply_write); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, + handle_ready_connect_message_reply_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_seq); + + CtPtr send_server_banner(); + CtPtr handle_server_banner_write(int r); + CtPtr wait_client_banner(); + CtPtr handle_client_banner(char *buffer, int r); + CtPtr wait_connect_message(); + CtPtr handle_connect_message_1(char *buffer, int r); + CtPtr wait_connect_message_auth(); + CtPtr handle_connect_message_auth(char *buffer, int r); + CtPtr handle_connect_message_2(); + CtPtr send_connect_message_reply(char tag, ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply); + CtPtr handle_connect_message_reply_write(int r); + CtPtr replace(AsyncConnectionRef existing, ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply); + CtPtr open(ceph_msg_connect_reply &reply, bufferlist &authorizer_reply); + CtPtr handle_ready_connect_message_reply_write(int r); + CtPtr wait_seq(); + CtPtr handle_seq(char *buffer, int r); + CtPtr server_ready(); +}; + +class LoopbackProtocolV1 : public ProtocolV1 { +public: + LoopbackProtocolV1(AsyncConnection *connection) : ProtocolV1(connection) { + this->can_write = WriteStatus::CANWRITE; + } +}; + +#endif /* _MSG_ASYNC_PROTOCOL_V1_ */ diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc new file mode 100644 index 00000000..381d42c3 --- /dev/null +++ b/src/msg/async/ProtocolV2.cc @@ -0,0 +1,2870 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <type_traits> + +#include "ProtocolV2.h" +#include "AsyncMessenger.h" + +#include "common/EventTrace.h" +#include "common/ceph_crypto.h" +#include "common/errno.h" +#include "include/random.h" +#include "auth/AuthClient.h" +#include "auth/AuthServer.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream &ProtocolV2::_conn_prefix(std::ostream *_dout) { + return *_dout << "--2- " << messenger->get_myaddrs() << " >> " + << *connection->peer_addrs << " conn(" << connection << " " + << this + << " " << ceph_con_mode_name(auth_meta->con_mode) + << " :" << connection->port + << " s=" << get_state_name(state) << " pgs=" << peer_global_seq + << " cs=" << connect_seq << " l=" << connection->policy.lossy + << " rev1=" << HAVE_MSGR2_FEATURE(peer_supported_features, + REVISION_1) + << " rx=" << session_stream_handlers.rx.get() + << " tx=" << session_stream_handlers.tx.get() + << ")."; +} + +using namespace ceph::msgr::v2; + +using CtPtr = Ct<ProtocolV2> *; +using CtRef = Ct<ProtocolV2> &; + +void ProtocolV2::run_continuation(CtPtr pcontinuation) { + if (pcontinuation) { + run_continuation(*pcontinuation); + } +} + +void ProtocolV2::run_continuation(CtRef continuation) { + try { + CONTINUATION_RUN(continuation) + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " failed decoding of frame header: " << e + << dendl; + _fault(); + } catch (const ceph::crypto::onwire::MsgAuthError &e) { + lderr(cct) << __func__ << " " << e.what() << dendl; + _fault(); + } catch (const DecryptionError &) { + lderr(cct) << __func__ << " failed to decrypt frame payload" << dendl; + } +} + +#define WRITE(B, D, C) write(D, CONTINUATION(C), B) + +#define READ(L, C) read(CONTINUATION(C), buffer::ptr_node::create(buffer::create(L))) + +#define READ_RXBUF(B, C) read(CONTINUATION(C), B) + +#ifdef UNIT_TESTS_BUILT + +#define INTERCEPT(S) { \ +if(connection->interceptor) { \ + auto a = connection->interceptor->intercept(connection, (S)); \ + if (a == Interceptor::ACTION::FAIL) { \ + return _fault(); \ + } else if (a == Interceptor::ACTION::STOP) { \ + stop(); \ + connection->dispatch_queue->queue_reset(connection); \ + return nullptr; \ + }}} + +#else +#define INTERCEPT(S) +#endif + +ProtocolV2::ProtocolV2(AsyncConnection *connection) + : Protocol(2, connection), + state(NONE), + peer_supported_features(0), + client_cookie(0), + server_cookie(0), + global_seq(0), + connect_seq(0), + peer_global_seq(0), + message_seq(0), + reconnecting(false), + replacing(false), + can_write(false), + bannerExchangeCallback(nullptr), + tx_frame_asm(&session_stream_handlers, false), + rx_frame_asm(&session_stream_handlers, false), + next_tag(static_cast<Tag>(0)), + keepalive(false) { +} + +ProtocolV2::~ProtocolV2() { +} + +void ProtocolV2::connect() { + ldout(cct, 1) << __func__ << dendl; + state = START_CONNECT; + pre_auth.enabled = true; +} + +void ProtocolV2::accept() { + ldout(cct, 1) << __func__ << dendl; + state = START_ACCEPT; +} + +bool ProtocolV2::is_connected() { return can_write; } + +/* + * Tears down the message queues, and removes them from the + * DispatchQueue Must hold write_lock prior to calling. + */ +void ProtocolV2::discard_out_queue() { + ldout(cct, 10) << __func__ << " started" << dendl; + + for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) { + ldout(cct, 20) << __func__ << " discard " << *p << dendl; + (*p)->put(); + } + sent.clear(); + for (auto& [ prio, entries ] : out_queue) { + static_cast<void>(prio); + for (auto& entry : entries) { + ldout(cct, 20) << __func__ << " discard " << *entry.m << dendl; + entry.m->put(); + } + } + out_queue.clear(); + write_in_progress = false; +} + +void ProtocolV2::reset_session() { + ldout(cct, 1) << __func__ << dendl; + + std::lock_guard<std::mutex> l(connection->write_lock); + if (connection->delay_state) { + connection->delay_state->discard(); + } + + connection->dispatch_queue->discard_queue(connection->conn_id); + discard_out_queue(); + connection->outgoing_bl.clear(); + + connection->dispatch_queue->queue_remote_reset(connection); + + out_seq = 0; + in_seq = 0; + client_cookie = 0; + server_cookie = 0; + connect_seq = 0; + peer_global_seq = 0; + message_seq = 0; + ack_left = 0; + can_write = false; +} + +void ProtocolV2::stop() { + ldout(cct, 1) << __func__ << dendl; + if (state == CLOSED) { + return; + } + + if (connection->delay_state) connection->delay_state->flush(); + + std::lock_guard<std::mutex> l(connection->write_lock); + + reset_recv_state(); + discard_out_queue(); + + connection->_stop(); + + can_write = false; + state = CLOSED; +} + +void ProtocolV2::fault() { _fault(); } + +void ProtocolV2::requeue_sent() { + write_in_progress = false; + if (sent.empty()) { + return; + } + + auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST]; + out_seq -= sent.size(); + while (!sent.empty()) { + Message *m = sent.back(); + sent.pop_back(); + ldout(cct, 5) << __func__ << " requeueing message m=" << m + << " seq=" << m->get_seq() << " type=" << m->get_type() << " " + << *m << dendl; + rq.emplace_front(out_queue_entry_t{false, m}); + } +} + +uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) { + ldout(cct, 10) << __func__ << " " << seq << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) { + return seq; + } + auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST]; + uint64_t count = out_seq; + while (!rq.empty()) { + Message* const m = rq.front().m; + if (m->get_seq() == 0 || m->get_seq() > seq) break; + ldout(cct, 5) << __func__ << " discarding message m=" << m + << " seq=" << m->get_seq() << " ack_seq=" << seq << " " + << *m << dendl; + m->put(); + rq.pop_front(); + count++; + } + if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST); + return count; +} + +void ProtocolV2::reset_security() { + ldout(cct, 5) << __func__ << dendl; + + auth_meta.reset(new AuthConnectionMeta); + session_stream_handlers.rx.reset(nullptr); + session_stream_handlers.tx.reset(nullptr); + pre_auth.rxbuf.clear(); + pre_auth.txbuf.clear(); +} + +// it's expected the `write_lock` is held while calling this method. +void ProtocolV2::reset_recv_state() { + ldout(cct, 5) << __func__ << dendl; + + if (!connection->center->in_thread()) { + // execute in the same thread that uses the rx/tx handlers. We need + // to do the warp because holding `write_lock` is not enough as + // `write_event()` unlocks it just before calling `write_message()`. + // `submit_to()` here is NOT blocking. + connection->center->submit_to(connection->center->get_id(), [this] { + ldout(cct, 5) << "reset_recv_state (warped) reseting crypto handlers" + << dendl; + // Possibly unnecessary. See the comment in `deactivate_existing`. + std::lock_guard<std::mutex> l(connection->lock); + std::lock_guard<std::mutex> wl(connection->write_lock); + reset_security(); + }, /* nowait = */true); + } else { + reset_security(); + } + + // clean read and write callbacks + connection->pendingReadLen.reset(); + connection->writeCallback.reset(); + + next_tag = static_cast<Tag>(0); + + reset_throttle(); +} + +size_t ProtocolV2::get_current_msg_size() const { + ceph_assert(rx_frame_asm.get_num_segments() > 0); + size_t sum = 0; + // we don't include SegmentIndex::Msg::HEADER. + for (size_t i = 1; i < rx_frame_asm.get_num_segments(); i++) { + sum += rx_frame_asm.get_segment_logical_len(i); + } + return sum; +} + +void ProtocolV2::reset_throttle() { + if (state > THROTTLE_MESSAGE && state <= THROTTLE_DONE && + connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " releasing " << 1 + << " message to policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + connection->policy.throttler_messages->put(); + } + if (state > THROTTLE_BYTES && state <= THROTTLE_DONE) { + if (connection->policy.throttler_bytes) { + const size_t cur_msg_size = get_current_msg_size(); + ldout(cct, 10) << __func__ << " releasing " << cur_msg_size + << " bytes to policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + connection->policy.throttler_bytes->put(cur_msg_size); + } + } + if (state > THROTTLE_DISPATCH_QUEUE && state <= THROTTLE_DONE) { + const size_t cur_msg_size = get_current_msg_size(); + ldout(cct, 10) + << __func__ << " releasing " << cur_msg_size + << " bytes to dispatch_queue throttler " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() << dendl; + connection->dispatch_queue->dispatch_throttle_release(cur_msg_size); + } +} + +CtPtr ProtocolV2::_fault() { + ldout(cct, 10) << __func__ << dendl; + + if (state == CLOSED || state == NONE) { + ldout(cct, 10) << __func__ << " connection is already closed" << dendl; + return nullptr; + } + + if (connection->policy.lossy && + !(state >= START_CONNECT && state <= SESSION_RECONNECTING)) { + ldout(cct, 2) << __func__ << " on lossy channel, failing" << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + connection->write_lock.lock(); + + can_write = false; + // requeue sent items + requeue_sent(); + + if (out_queue.empty() && state >= START_ACCEPT && + state <= SESSION_ACCEPTING && !replacing) { + ldout(cct, 2) << __func__ << " with nothing to send and in the half " + << " accept state just closed" << dendl; + connection->write_lock.unlock(); + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + replacing = false; + connection->fault(); + reset_recv_state(); + + reconnecting = false; + + if (connection->policy.standby && out_queue.empty() && !keepalive && + state != WAIT) { + ldout(cct, 1) << __func__ << " with nothing to send, going to standby" + << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return nullptr; + } + if (connection->policy.server) { + ldout(cct, 1) << __func__ << " server, going to standby, even though i have stuff queued" << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return nullptr; + } + + connection->write_lock.unlock(); + + if (!(state >= START_CONNECT && state <= SESSION_RECONNECTING) && + state != WAIT && + state != SESSION_ACCEPTING /* due to connection race */) { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 1) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 1) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + pre_auth.enabled = true; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); + } else { + if (state == WAIT) { + backoff.set_from_double(cct->_conf->ms_max_backoff); + } else if (backoff == utime_t()) { + backoff.set_from_double(cct->_conf->ms_initial_backoff); + } else { + backoff += backoff; + if (backoff > cct->_conf->ms_max_backoff) + backoff.set_from_double(cct->_conf->ms_max_backoff); + } + + if (server_cookie) { + connect_seq++; + } + + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + pre_auth.enabled = true; + connection->state = AsyncConnection::STATE_CONNECTING; + ldout(cct, 1) << __func__ << " waiting " << backoff << dendl; + // woke up again; + connection->register_time_events.insert( + connection->center->create_time_event(backoff.to_nsec() / 1000, + connection->wakeup_handler)); + } + return nullptr; +} + +void ProtocolV2::prepare_send_message(uint64_t features, + Message *m) { + ldout(cct, 20) << __func__ << " m=" << *m << dendl; + + // associate message with Connection (for benefit of encode_payload) + if (m->empty_payload()) { + ldout(cct, 20) << __func__ << " encoding features " << features << " " << m + << " " << *m << dendl; + } else { + ldout(cct, 20) << __func__ << " half-reencoding features " << features + << " " << m << " " << *m << dendl; + } + + // encode and copy out of *m + m->encode(features, 0); +} + +void ProtocolV2::send_message(Message *m) { + uint64_t f = connection->get_features(); + + // TODO: Currently not all messages supports reencode like MOSDMap, so here + // only let fast dispatch support messages prepare message + const bool can_fast_prepare = messenger->ms_can_fast_dispatch(m); + if (can_fast_prepare) { + prepare_send_message(f, m); + } + + std::lock_guard<std::mutex> l(connection->write_lock); + bool is_prepared = can_fast_prepare; + // "features" changes will change the payload encoding + if (can_fast_prepare && (!can_write || connection->get_features() != f)) { + // ensure the correctness of message encoding + m->clear_payload(); + is_prepared = false; + ldout(cct, 10) << __func__ << " clear encoded buffer previous " << f + << " != " << connection->get_features() << dendl; + } + if (state == CLOSED) { + ldout(cct, 10) << __func__ << " connection closed." + << " Drop message " << m << dendl; + m->put(); + } else { + ldout(cct, 5) << __func__ << " enqueueing message m=" << m + << " type=" << m->get_type() << " " << *m << dendl; + m->trace.event("async enqueueing message"); + out_queue[m->get_priority()].emplace_back( + out_queue_entry_t{is_prepared, m}); + ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m + << dendl; + if (((!replacing && can_write) || state == STANDBY) && !write_in_progress) { + write_in_progress = true; + connection->center->dispatch_event_external(connection->write_handler); + } + } +} + +void ProtocolV2::send_keepalive() { + ldout(cct, 10) << __func__ << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (state != CLOSED) { + keepalive = true; + connection->center->dispatch_event_external(connection->write_handler); + } +} + +void ProtocolV2::read_event() { + ldout(cct, 20) << __func__ << dendl; + + switch (state) { + case START_CONNECT: + run_continuation(CONTINUATION(start_client_banner_exchange)); + break; + case START_ACCEPT: + run_continuation(CONTINUATION(start_server_banner_exchange)); + break; + case READY: + run_continuation(CONTINUATION(read_frame)); + break; + case THROTTLE_MESSAGE: + run_continuation(CONTINUATION(throttle_message)); + break; + case THROTTLE_BYTES: + run_continuation(CONTINUATION(throttle_bytes)); + break; + case THROTTLE_DISPATCH_QUEUE: + run_continuation(CONTINUATION(throttle_dispatch_queue)); + break; + default: + break; + } +} + +ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() { + out_queue_entry_t out_entry; + + if (!out_queue.empty()) { + auto it = out_queue.rbegin(); + auto& entries = it->second; + ceph_assert(!entries.empty()); + out_entry = entries.front(); + entries.pop_front(); + if (entries.empty()) { + out_queue.erase(it->first); + } + } + return out_entry; +} + +ssize_t ProtocolV2::write_message(Message *m, bool more) { + FUNCTRACE(cct); + ceph_assert(connection->center->in_thread()); + m->set_seq(++out_seq); + + connection->lock.lock(); + uint64_t ack_seq = in_seq; + ack_left = 0; + connection->lock.unlock(); + + ceph_msg_header &header = m->get_header(); + ceph_msg_footer &footer = m->get_footer(); + + ceph_msg_header2 header2{header.seq, header.tid, + header.type, header.priority, + header.version, + init_le32(0), header.data_off, + init_le64(ack_seq), + footer.flags, header.compat_version, + header.reserved}; + + auto message = MessageFrame::Encode( + header2, + m->get_payload(), + m->get_middle(), + m->get_data()); + if (!append_frame(message)) { + m->put(); + return -EILSEQ; + } + + ldout(cct, 5) << __func__ << " sending message m=" << m + << " seq=" << m->get_seq() << " " << *m << dendl; + + m->trace.event("async writing message"); + ldout(cct, 20) << __func__ << " sending m=" << m << " seq=" << m->get_seq() + << " src=" << entity_name_t(messenger->get_myname()) + << " off=" << header2.data_off + << dendl; + ssize_t total_send_size = connection->outgoing_bl.length(); + ssize_t rc = connection->_try_send(more); + if (rc < 0) { + ldout(cct, 1) << __func__ << " error sending " << m << ", " + << cpp_strerror(rc) << dendl; + } else { + connection->logger->inc( + l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length()); + ldout(cct, 10) << __func__ << " sending " << m + << (rc ? " continuely." : " done.") << dendl; + } + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false); + m->put(); + + return rc; +} + +template <class F> +bool ProtocolV2::append_frame(F& frame) { + ceph::bufferlist bl; + try { + bl = frame.get_buffer(tx_frame_asm); + } catch (ceph::crypto::onwire::TxHandlerError &e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return false; + } + + ldout(cct, 25) << __func__ << " assembled frame " << bl.length() + << " bytes " << tx_frame_asm << dendl; + connection->outgoing_bl.append(bl); + return true; +} + +void ProtocolV2::handle_message_ack(uint64_t seq) { + if (connection->policy.lossy) { // lossy connections don't keep sent messages + return; + } + + ldout(cct, 15) << __func__ << " seq=" << seq << dendl; + + // trim sent list + static const int max_pending = 128; + int i = 0; + Message *pending[max_pending]; + connection->write_lock.lock(); + while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) { + Message *m = sent.front(); + sent.pop_front(); + pending[i++] = m; + ldout(cct, 10) << __func__ << " got ack seq " << seq + << " >= " << m->get_seq() << " on " << m << " " << *m + << dendl; + } + connection->write_lock.unlock(); + for (int k = 0; k < i; k++) { + pending[k]->put(); + } +} + +void ProtocolV2::write_event() { + ldout(cct, 10) << __func__ << dendl; + ssize_t r = 0; + + connection->write_lock.lock(); + if (can_write) { + if (keepalive) { + ldout(cct, 10) << __func__ << " appending keepalive" << dendl; + auto keepalive_frame = KeepAliveFrame::Encode(); + if (!append_frame(keepalive_frame)) { + connection->write_lock.unlock(); + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + keepalive = false; + } + + auto start = ceph::mono_clock::now(); + bool more; + do { + const auto out_entry = _get_next_outgoing(); + if (!out_entry.m) { + break; + } + + if (!connection->policy.lossy) { + // put on sent list + sent.push_back(out_entry.m); + out_entry.m->get(); + } + more = !out_queue.empty(); + connection->write_lock.unlock(); + + // send_message or requeue messages may not encode message + if (!out_entry.is_prepared) { + prepare_send_message(connection->get_features(), out_entry.m); + } + + r = write_message(out_entry.m, more); + + connection->write_lock.lock(); + if (r == 0) { + ; + } else if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + break; + } else if (r > 0) + break; + } while (can_write); + write_in_progress = false; + + // if r > 0 mean data still lefted, so no need _try_send. + if (r == 0) { + uint64_t left = ack_left; + if (left) { + ceph_le64 s; + s = in_seq; + ldout(cct, 10) << __func__ << " try send msg ack, acked " << left + << " messages" << dendl; + auto ack_frame = AckFrame::Encode(in_seq); + if (append_frame(ack_frame)) { + ack_left -= left; + left = ack_left; + r = connection->_try_send(left); + } else { + r = -EILSEQ; + } + } else if (is_queued()) { + r = connection->_try_send(); + } + } + connection->write_lock.unlock(); + + connection->logger->tinc(l_msgr_running_send_time, + ceph::mono_clock::now() - start); + if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + } else { + write_in_progress = false; + connection->write_lock.unlock(); + connection->lock.lock(); + connection->write_lock.lock(); + if (state == STANDBY && !connection->policy.server && is_queued()) { + ldout(cct, 10) << __func__ << " policy.server is false" << dendl; + if (server_cookie) { // only increment connect_seq if there is a session + connect_seq++; + } + connection->_connect(); + } else if (connection->cs && state != NONE && state != CLOSED && + state != START_CONNECT) { + r = connection->_try_send(); + if (r < 0) { + ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl; + connection->write_lock.unlock(); + fault(); + connection->lock.unlock(); + return; + } + } + connection->write_lock.unlock(); + connection->lock.unlock(); + } +} + +bool ProtocolV2::is_queued() { + return !out_queue.empty() || connection->is_queued(); +} + +CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next, + rx_buffer_t &&buffer) { + const auto len = buffer->length(); + const auto buf = buffer->c_str(); + next.node = std::move(buffer); + ssize_t r = connection->read(len, buf, + [&next, this](char *buffer, int r) { + if (unlikely(pre_auth.enabled) && r >= 0) { + pre_auth.rxbuf.append(*next.node); + ceph_assert(!cct->_conf->ms_die_on_bug || + pre_auth.rxbuf.length() < 20000000); + } + next.r = r; + run_continuation(next); + }); + if (r <= 0) { + // error or done synchronously + if (unlikely(pre_auth.enabled) && r >= 0) { + pre_auth.rxbuf.append(*next.node); + ceph_assert(!cct->_conf->ms_die_on_bug || + pre_auth.rxbuf.length() < 20000000); + } + next.r = r; + return &next; + } + + return nullptr; +} + +template <class F> +CtPtr ProtocolV2::write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + F &frame) { + ceph::bufferlist bl; + try { + bl = frame.get_buffer(tx_frame_asm); + } catch (ceph::crypto::onwire::TxHandlerError &e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return _fault(); + } + + ldout(cct, 25) << __func__ << " assembled frame " << bl.length() + << " bytes " << tx_frame_asm << dendl; + return write(desc, next, bl); +} + +CtPtr ProtocolV2::write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + bufferlist &buffer) { + if (unlikely(pre_auth.enabled)) { + pre_auth.txbuf.append(buffer); + ceph_assert(!cct->_conf->ms_die_on_bug || + pre_auth.txbuf.length() < 20000000); + } + + ssize_t r = + connection->write(buffer, [&next, desc, this](int r) { + if (r < 0) { + ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + connection->inject_delay(); + _fault(); + } + run_continuation(next); + }); + + if (r < 0) { + ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } else if (r == 0) { + next.setParams(); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV2::_banner_exchange(CtRef callback) { + ldout(cct, 20) << __func__ << dendl; + bannerExchangeCallback = &callback; + + bufferlist banner_payload; + encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0); + encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0); + + bufferlist bl; + bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX)); + encode((uint16_t)banner_payload.length(), bl, 0); + bl.claim_append(banner_payload); + + INTERCEPT(state == BANNER_CONNECTING ? 3 : 4); + + return WRITE(bl, "banner", _wait_for_peer_banner); +} + +CtPtr ProtocolV2::_wait_for_peer_banner() { + unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(__le16); + return READ(banner_len, _handle_peer_banner); +} + +CtPtr ProtocolV2::_handle_peer_banner(rx_buffer_t &&buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner failed r=" << r << " (" + << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX); + + if (memcmp(buffer->c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len)) { + if (memcmp(buffer->c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) { + lderr(cct) << __func__ << " peer " << *connection->peer_addrs + << " is using msgr V1 protocol" << dendl; + return _fault(); + } + ldout(cct, 1) << __func__ << " accept peer sent bad banner" << dendl; + return _fault(); + } + + uint16_t payload_len; + bufferlist bl; + buffer->set_offset(banner_prefix_len); + buffer->set_length(sizeof(__le16)); + bl.push_back(std::move(buffer)); + auto ti = bl.cbegin(); + try { + decode(payload_len, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode banner payload len failed " << dendl; + return _fault(); + } + + INTERCEPT(state == BANNER_CONNECTING ? 5 : 6); + + return READ(payload_len, _handle_peer_banner_payload); +} + +CtPtr ProtocolV2::_handle_peer_banner_payload(rx_buffer_t &&buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner payload failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + uint64_t peer_supported_features; + uint64_t peer_required_features; + + bufferlist bl; + bl.push_back(std::move(buffer)); + auto ti = bl.cbegin(); + try { + decode(peer_supported_features, ti); + decode(peer_required_features, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode banner payload failed " << dendl; + return _fault(); + } + + ldout(cct, 1) << __func__ << " supported=" << std::hex + << peer_supported_features << " required=" << std::hex + << peer_required_features << std::dec << dendl; + + // Check feature bit compatibility + + uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES; + uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES; + + if ((required_features & peer_supported_features) != required_features) { + ldout(cct, 1) << __func__ << " peer does not support all required features" + << " required=" << std::hex << required_features + << " supported=" << std::hex << peer_supported_features + << std::dec << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + if ((supported_features & peer_required_features) != peer_required_features) { + ldout(cct, 1) << __func__ << " we do not support all peer required features" + << " required=" << std::hex << peer_required_features + << " supported=" << supported_features << std::dec << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + this->peer_supported_features = peer_supported_features; + if (peer_required_features == 0) { + this->connection_features = msgr2_required; + } + + // if the peer supports msgr2.1, switch to it + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + tx_frame_asm.set_is_rev1(is_rev1); + rx_frame_asm.set_is_rev1(is_rev1); + + if (state == BANNER_CONNECTING) { + state = HELLO_CONNECTING; + } + else { + ceph_assert(state == BANNER_ACCEPTING); + state = HELLO_ACCEPTING; + } + + auto hello = HelloFrame::Encode(messenger->get_mytype(), + connection->target_addr); + + INTERCEPT(state == HELLO_CONNECTING ? 7 : 8); + + return WRITE(hello, "hello frame", read_frame); +} + +CtPtr ProtocolV2::handle_hello(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != HELLO_CONNECTING && state != HELLO_ACCEPTING) { + lderr(cct) << __func__ << " not in hello exchange state!" << dendl; + return _fault(); + } + + auto hello = HelloFrame::Decode(payload); + + ldout(cct, 5) << __func__ << " received hello:" + << " peer_type=" << (int)hello.entity_type() + << " peer_addr_for_me=" << hello.peer_addr() << dendl; + + sockaddr_storage ss; + socklen_t len = sizeof(ss); + getsockname(connection->cs.fd(), (sockaddr *)&ss, &len); + ldout(cct, 5) << __func__ << " getsockname says I am " << (sockaddr *)&ss + << " when talking to " << connection->target_addr << dendl; + + if (connection->get_peer_type() == -1) { + connection->set_peer_type(hello.entity_type()); + + ceph_assert(state == HELLO_ACCEPTING); + connection->policy = messenger->get_policy(hello.entity_type()); + ldout(cct, 10) << __func__ << " accept of host_type " + << (int)hello.entity_type() + << ", policy.lossy=" << connection->policy.lossy + << " policy.server=" << connection->policy.server + << " policy.standby=" << connection->policy.standby + << " policy.resetcheck=" << connection->policy.resetcheck + << dendl; + } else { + ceph_assert(state == HELLO_CONNECTING); + if (connection->get_peer_type() != hello.entity_type()) { + ldout(cct, 1) << __func__ << " connection peer type does not match what" + << " peer advertises " << connection->get_peer_type() + << " != " << (int)hello.entity_type() << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + } + + if (messenger->get_myaddrs().empty() || + messenger->get_myaddrs().front().is_blank_ip()) { + entity_addr_t a; + if (cct->_conf->ms_learn_addr_from_peer) { + ldout(cct, 1) << __func__ << " peer " << connection->target_addr + << " says I am " << hello.peer_addr() << " (socket says " + << (sockaddr*)&ss << ")" << dendl; + a = hello.peer_addr(); + } else { + ldout(cct, 1) << __func__ << " socket to " << connection->target_addr + << " says I am " << (sockaddr*)&ss + << " (peer says " << hello.peer_addr() << ")" << dendl; + a.set_sockaddr((sockaddr *)&ss); + } + a.set_type(entity_addr_t::TYPE_MSGR2); // anything but NONE; learned_addr ignores this + a.set_port(0); + connection->lock.unlock(); + messenger->learned_addr(a); + if (cct->_conf->ms_inject_internal_delays && + cct->_conf->ms_inject_socket_failures) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 10) << __func__ << " sleep for " + << cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + } + connection->lock.lock(); + if (state != HELLO_CONNECTING) { + ldout(cct, 1) << __func__ + << " state changed while learned_addr, mark_down or " + << " replacing must be happened just now" << dendl; + return nullptr; + } + } + + + + CtPtr callback; + callback = bannerExchangeCallback; + bannerExchangeCallback = nullptr; + ceph_assert(callback); + return callback; +} + +CtPtr ProtocolV2::read_frame() { + if (state == CLOSED) { + return nullptr; + } + + ldout(cct, 20) << __func__ << dendl; + rx_preamble.clear(); + rx_epilogue.clear(); + rx_segments_data.clear(); + + return READ(rx_frame_asm.get_preamble_onwire_len(), + handle_read_frame_preamble_main); +} + +CtPtr ProtocolV2::handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read frame preamble failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + rx_preamble.push_back(std::move(buffer)); + + ldout(cct, 30) << __func__ << " preamble\n"; + rx_preamble.hexdump(*_dout); + *_dout << dendl; + + try { + next_tag = rx_frame_asm.disassemble_preamble(rx_preamble); + } catch (FrameError& e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return _fault(); + } catch (ceph::crypto::onwire::MsgAuthError&) { + ldout(cct, 1) << __func__ << "bad auth tag" << dendl; + return _fault(); + } + + ldout(cct, 25) << __func__ << " disassembled preamble " << rx_frame_asm + << dendl; + + if (session_stream_handlers.rx) { + ldout(cct, 30) << __func__ << " preamble after decrypt\n"; + rx_preamble.hexdump(*_dout); + *_dout << dendl; + } + + // does it need throttle? + if (next_tag == Tag::MESSAGE) { + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + state = THROTTLE_MESSAGE; + return CONTINUE(throttle_message); + } else { + return read_frame_segment(); + } +} + +CtPtr ProtocolV2::handle_read_frame_dispatch() { + ldout(cct, 10) << __func__ + << " tag=" << static_cast<uint32_t>(next_tag) << dendl; + + switch (next_tag) { + case Tag::HELLO: + case Tag::AUTH_REQUEST: + case Tag::AUTH_BAD_METHOD: + case Tag::AUTH_REPLY_MORE: + case Tag::AUTH_REQUEST_MORE: + case Tag::AUTH_DONE: + case Tag::AUTH_SIGNATURE: + case Tag::CLIENT_IDENT: + case Tag::SERVER_IDENT: + case Tag::IDENT_MISSING_FEATURES: + case Tag::SESSION_RECONNECT: + case Tag::SESSION_RESET: + case Tag::SESSION_RETRY: + case Tag::SESSION_RETRY_GLOBAL: + case Tag::SESSION_RECONNECT_OK: + case Tag::KEEPALIVE2: + case Tag::KEEPALIVE2_ACK: + case Tag::ACK: + case Tag::WAIT: + return handle_frame_payload(); + case Tag::MESSAGE: + return handle_message(); + default: { + lderr(cct) << __func__ + << " received unknown tag=" << static_cast<uint32_t>(next_tag) + << dendl; + return _fault(); + } + } + + return nullptr; +} + +CtPtr ProtocolV2::read_frame_segment() { + size_t seg_idx = rx_segments_data.size(); + ldout(cct, 20) << __func__ << " seg_idx=" << seg_idx << dendl; + rx_segments_data.emplace_back(); + + uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx); + if (onwire_len == 0) { + return _handle_read_frame_segment(); + } + + rx_buffer_t rx_buffer; + uint16_t align = rx_frame_asm.get_segment_align(seg_idx); + try { + rx_buffer = buffer::ptr_node::create(buffer::create_aligned( + onwire_len, align)); + } catch (std::bad_alloc&) { + // Catching because of potential issues with satisfying alignment. + ldout(cct, 1) << __func__ << " can't allocate aligned rx_buffer" + << " len=" << onwire_len + << " align=" << align + << dendl; + return _fault(); + } + + return READ_RXBUF(std::move(rx_buffer), handle_read_frame_segment); +} + +CtPtr ProtocolV2::handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read frame segment failed r=" << r << " (" + << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + rx_segments_data.back().push_back(std::move(rx_buffer)); + return _handle_read_frame_segment(); +} + +CtPtr ProtocolV2::_handle_read_frame_segment() { + if (rx_segments_data.size() == rx_frame_asm.get_num_segments()) { + // OK, all segments planned to read are read. Can go with epilogue. + uint32_t epilogue_onwire_len = rx_frame_asm.get_epilogue_onwire_len(); + if (epilogue_onwire_len == 0) { + return _handle_read_frame_epilogue_main(); + } + return READ(epilogue_onwire_len, handle_read_frame_epilogue_main); + } + // TODO: for makeshift only. This will be more generic and throttled + return read_frame_segment(); +} + +CtPtr ProtocolV2::handle_frame_payload() { + ceph_assert(!rx_segments_data.empty()); + auto& payload = rx_segments_data.back(); + + ldout(cct, 30) << __func__ << "\n"; + payload.hexdump(*_dout); + *_dout << dendl; + + switch (next_tag) { + case Tag::HELLO: + return handle_hello(payload); + case Tag::AUTH_REQUEST: + return handle_auth_request(payload); + case Tag::AUTH_BAD_METHOD: + return handle_auth_bad_method(payload); + case Tag::AUTH_REPLY_MORE: + return handle_auth_reply_more(payload); + case Tag::AUTH_REQUEST_MORE: + return handle_auth_request_more(payload); + case Tag::AUTH_DONE: + return handle_auth_done(payload); + case Tag::AUTH_SIGNATURE: + return handle_auth_signature(payload); + case Tag::CLIENT_IDENT: + return handle_client_ident(payload); + case Tag::SERVER_IDENT: + return handle_server_ident(payload); + case Tag::IDENT_MISSING_FEATURES: + return handle_ident_missing_features(payload); + case Tag::SESSION_RECONNECT: + return handle_reconnect(payload); + case Tag::SESSION_RESET: + return handle_session_reset(payload); + case Tag::SESSION_RETRY: + return handle_session_retry(payload); + case Tag::SESSION_RETRY_GLOBAL: + return handle_session_retry_global(payload); + case Tag::SESSION_RECONNECT_OK: + return handle_reconnect_ok(payload); + case Tag::KEEPALIVE2: + return handle_keepalive2(payload); + case Tag::KEEPALIVE2_ACK: + return handle_keepalive2_ack(payload); + case Tag::ACK: + return handle_message_ack(payload); + case Tag::WAIT: + return handle_wait(payload); + default: + ceph_abort(); + } + return nullptr; +} + +CtPtr ProtocolV2::ready() { + ldout(cct, 25) << __func__ << dendl; + + reconnecting = false; + replacing = false; + + // make sure no pending tick timer + if (connection->last_tick_id) { + connection->center->delete_time_event(connection->last_tick_id); + } + connection->last_tick_id = connection->center->create_time_event( + connection->inactive_timeout_us, connection->tick_handler); + + { + std::lock_guard<std::mutex> l(connection->write_lock); + can_write = true; + if (!out_queue.empty()) { + connection->center->dispatch_event_external(connection->write_handler); + } + } + + connection->maybe_start_delay_thread(); + + state = READY; + ldout(cct, 1) << __func__ << " entity=" << peer_name << " client_cookie=" + << std::hex << client_cookie << " server_cookie=" + << server_cookie << std::dec << " in_seq=" << in_seq + << " out_seq=" << out_seq << dendl; + + INTERCEPT(15); + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r) +{ + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read frame epilogue failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + rx_epilogue.push_back(std::move(buffer)); + return _handle_read_frame_epilogue_main(); +} + +CtPtr ProtocolV2::_handle_read_frame_epilogue_main() { + bool aborted; + try { + rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]); + aborted = !rx_frame_asm.disassemble_remaining_segments( + rx_segments_data.data(), rx_epilogue); + } catch (FrameError& e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return _fault(); + } catch (ceph::crypto::onwire::MsgAuthError&) { + ldout(cct, 1) << __func__ << "bad auth tag" << dendl; + return _fault(); + } + + // we do have a mechanism that allows transmitter to start sending message + // and abort after putting entire data field on wire. This will be used by + // the kernel client to avoid unnecessary buffering. + if (aborted) { + reset_throttle(); + state = READY; + return CONTINUE(read_frame); + } + return handle_read_frame_dispatch(); +} + +CtPtr ProtocolV2::handle_message() { + ldout(cct, 20) << __func__ << dendl; + ceph_assert(state == THROTTLE_DONE); + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + ltt_recv_stamp = ceph_clock_now(); +#endif + recv_stamp = ceph_clock_now(); + + const size_t cur_msg_size = get_current_msg_size(); + auto msg_frame = MessageFrame::Decode(rx_segments_data); + + // XXX: paranoid copy just to avoid oops + ceph_msg_header2 current_header = msg_frame.header(); + + ldout(cct, 5) << __func__ + << " got " << msg_frame.front_len() + << " + " << msg_frame.middle_len() + << " + " << msg_frame.data_len() + << " byte message." + << " envelope type=" << current_header.type + << " src " << peer_name + << " off " << current_header.data_off + << dendl; + + INTERCEPT(16); + ceph_msg_header header{current_header.seq, + current_header.tid, + current_header.type, + current_header.priority, + current_header.version, + init_le32(msg_frame.front_len()), + init_le32(msg_frame.middle_len()), + init_le32(msg_frame.data_len()), + current_header.data_off, + peer_name, + current_header.compat_version, + current_header.reserved, + init_le32(0)}; + ceph_msg_footer footer{init_le32(0), init_le32(0), + init_le32(0), init_le64(0), current_header.flags}; + + Message *message = decode_message(cct, 0, header, footer, + msg_frame.front(), + msg_frame.middle(), + msg_frame.data(), + connection); + if (!message) { + ldout(cct, 1) << __func__ << " decode message failed " << dendl; + return _fault(); + } else { + state = READ_MESSAGE_COMPLETE; + } + + INTERCEPT(17); + + message->set_byte_throttler(connection->policy.throttler_bytes); + message->set_message_throttler(connection->policy.throttler_messages); + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(cur_msg_size); + + message->set_recv_stamp(recv_stamp); + message->set_throttle_stamp(throttle_stamp); + message->set_recv_complete_stamp(ceph_clock_now()); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = in_seq; + if (message->get_seq() <= cur_seq) { + ldout(cct, 0) << __func__ << " got old message " << message->get_seq() + << " <= " << cur_seq << " " << message << " " << *message + << ", discarding" << dendl; + message->put(); + if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) && + cct->_conf->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return nullptr; + } + if (message->get_seq() > cur_seq + 1) { + ldout(cct, 0) << __func__ << " missed message? skipped from seq " + << cur_seq << " to " << message->get_seq() << dendl; + if (cct->_conf->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + if (message->get_type() == CEPH_MSG_OSD_OP || + message->get_type() == CEPH_MSG_OSD_OPREPLY) { + utime_t ltt_processed_stamp = ceph_clock_now(); + double usecs_elapsed = + (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000; + ostringstream buf; + if (message->get_type() == CEPH_MSG_OSD_OP) + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP", + false); + else + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY", + false); + } +#endif + + // note last received message. + in_seq = message->get_seq(); + ldout(cct, 5) << __func__ << " received message m=" << message + << " seq=" << message->get_seq() + << " from=" << message->get_source() << " type=" << header.type + << " " << *message << dendl; + + bool need_dispatch_writer = false; + if (!connection->policy.lossy) { + ack_left++; + need_dispatch_writer = true; + } + + state = READY; + + connection->logger->inc(l_msgr_recv_messages); + connection->logger->inc(l_msgr_recv_bytes, + rx_frame_asm.get_frame_onwire_len()); + + messenger->ms_fast_preprocess(message); + auto fast_dispatch_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_recv_time, + fast_dispatch_time - connection->recv_start_time); + if (connection->delay_state) { + double delay_period = 0; + if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) { + delay_period = + cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0; + ldout(cct, 1) << "queue_received will delay after " + << (ceph_clock_now() + delay_period) << " on " << message + << " " << *message << dendl; + } + connection->delay_state->queue(delay_period, message); + } else if (messenger->ms_can_fast_dispatch(message)) { + connection->lock.unlock(); + connection->dispatch_queue->fast_dispatch(message); + connection->recv_start_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_fast_dispatch_time, + connection->recv_start_time - fast_dispatch_time); + connection->lock.lock(); + } else { + connection->dispatch_queue->enqueue(message, message->get_priority(), + connection->conn_id); + } + + handle_message_ack(current_header.ack_seq); + + // we might have been reused by another connection + // let's check if that is the case + if (state != READY) { + // yes, that was the case, let's do nothing + return nullptr; + } + + if (need_dispatch_writer && connection->is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(read_frame); +} + + +CtPtr ProtocolV2::throttle_message() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " wants " << 1 + << " message from policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + if (!connection->policy.throttler_messages->get_or_fail()) { + ldout(cct, 10) << __func__ << " wants 1 message from policy throttle " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + state = THROTTLE_BYTES; + return CONTINUE(throttle_bytes); +} + +CtPtr ProtocolV2::throttle_bytes() { + ldout(cct, 20) << __func__ << dendl; + + const size_t cur_msg_size = get_current_msg_size(); + if (cur_msg_size) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() + << "/" << connection->policy.throttler_bytes->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event( + 1000, connection->wakeup_handler)); + } + return nullptr; + } + } + } + + state = THROTTLE_DISPATCH_QUEUE; + return CONTINUE(throttle_dispatch_queue); +} + +CtPtr ProtocolV2::throttle_dispatch_queue() { + ldout(cct, 20) << __func__ << dendl; + + const size_t cur_msg_size = get_current_msg_size(); + if (cur_msg_size) { + if (!connection->dispatch_queue->dispatch_throttler.get_or_fail( + cur_msg_size)) { + ldout(cct, 10) + << __func__ << " wants " << cur_msg_size + << " bytes from dispatch throttle " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + throttle_stamp = ceph_clock_now(); + state = THROTTLE_DONE; + + return read_frame_segment(); +} + +CtPtr ProtocolV2::handle_keepalive2(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + + auto keepalive_frame = KeepAliveFrame::Decode(payload); + + ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl; + + connection->write_lock.lock(); + auto keepalive_ack_frame = KeepAliveFrameAck::Encode(keepalive_frame.timestamp()); + if (!append_frame(keepalive_ack_frame)) { + connection->write_lock.unlock(); + return _fault(); + } + connection->write_lock.unlock(); + + ldout(cct, 20) << __func__ << " got KEEPALIVE2 " + << keepalive_frame.timestamp() << dendl; + connection->set_last_keepalive(ceph_clock_now()); + + if (is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_keepalive2_ack(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + + auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload); + connection->set_last_keepalive_ack(keepalive_ack_frame.timestamp()); + ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl; + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_message_ack(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + + auto ack = AckFrame::Decode(payload); + handle_message_ack(ack.seq()); + return CONTINUE(read_frame); +} + +/* Client Protocol Methods */ + +CtPtr ProtocolV2::start_client_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + INTERCEPT(1); + + state = BANNER_CONNECTING; + + global_seq = messenger->get_global_seq(); + + return _banner_exchange(CONTINUATION(post_client_banner_exchange)); +} + +CtPtr ProtocolV2::post_client_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + state = AUTH_CONNECTING; + + return send_auth_request(); +} + +CtPtr ProtocolV2::send_auth_request(std::vector<uint32_t> &allowed_methods) { + ldout(cct, 20) << __func__ << " peer_type " << (int)connection->peer_type + << " auth_client " << messenger->auth_client << dendl; + ceph_assert(messenger->auth_client); + + bufferlist bl; + vector<uint32_t> preferred_modes; + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->get_auth_request( + connection, am.get(), + &am->auth_method, &preferred_modes, &bl); + connection->lock.lock(); + if (state != AUTH_CONNECTING) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + if (r < 0) { + ldout(cct, 0) << __func__ << " get_initial_auth_request returned " << r + << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + INTERCEPT(9); + + auto frame = AuthRequestFrame::Encode(auth_meta->auth_method, preferred_modes, + bl); + return WRITE(frame, "auth request", read_frame); +} + +CtPtr ProtocolV2::handle_auth_bad_method(ceph::bufferlist &payload) { + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_CONNECTING) { + lderr(cct) << __func__ << " not in auth connect state!" << dendl; + return _fault(); + } + + auto bad_method = AuthBadMethodFrame::Decode(payload); + ldout(cct, 1) << __func__ << " method=" << bad_method.method() + << " result " << cpp_strerror(bad_method.result()) + << ", allowed methods=" << bad_method.allowed_methods() + << ", allowed modes=" << bad_method.allowed_modes() + << dendl; + ceph_assert(messenger->auth_client); + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->handle_auth_bad_method( + connection, + am.get(), + bad_method.method(), bad_method.result(), + bad_method.allowed_methods(), + bad_method.allowed_modes()); + connection->lock.lock(); + if (state != AUTH_CONNECTING || r < 0) { + return _fault(); + } + return send_auth_request(bad_method.allowed_methods()); +} + +CtPtr ProtocolV2::handle_auth_reply_more(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_CONNECTING) { + lderr(cct) << __func__ << " not in auth connect state!" << dendl; + return _fault(); + } + + auto auth_more = AuthReplyMoreFrame::Decode(payload); + ldout(cct, 5) << __func__ + << " auth reply more len=" << auth_more.auth_payload().length() + << dendl; + ceph_assert(messenger->auth_client); + ceph::bufferlist reply; + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->handle_auth_reply_more( + connection, am.get(), auth_more.auth_payload(), &reply); + connection->lock.lock(); + if (state != AUTH_CONNECTING) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + if (r < 0) { + lderr(cct) << __func__ << " auth_client handle_auth_reply_more returned " + << r << dendl; + return _fault(); + } + auto more_reply = AuthRequestMoreFrame::Encode(reply); + return WRITE(more_reply, "auth request more", read_frame); +} + +CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_CONNECTING) { + lderr(cct) << __func__ << " not in auth connect state!" << dendl; + return _fault(); + } + + auto auth_done = AuthDoneFrame::Decode(payload); + + ceph_assert(messenger->auth_client); + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->handle_auth_done( + connection, + am.get(), + auth_done.global_id(), + auth_done.con_mode(), + auth_done.auth_payload(), + &am->session_key, + &am->connection_secret); + connection->lock.lock(); + if (state != AUTH_CONNECTING) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + if (r < 0) { + return _fault(); + } + auth_meta->con_mode = auth_done.con_mode(); + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/false); + + state = AUTH_CONNECTING_SIGN; + + const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() : + auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf); + auto sig_frame = AuthSignatureFrame::Encode(sig); + pre_auth.enabled = false; + pre_auth.rxbuf.clear(); + return WRITE(sig_frame, "auth signature", read_frame); +} + +CtPtr ProtocolV2::finish_client_auth() { + if (!server_cookie) { + ceph_assert(connect_seq == 0); + state = SESSION_CONNECTING; + return send_client_ident(); + } else { // reconnecting to previous session + state = SESSION_RECONNECTING; + ceph_assert(connect_seq > 0); + return send_reconnect(); + } +} + +CtPtr ProtocolV2::send_client_ident() { + ldout(cct, 20) << __func__ << dendl; + + if (!connection->policy.lossy && !client_cookie) { + client_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll); + } + + uint64_t flags = 0; + if (connection->policy.lossy) { + flags |= CEPH_MSG_CONNECT_LOSSY; + } + + auto client_ident = ClientIdentFrame::Encode( + messenger->get_myaddrs(), + connection->target_addr, + messenger->get_myname().num(), + global_seq, + connection->policy.features_supported, + connection->policy.features_required | msgr2_required, + flags, + client_cookie); + + ldout(cct, 5) << __func__ << " sending identification: " + << "addrs=" << messenger->get_myaddrs() + << " target=" << connection->target_addr + << " gid=" << messenger->get_myname().num() + << " global_seq=" << global_seq + << " features_supported=" << std::hex + << connection->policy.features_supported + << " features_required=" + << (connection->policy.features_required | msgr2_required) + << " flags=" << flags + << " cookie=" << client_cookie << std::dec << dendl; + + INTERCEPT(11); + + return WRITE(client_ident, "client ident", read_frame); +} + +CtPtr ProtocolV2::send_reconnect() { + ldout(cct, 20) << __func__ << dendl; + + auto reconnect = ReconnectFrame::Encode(messenger->get_myaddrs(), + client_cookie, + server_cookie, + global_seq, + connect_seq, + in_seq); + + ldout(cct, 5) << __func__ << " reconnect to session: client_cookie=" + << std::hex << client_cookie << " server_cookie=" + << server_cookie << std::dec + << " gs=" << global_seq << " cs=" << connect_seq + << " ms=" << in_seq << dendl; + + INTERCEPT(13); + + return WRITE(reconnect, "reconnect", read_frame); +} + +CtPtr ProtocolV2::handle_ident_missing_features(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_CONNECTING) { + lderr(cct) << __func__ << " not in session connect state!" << dendl; + return _fault(); + } + + auto ident_missing = + IdentMissingFeaturesFrame::Decode(payload); + lderr(cct) << __func__ + << " client does not support all server features: " << std::hex + << ident_missing.features() << std::dec << dendl; + + return _fault(); +} + +CtPtr ProtocolV2::handle_session_reset(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto reset = ResetFrame::Decode(payload); + + ldout(cct, 1) << __func__ << " received session reset full=" << reset.full() + << dendl; + if (reset.full()) { + reset_session(); + } else { + server_cookie = 0; + connect_seq = 0; + in_seq = 0; + } + + state = SESSION_CONNECTING; + return send_client_ident(); +} + +CtPtr ProtocolV2::handle_session_retry(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto retry = RetryFrame::Decode(payload); + connect_seq = retry.connect_seq() + 1; + + ldout(cct, 1) << __func__ + << " received session retry connect_seq=" << retry.connect_seq() + << ", inc to cs=" << connect_seq << dendl; + + return send_reconnect(); +} + +CtPtr ProtocolV2::handle_session_retry_global(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto retry = RetryGlobalFrame::Decode(payload); + global_seq = messenger->get_global_seq(retry.global_seq()); + + ldout(cct, 1) << __func__ << " received session retry global global_seq=" + << retry.global_seq() << ", choose new gs=" << global_seq + << dendl; + + return send_reconnect(); +} + +CtPtr ProtocolV2::handle_wait(ceph::bufferlist &payload) { + ldout(cct, 20) << __func__ + << " received WAIT (connection race)" + << " payload.length()=" << payload.length() + << dendl; + + if (state != SESSION_CONNECTING && state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session (re)connect state!" << dendl; + return _fault(); + } + + state = WAIT; + WaitFrame::Decode(payload); + return _fault(); +} + +CtPtr ProtocolV2::handle_reconnect_ok(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto reconnect_ok = ReconnectOkFrame::Decode(payload); + ldout(cct, 5) << __func__ + << " reconnect accepted: sms=" << reconnect_ok.msg_seq() + << dendl; + + out_seq = discard_requeued_up_to(out_seq, reconnect_ok.msg_seq()); + + backoff = utime_t(); + ldout(cct, 10) << __func__ << " reconnect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +CtPtr ProtocolV2::handle_server_ident(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_CONNECTING) { + lderr(cct) << __func__ << " not in session connect state!" << dendl; + return _fault(); + } + + auto server_ident = ServerIdentFrame::Decode(payload); + ldout(cct, 5) << __func__ << " received server identification:" + << " addrs=" << server_ident.addrs() + << " gid=" << server_ident.gid() + << " global_seq=" << server_ident.global_seq() + << " features_supported=" << std::hex + << server_ident.supported_features() + << " features_required=" << server_ident.required_features() + << " flags=" << server_ident.flags() + << " cookie=" << server_ident.cookie() << std::dec << dendl; + + // is this who we intended to talk to? + // be a bit forgiving here, since we may be connecting based on addresses parsed out + // of mon_host or something. + if (!server_ident.addrs().contains(connection->target_addr)) { + ldout(cct,1) << __func__ << " peer identifies as " << server_ident.addrs() + << ", does not include " << connection->target_addr << dendl; + return _fault(); + } + + server_cookie = server_ident.cookie(); + + connection->set_peer_addrs(server_ident.addrs()); + peer_name = entity_name_t(connection->get_peer_type(), server_ident.gid()); + connection->set_features(server_ident.supported_features() & + connection->policy.features_supported); + peer_global_seq = server_ident.global_seq(); + + connection->policy.lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY; + + backoff = utime_t(); + ldout(cct, 10) << __func__ << " connect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +/* Server Protocol Methods */ + +CtPtr ProtocolV2::start_server_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + INTERCEPT(2); + + state = BANNER_ACCEPTING; + + return _banner_exchange(CONTINUATION(post_server_banner_exchange)); +} + +CtPtr ProtocolV2::post_server_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + state = AUTH_ACCEPTING; + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_auth_request(ceph::bufferlist &payload) { + ldout(cct, 20) << __func__ << " payload.length()=" << payload.length() + << dendl; + + if (state != AUTH_ACCEPTING) { + lderr(cct) << __func__ << " not in auth accept state!" << dendl; + return _fault(); + } + + auto request = AuthRequestFrame::Decode(payload); + ldout(cct, 10) << __func__ << " AuthRequest(method=" << request.method() + << ", preferred_modes=" << request.preferred_modes() + << ", payload_len=" << request.auth_payload().length() << ")" + << dendl; + auth_meta->auth_method = request.method(); + auth_meta->con_mode = messenger->auth_server->pick_con_mode( + connection->get_peer_type(), auth_meta->auth_method, + request.preferred_modes()); + if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) { + return _auth_bad_method(-EOPNOTSUPP); + } + return _handle_auth_request(request.auth_payload(), false); +} + +CtPtr ProtocolV2::_auth_bad_method(int r) +{ + ceph_assert(r < 0); + std::vector<uint32_t> allowed_methods; + std::vector<uint32_t> allowed_modes; + messenger->auth_server->get_supported_auth_methods( + connection->get_peer_type(), &allowed_methods, &allowed_modes); + ldout(cct, 1) << __func__ << " auth_method " << auth_meta->auth_method + << " r " << cpp_strerror(r) + << ", allowed_methods " << allowed_methods + << ", allowed_modes " << allowed_modes + << dendl; + auto bad_method = AuthBadMethodFrame::Encode(auth_meta->auth_method, r, + allowed_methods, allowed_modes); + return WRITE(bad_method, "bad auth method", read_frame); +} + +CtPtr ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more) +{ + if (!messenger->auth_server) { + return _fault(); + } + bufferlist reply; + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_server->handle_auth_request( + connection, am.get(), + more, am->auth_method, auth_payload, + &reply); + connection->lock.lock(); + if (state != AUTH_ACCEPTING && state != AUTH_ACCEPTING_MORE) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + if (r == 1) { + INTERCEPT(10); + state = AUTH_ACCEPTING_SIGN; + + auto auth_done = AuthDoneFrame::Encode(connection->peer_global_id, + auth_meta->con_mode, + reply); + return WRITE(auth_done, "auth done", finish_auth); + } else if (r == 0) { + state = AUTH_ACCEPTING_MORE; + + auto more = AuthReplyMoreFrame::Encode(reply); + return WRITE(more, "auth reply more", read_frame); + } else if (r == -EBUSY) { + // kick the client and maybe they'll come back later + return _fault(); + } else { + return _auth_bad_method(r); + } +} + +CtPtr ProtocolV2::finish_auth() +{ + ceph_assert(auth_meta); + // TODO: having a possibility to check whether we're server or client could + // allow reusing finish_auth(). + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/true); + + const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() : + auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf); + auto sig_frame = AuthSignatureFrame::Encode(sig); + pre_auth.enabled = false; + pre_auth.rxbuf.clear(); + return WRITE(sig_frame, "auth signature", read_frame); +} + +CtPtr ProtocolV2::handle_auth_request_more(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_ACCEPTING_MORE) { + lderr(cct) << __func__ << " not in auth accept more state!" << dendl; + return _fault(); + } + + auto auth_more = AuthRequestMoreFrame::Decode(payload); + return _handle_auth_request(auth_more.auth_payload(), true); +} + +CtPtr ProtocolV2::handle_auth_signature(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_ACCEPTING_SIGN && state != AUTH_CONNECTING_SIGN) { + lderr(cct) << __func__ + << " pre-auth verification signature seen in wrong state!" + << dendl; + return _fault(); + } + + auto sig_frame = AuthSignatureFrame::Decode(payload); + + const auto actual_tx_sig = auth_meta->session_key.empty() ? + sha256_digest_t() : auth_meta->session_key.hmac_sha256(cct, pre_auth.txbuf); + if (sig_frame.signature() != actual_tx_sig) { + ldout(cct, 2) << __func__ << " pre-auth signature mismatch" + << " actual_tx_sig=" << actual_tx_sig + << " sig_frame.signature()=" << sig_frame.signature() + << dendl; + return _fault(); + } else { + ldout(cct, 20) << __func__ << " pre-auth signature success" + << " sig_frame.signature()=" << sig_frame.signature() + << dendl; + pre_auth.txbuf.clear(); + } + + if (state == AUTH_ACCEPTING_SIGN) { + // server had sent AuthDone and client responded with correct pre-auth + // signature. we can start accepting new sessions/reconnects. + state = SESSION_ACCEPTING; + return CONTINUE(read_frame); + } else if (state == AUTH_CONNECTING_SIGN) { + // this happened at client side + return finish_client_auth(); + } else { + ceph_assert_always("state corruption" == nullptr); + } +} + +CtPtr ProtocolV2::handle_client_ident(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_ACCEPTING) { + lderr(cct) << __func__ << " not in session accept state!" << dendl; + return _fault(); + } + + auto client_ident = ClientIdentFrame::Decode(payload); + + ldout(cct, 5) << __func__ << " received client identification:" + << " addrs=" << client_ident.addrs() + << " target=" << client_ident.target_addr() + << " gid=" << client_ident.gid() + << " global_seq=" << client_ident.global_seq() + << " features_supported=" << std::hex + << client_ident.supported_features() + << " features_required=" << client_ident.required_features() + << " flags=" << client_ident.flags() + << " cookie=" << client_ident.cookie() << std::dec << dendl; + + if (client_ident.addrs().empty() || + client_ident.addrs().front() == entity_addr_t()) { + ldout(cct,5) << __func__ << " oops, client_ident.addrs() is empty" << dendl; + return _fault(); // a v2 peer should never do this + } + if (!messenger->get_myaddrs().contains(client_ident.target_addr())) { + ldout(cct,5) << __func__ << " peer is trying to reach " + << client_ident.target_addr() + << " which is not us (" << messenger->get_myaddrs() << ")" + << dendl; + return _fault(); + } + + connection->set_peer_addrs(client_ident.addrs()); + connection->target_addr = connection->_infer_target_addr(client_ident.addrs()); + + peer_name = entity_name_t(connection->get_peer_type(), client_ident.gid()); + connection->set_peer_id(client_ident.gid()); + + client_cookie = client_ident.cookie(); + + uint64_t feat_missing = + (connection->policy.features_required | msgr2_required) & + ~(uint64_t)client_ident.supported_features(); + if (feat_missing) { + ldout(cct, 1) << __func__ << " peer missing required features " << std::hex + << feat_missing << std::dec << dendl; + auto ident_missing_features = + IdentMissingFeaturesFrame::Encode(feat_missing); + + return WRITE(ident_missing_features, "ident missing features", read_frame); + } + + connection_features = + client_ident.supported_features() & connection->policy.features_supported; + + peer_global_seq = client_ident.global_seq(); + + // Looks good so far, let's check if there is already an existing connection + // to this peer. + + connection->lock.unlock(); + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + if (existing && + existing->protocol->proto_type != 2) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + connection->inject_delay(); + + connection->lock.lock(); + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (existing) { + return handle_existing_connection(existing); + } + + // if everything is OK reply with server identification + return send_server_ident(); +} + +CtPtr ProtocolV2::handle_reconnect(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_ACCEPTING) { + lderr(cct) << __func__ << " not in session accept state!" << dendl; + return _fault(); + } + + auto reconnect = ReconnectFrame::Decode(payload); + + ldout(cct, 5) << __func__ + << " received reconnect:" + << " client_cookie=" << std::hex << reconnect.client_cookie() + << " server_cookie=" << reconnect.server_cookie() << std::dec + << " gs=" << reconnect.global_seq() + << " cs=" << reconnect.connect_seq() + << " ms=" << reconnect.msg_seq() + << dendl; + + // Should we check if one of the ident.addrs match connection->target_addr + // as we do in ProtocolV1? + connection->set_peer_addrs(reconnect.addrs()); + connection->target_addr = connection->_infer_target_addr(reconnect.addrs()); + peer_global_seq = reconnect.global_seq(); + + connection->lock.unlock(); + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + if (existing && + existing->protocol->proto_type != 2) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + connection->inject_delay(); + + connection->lock.lock(); + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (!existing) { + // there is no existing connection therefore cannot reconnect to previous + // session + ldout(cct, 0) << __func__ + << " no existing connection exists, reseting client" << dendl; + auto reset = ResetFrame::Encode(true); + return WRITE(reset, "session reset", read_frame); + } + + std::lock_guard<std::mutex> l(existing->lock); + + ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get()); + if (!exproto) { + ldout(cct, 1) << __func__ << " existing=" << existing << dendl; + ceph_assert(false); + } + + if (exproto->state == CLOSED) { + ldout(cct, 5) << __func__ << " existing " << existing + << " already closed. Reseting client" << dendl; + auto reset = ResetFrame::Encode(true); + return WRITE(reset, "session reset", read_frame); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing=" << existing << dendl; + auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq); + return WRITE(retry, "session retry", read_frame); + } + + if (exproto->client_cookie != reconnect.client_cookie()) { + ldout(cct, 1) << __func__ << " existing=" << existing + << " client cookie mismatch, I must have reseted:" + << " cc=" << std::hex << exproto->client_cookie + << " rcc=" << reconnect.client_cookie() + << ", reseting client." << std::dec + << dendl; + auto reset = ResetFrame::Encode(connection->policy.resetcheck); + return WRITE(reset, "session reset", read_frame); + } else if (exproto->server_cookie == 0) { + // this happens when: + // - a connects to b + // - a sends client_ident + // - b gets client_ident, sends server_ident and sets cookie X + // - connection fault + // - b reconnects to a with cookie X, connect_seq=1 + // - a has cookie==0 + ldout(cct, 1) << __func__ << " I was a client and didn't received the" + << " server_ident. Asking peer to resume session" + << " establishment" << dendl; + auto reset = ResetFrame::Encode(false); + return WRITE(reset, "session reset", read_frame); + } + + if (exproto->peer_global_seq > reconnect.global_seq()) { + ldout(cct, 5) << __func__ + << " stale global_seq: sgs=" << exproto->peer_global_seq + << " cgs=" << reconnect.global_seq() + << ", ask client to retry global" << dendl; + auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq); + + INTERCEPT(18); + + return WRITE(retry, "session retry", read_frame); + } + + if (exproto->connect_seq > reconnect.connect_seq()) { + ldout(cct, 5) << __func__ + << " stale connect_seq scs=" << exproto->connect_seq + << " ccs=" << reconnect.connect_seq() + << " , ask client to retry" << dendl; + auto retry = RetryFrame::Encode(exproto->connect_seq); + return WRITE(retry, "session retry", read_frame); + } + + if (exproto->connect_seq == reconnect.connect_seq()) { + // reconnect race: both peers are sending reconnect messages + if (existing->peer_addrs->msgr2_addr() > + messenger->get_myaddrs().msgr2_addr() && + !existing->policy.server) { + // the existing connection wins + ldout(cct, 1) + << __func__ + << " reconnect race detected, this connection loses to existing=" + << existing << dendl; + + auto wait = WaitFrame::Encode(); + return WRITE(wait, "wait", read_frame); + } else { + // this connection wins + ldout(cct, 1) << __func__ + << " reconnect race detected, replacing existing=" + << existing << " socket by this connection's socket" + << dendl; + } + } + + ldout(cct, 1) << __func__ << " reconnect to existing=" << existing << dendl; + + reconnecting = true; + + // everything looks good + exproto->connect_seq = reconnect.connect_seq(); + exproto->message_seq = reconnect.msg_seq(); + + return reuse_connection(existing, exproto); +} + +CtPtr ProtocolV2::handle_existing_connection(AsyncConnectionRef existing) { + ldout(cct, 20) << __func__ << " existing=" << existing << dendl; + + std::lock_guard<std::mutex> l(existing->lock); + + ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get()); + if (!exproto) { + ldout(cct, 1) << __func__ << " existing=" << existing << dendl; + ceph_assert(false); + } + + if (exproto->state == CLOSED) { + ldout(cct, 1) << __func__ << " existing " << existing << " already closed." + << dendl; + return send_server_ident(); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing=" << existing << dendl; + auto wait = WaitFrame::Encode(); + return WRITE(wait, "wait", read_frame); + } + + if (exproto->peer_global_seq > peer_global_seq) { + ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq=" + << peer_global_seq + << " existing->peer_global_seq=" << exproto->peer_global_seq + << ", stopping this connection." << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + if (existing->policy.lossy) { + // existing connection can be thrown out in favor of this one + ldout(cct, 1) + << __func__ << " existing=" << existing + << " is a lossy channel. Stopping existing in favor of this connection" + << dendl; + existing->protocol->stop(); + existing->dispatch_queue->queue_reset(existing.get()); + return send_server_ident(); + } + + if (exproto->server_cookie && exproto->client_cookie && + exproto->client_cookie != client_cookie) { + // Found previous session + // peer has reseted and we're going to reuse the existing connection + // by replacing the communication socket + ldout(cct, 1) << __func__ << " found previous session existing=" << existing + << ", peer must have reseted." << dendl; + if (connection->policy.resetcheck) { + exproto->reset_session(); + } + return reuse_connection(existing, exproto); + } + + if (exproto->client_cookie == client_cookie) { + // session establishment interrupted between client_ident and server_ident, + // continuing... + ldout(cct, 1) << __func__ << " found previous session existing=" << existing + << ", continuing session establishment." << dendl; + return reuse_connection(existing, exproto); + } + + if (exproto->state == READY || exproto->state == STANDBY) { + ldout(cct, 1) << __func__ << " existing=" << existing + << " is READY/STANDBY, lets reuse it" << dendl; + return reuse_connection(existing, exproto); + } + + // Looks like a connection race: server and client are both connecting to + // each other at the same time. + if (connection->peer_addrs->msgr2_addr() < + messenger->get_myaddrs().msgr2_addr() || + existing->policy.server) { + // this connection wins + ldout(cct, 1) << __func__ + << " connection race detected, replacing existing=" + << existing << " socket by this connection's socket" << dendl; + return reuse_connection(existing, exproto); + } else { + // the existing connection wins + ldout(cct, 1) + << __func__ + << " connection race detected, this connection loses to existing=" + << existing << dendl; + ceph_assert(connection->peer_addrs->msgr2_addr() > + messenger->get_myaddrs().msgr2_addr()); + + // make sure we follow through with opening the existing + // connection (if it isn't yet open) since we know the peer + // has something to send to us. + existing->send_keepalive(); + auto wait = WaitFrame::Encode(); + return WRITE(wait, "wait", read_frame); + } +} + +CtPtr ProtocolV2::reuse_connection(AsyncConnectionRef existing, + ProtocolV2 *exproto) { + ldout(cct, 20) << __func__ << " existing=" << existing + << " reconnect=" << reconnecting << dendl; + + connection->inject_delay(); + + std::lock_guard<std::mutex> l(existing->write_lock); + + connection->center->delete_file_event(connection->cs.fd(), + EVENT_READABLE | EVENT_WRITABLE); + + if (existing->delay_state) { + existing->delay_state->flush(); + ceph_assert(!connection->delay_state); + } + exproto->reset_recv_state(); + exproto->pre_auth.enabled = false; + + if (!reconnecting) { + exproto->peer_supported_features = peer_supported_features; + exproto->tx_frame_asm.set_is_rev1(tx_frame_asm.get_is_rev1()); + exproto->rx_frame_asm.set_is_rev1(rx_frame_asm.get_is_rev1()); + + exproto->client_cookie = client_cookie; + exproto->peer_name = peer_name; + exproto->connection_features = connection_features; + existing->set_features(connection_features); + } + exproto->peer_global_seq = peer_global_seq; + + ceph_assert(connection->center->in_thread()); + auto temp_cs = std::move(connection->cs); + EventCenter *new_center = connection->center; + Worker *new_worker = connection->worker; + // we can steal the session_stream_handlers under the assumption + // this happens in the event center's thread as there should be + // no user outside its boundaries (simlarly to e.g. outgoing_bl). + auto temp_stream_handlers = std::move(session_stream_handlers); + exproto->auth_meta = auth_meta; + + ldout(messenger->cct, 5) << __func__ << " stop myself to swap existing" + << dendl; + + // avoid _stop shutdown replacing socket + // queue a reset on the new connection, which we're dumping for the old + stop(); + + connection->dispatch_queue->queue_reset(connection); + + exproto->can_write = false; + exproto->write_in_progress = false; + exproto->reconnecting = reconnecting; + exproto->replacing = true; + existing->state_offset = 0; + // avoid previous thread modify event + exproto->state = NONE; + existing->state = AsyncConnection::STATE_NONE; + // Discard existing prefetch buffer in `recv_buf` + existing->recv_start = existing->recv_end = 0; + // there shouldn't exist any buffer + ceph_assert(connection->recv_start == connection->recv_end); + + auto deactivate_existing = std::bind( + [ existing, + new_worker, + new_center, + exproto, + temp_stream_handlers=std::move(temp_stream_handlers) + ](ConnectedSocket &cs) mutable { + // we need to delete time event in original thread + { + std::lock_guard<std::mutex> l(existing->lock); + existing->write_lock.lock(); + exproto->requeue_sent(); + // XXX: do we really need the locking for `outgoing_bl`? There is + // a comment just above its definition saying "lockfree, only used + // in own thread". I'm following lockfull schema just in the case. + // From performance point of view it should be fine – this happens + // far away from hot paths. + existing->outgoing_bl.clear(); + existing->open_write = false; + exproto->session_stream_handlers = std::move(temp_stream_handlers); + existing->write_lock.unlock(); + if (exproto->state == NONE) { + existing->shutdown_socket(); + existing->cs = std::move(cs); + existing->worker->references--; + new_worker->references++; + existing->logger = new_worker->get_perf_counter(); + existing->worker = new_worker; + existing->center = new_center; + if (existing->delay_state) + existing->delay_state->set_center(new_center); + } else if (exproto->state == CLOSED) { + auto back_to_close = std::bind( + [](ConnectedSocket &cs) mutable { cs.close(); }, std::move(cs)); + new_center->submit_to(new_center->get_id(), + std::move(back_to_close), true); + return; + } else { + ceph_abort(); + } + } + + // Before changing existing->center, it may already exists some + // events in existing->center's queue. Then if we mark down + // `existing`, it will execute in another thread and clean up + // connection. Previous event will result in segment fault + auto transfer_existing = [existing, exproto]() mutable { + std::lock_guard<std::mutex> l(existing->lock); + if (exproto->state == CLOSED) return; + ceph_assert(exproto->state == NONE); + + exproto->state = SESSION_ACCEPTING; + // we have called shutdown_socket above + ceph_assert(existing->last_tick_id == 0); + // restart timer since we are going to re-build connection + existing->last_connect_started = ceph::coarse_mono_clock::now(); + existing->last_tick_id = existing->center->create_time_event( + existing->connect_timeout_us, existing->tick_handler); + existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED; + existing->center->create_file_event(existing->cs.fd(), EVENT_READABLE, + existing->read_handler); + if (!exproto->reconnecting) { + exproto->run_continuation(exproto->send_server_ident()); + } else { + exproto->run_continuation(exproto->send_reconnect_ok()); + } + }; + if (existing->center->in_thread()) + transfer_existing(); + else + existing->center->submit_to(existing->center->get_id(), + std::move(transfer_existing), true); + }, + std::move(temp_cs)); + + existing->center->submit_to(existing->center->get_id(), + std::move(deactivate_existing), true); + return nullptr; +} + +CtPtr ProtocolV2::send_server_ident() { + ldout(cct, 20) << __func__ << dendl; + + // this is required for the case when this connection is being replaced + out_seq = discard_requeued_up_to(out_seq, 0); + in_seq = 0; + + if (!connection->policy.lossy) { + server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll); + } + + uint64_t flags = 0; + if (connection->policy.lossy) { + flags = flags | CEPH_MSG_CONNECT_LOSSY; + } + + uint64_t gs = messenger->get_global_seq(); + auto server_ident = ServerIdentFrame::Encode( + messenger->get_myaddrs(), + messenger->get_myname().num(), + gs, + connection->policy.features_supported, + connection->policy.features_required | msgr2_required, + flags, + server_cookie); + + ldout(cct, 5) << __func__ << " sending identification:" + << " addrs=" << messenger->get_myaddrs() + << " gid=" << messenger->get_myname().num() + << " global_seq=" << gs << " features_supported=" << std::hex + << connection->policy.features_supported + << " features_required=" + << (connection->policy.features_required | msgr2_required) + << " flags=" << flags + << " cookie=" << server_cookie << std::dec << dendl; + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->msgr2_addr() + << " just fail later one(this)" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + connection->set_features(connection_features); + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + + INTERCEPT(12); + + return WRITE(server_ident, "server ident", server_ready); +} + +CtPtr ProtocolV2::server_ready() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + return ready(); +} + +CtPtr ProtocolV2::send_reconnect_ok() { + ldout(cct, 20) << __func__ << dendl; + + out_seq = discard_requeued_up_to(out_seq, message_seq); + + uint64_t ms = in_seq; + auto reconnect_ok = ReconnectOkFrame::Encode(ms); + + ldout(cct, 5) << __func__ << " sending reconnect_ok: msg_seq=" << ms << dendl; + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->msgr2_addr() + << " just fail later one(this)" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + + INTERCEPT(14); + + return WRITE(reconnect_ok, "reconnect ok", server_ready); +} diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h new file mode 100644 index 00000000..4941cea5 --- /dev/null +++ b/src/msg/async/ProtocolV2.h @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _MSG_ASYNC_PROTOCOL_V2_ +#define _MSG_ASYNC_PROTOCOL_V2_ + +#include "Protocol.h" +#include "crypto_onwire.h" +#include "frames_v2.h" + +class ProtocolV2 : public Protocol { +private: + enum State { + NONE, + START_CONNECT, + BANNER_CONNECTING, + HELLO_CONNECTING, + AUTH_CONNECTING, + AUTH_CONNECTING_SIGN, + SESSION_CONNECTING, + SESSION_RECONNECTING, + START_ACCEPT, + BANNER_ACCEPTING, + HELLO_ACCEPTING, + AUTH_ACCEPTING, + AUTH_ACCEPTING_MORE, + AUTH_ACCEPTING_SIGN, + SESSION_ACCEPTING, + READY, + THROTTLE_MESSAGE, + THROTTLE_BYTES, + THROTTLE_DISPATCH_QUEUE, + THROTTLE_DONE, + READ_MESSAGE_COMPLETE, + STANDBY, + WAIT, + CLOSED + }; + + static const char *get_state_name(int state) { + const char *const statenames[] = {"NONE", + "START_CONNECT", + "BANNER_CONNECTING", + "HELLO_CONNECTING", + "AUTH_CONNECTING", + "AUTH_CONNECTING_SIGN", + "SESSION_CONNECTING", + "SESSION_RECONNECTING", + "START_ACCEPT", + "BANNER_ACCEPTING", + "HELLO_ACCEPTING", + "AUTH_ACCEPTING", + "AUTH_ACCEPTING_MORE", + "AUTH_ACCEPTING_SIGN", + "SESSION_ACCEPTING", + "READY", + "THROTTLE_MESSAGE", + "THROTTLE_BYTES", + "THROTTLE_DISPATCH_QUEUE", + "THROTTLE_DONE", + "READ_MESSAGE_COMPLETE", + "STANDBY", + "WAIT", + "CLOSED"}; + return statenames[state]; + } + + // TODO: move into auth_meta? + ceph::crypto::onwire::rxtx_t session_stream_handlers; + + entity_name_t peer_name; + State state; + uint64_t peer_supported_features; // CEPH_MSGR2_FEATURE_* + + uint64_t client_cookie; + uint64_t server_cookie; + uint64_t global_seq; + uint64_t connect_seq; + uint64_t peer_global_seq; + uint64_t message_seq; + bool reconnecting; + bool replacing; + bool can_write; + struct out_queue_entry_t { + bool is_prepared {false}; + Message* m {nullptr}; + }; + std::map<int, std::list<out_queue_entry_t>> out_queue; + std::list<Message *> sent; + std::atomic<uint64_t> out_seq{0}; + std::atomic<uint64_t> in_seq{0}; + std::atomic<uint64_t> ack_left{0}; + + using ProtFuncPtr = void (ProtocolV2::*)(); + Ct<ProtocolV2> *bannerExchangeCallback; + + ceph::msgr::v2::FrameAssembler tx_frame_asm; + ceph::msgr::v2::FrameAssembler rx_frame_asm; + + ceph::bufferlist rx_preamble; + ceph::bufferlist rx_epilogue; + ceph::msgr::v2::segment_bls_t rx_segments_data; + ceph::msgr::v2::Tag next_tag; + utime_t backoff; // backoff time + utime_t recv_stamp; + utime_t throttle_stamp; + + struct { + ceph::bufferlist rxbuf; + ceph::bufferlist txbuf; + bool enabled {true}; + } pre_auth; + + bool keepalive; + bool write_in_progress = false; + + ostream &_conn_prefix(std::ostream *_dout); + void run_continuation(Ct<ProtocolV2> *pcontinuation); + void run_continuation(Ct<ProtocolV2> &continuation); + + Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next, + rx_buffer_t&& buffer); + template <class F> + Ct<ProtocolV2> *write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + F &frame); + Ct<ProtocolV2> *write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + bufferlist &buffer); + + template <class F> + bool append_frame(F& frame); + + void requeue_sent(); + uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq); + void reset_recv_state(); + void reset_security(); + void reset_throttle(); + Ct<ProtocolV2> *_fault(); + void discard_out_queue(); + void reset_session(); + void prepare_send_message(uint64_t features, Message *m); + out_queue_entry_t _get_next_outgoing(); + ssize_t write_message(Message *m, bool more); + void handle_message_ack(uint64_t seq); + + CONTINUATION_DECL(ProtocolV2, _wait_for_peer_banner); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner_payload); + + Ct<ProtocolV2> *_banner_exchange(Ct<ProtocolV2> &callback); + Ct<ProtocolV2> *_wait_for_peer_banner(); + Ct<ProtocolV2> *_handle_peer_banner(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *_handle_peer_banner_payload(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *handle_hello(ceph::bufferlist &payload); + + CONTINUATION_DECL(ProtocolV2, read_frame); + CONTINUATION_DECL(ProtocolV2, finish_auth); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_preamble_main); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_segment); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_epilogue_main); + CONTINUATION_DECL(ProtocolV2, throttle_message); + CONTINUATION_DECL(ProtocolV2, throttle_bytes); + CONTINUATION_DECL(ProtocolV2, throttle_dispatch_queue); + + Ct<ProtocolV2> *read_frame(); + Ct<ProtocolV2> *finish_auth(); + Ct<ProtocolV2> *finish_client_auth(); + Ct<ProtocolV2> *handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *read_frame_segment(); + Ct<ProtocolV2> *handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r); + Ct<ProtocolV2> *_handle_read_frame_segment(); + Ct<ProtocolV2> *handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *_handle_read_frame_epilogue_main(); + Ct<ProtocolV2> *handle_read_frame_dispatch(); + Ct<ProtocolV2> *handle_frame_payload(); + + Ct<ProtocolV2> *ready(); + + Ct<ProtocolV2> *handle_message(); + Ct<ProtocolV2> *throttle_message(); + Ct<ProtocolV2> *throttle_bytes(); + Ct<ProtocolV2> *throttle_dispatch_queue(); + Ct<ProtocolV2> *read_message_data_prepare(); + + Ct<ProtocolV2> *handle_keepalive2(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_keepalive2_ack(ceph::bufferlist &payload); + + Ct<ProtocolV2> *handle_message_ack(ceph::bufferlist &payload); + +public: + uint64_t connection_features; + + ProtocolV2(AsyncConnection *connection); + virtual ~ProtocolV2(); + + virtual void connect() override; + virtual void accept() override; + virtual bool is_connected() override; + virtual void stop() override; + virtual void fault() override; + virtual void send_message(Message *m) override; + virtual void send_keepalive() override; + + virtual void read_event() override; + virtual void write_event() override; + virtual bool is_queued() override; + +private: + // Client Protocol + CONTINUATION_DECL(ProtocolV2, start_client_banner_exchange); + CONTINUATION_DECL(ProtocolV2, post_client_banner_exchange); + + Ct<ProtocolV2> *start_client_banner_exchange(); + Ct<ProtocolV2> *post_client_banner_exchange(); + inline Ct<ProtocolV2> *send_auth_request() { + std::vector<uint32_t> empty; + return send_auth_request(empty); + } + Ct<ProtocolV2> *send_auth_request(std::vector<uint32_t> &allowed_methods); + Ct<ProtocolV2> *handle_auth_bad_method(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_reply_more(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_done(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_signature(ceph::bufferlist &payload); + Ct<ProtocolV2> *send_client_ident(); + Ct<ProtocolV2> *send_reconnect(); + Ct<ProtocolV2> *handle_ident_missing_features(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_session_reset(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_session_retry(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_session_retry_global(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_wait(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_reconnect_ok(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_server_ident(ceph::bufferlist &payload); + + // Server Protocol + CONTINUATION_DECL(ProtocolV2, start_server_banner_exchange); + CONTINUATION_DECL(ProtocolV2, post_server_banner_exchange); + CONTINUATION_DECL(ProtocolV2, server_ready); + + Ct<ProtocolV2> *start_server_banner_exchange(); + Ct<ProtocolV2> *post_server_banner_exchange(); + Ct<ProtocolV2> *handle_auth_request(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_request_more(ceph::bufferlist &payload); + Ct<ProtocolV2> *_handle_auth_request(bufferlist& auth_payload, bool more); + Ct<ProtocolV2> *_auth_bad_method(int r); + Ct<ProtocolV2> *handle_client_ident(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_ident_missing_features_write(int r); + Ct<ProtocolV2> *handle_reconnect(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_existing_connection(AsyncConnectionRef existing); + Ct<ProtocolV2> *reuse_connection(AsyncConnectionRef existing, + ProtocolV2 *exproto); + Ct<ProtocolV2> *send_server_ident(); + Ct<ProtocolV2> *send_reconnect_ok(); + Ct<ProtocolV2> *server_ready(); + + size_t get_current_msg_size() const; +}; + +#endif /* _MSG_ASYNC_PROTOCOL_V2_ */ diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc new file mode 100644 index 00000000..8976c3cc --- /dev/null +++ b/src/msg/async/Stack.cc @@ -0,0 +1,217 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <mutex> + +#include "include/compat.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "PosixStack.h" +#ifdef HAVE_RDMA +#include "rdma/RDMAStack.h" +#endif +#ifdef HAVE_DPDK +#include "dpdk/DPDKStack.h" +#endif + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "stack " + +std::function<void ()> NetworkStack::add_thread(unsigned i) +{ + Worker *w = workers[i]; + return [this, w]() { + char tp_name[16]; + sprintf(tp_name, "msgr-worker-%u", w->id); + ceph_pthread_setname(pthread_self(), tp_name); + const unsigned EventMaxWaitUs = 30000000; + w->center.set_owner(); + ldout(cct, 10) << __func__ << " starting" << dendl; + w->initialize(); + w->init_done(); + while (!w->done) { + ldout(cct, 30) << __func__ << " calling event process" << dendl; + + ceph::timespan dur; + int r = w->center.process_events(EventMaxWaitUs, &dur); + if (r < 0) { + ldout(cct, 20) << __func__ << " process events failed: " + << cpp_strerror(errno) << dendl; + // TODO do something? + } + w->perf_logger->tinc(l_msgr_running_total_time, dur); + } + w->reset(); + w->destroy(); + }; +} + +std::shared_ptr<NetworkStack> NetworkStack::create(CephContext *c, const string &t) +{ + if (t == "posix") + return std::make_shared<PosixNetworkStack>(c, t); +#ifdef HAVE_RDMA + else if (t == "rdma") + return std::make_shared<RDMAStack>(c, t); +#endif +#ifdef HAVE_DPDK + else if (t == "dpdk") + return std::make_shared<DPDKStack>(c, t); +#endif + + lderr(c) << __func__ << " ms_async_transport_type " << t << + " is not supported! " << dendl; + ceph_abort(); + return nullptr; +} + +Worker* NetworkStack::create_worker(CephContext *c, const string &type, unsigned i) +{ + if (type == "posix") + return new PosixWorker(c, i); +#ifdef HAVE_RDMA + else if (type == "rdma") + return new RDMAWorker(c, i); +#endif +#ifdef HAVE_DPDK + else if (type == "dpdk") + return new DPDKWorker(c, i); +#endif + + lderr(c) << __func__ << " ms_async_transport_type " << type << + " is not supported! " << dendl; + ceph_abort(); + return nullptr; +} + +NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c) +{ + ceph_assert(cct->_conf->ms_async_op_threads > 0); + + const int InitEventNumber = 5000; + num_workers = cct->_conf->ms_async_op_threads; + if (num_workers >= EventCenter::MAX_EVENTCENTER) { + ldout(cct, 0) << __func__ << " max thread limit is " + << EventCenter::MAX_EVENTCENTER << ", switching to this now. " + << "Higher thread values are unnecessary and currently unsupported." + << dendl; + num_workers = EventCenter::MAX_EVENTCENTER; + } + + for (unsigned i = 0; i < num_workers; ++i) { + Worker *w = create_worker(cct, type, i); + w->center.init(InitEventNumber, i, type); + workers.push_back(w); + } +} + +void NetworkStack::start() +{ + std::unique_lock<decltype(pool_spin)> lk(pool_spin); + + if (started) { + return ; + } + + for (unsigned i = 0; i < num_workers; ++i) { + if (workers[i]->is_init()) + continue; + std::function<void ()> thread = add_thread(i); + spawn_worker(i, std::move(thread)); + } + started = true; + lk.unlock(); + + for (unsigned i = 0; i < num_workers; ++i) + workers[i]->wait_for_init(); +} + +Worker* NetworkStack::get_worker() +{ + ldout(cct, 30) << __func__ << dendl; + + // start with some reasonably large number + unsigned min_load = std::numeric_limits<int>::max(); + Worker* current_best = nullptr; + + pool_spin.lock(); + // find worker with least references + // tempting case is returning on references == 0, but in reality + // this will happen so rarely that there's no need for special case. + for (unsigned i = 0; i < num_workers; ++i) { + unsigned worker_load = workers[i]->references.load(); + if (worker_load < min_load) { + current_best = workers[i]; + min_load = worker_load; + } + } + + pool_spin.unlock(); + ceph_assert(current_best); + ++current_best->references; + return current_best; +} + +void NetworkStack::stop() +{ + std::lock_guard<decltype(pool_spin)> lk(pool_spin); + for (unsigned i = 0; i < num_workers; ++i) { + workers[i]->done = true; + workers[i]->center.wakeup(); + join_worker(i); + } + started = false; +} + +class C_drain : public EventCallback { + Mutex drain_lock; + Cond drain_cond; + unsigned drain_count; + + public: + explicit C_drain(size_t c) + : drain_lock("C_drain::drain_lock"), + drain_count(c) {} + void do_request(uint64_t id) override { + Mutex::Locker l(drain_lock); + drain_count--; + if (drain_count == 0) drain_cond.Signal(); + } + void wait() { + Mutex::Locker l(drain_lock); + while (drain_count) + drain_cond.Wait(drain_lock); + } +}; + +void NetworkStack::drain() +{ + ldout(cct, 30) << __func__ << " started." << dendl; + pthread_t cur = pthread_self(); + pool_spin.lock(); + C_drain drain(num_workers); + for (unsigned i = 0; i < num_workers; ++i) { + ceph_assert(cur != workers[i]->center.get_owner()); + workers[i]->center.dispatch_event_external(EventCallbackRef(&drain)); + } + pool_spin.unlock(); + drain.wait(); + ldout(cct, 30) << __func__ << " end." << dendl; +} diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h new file mode 100644 index 00000000..a093dadb --- /dev/null +++ b/src/msg/async/Stack.h @@ -0,0 +1,356 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ASYNC_STACK_H +#define CEPH_MSG_ASYNC_STACK_H + +#include "include/spinlock.h" +#include "common/perf_counters.h" +#include "msg/msg_types.h" +#include "msg/async/Event.h" + +class Worker; +class ConnectedSocketImpl { + public: + virtual ~ConnectedSocketImpl() {} + virtual int is_connected() = 0; + virtual ssize_t read(char*, size_t) = 0; + virtual ssize_t zero_copy_read(bufferptr&) = 0; + virtual ssize_t send(bufferlist &bl, bool more) = 0; + virtual void shutdown() = 0; + virtual void close() = 0; + virtual int fd() const = 0; + virtual int socket_fd() const = 0; +}; + +class ConnectedSocket; +struct SocketOptions { + bool nonblock = true; + bool nodelay = true; + int rcbuf_size = 0; + int priority = -1; + entity_addr_t connect_bind_addr; +}; + +/// \cond internal +class ServerSocketImpl { + public: + unsigned addr_type; ///< entity_addr_t::TYPE_* + unsigned addr_slot; ///< position of our addr in myaddrs().v + ServerSocketImpl(unsigned type, unsigned slot) + : addr_type(type), addr_slot(slot) {} + virtual ~ServerSocketImpl() {} + virtual int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) = 0; + virtual void abort_accept() = 0; + /// Get file descriptor + virtual int fd() const = 0; +}; +/// \endcond + +/// \addtogroup networking-module +/// @{ + +/// A TCP (or other stream-based protocol) connection. +/// +/// A \c ConnectedSocket represents a full-duplex stream between +/// two endpoints, a local endpoint and a remote endpoint. +class ConnectedSocket { + std::unique_ptr<ConnectedSocketImpl> _csi; + + public: + /// Constructs a \c ConnectedSocket not corresponding to a connection + ConnectedSocket() {}; + /// \cond internal + explicit ConnectedSocket(std::unique_ptr<ConnectedSocketImpl> csi) + : _csi(std::move(csi)) {} + /// \endcond + ~ConnectedSocket() { + if (_csi) + _csi->close(); + } + /// Moves a \c ConnectedSocket object. + ConnectedSocket(ConnectedSocket&& cs) = default; + /// Move-assigns a \c ConnectedSocket object. + ConnectedSocket& operator=(ConnectedSocket&& cs) = default; + + int is_connected() { + return _csi->is_connected(); + } + /// Read the input stream with copy. + /// + /// Copy an object returning data sent from the remote endpoint. + ssize_t read(char* buf, size_t len) { + return _csi->read(buf, len); + } + /// Gets the input stream. + /// + /// Gets an object returning data sent from the remote endpoint. + ssize_t zero_copy_read(bufferptr &data) { + return _csi->zero_copy_read(data); + } + /// Gets the output stream. + /// + /// Gets an object that sends data to the remote endpoint. + ssize_t send(bufferlist &bl, bool more) { + return _csi->send(bl, more); + } + /// Disables output to the socket. + /// + /// Current or future writes that have not been successfully flushed + /// will immediately fail with an error. This is useful to abort + /// operations on a socket that is not making progress due to a + /// peer failure. + void shutdown() { + return _csi->shutdown(); + } + /// Disables input from the socket. + /// + /// Current or future reads will immediately fail with an error. + /// This is useful to abort operations on a socket that is not making + /// progress due to a peer failure. + void close() { + _csi->close(); + _csi.reset(); + } + + /// Get file descriptor + int fd() const { + return _csi->fd(); + } + int socket_fd() const { + return _csi->socket_fd(); + } + + explicit operator bool() const { + return _csi.get(); + } +}; +/// @} + +/// \addtogroup networking-module +/// @{ + +/// A listening socket, waiting to accept incoming network connections. +class ServerSocket { + std::unique_ptr<ServerSocketImpl> _ssi; + public: + /// Constructs a \c ServerSocket not corresponding to a connection + ServerSocket() {} + /// \cond internal + explicit ServerSocket(std::unique_ptr<ServerSocketImpl> ssi) + : _ssi(std::move(ssi)) {} + ~ServerSocket() { + if (_ssi) + _ssi->abort_accept(); + } + /// \endcond + /// Moves a \c ServerSocket object. + ServerSocket(ServerSocket&& ss) = default; + /// Move-assigns a \c ServerSocket object. + ServerSocket& operator=(ServerSocket&& cs) = default; + + /// Accepts the next connection to successfully connect to this socket. + /// + /// \Accepts a \ref ConnectedSocket representing the connection, and + /// a \ref entity_addr_t describing the remote endpoint. + int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) { + return _ssi->accept(sock, opt, out, w); + } + + /// Stops any \ref accept() in progress. + /// + /// Current and future \ref accept() calls will terminate immediately + /// with an error. + void abort_accept() { + _ssi->abort_accept(); + _ssi.reset(); + } + + /// Get file descriptor + int fd() const { + return _ssi->fd(); + } + + /// get listen/bind addr + unsigned get_addr_slot() { + return _ssi->addr_slot; + } + + explicit operator bool() const { + return _ssi.get(); + } +}; +/// @} + +class NetworkStack; + +enum { + l_msgr_first = 94000, + l_msgr_recv_messages, + l_msgr_send_messages, + l_msgr_recv_bytes, + l_msgr_send_bytes, + l_msgr_created_connections, + l_msgr_active_connections, + + l_msgr_running_total_time, + l_msgr_running_send_time, + l_msgr_running_recv_time, + l_msgr_running_fast_dispatch_time, + + l_msgr_last, +}; + +class Worker { + std::mutex init_lock; + std::condition_variable init_cond; + bool init = false; + + public: + bool done = false; + + CephContext *cct; + PerfCounters *perf_logger; + unsigned id; + + std::atomic_uint references; + EventCenter center; + + Worker(const Worker&) = delete; + Worker& operator=(const Worker&) = delete; + + Worker(CephContext *c, unsigned i) + : cct(c), perf_logger(NULL), id(i), references(0), center(c) { + char name[128]; + sprintf(name, "AsyncMessenger::Worker-%u", id); + // initialize perf_logger + PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last); + + plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages"); + plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages"); + plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number"); + plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number"); + + plb.add_time(l_msgr_running_total_time, "msgr_running_total_time", "The total time of thread running"); + plb.add_time(l_msgr_running_send_time, "msgr_running_send_time", "The total time of message sending"); + plb.add_time(l_msgr_running_recv_time, "msgr_running_recv_time", "The total time of message receiving"); + plb.add_time(l_msgr_running_fast_dispatch_time, "msgr_running_fast_dispatch_time", "The total time of fast dispatch"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + } + virtual ~Worker() { + if (perf_logger) { + cct->get_perfcounters_collection()->remove(perf_logger); + delete perf_logger; + } + } + + virtual int listen(entity_addr_t &addr, unsigned addr_slot, + const SocketOptions &opts, ServerSocket *) = 0; + virtual int connect(const entity_addr_t &addr, + const SocketOptions &opts, ConnectedSocket *socket) = 0; + virtual void destroy() {} + + virtual void initialize() {} + PerfCounters *get_perf_counter() { return perf_logger; } + void release_worker() { + int oldref = references.fetch_sub(1); + ceph_assert(oldref > 0); + } + void init_done() { + init_lock.lock(); + init = true; + init_cond.notify_all(); + init_lock.unlock(); + } + bool is_init() { + std::lock_guard<std::mutex> l(init_lock); + return init; + } + void wait_for_init() { + std::unique_lock<std::mutex> l(init_lock); + while (!init) + init_cond.wait(l); + } + void reset() { + init_lock.lock(); + init = false; + init_cond.notify_all(); + init_lock.unlock(); + done = false; + } +}; + +class NetworkStack { + std::string type; + unsigned num_workers = 0; + ceph::spinlock pool_spin; + bool started = false; + + std::function<void ()> add_thread(unsigned i); + + protected: + CephContext *cct; + vector<Worker*> workers; + + explicit NetworkStack(CephContext *c, const string &t); + public: + NetworkStack(const NetworkStack &) = delete; + NetworkStack& operator=(const NetworkStack &) = delete; + virtual ~NetworkStack() { + for (auto &&w : workers) + delete w; + } + + static std::shared_ptr<NetworkStack> create( + CephContext *c, const string &type); + + static Worker* create_worker( + CephContext *c, const string &t, unsigned i); + // backend need to override this method if supports zero copy read + virtual bool support_zero_copy_read() const { return false; } + // backend need to override this method if backend doesn't support shared + // listen table. + // For example, posix backend has in kernel global listen table. If one + // thread bind a port, other threads also aware this. + // But for dpdk backend, we maintain listen table in each thread. So we + // need to let each thread do binding port. + virtual bool support_local_listen_table() const { return false; } + virtual bool nonblock_connect_need_writable_event() const { return true; } + + void start(); + void stop(); + virtual Worker *get_worker(); + Worker *get_worker(unsigned i) { + return workers[i]; + } + void drain(); + unsigned get_num_worker() const { + return num_workers; + } + + // direct is used in tests only + virtual void spawn_worker(unsigned i, std::function<void ()> &&) = 0; + virtual void join_worker(unsigned i) = 0; + + virtual bool is_ready() { return true; }; + virtual void ready() { }; +}; + +#endif //CEPH_MSG_ASYNC_STACK_H diff --git a/src/msg/async/crypto_onwire.cc b/src/msg/async/crypto_onwire.cc new file mode 100644 index 00000000..4e423406 --- /dev/null +++ b/src/msg/async/crypto_onwire.cc @@ -0,0 +1,311 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <array> +#include <openssl/evp.h> + +#include "crypto_onwire.h" + +#include "common/debug.h" +#include "common/ceph_crypto.h" +#include "include/types.h" + +#define dout_subsys ceph_subsys_ms + +namespace ceph::crypto::onwire { + +static constexpr const std::size_t AESGCM_KEY_LEN{16}; +static constexpr const std::size_t AESGCM_IV_LEN{12}; +static constexpr const std::size_t AESGCM_TAG_LEN{16}; +static constexpr const std::size_t AESGCM_BLOCK_LEN{16}; + +struct nonce_t { + ceph_le32 fixed; + ceph_le64 counter; + + bool operator==(const nonce_t& rhs) const { + return !memcmp(this, &rhs, sizeof(*this)); + } +} __attribute__((packed)); +static_assert(sizeof(nonce_t) == AESGCM_IV_LEN); + +using key_t = std::array<std::uint8_t, AESGCM_KEY_LEN>; + +// http://www.mindspring.com/~dmcgrew/gcm-nist-6.pdf +// https://www.openssl.org/docs/man1.0.2/crypto/EVP_aes_128_gcm.html#GCM-mode +// https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption +// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf +class AES128GCM_OnWireTxHandler : public ceph::crypto::onwire::TxHandler { + CephContext* const cct; + std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx; + ceph::bufferlist buffer; + nonce_t nonce, initial_nonce; + bool used_initial_nonce; + bool new_nonce_format; // 64-bit counter? + static_assert(sizeof(nonce) == AESGCM_IV_LEN); + +public: + AES128GCM_OnWireTxHandler(CephContext* const cct, + const key_t& key, + const nonce_t& nonce, + bool new_nonce_format) + : cct(cct), + ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free), + nonce(nonce), initial_nonce(nonce), used_initial_nonce(false), + new_nonce_format(new_nonce_format) { + ceph_assert_always(ectx); + ceph_assert_always(key.size() * CHAR_BIT == 128); + + if (1 != EVP_EncryptInit_ex(ectx.get(), EVP_aes_128_gcm(), + nullptr, nullptr, nullptr)) { + throw std::runtime_error("EVP_EncryptInit_ex failed"); + } + + if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, + key.data(), nullptr)) { + throw std::runtime_error("EVP_EncryptInit_ex failed"); + } + } + + ~AES128GCM_OnWireTxHandler() override { + ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce)); + ::ceph::crypto::zeroize_for_security(&initial_nonce, sizeof(initial_nonce)); + } + + void reset_tx_handler(const uint32_t* first, const uint32_t* last) override; + + void authenticated_encrypt_update(const ceph::bufferlist& plaintext) override; + ceph::bufferlist authenticated_encrypt_final() override; +}; + +void AES128GCM_OnWireTxHandler::reset_tx_handler(const uint32_t* first, + const uint32_t* last) +{ + if (nonce == initial_nonce) { + if (used_initial_nonce) { + throw ceph::crypto::onwire::TxHandlerError("out of nonces"); + } + used_initial_nonce = true; + } + + if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, nullptr, + reinterpret_cast<const unsigned char*>(&nonce))) { + throw std::runtime_error("EVP_EncryptInit_ex failed"); + } + + ceph_assert(buffer.get_append_buffer_unused_tail_length() == 0); + buffer.reserve(std::accumulate(first, last, AESGCM_TAG_LEN)); + + if (!new_nonce_format) { + // msgr2.0: 32-bit counter followed by 64-bit fixed field, + // susceptible to overflow! + nonce.fixed = nonce.fixed + 1; + } else { + nonce.counter = nonce.counter + 1; + } +} + +void AES128GCM_OnWireTxHandler::authenticated_encrypt_update( + const ceph::bufferlist& plaintext) +{ + ceph_assert(buffer.get_append_buffer_unused_tail_length() >= + plaintext.length()); + auto filler = buffer.append_hole(plaintext.length()); + + for (const auto& plainbuf : plaintext.buffers()) { + int update_len = 0; + + if(1 != EVP_EncryptUpdate(ectx.get(), + reinterpret_cast<unsigned char*>(filler.c_str()), + &update_len, + reinterpret_cast<const unsigned char*>(plainbuf.c_str()), + plainbuf.length())) { + throw std::runtime_error("EVP_EncryptUpdate failed"); + } + ceph_assert_always(update_len >= 0); + ceph_assert(static_cast<unsigned>(update_len) == plainbuf.length()); + filler.advance(update_len); + } + + ldout(cct, 15) << __func__ + << " plaintext.length()=" << plaintext.length() + << " buffer.length()=" << buffer.length() + << dendl; +} + +ceph::bufferlist AES128GCM_OnWireTxHandler::authenticated_encrypt_final() +{ + int final_len = 0; + ceph_assert(buffer.get_append_buffer_unused_tail_length() == + AESGCM_BLOCK_LEN); + auto filler = buffer.append_hole(AESGCM_BLOCK_LEN); + if(1 != EVP_EncryptFinal_ex(ectx.get(), + reinterpret_cast<unsigned char*>(filler.c_str()), + &final_len)) { + throw std::runtime_error("EVP_EncryptFinal_ex failed"); + } + ceph_assert_always(final_len == 0); + + static_assert(AESGCM_BLOCK_LEN == AESGCM_TAG_LEN); + if(1 != EVP_CIPHER_CTX_ctrl(ectx.get(), + EVP_CTRL_GCM_GET_TAG, AESGCM_TAG_LEN, + filler.c_str())) { + throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed"); + } + + ldout(cct, 15) << __func__ + << " buffer.length()=" << buffer.length() + << " final_len=" << final_len + << dendl; + return std::move(buffer); +} + +// RX PART +class AES128GCM_OnWireRxHandler : public ceph::crypto::onwire::RxHandler { + CephContext* const cct; + std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx; + nonce_t nonce; + bool new_nonce_format; // 64-bit counter? + static_assert(sizeof(nonce) == AESGCM_IV_LEN); + +public: + AES128GCM_OnWireRxHandler(CephContext* const cct, + const key_t& key, + const nonce_t& nonce, + bool new_nonce_format) + : cct(cct), + ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free), + nonce(nonce), new_nonce_format(new_nonce_format) { + ceph_assert_always(ectx); + ceph_assert_always(key.size() * CHAR_BIT == 128); + + if (1 != EVP_DecryptInit_ex(ectx.get(), EVP_aes_128_gcm(), + nullptr, nullptr, nullptr)) { + throw std::runtime_error("EVP_DecryptInit_ex failed"); + } + + if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, + key.data(), nullptr)) { + throw std::runtime_error("EVP_DecryptInit_ex failed"); + } + } + + ~AES128GCM_OnWireRxHandler() override { + ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce)); + } + + std::uint32_t get_extra_size_at_final() override { + return AESGCM_TAG_LEN; + } + void reset_rx_handler() override; + void authenticated_decrypt_update(ceph::bufferlist& bl) override; + void authenticated_decrypt_update_final(ceph::bufferlist& bl) override; +}; + +void AES128GCM_OnWireRxHandler::reset_rx_handler() +{ + if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, nullptr, + reinterpret_cast<const unsigned char*>(&nonce))) { + throw std::runtime_error("EVP_DecryptInit_ex failed"); + } + + if (!new_nonce_format) { + // msgr2.0: 32-bit counter followed by 64-bit fixed field, + // susceptible to overflow! + nonce.fixed = nonce.fixed + 1; + } else { + nonce.counter = nonce.counter + 1; + } +} + +void AES128GCM_OnWireRxHandler::authenticated_decrypt_update( + ceph::bufferlist& bl) +{ + // discard cached crcs as we will be writing through c_str() + bl.invalidate_crc(); + for (auto& buf : bl.buffers()) { + auto p = reinterpret_cast<unsigned char*>(const_cast<char*>(buf.c_str())); + int update_len = 0; + + if (1 != EVP_DecryptUpdate(ectx.get(), p, &update_len, p, buf.length())) { + throw std::runtime_error("EVP_DecryptUpdate failed"); + } + ceph_assert_always(update_len >= 0); + ceph_assert(static_cast<unsigned>(update_len) == buf.length()); + } +} + +void AES128GCM_OnWireRxHandler::authenticated_decrypt_update_final( + ceph::bufferlist& bl) +{ + unsigned orig_len = bl.length(); + ceph_assert(orig_len >= AESGCM_TAG_LEN); + + // decrypt optional data. Caller is obliged to provide only signature but it + // may supply ciphertext as well. Combining the update + final is reflected + // combined together. + ceph::bufferlist auth_tag; + bl.splice(orig_len - AESGCM_TAG_LEN, AESGCM_TAG_LEN, &auth_tag); + if (bl.length() > 0) { + authenticated_decrypt_update(bl); + } + + // we need to ensure the tag is stored in continuous memory. + if (1 != EVP_CIPHER_CTX_ctrl(ectx.get(), EVP_CTRL_GCM_SET_TAG, + AESGCM_TAG_LEN, auth_tag.c_str())) { + throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed"); + } + + // I expect that 0 bytes will be appended. The call is supposed solely to + // authenticate the message. + { + int final_len = 0; + if (0 >= EVP_DecryptFinal_ex(ectx.get(), nullptr, &final_len)) { + throw MsgAuthError(); + } + ceph_assert_always(final_len == 0); + ceph_assert(bl.length() + AESGCM_TAG_LEN == orig_len); + } +} + +ceph::crypto::onwire::rxtx_t ceph::crypto::onwire::rxtx_t::create_handler_pair( + CephContext* cct, + const AuthConnectionMeta& auth_meta, + bool new_nonce_format, + bool crossed) +{ + if (auth_meta.is_mode_secure()) { + ceph_assert_always(auth_meta.connection_secret.length() >= \ + sizeof(key_t) + 2 * sizeof(nonce_t)); + const char* secbuf = auth_meta.connection_secret.c_str(); + + key_t key; + { + ::memcpy(key.data(), secbuf, sizeof(key)); + secbuf += sizeof(key); + } + + nonce_t rx_nonce; + { + ::memcpy(&rx_nonce, secbuf, sizeof(rx_nonce)); + secbuf += sizeof(rx_nonce); + } + + nonce_t tx_nonce; + { + ::memcpy(&tx_nonce, secbuf, sizeof(tx_nonce)); + secbuf += sizeof(tx_nonce); + } + + return { + std::make_unique<AES128GCM_OnWireRxHandler>( + cct, key, crossed ? tx_nonce : rx_nonce, new_nonce_format), + std::make_unique<AES128GCM_OnWireTxHandler>( + cct, key, crossed ? rx_nonce : tx_nonce, new_nonce_format) + }; + } else { + return { nullptr, nullptr }; + } +} + +} // namespace ceph::crypto::onwire diff --git a/src/msg/async/crypto_onwire.h b/src/msg/async/crypto_onwire.h new file mode 100644 index 00000000..55f75508 --- /dev/null +++ b/src/msg/async/crypto_onwire.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CRYPTO_ONWIRE_H +#define CEPH_CRYPTO_ONWIRE_H + +#include <cstdint> +#include <memory> + +#include "auth/Auth.h" +#include "include/buffer.h" + +namespace ceph::math { + +// TODO +template <typename T> +class always_aligned_t { + T val; + + template <class... Args> + always_aligned_t(Args&&... args) + : val(std::forward<Args>(args)...) { + } +}; + +} // namespace ceph::math + +namespace ceph::crypto::onwire { + +struct MsgAuthError : public std::runtime_error { + MsgAuthError() + : runtime_error("message signature mismatch") { + } +}; + +struct TxHandlerError : public std::runtime_error { + TxHandlerError(const char* what) + : std::runtime_error(std::string("tx handler error: ") + what) {} +}; + +struct TxHandler { + virtual ~TxHandler() = default; + + // Instance of TxHandler must be reset before doing any encrypt-update + // step. This applies also to situation when encrypt-final was already + // called and another round of update-...-update-final will take place. + // + // The input parameter informs implementation how the -update sequence + // is fragmented and allows to make concious decision about allocation + // or reusage of provided memory. One implementation could do in-place + // encryption while other might prefer one huge output buffer. + // + // It's undefined what will happen if client doesn't follow the order. + // + // TODO: switch to always_aligned_t + virtual void reset_tx_handler(const uint32_t* first, + const uint32_t* last) = 0; + + void reset_tx_handler(std::initializer_list<uint32_t> update_size_sequence) { + if (update_size_sequence.size() > 0) { + const uint32_t* first = &*update_size_sequence.begin(); + reset_tx_handler(first, first + update_size_sequence.size()); + } else { + reset_tx_handler(nullptr, nullptr); + } + } + + // Perform encryption. Client gives full ownership right to provided + // bufferlist. The method MUST NOT be called after _final() if there + // was no call to _reset(). + virtual void authenticated_encrypt_update( + const ceph::bufferlist& plaintext) = 0; + + // Generates authentication signature and returns bufferlist crafted + // basing on plaintext from preceding call to _update(). + virtual ceph::bufferlist authenticated_encrypt_final() = 0; +}; + +class RxHandler { +public: + virtual ~RxHandler() = default; + + // Transmitter can append extra bytes of ciphertext at the -final step. + // This method return how much was added, and thus let client translate + // plaintext size into ciphertext size to grab from wire. + virtual std::uint32_t get_extra_size_at_final() = 0; + + // Instance of RxHandler must be reset before doing any decrypt-update + // step. This applies also to situation when decrypt-final was already + // called and another round of update-...-update-final will take place. + virtual void reset_rx_handler() = 0; + + // Perform decryption ciphertext must be ALWAYS aligned to 16 bytes. + virtual void authenticated_decrypt_update(ceph::bufferlist& bl) = 0; + + // Perform decryption of last cipertext's portion and verify signature + // for overall decryption sequence. + // Throws on integrity/authenticity checks + virtual void authenticated_decrypt_update_final(ceph::bufferlist& bl) = 0; +}; + +struct rxtx_t { + //rxtx_t(rxtx_t&& r) : rx(std::move(rx)), tx(std::move(tx)) {} + // Each peer can use different handlers. + // Hmm, isn't that too much flexbility? + std::unique_ptr<RxHandler> rx; + std::unique_ptr<TxHandler> tx; + + static rxtx_t create_handler_pair( + CephContext* ctx, + const class AuthConnectionMeta& auth_meta, + bool new_nonce_format, + bool crossed); +}; + +} // namespace ceph::crypto::onwire + +#endif // CEPH_CRYPTO_ONWIRE_H diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc new file mode 100644 index 00000000..dedc9e3c --- /dev/null +++ b/src/msg/async/dpdk/ARP.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "ARP.h" + +arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num) + : _arp(a), _proto_num(proto_num) +{ + _arp.add(proto_num, this); +} + +arp_for_protocol::~arp_for_protocol() +{ + _arp.del(_proto_num); +} + +arp::arp(interface* netif): + _netif(netif), + _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }), + _rx_packets( + _proto.receive( + [this] (Packet p, ethernet_address ea) { + return process_packet(std::move(p), ea); + }, + [this](forward_hash& out_hash_data, Packet& p, size_t off) { + return forward(out_hash_data, p, off); + } + ) + ) +{} + +Tub<l3_protocol::l3packet> arp::get_packet() +{ + Tub<l3_protocol::l3packet> p; + if (!_packetq.empty()) { + p = std::move(_packetq.front()); + _packetq.pop_front(); + } + return p; +} + +bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + auto ah = p.get_header<arp_hdr>(off); + auto i = _arp_for_protocol.find(ntoh(ah->ptype)); + if (i != _arp_for_protocol.end()) { + return i->second->forward(out_hash_data, p, off); + } + return false; +} + +void arp::add(uint16_t proto_num, arp_for_protocol* afp) +{ + _arp_for_protocol[proto_num] = afp; +} + +void arp::del(uint16_t proto_num) +{ + _arp_for_protocol.erase(proto_num); +} + +int arp::process_packet(Packet p, ethernet_address from) +{ + auto ah = p.get_header<arp_hdr>()->ntoh(); + auto i = _arp_for_protocol.find(ah.ptype); + if (i != _arp_for_protocol.end()) { + i->second->received(std::move(p)); + } + return 0; +} diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h new file mode 100644 index 00000000..54569564 --- /dev/null +++ b/src/msg/async/dpdk/ARP.h @@ -0,0 +1,301 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ + +#ifndef CEPH_MSG_ARP_H_ +#define CEPH_MSG_ARP_H_ + +#include <errno.h> + +#include <unordered_map> +#include <functional> + +#include "msg/async/Event.h" + +#include "ethernet.h" +#include "circular_buffer.h" +#include "ip_types.h" +#include "net.h" +#include "Packet.h" + +class arp; +template <typename L3> +class arp_for; + +class arp_for_protocol { + protected: + arp& _arp; + uint16_t _proto_num; + public: + arp_for_protocol(arp& a, uint16_t proto_num); + virtual ~arp_for_protocol(); + virtual int received(Packet p) = 0; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; } +}; + +class interface; + +class arp { + interface* _netif; + l3_protocol _proto; + subscription<Packet, ethernet_address> _rx_packets; + std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol; + circular_buffer<l3_protocol::l3packet> _packetq; + private: + struct arp_hdr { + uint16_t htype; + uint16_t ptype; + arp_hdr ntoh() { + arp_hdr hdr = *this; + hdr.htype = ::ntoh(htype); + hdr.ptype = ::ntoh(ptype); + return hdr; + } + arp_hdr hton() { + arp_hdr hdr = *this; + hdr.htype = ::hton(htype); + hdr.ptype = ::hton(ptype); + return hdr; + } + }; + public: + explicit arp(interface* netif); + void add(uint16_t proto_num, arp_for_protocol* afp); + void del(uint16_t proto_num); + private: + ethernet_address l2self() { return _netif->hw_address(); } + int process_packet(Packet p, ethernet_address from); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + Tub<l3_protocol::l3packet> get_packet(); + template <class l3_proto> + friend class arp_for; +}; + +template <typename L3> +class arp_for : public arp_for_protocol { + public: + using l2addr = ethernet_address; + using l3addr = typename L3::address_type; + private: + static constexpr auto max_waiters = 512; + enum oper { + op_request = 1, + op_reply = 2, + }; + struct arp_hdr { + uint16_t htype; + uint16_t ptype; + uint8_t hlen; + uint8_t plen; + uint16_t oper; + l2addr sender_hwaddr; + l3addr sender_paddr; + l2addr target_hwaddr; + l3addr target_paddr; + + arp_hdr ntoh() { + arp_hdr hdr = *this; + hdr.htype = ::ntoh(htype); + hdr.ptype = ::ntoh(ptype); + hdr.oper = ::ntoh(oper); + hdr.sender_hwaddr = sender_hwaddr.ntoh(); + hdr.sender_paddr = sender_paddr.ntoh(); + hdr.target_hwaddr = target_hwaddr.ntoh(); + hdr.target_paddr = target_paddr.ntoh(); + return hdr; + } + + arp_hdr hton() { + arp_hdr hdr = *this; + hdr.htype = ::hton(htype); + hdr.ptype = ::hton(ptype); + hdr.oper = ::hton(oper); + hdr.sender_hwaddr = sender_hwaddr.hton(); + hdr.sender_paddr = sender_paddr.hton(); + hdr.target_hwaddr = target_hwaddr.hton(); + hdr.target_paddr = target_paddr.hton(); + return hdr; + } + }; + struct resolution { + std::vector<std::pair<resolution_cb, Packet>> _waiters; + uint64_t timeout_fd; + }; + class C_handle_arp_timeout : public EventCallback { + arp_for *arp; + l3addr paddr; + bool first_request; + + public: + C_handle_arp_timeout(arp_for *a, l3addr addr, bool first): + arp(a), paddr(addr), first_request(first) {} + void do_request(uint64_t r) { + arp->send_query(paddr); + auto &res = arp->_in_progress[paddr]; + + for (auto& p : res._waiters) { + p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT); + } + res._waiters.clear(); + res.timeout_fd = arp->center->create_time_event( + 1*1000*1000, this); + } + }; + friend class C_handle_arp_timeout; + + private: + CephContext *cct; + EventCenter *center; + l3addr _l3self = L3::broadcast_address(); + std::unordered_map<l3addr, l2addr> _table; + std::unordered_map<l3addr, resolution> _in_progress; + private: + Packet make_query_packet(l3addr paddr); + virtual int received(Packet p) override; + int handle_request(arp_hdr* ah); + l2addr l2self() { return _arp.l2self(); } + void send(l2addr to, Packet &&p); + public: + void send_query(const l3addr& paddr); + explicit arp_for(CephContext *c, arp& a, EventCenter *cen) + : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) { + _table[L3::broadcast_address()] = ethernet::broadcast_address(); + } + ~arp_for() { + for (auto && p : _in_progress) + center->delete_time_event(p.second.timeout_fd); + } + void wait(const l3addr& addr, Packet p, resolution_cb cb); + void learn(l2addr l2, l3addr l3); + void run(); + void set_self_addr(l3addr addr) { + _table.erase(_l3self); + _table[addr] = l2self(); + _l3self = addr; + } + friend class arp; +}; + +template <typename L3> +void arp_for<L3>::send(l2addr to, Packet &&p) { + _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)}); +} + +template <typename L3> +Packet arp_for<L3>::make_query_packet(l3addr paddr) { + arp_hdr hdr; + hdr.htype = ethernet::arp_hardware_type(); + hdr.ptype = L3::arp_protocol_type(); + hdr.hlen = sizeof(l2addr); + hdr.plen = sizeof(l3addr); + hdr.oper = op_request; + hdr.sender_hwaddr = l2self(); + hdr.sender_paddr = _l3self; + hdr.target_hwaddr = ethernet::broadcast_address(); + hdr.target_paddr = paddr; + hdr = hdr.hton(); + return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr)); +} + +template <typename L3> +void arp_for<L3>::send_query(const l3addr& paddr) { + send(ethernet::broadcast_address(), make_query_packet(paddr)); +} + +template <typename L3> +void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) { + _table[paddr] = hwaddr; + auto i = _in_progress.find(paddr); + if (i != _in_progress.end()) { + auto& res = i->second; + center->delete_time_event(res.timeout_fd); + for (auto &&p : res._waiters) { + p.first(hwaddr, std::move(p.second), 0); + } + _in_progress.erase(i); + } +} + +template <typename L3> +void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) { + auto i = _table.find(paddr); + if (i != _table.end()) { + cb(i->second, std::move(p), 0); + return ; + } + + auto j = _in_progress.find(paddr); + auto first_request = j == _in_progress.end(); + auto& res = first_request ? _in_progress[paddr] : j->second; + + if (first_request) { + res.timeout_fd = center->create_time_event( + 1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request)); + send_query(paddr); + } + + if (res._waiters.size() >= max_waiters) { + cb(ethernet_address(), std::move(p), -EBUSY); + return ; + } + + res._waiters.emplace_back(cb, std::move(p)); + return ; +} + +template <typename L3> +int arp_for<L3>::received(Packet p) { + auto ah = p.get_header<arp_hdr>(); + if (!ah) { + return 0; + } + auto h = ah->ntoh(); + if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) { + return 0; + } + switch (h.oper) { + case op_request: + return handle_request(&h); + case op_reply: + _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr); + return 0; + default: + return 0; + } +} + +template <typename L3> +int arp_for<L3>::handle_request(arp_hdr* ah) { + if (ah->target_paddr == _l3self + && _l3self != L3::broadcast_address()) { + ah->oper = op_reply; + ah->target_hwaddr = ah->sender_hwaddr; + ah->target_paddr = ah->sender_paddr; + ah->sender_hwaddr = l2self(); + ah->sender_paddr = _l3self; + *ah = ah->hton(); + send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah))); + } + return 0; +} + +#endif /* CEPH_MSG_ARP_H_ */ diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc new file mode 100644 index 00000000..278efe9e --- /dev/null +++ b/src/msg/async/dpdk/DPDK.cc @@ -0,0 +1,1267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include <atomic> +#include <vector> +#include <queue> + +#include <rte_config.h> +#include <rte_common.h> +#include <rte_eal.h> +#include <rte_pci.h> +#include <rte_ethdev.h> +#include <rte_cycles.h> +#include <rte_memzone.h> + +#include "include/page.h" +#include "align.h" +#include "IP.h" +#include "const.h" +#include "dpdk_rte.h" +#include "DPDK.h" +#include "toeplitz.h" + +#include "common/Cycles.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + + +void* as_cookie(struct rte_pktmbuf_pool_private& p) { + return &p; +}; + +#ifndef MARKER +typedef void *MARKER[0]; /**< generic marker for a point in a structure */ +#endif + +/******************* Net device related constatns *****************************/ +static constexpr uint16_t default_ring_size = 512; + +// +// We need 2 times the ring size of buffers because of the way PMDs +// refill the ring. +// +static constexpr uint16_t mbufs_per_queue_rx = 2 * default_ring_size; +static constexpr uint16_t rx_gc_thresh = 64; + +// +// No need to keep more descriptors in the air than can be sent in a single +// rte_eth_tx_burst() call. +// +static constexpr uint16_t mbufs_per_queue_tx = 2 * default_ring_size; + +static constexpr uint16_t mbuf_cache_size = 512; +// +// Size of the data buffer in the non-inline case. +// +// We may want to change (increase) this value in future, while the +// inline_mbuf_data_size value will unlikely change due to reasons described +// above. +// +static constexpr size_t mbuf_data_size = 4096; + +static constexpr uint16_t mbuf_overhead = + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM; +// +// We'll allocate 2K data buffers for an inline case because this would require +// a single page per mbuf. If we used 4K data buffers here it would require 2 +// pages for a single buffer (due to "mbuf_overhead") and this is a much more +// demanding memory constraint. +// +static constexpr size_t inline_mbuf_data_size = 2048; + + +// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers +static constexpr uint8_t max_frags = 32 + 1; + +// +// Intel's 40G NIC HW limit for a number of fragments in an xmit segment. +// +// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices +// spec. for more details. +// +static constexpr uint8_t i40e_max_xmit_segment_frags = 8; + +// +// VMWare's virtual NIC limit for a number of fragments in an xmit segment. +// +// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT +// +static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16; + +static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead; + +static size_t huge_page_size = 512 * CEPH_PAGE_SIZE; + +uint32_t qp_mempool_obj_size() +{ + uint32_t mp_size = 0; + struct rte_mempool_objsz mp_obj_sz = {}; + + // + // We will align each size to huge page size because DPDK allocates + // physically contiguous memory region for each pool object. + // + + // Rx + mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+ + sizeof(struct rte_pktmbuf_pool_private), + huge_page_size); + + //Tx + std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz)); + mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0, + &mp_obj_sz)+ + sizeof(struct rte_pktmbuf_pool_private), + huge_page_size); + return mp_size; +} + +static constexpr const char* pktmbuf_pool_name = "dpdk_net_pktmbuf_pool"; + +/* + * When doing reads from the NIC queues, use this batch size + */ +static constexpr uint8_t packet_read_size = 32; +/******************************************************************************/ + +int DPDKDevice::init_port_start() +{ + ceph_assert(_port_idx < rte_eth_dev_count()); + + rte_eth_dev_info_get(_port_idx, &_dev_info); + + // + // This is a workaround for a missing handling of a HW limitation in the + // DPDK i40e driver. This and all related to _is_i40e_device code should be + // removed once this handling is added. + // + if (std::string("rte_i40evf_pmd") == _dev_info.driver_name || + std::string("rte_i40e_pmd") == _dev_info.driver_name) { + ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl; + _is_i40e_device = true; + } + + if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) { + ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl; + _is_vmxnet3_device = true; + } + + // + // Another workaround: this time for a lack of number of RSS bits. + // ixgbe PF NICs support up to 16 RSS queues. + // ixgbe VF NICs support up to 4 RSS queues. + // i40e PF NICs support up to 64 RSS queues. + // i40e VF NICs support up to 16 RSS queues. + // + if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16); + } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4); + } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64); + } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16); + } + + // Clear txq_flags - we want to support all available offload features + // except for multi-mempool and refcnt'ing which we don't need + _dev_info.default_txconf.txq_flags = + ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT; + + // + // Disable features that are not supported by port's HW + // + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + } + + /* for port configuration all features are off by default */ + rte_eth_conf port_conf = { 0 }; + + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues " + << _dev_info.max_rx_queues << " max_tx_queues " + << _dev_info.max_tx_queues << dendl; + + _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues}); + + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using " + << _num_queues << " queues" << dendl;; + + // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU. + // Even if port has a single queue we still want the RSS feature to be + // available in order to make HW calculate RSS hash for us. + if (_num_queues > 1) { + if (_dev_info.hash_key_size == 40) { + _rss_key = default_rsskey_40bytes; + } else if (_dev_info.hash_key_size == 52) { + _rss_key = default_rsskey_52bytes; + } else if (_dev_info.hash_key_size != 0) { + // WTF?!! + rte_exit(EXIT_FAILURE, + "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested", + _port_idx, _dev_info.hash_key_size); + } else { + _rss_key = default_rsskey_40bytes; + _dev_info.hash_key_size = 40; + } + + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; + if (_dev_info.hash_key_size) { + port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data()); + port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size; + } + } else { + port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE; + } + + if (_num_queues > 1) { + if (_dev_info.reta_size) { + // RETA size should be a power of 2 + ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0); + + // Set the RSS table to the correct size + _redir_table.resize(_dev_info.reta_size); + _rss_table_bits = std::lround(std::log2(_dev_info.reta_size)); + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) + << ": RSS table size is " << _dev_info.reta_size << dendl; + } else { + // FIXME: same with sw_reta + _redir_table.resize(128); + _rss_table_bits = std::lround(std::log2(128)); + } + } else { + _redir_table.push_back(0); + } + + // Set Rx VLAN stripping + if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { + port_conf.rxmode.hw_vlan_strip = 1; + } + + // Enable HW CRC stripping + port_conf.rxmode.hw_strip_crc = 1; + +#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT + // Enable LRO + if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) { + ldout(cct, 1) << __func__ << " LRO is on" << dendl; + port_conf.rxmode.enable_lro = 1; + _hw_features.rx_lro = true; + } else +#endif + ldout(cct, 1) << __func__ << " LRO is off" << dendl; + + // Check that all CSUM features are either all set all together or not set + // all together. If this assumption breaks we need to rework the below logic + // by splitting the csum offload feature bit into separate bits for IPv4, + // TCP. + ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) || + (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM))); + + // Set Rx checksum checking + if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { + ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl; + port_conf.rxmode.hw_ip_checksum = 1; + _hw_features.rx_csum_offload = 1; + } + + if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { + ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl; + _hw_features.tx_csum_ip_offload = 1; + } + + // TSO is supported starting from DPDK v1.8 + if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { + ldout(cct, 1) << __func__ << " TSO is supported" << dendl; + _hw_features.tx_tso = 1; + } + + // Check that Tx TCP CSUM features are either all set all together + // or not set all together. If this assumption breaks we need to rework the + // below logic by splitting the csum offload feature bit into separate bits + // for TCP. + ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) || + !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)); + + if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) { + ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl; + _hw_features.tx_csum_l4_offload = 1; + } + + int retval; + + ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl; + + /* + * Standard DPDK port initialisation - config port, then set up + * rx and tx rings. + */ + if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues, + &port_conf)) != 0) { + lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx + << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl; + return retval; + } + + //rte_eth_promiscuous_enable(port_num); + ldout(cct, 1) << __func__ << " done." << dendl; + + return 0; +} + +void DPDKDevice::set_hw_flow_control() +{ + // Read the port's current/default flow control settings + struct rte_eth_fc_conf fc_conf; + auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf); + + if (ret == -ENOTSUP) { + ldout(cct, 1) << __func__ << " port " << int(_port_idx) + << ": not support to get hardware flow control settings: " << ret << dendl; + goto not_supported; + } + + if (ret < 0) { + lderr(cct) << __func__ << " port " << int(_port_idx) + << ": failed to get hardware flow control settings: " << ret << dendl; + ceph_abort(); + } + + if (_enable_fc) { + fc_conf.mode = RTE_FC_FULL; + } else { + fc_conf.mode = RTE_FC_NONE; + } + + ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf); + if (ret == -ENOTSUP) { + ldout(cct, 1) << __func__ << " port " << int(_port_idx) + << ": not support to set hardware flow control settings: " << ret << dendl; + goto not_supported; + } + + if (ret < 0) { + lderr(cct) << __func__ << " port " << int(_port_idx) + << ": failed to set hardware flow control settings: " << ret << dendl; + ceph_abort(); + } + + ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": HW FC " << _enable_fc << dendl; + return; + +not_supported: + ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl; +} + +int DPDKDevice::init_port_fini() +{ + // Changing FC requires HW reset, so set it before the port is initialized. + set_hw_flow_control(); + + if (rte_eth_dev_start(_port_idx) != 0) { + lderr(cct) << __func__ << " can't start port " << _port_idx << dendl; + return -1; + } + + if (_num_queues > 1) { + if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) { + ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl; + + // Setup HW touse the TOEPLITZ hash function as an RSS hash function + struct rte_eth_hash_filter_info info = {}; + + info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; + info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; + + if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH, + RTE_ETH_FILTER_SET, &info) < 0) { + lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl; + return -1; + } + } + + set_rss_table(); + } + + // Wait for a link + if (check_port_link_status() < 0) { + lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl; + return -1; + } + + ldout(cct, 5) << __func__ << " created DPDK device" << dendl; + return 0; +} + +void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) { + ceph_assert(!cpu_weights.empty()); + if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) { + // special case queue sending to self only, to avoid requiring a hash value + return; + } + register_packet_provider([this] { + Tub<Packet> p; + if (!_proxy_packetq.empty()) { + p = std::move(_proxy_packetq.front()); + _proxy_packetq.pop_front(); + } + return p; + }); + build_sw_reta(cpu_weights); +} + +void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) { + float total_weight = 0; + for (auto&& x : cpu_weights) { + total_weight += x.second; + } + float accum = 0; + unsigned idx = 0; + std::array<uint8_t, 128> reta; + for (auto&& entry : cpu_weights) { + auto cpu = entry.first; + auto weight = entry.second; + accum += weight; + while (idx < (accum / total_weight * reta.size() - 0.5)) { + reta[idx++] = cpu; + } + } + _sw_reta = reta; +} + + +bool DPDKQueuePair::init_rx_mbuf_pool() +{ + std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx"; + + // reserve the memory for Rx buffers containers + _rx_free_pkts.reserve(mbufs_per_queue_rx); + _rx_free_bufs.reserve(mbufs_per_queue_rx); + + _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str()); + if (!_pktmbuf_pool_rx) { + ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str() + << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl; + + // + // Don't pass single-producer/single-consumer flags to mbuf create as it + // seems faster to use a cache instead. + // + struct rte_pktmbuf_pool_private roomsz = {}; + roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM; + _pktmbuf_pool_rx = rte_mempool_create( + name.c_str(), + mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size, + mbuf_cache_size, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, as_cookie(roomsz), + rte_pktmbuf_init, nullptr, + rte_socket_id(), 0); + if (!_pktmbuf_pool_rx) { + lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl; + return false; + } + + // + // allocate more data buffer + int bufs_count = cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx; + int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; + std::string mz_name = "rx_buffer_data" + std::to_string(_qid); + const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(), + mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size); + ceph_assert(mz); + void* m = mz->addr; + for (int i = 0; i < bufs_count; i++) { + ceph_assert(m); + _alloc_bufs.push_back(m); + m += mbuf_data_size; + } + + if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size, + rte_eth_dev_socket_id(_dev_port_idx), + _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) { + lderr(cct) << __func__ << " cannot initialize rx queue" << dendl; + return false; + } + } + + return _pktmbuf_pool_rx != nullptr; +} + +int DPDKDevice::check_port_link_status() +{ + int count = 0; + + ldout(cct, 20) << __func__ << dendl; + const int sleep_time = 100 * 1000; + const int max_check_time = 90; /* 9s (90 * 100ms) in total */ + while (true) { + struct rte_eth_link link; + memset(&link, 0, sizeof(link)); + rte_eth_link_get_nowait(_port_idx, &link); + + if (true) { + if (link.link_status) { + ldout(cct, 5) << __func__ << " done port " + << static_cast<unsigned>(_port_idx) + << " link Up - speed " << link.link_speed + << " Mbps - " + << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n")) + << dendl; + break; + } else if (count++ < max_check_time) { + ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl; + usleep(sleep_time); + } else { + lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl; + return -1; + } + } + } + return 0; +} + +class C_handle_dev_stats : public EventCallback { + DPDKQueuePair *_qp; + public: + C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { } + void do_request(uint64_t id) { + _qp->handle_stats(); + } +}; + +DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid) + : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid), + _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid), + _tx_gc_poller(this) +{ + if (!init_rx_mbuf_pool()) { + lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl; + ceph_abort(); + } + + static_assert(offsetof(tx_buf, private_end) - + offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM, + "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! " + "Increase the headroom size in the DPDK configuration"); + static_assert(offsetof(tx_buf, _mbuf) == 0, + "There is a pad at the beginning of the tx_buf before _mbuf " + "field!"); + static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0, + "inline_mbuf_data_size has to be a power of two!"); + + std::string name(std::string("queue") + std::to_string(qid)); + PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last); + + plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets"); + plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets"); + plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets"); + plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets"); + plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch"); + plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch"); + plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments"); + plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments"); + plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations"); + plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations"); + plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations"); + plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations"); + plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + + if (!_qid) + device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this)); +} + +void DPDKQueuePair::handle_stats() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + rte_eth_stats rte_stats = {}; + int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats); + + if (rc) { + ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl; + return ; + } + +#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0) + _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts); + _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc); +#endif + _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed); + _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf); + + _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors); + _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors); + device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this)); +} + +bool DPDKQueuePair::poll_tx() { + bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback; +#ifdef CEPH_PERF_DEV + uint64_t start = Cycles::rdtsc(); +#endif + uint32_t total_work = 0; + if (_tx_packetq.size() < 16) { + // refill send queue from upper layers + uint32_t work; + do { + work = 0; + for (auto&& pr : _pkt_providers) { + auto p = pr(); + if (p) { + work++; + if (likely(nonloopback)) { + // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl; + _tx_packetq.push_back(std::move(*p)); + } else { + auto th = p->get_header<eth_hdr>(0); + if (th->dst_mac == th->src_mac) { + _dev->l2receive(_qid, std::move(*p)); + } else { + _tx_packetq.push_back(std::move(*p)); + } + } + if (_tx_packetq.size() == 128) { + break; + } + } + } + total_work += work; + } while (work && total_work < 256 && _tx_packetq.size() < 128); + } + if (!_tx_packetq.empty()) { + uint64_t c = send(_tx_packetq); + perf_logger->inc(l_dpdk_qp_tx_packets, c); + perf_logger->set(l_dpdk_qp_tx_last_bunch, c); +#ifdef CEPH_PERF_DEV + tx_count += total_work; + tx_cycles += Cycles::rdtsc() - start; +#endif + return true; + } + + return false; +} + +inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m) +{ + _frags.clear(); + _bufs.clear(); + + for (; m != nullptr; m = m->next) { + char* data = rte_pktmbuf_mtod(m, char*); + + _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)}); + _bufs.push_back(data); + } + + auto del = std::bind( + [this](std::vector<char*> &bufs) { + for (auto&& b : bufs) { _alloc_bufs.push_back(b); } + }, std::move(_bufs)); + return Packet( + _frags.begin(), _frags.end(), make_deleter(std::move(del))); +} + +inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m) +{ + _rx_free_pkts.push_back(m); + _num_rx_free_segs += m->nb_segs; + + if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) { + char* data = rte_pktmbuf_mtod(m, char*); + + return Packet(fragment{data, rte_pktmbuf_data_len(m)}, + make_deleter([this, data] { _alloc_bufs.push_back(data); })); + } else { + return from_mbuf_lro(m); + } +} + +inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head) +{ + for (; head != nullptr; head = head->next) { + if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) { + // + // If we failed to allocate a new buffer - push the rest of the + // cluster back to the free_packets list for a later retry. + // + _rx_free_pkts.push_back(head); + return false; + } + _rx_free_bufs.push_back(head); + } + + return true; +} + +bool DPDKQueuePair::rx_gc(bool force) +{ + if (_num_rx_free_segs >= rx_gc_thresh || force) { + ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs + << " thresh " << rx_gc_thresh + << " free pkts " << _rx_free_pkts.size() + << dendl; + + while (!_rx_free_pkts.empty()) { + // + // Use back() + pop_back() semantics to avoid an extra + // _rx_free_pkts.clear() at the end of the function - clear() has a + // linear complexity. + // + auto m = _rx_free_pkts.back(); + _rx_free_pkts.pop_back(); + + if (!refill_one_cluster(m)) { + ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl; + break; + } + } + for (auto&& m : _rx_free_bufs) { + rte_pktmbuf_prefree_seg(m); + } + + if (_rx_free_bufs.size()) { + rte_mempool_put_bulk(_pktmbuf_pool_rx, + (void **)_rx_free_bufs.data(), + _rx_free_bufs.size()); + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size()); + + _num_rx_free_segs -= _rx_free_bufs.size(); + _rx_free_bufs.clear(); + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) || + (!_rx_free_pkts.empty() && _num_rx_free_segs)); + } + } + + return _num_rx_free_segs >= rx_gc_thresh; +} + + +void DPDKQueuePair::process_packets( + struct rte_mbuf **bufs, uint16_t count) +{ + uint64_t nr_frags = 0, bytes = 0; + + for (uint16_t i = 0; i < count; i++) { + struct rte_mbuf *m = bufs[i]; + offload_info oi; + + Tub<Packet> p = from_mbuf(m); + + // Drop the packet if translation above has failed + if (!p) { + perf_logger->inc(l_dpdk_qp_rx_no_memory_errors); + continue; + } + // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl; + + nr_frags += m->nb_segs; + bytes += m->pkt_len; + + // Set stipped VLAN value if available + if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) && + (m->ol_flags & PKT_RX_VLAN_STRIPPED)) { + oi.vlan_tci = m->vlan_tci; + } + + if (_dev->get_hw_features().rx_csum_offload) { + if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { + // Packet with bad checksum, just drop it. + perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors); + continue; + } + // Note that when _hw_features.rx_csum_offload is on, the receive + // code for ip, tcp and udp will assume they don't need to check + // the checksum again, because we did this here. + } + + p->set_offload_info(oi); + if (m->ol_flags & PKT_RX_RSS_HASH) { + p->set_rss_hash(m->hash.rss); + } + + _dev->l2receive(_qid, std::move(*p)); + } + + perf_logger->inc(l_dpdk_qp_rx_packets, count); + perf_logger->set(l_dpdk_qp_rx_last_bunch, count); + perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags); + perf_logger->inc(l_dpdk_qp_rx_bytes, bytes); +} + +bool DPDKQueuePair::poll_rx_once() +{ + struct rte_mbuf *buf[packet_read_size]; + + /* read a port */ +#ifdef CEPH_PERF_DEV + uint64_t start = Cycles::rdtsc(); +#endif + uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid, + buf, packet_read_size); + + /* Now process the NIC packets read */ + if (likely(count > 0)) { + process_packets(buf, count); +#ifdef CEPH_PERF_DEV + rx_cycles = Cycles::rdtsc() - start; + rx_count += count; +#endif + } +#ifdef CEPH_PERF_DEV + else { + if (rx_count > 10000 && tx_count) { + ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns " + << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns" + << dendl; + rx_count = rx_cycles = tx_count = tx_cycles = 0; + } + } +#endif + + return count; +} + +DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c, + DPDKDevice *dev, uint8_t qid): cct(c) +{ + std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx"; + + _pool = rte_mempool_lookup(name.c_str()); + if (!_pool) { + ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str() + << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl; + // + // We are going to push the buffers from the mempool into + // the circular_buffer and then poll them from there anyway, so + // we prefer to make a mempool non-atomic in this case. + // + _pool = rte_mempool_create(name.c_str(), + mbufs_per_queue_tx, inline_mbuf_size, + mbuf_cache_size, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, nullptr, + rte_pktmbuf_init, nullptr, + rte_socket_id(), 0); + + if (!_pool) { + lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl; + ceph_abort(); + } + if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size, + rte_eth_dev_socket_id(dev->port_idx()), + dev->def_tx_conf()) < 0) { + lderr(cct) << __func__ << " cannot initialize tx queue" << dendl; + ceph_abort(); + } + } + + // + // Fill the factory with the buffers from the mempool allocated + // above. + // + init_factory(); +} + +bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head) +{ + bool is_tso = head->ol_flags & PKT_TX_TCP_SEG; + + // For a non-TSO case: number of fragments should not exceed 8 + if (!is_tso){ + return head->nb_segs > i40e_max_xmit_segment_frags; + } + + // + // For a TSO case each MSS window should not include more than 8 + // fragments including headers. + // + + // Calculate the number of frags containing headers. + // + // Note: we support neither VLAN nor tunneling thus headers size + // accounting is super simple. + // + size_t headers_size = head->l2_len + head->l3_len + head->l4_len; + unsigned hdr_frags = 0; + size_t cur_payload_len = 0; + rte_mbuf *cur_seg = head; + + while (cur_seg && cur_payload_len < headers_size) { + cur_payload_len += cur_seg->data_len; + cur_seg = cur_seg->next; + hdr_frags++; + } + + // + // Header fragments will be used for each TSO segment, thus the + // maximum number of data segments will be 8 minus the number of + // header fragments. + // + // It's unclear from the spec how the first TSO segment is treated + // if the last fragment with headers contains some data bytes: + // whether this fragment will be accounted as a single fragment or + // as two separate fragments. We prefer to play it safe and assume + // that this fragment will be accounted as two separate fragments. + // + size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags; + + if (head->nb_segs <= max_win_size) { + return false; + } + + // Get the data (without headers) part of the first data fragment + size_t prev_frag_data = cur_payload_len - headers_size; + auto mss = head->tso_segsz; + + while (cur_seg) { + unsigned frags_in_seg = 0; + size_t cur_seg_size = 0; + + if (prev_frag_data) { + cur_seg_size = prev_frag_data; + frags_in_seg++; + prev_frag_data = 0; + } + + while (cur_seg_size < mss && cur_seg) { + cur_seg_size += cur_seg->data_len; + cur_seg = cur_seg->next; + frags_in_seg++; + + if (frags_in_seg > max_win_size) { + return true; + } + } + + if (cur_seg_size > mss) { + prev_frag_data = cur_seg_size - mss; + } + } + + return false; +} + +void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head) +{ + // Handle TCP checksum offload + auto oi = p.offload_info(); + if (oi.needs_ip_csum) { + head->ol_flags |= PKT_TX_IP_CKSUM; + // TODO: Take a VLAN header into an account here + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = oi.ip_hdr_len; + } + if (qp.port().get_hw_features().tx_csum_l4_offload) { + if (oi.protocol == ip_protocol_num::tcp) { + head->ol_flags |= PKT_TX_TCP_CKSUM; + // TODO: Take a VLAN header into an account here + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = oi.ip_hdr_len; + + if (oi.tso_seg_size) { + ceph_assert(oi.needs_ip_csum); + head->ol_flags |= PKT_TX_TCP_SEG; + head->l4_len = oi.tcp_hdr_len; + head->tso_segsz = oi.tso_seg_size; + } + } + } +} + +DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc( + CephContext *cct, Packet&& p, DPDKQueuePair& qp) +{ + // Too fragmented - linearize + if (p.nr_frags() > max_frags) { + p.linearize(); + qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops); + } + + build_mbuf_cluster: + rte_mbuf *head = nullptr, *last_seg = nullptr; + unsigned nsegs = 0; + + // + // Create a HEAD of the fragmented packet: check if frag0 has to be + // copied and if yes - send it in a copy way + // + if (!check_frag0(p)) { + if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl; + return nullptr; + } + } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl; + return nullptr; + } + + unsigned total_nsegs = nsegs; + + for (unsigned i = 1; i < p.nr_frags(); i++) { + rte_mbuf *h = nullptr, *new_last_seg = nullptr; + if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl; + me(head)->recycle(); + return nullptr; + } + + total_nsegs += nsegs; + + // Attach a new buffers' chain to the packet chain + last_seg->next = h; + last_seg = new_last_seg; + } + + // Update the HEAD buffer with the packet info + head->pkt_len = p.len(); + head->nb_segs = total_nsegs; + + set_cluster_offload_info(p, qp, head); + + // + // If a packet hasn't been linearized already and the resulting + // cluster requires the linearisation due to HW limitation: + // + // - Recycle the cluster. + // - Linearize the packet. + // - Build the cluster once again + // + if (head->nb_segs > max_frags || + (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) || + (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) { + me(head)->recycle(); + p.linearize(); + qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops); + + goto build_mbuf_cluster; + } + + me(last_seg)->set_packet(std::move(p)); + + return me(head); +} + +void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head) +{ + rte_mbuf* cur_seg = head; + size_t cur_seg_offset = 0; + unsigned cur_frag_idx = 0; + size_t cur_frag_offset = 0; + + while (true) { + size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset, + inline_mbuf_data_size - cur_seg_offset); + + memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset), + p.frag(cur_frag_idx).base + cur_frag_offset, to_copy); + + cur_frag_offset += to_copy; + cur_seg_offset += to_copy; + + if (cur_frag_offset >= p.frag(cur_frag_idx).size) { + ++cur_frag_idx; + if (cur_frag_idx >= p.nr_frags()) { + // + // We are done - set the data size of the last segment + // of the cluster. + // + cur_seg->data_len = cur_seg_offset; + break; + } + + cur_frag_offset = 0; + } + + if (cur_seg_offset >= inline_mbuf_data_size) { + cur_seg->data_len = inline_mbuf_data_size; + cur_seg = cur_seg->next; + cur_seg_offset = 0; + + // FIXME: assert in a fast-path - remove!!! + ceph_assert(cur_seg); + } + } +} + +DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp) +{ + // sanity + if (!p.len()) { + return nullptr; + } + + /* + * Here we are going to use the fact that the inline data size is a + * power of two. + * + * We will first try to allocate the cluster and only if we are + * successful - we will go and copy the data. + */ + auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size); + unsigned nsegs = aligned_len / inline_mbuf_data_size; + rte_mbuf *head = nullptr, *last_seg = nullptr; + + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return nullptr; + } + + head = buf->rte_mbuf_p(); + last_seg = head; + for (unsigned i = 1; i < nsegs; i++) { + buf = qp.get_tx_buf(); + if (!buf) { + me(head)->recycle(); + return nullptr; + } + + last_seg->next = buf->rte_mbuf_p(); + last_seg = last_seg->next; + } + + // + // If we've got here means that we have succeeded already! + // We only need to copy the data and set the head buffer with the + // relevant info. + // + head->pkt_len = p.len(); + head->nb_segs = nsegs; + + copy_packet_to_cluster(p, head); + set_cluster_offload_info(p, qp, head); + + return me(head); +} + +size_t DPDKQueuePair::tx_buf::copy_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len) +{ + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return 0; + } + + size_t len = std::min(buf_len, inline_mbuf_data_size); + + m = buf->rte_mbuf_p(); + + // mbuf_put() + m->data_len = len; + m->pkt_len = len; + + qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops); + qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len); + + memcpy(rte_pktmbuf_mtod(m, void*), data, len); + + return len; +} + +void DPDKDevice::set_rss_table() +{ + // always fill our local indirection table. + unsigned i = 0; + for (auto& r : _redir_table) { + r = i++ % _num_queues; + } + + if (_dev_info.reta_size == 0) + return; + + int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE); + rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; + + // Configure the HW indirection table + i = 0; + for (auto& x : reta_conf) { + x.mask = ~0ULL; + for (auto& r: x.reta) { + r = i++ % _num_queues; + } + } + + if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) { + rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx); + } +} + +/******************************** Interface functions *************************/ + +std::unique_ptr<DPDKDevice> create_dpdk_net_device( + CephContext *cct, + unsigned cores, + uint8_t port_idx, + bool use_lro, + bool enable_fc) +{ + // Check that we have at least one DPDK-able port + if (rte_eth_dev_count() == 0) { + rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n"); + } else { + ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl; + } + + return std::unique_ptr<DPDKDevice>( + new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc)); +} diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h new file mode 100644 index 00000000..fa12af6b --- /dev/null +++ b/src/msg/async/dpdk/DPDK.h @@ -0,0 +1,918 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_DPDK_DEV_H +#define CEPH_DPDK_DEV_H + +#include <memory> +#include <functional> +#include <rte_config.h> +#include <rte_common.h> +#include <rte_ethdev.h> +#include <rte_malloc.h> +#include <rte_version.h> + +#include "include/page.h" +#include "common/Tub.h" +#include "common/perf_counters.h" +#include "msg/async/Event.h" +#include "const.h" +#include "circular_buffer.h" +#include "ethernet.h" +#include "Packet.h" +#include "stream.h" +#include "net.h" +#include "toeplitz.h" + + +struct free_deleter { + void operator()(void* p) { ::free(p); } +}; + + +enum { + l_dpdk_dev_first = 58800, + l_dpdk_dev_rx_mcast, + l_dpdk_dev_rx_total_errors, + l_dpdk_dev_tx_total_errors, + l_dpdk_dev_rx_badcrc_errors, + l_dpdk_dev_rx_dropped_errors, + l_dpdk_dev_rx_nombuf_errors, + l_dpdk_dev_last +}; + +enum { + l_dpdk_qp_first = 58900, + l_dpdk_qp_rx_packets, + l_dpdk_qp_tx_packets, + l_dpdk_qp_rx_bad_checksum_errors, + l_dpdk_qp_rx_no_memory_errors, + l_dpdk_qp_rx_bytes, + l_dpdk_qp_tx_bytes, + l_dpdk_qp_rx_last_bunch, + l_dpdk_qp_tx_last_bunch, + l_dpdk_qp_rx_fragments, + l_dpdk_qp_tx_fragments, + l_dpdk_qp_rx_copy_ops, + l_dpdk_qp_tx_copy_ops, + l_dpdk_qp_rx_copy_bytes, + l_dpdk_qp_tx_copy_bytes, + l_dpdk_qp_rx_linearize_ops, + l_dpdk_qp_tx_linearize_ops, + l_dpdk_qp_tx_queue_length, + l_dpdk_qp_last +}; + +class DPDKDevice; +class DPDKWorker; + +class DPDKQueuePair { + using packet_provider_type = std::function<Tub<Packet> ()>; + public: + void configure_proxies(const std::map<unsigned, float>& cpu_weights); + // build REdirection TAble for cpu_weights map: target cpu -> weight + void build_sw_reta(const std::map<unsigned, float>& cpu_weights); + void proxy_send(Packet p) { + _proxy_packetq.push_back(std::move(p)); + } + void register_packet_provider(packet_provider_type func) { + _pkt_providers.push_back(std::move(func)); + } + bool poll_tx(); + friend class DPDKDevice; + + class tx_buf_factory; + + class tx_buf { + friend class DPDKQueuePair; + public: + static tx_buf* me(rte_mbuf* mbuf) { + return reinterpret_cast<tx_buf*>(mbuf); + } + + private: + /** + * Checks if the original packet of a given cluster should be linearized + * due to HW limitations. + * + * @param head head of a cluster to check + * + * @return TRUE if a packet should be linearized. + */ + static bool i40e_should_linearize(rte_mbuf *head); + + /** + * Sets the offload info in the head buffer of an rte_mbufs cluster. + * + * @param p an original packet the cluster is built for + * @param qp QP handle + * @param head a head of an rte_mbufs cluster + */ + static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head); + + /** + * Creates a tx_buf cluster representing a given packet in a "zero-copy" + * way. + * + * @param p packet to translate + * @param qp DPDKQueuePair handle + * + * @return the HEAD tx_buf of the cluster or nullptr in case of a + * failure + */ + static tx_buf* from_packet_zc( + CephContext *cct, Packet&& p, DPDKQueuePair& qp); + + /** + * Copy the contents of the "packet" into the given cluster of + * rte_mbuf's. + * + * @note Size of the cluster has to be big enough to accommodate all the + * contents of the given packet. + * + * @param p packet to copy + * @param head head of the rte_mbuf's cluster + */ + static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head); + + /** + * Creates a tx_buf cluster representing a given packet in a "copy" way. + * + * @param p packet to translate + * @param qp DPDKQueuePair handle + * + * @return the HEAD tx_buf of the cluster or nullptr in case of a + * failure + */ + static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp); + + /** + * Zero-copy handling of a single fragment. + * + * @param do_one_buf Functor responsible for a single rte_mbuf + * handling + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * @return TRUE in case of success + */ + template <class DoOneBufFunc> + static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp, + fragment& frag, rte_mbuf*& head, + rte_mbuf*& last_seg, unsigned& nsegs) { + size_t len, left_to_set = frag.size; + char* base = frag.base; + + rte_mbuf* m; + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(frag.size); + + // Create a HEAD of mbufs' cluster and set the first bytes into it + len = do_one_buf(qp, head, base, left_to_set); + if (!len) { + return false; + } + + left_to_set -= len; + base += len; + nsegs = 1; + + // + // Set the rest of the data into the new mbufs and chain them to + // the cluster. + // + rte_mbuf* prev_seg = head; + while (left_to_set) { + len = do_one_buf(qp, m, base, left_to_set); + if (!len) { + me(head)->recycle(); + return false; + } + + left_to_set -= len; + base += len; + nsegs++; + + prev_seg->next = m; + prev_seg = m; + } + + // Return the last mbuf in the cluster + last_seg = prev_seg; + + return true; + } + + /** + * Zero-copy handling of a single fragment. + * + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * @return TRUE in case of success + */ + static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag, + rte_mbuf*& head, rte_mbuf*& last_seg, + unsigned& nsegs) { + return do_one_frag(set_one_data_buf, qp, frag, head, + last_seg, nsegs); + } + + /** + * Copies one fragment into the cluster of rte_mbuf's. + * + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * We return the "last_seg" to avoid traversing the cluster in order to get + * it. + * + * @return TRUE in case of success + */ + static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag, + rte_mbuf*& head, rte_mbuf*& last_seg, + unsigned& nsegs) { + return do_one_frag(copy_one_data_buf, qp, frag, head, + last_seg, nsegs); + } + + /** + * Allocates a single rte_mbuf and sets it to point to a given data + * buffer. + * + * @param qp DPDKQueuePair handle (in) + * @param m New allocated rte_mbuf (out) + * @param va virtual address of a data buffer (in) + * @param buf_len length of the data to copy (in) + * + * @return The actual number of bytes that has been set in the mbuf + */ + static size_t set_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) { + static constexpr size_t max_frag_len = 15 * 1024; // 15K + + // FIXME: current all tx buf is allocated without rte_malloc + return copy_one_data_buf(qp, m, va, buf_len); + // + // Currently we break a buffer on a 15K boundary because 82599 + // devices have a 15.5K limitation on a maximum single fragment + // size. + // + rte_iova_t pa = rte_malloc_virt2iova(va); + if (!pa) + return copy_one_data_buf(qp, m, va, buf_len); + + ceph_assert(buf_len); + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return 0; + } + + size_t len = std::min(buf_len, max_frag_len); + + buf->set_zc_info(va, pa, len); + m = buf->rte_mbuf_p(); + + return len; + } + + /** + * Allocates a single rte_mbuf and copies a given data into it. + * + * @param qp DPDKQueuePair handle (in) + * @param m New allocated rte_mbuf (out) + * @param data Data to copy from (in) + * @param buf_len length of the data to copy (in) + * + * @return The actual number of bytes that has been copied + */ + static size_t copy_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len); + + /** + * Checks if the first fragment of the given packet satisfies the + * zero-copy flow requirement: its first 128 bytes should not cross the + * 4K page boundary. This is required in order to avoid splitting packet + * headers. + * + * @param p packet to check + * + * @return TRUE if packet is ok and FALSE otherwise. + */ + static bool check_frag0(Packet& p) + { + // + // First frag is special - it has headers that should not be split. + // If the addressing is such that the first fragment has to be + // split, then send this packet in a (non-zero) copy flow. We'll + // check if the first 128 bytes of the first fragment reside in the + // physically contiguous area. If that's the case - we are good to + // go. + // + if (p.frag(0).size < 128) + return false; + + return true; + } + + public: + tx_buf(tx_buf_factory& fc) : _fc(fc) { + + _buf_physaddr = _mbuf.buf_physaddr; + _data_off = _mbuf.data_off; + } + + rte_mbuf* rte_mbuf_p() { return &_mbuf; } + + void set_zc_info(void* va, phys_addr_t pa, size_t len) { + // mbuf_put() + _mbuf.data_len = len; + _mbuf.pkt_len = len; + + // Set the mbuf to point to our data + _mbuf.buf_addr = va; + _mbuf.buf_physaddr = pa; + _mbuf.data_off = 0; + _is_zc = true; + } + + void reset_zc() { + + // + // If this mbuf was the last in a cluster and contains an + // original packet object then call the destructor of the + // original packet object. + // + if (_p) { + // + // Reset the std::optional. This in particular is going + // to call the "packet"'s destructor and reset the + // "optional" state to "nonengaged". + // + _p.destroy(); + + } else if (!_is_zc) { + return; + } + + // Restore the rte_mbuf fields we trashed in set_zc_info() + _mbuf.buf_physaddr = _buf_physaddr; + _mbuf.buf_addr = rte_mbuf_to_baddr(&_mbuf); + _mbuf.data_off = _data_off; + + _is_zc = false; + } + + void recycle() { + struct rte_mbuf *m = &_mbuf, *m_next; + + while (m != nullptr) { + m_next = m->next; + rte_pktmbuf_reset(m); + _fc.put(me(m)); + m = m_next; + } + } + + void set_packet(Packet&& p) { + _p = std::move(p); + } + + private: + struct rte_mbuf _mbuf; + MARKER private_start; + Tub<Packet> _p; + phys_addr_t _buf_physaddr; + uint16_t _data_off; + // TRUE if underlying mbuf has been used in the zero-copy flow + bool _is_zc = false; + // buffers' factory the buffer came from + tx_buf_factory& _fc; + MARKER private_end; + }; + + class tx_buf_factory { + // + // Number of buffers to free in each GC iteration: + // We want the buffers to be allocated from the mempool as many as + // possible. + // + // On the other hand if there is no Tx for some time we want the + // completions to be eventually handled. Thus we choose the smallest + // possible packets count number here. + // + static constexpr int gc_count = 1; + public: + tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid); + ~tx_buf_factory() { + // put all mbuf back into mempool in order to make the next factory work + while (gc()); + rte_mempool_put_bulk(_pool, (void**)_ring.data(), + _ring.size()); + } + + + /** + * @note Should not be called if there are no free tx_buf's + * + * @return a free tx_buf object + */ + tx_buf* get() { + // Take completed from the HW first + tx_buf *pkt = get_one_completed(); + if (pkt) { + pkt->reset_zc(); + return pkt; + } + + // + // If there are no completed at the moment - take from the + // factory's cache. + // + if (_ring.empty()) { + return nullptr; + } + + pkt = _ring.back(); + _ring.pop_back(); + + return pkt; + } + + void put(tx_buf* buf) { + buf->reset_zc(); + _ring.push_back(buf); + } + + bool gc() { + for (int cnt = 0; cnt < gc_count; ++cnt) { + auto tx_buf_p = get_one_completed(); + if (!tx_buf_p) { + return false; + } + + put(tx_buf_p); + } + + return true; + } + private: + /** + * Fill the mbufs circular buffer: after this the _pool will become + * empty. We will use it to catch the completed buffers: + * + * - Underlying PMD drivers will "free" the mbufs once they are + * completed. + * - We will poll the _pktmbuf_pool_tx till it's empty and release + * all the buffers from the freed mbufs. + */ + void init_factory() { + while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) { + _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this}); + } + } + + /** + * PMD puts the completed buffers back into the mempool they have + * originally come from. + * + * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call + * rte_pktmbuf_reset() here again. + * + * @return a single tx_buf that has been completed by HW. + */ + tx_buf* get_one_completed() { + return tx_buf::me(rte_pktmbuf_alloc(_pool)); + } + + private: + CephContext *cct; + std::vector<tx_buf*> _ring; + rte_mempool* _pool = nullptr; + }; + + public: + explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid); + ~DPDKQueuePair() { + if (device_stat_time_fd) { + center->delete_time_event(device_stat_time_fd); + } + rx_gc(true); + } + + void rx_start() { + _rx_poller.construct(this); + } + + uint32_t send(circular_buffer<Packet>& pb) { + // Zero-copy send + return _send(pb, [&] (Packet&& p) { + return tx_buf::from_packet_zc(cct, std::move(p), *this); + }); + } + + DPDKDevice& port() const { return *_dev; } + tx_buf* get_tx_buf() { return _tx_buf_factory.get(); } + + void handle_stats(); + + private: + template <class Func> + uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) { + if (_tx_burst.size() == 0) { + for (auto&& p : pb) { + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(p.len()); + + tx_buf* buf = packet_to_tx_buf_p(std::move(p)); + if (!buf) { + break; + } + + _tx_burst.push_back(buf->rte_mbuf_p()); + } + } + + uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid, + _tx_burst.data() + _tx_burst_idx, + _tx_burst.size() - _tx_burst_idx); + + uint64_t nr_frags = 0, bytes = 0; + + for (int i = 0; i < sent; i++) { + rte_mbuf* m = _tx_burst[_tx_burst_idx + i]; + bytes += m->pkt_len; + nr_frags += m->nb_segs; + pb.pop_front(); + } + + perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags); + perf_logger->inc(l_dpdk_qp_tx_bytes, bytes); + + _tx_burst_idx += sent; + + if (_tx_burst_idx == _tx_burst.size()) { + _tx_burst_idx = 0; + _tx_burst.clear(); + } + + return sent; + } + + /** + * Allocate a new data buffer and set the mbuf to point to it. + * + * Do some DPDK hacks to work on PMD: it assumes that the buf_addr + * points to the private data of RTE_PKTMBUF_HEADROOM before the actual + * data buffer. + * + * @param m mbuf to update + */ + static bool refill_rx_mbuf(rte_mbuf* m, size_t size, + std::vector<void*> &datas) { + if (datas.empty()) + return false; + void *data = datas.back(); + datas.pop_back(); + + // + // Set the mbuf to point to our data. + // + // Do some DPDK hacks to work on PMD: it assumes that the buf_addr + // points to the private data of RTE_PKTMBUF_HEADROOM before the + // actual data buffer. + // + m->buf_addr = (char*)data - RTE_PKTMBUF_HEADROOM; + m->buf_physaddr = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM; + return true; + } + + bool init_rx_mbuf_pool(); + bool rx_gc(bool force=false); + bool refill_one_cluster(rte_mbuf* head); + + /** + * Polls for a burst of incoming packets. This function will not block and + * will immediately return after processing all available packets. + * + */ + bool poll_rx_once(); + + /** + * Translates an rte_mbuf's into packet and feeds them to _rx_stream. + * + * @param bufs An array of received rte_mbuf's + * @param count Number of buffers in the bufs[] + */ + void process_packets(struct rte_mbuf **bufs, uint16_t count); + + /** + * Translate rte_mbuf into the "packet". + * @param m mbuf to translate + * + * @return a "optional" object representing the newly received data if in an + * "engaged" state or an error if in a "disengaged" state. + */ + Tub<Packet> from_mbuf(rte_mbuf* m); + + /** + * Transform an LRO rte_mbuf cluster into the "packet" object. + * @param m HEAD of the mbufs' cluster to transform + * + * @return a "optional" object representing the newly received LRO packet if + * in an "engaged" state or an error if in a "disengaged" state. + */ + Tub<Packet> from_mbuf_lro(rte_mbuf* m); + + private: + CephContext *cct; + std::vector<packet_provider_type> _pkt_providers; + Tub<std::array<uint8_t, 128>> _sw_reta; + circular_buffer<Packet> _proxy_packetq; + stream<Packet> _rx_stream; + circular_buffer<Packet> _tx_packetq; + std::vector<void*> _alloc_bufs; + + PerfCounters *perf_logger; + DPDKDevice* _dev; + uint8_t _dev_port_idx; + EventCenter *center; + uint8_t _qid; + rte_mempool *_pktmbuf_pool_rx; + std::vector<rte_mbuf*> _rx_free_pkts; + std::vector<rte_mbuf*> _rx_free_bufs; + std::vector<fragment> _frags; + std::vector<char*> _bufs; + size_t _num_rx_free_segs = 0; + uint64_t device_stat_time_fd = 0; + +#ifdef CEPH_PERF_DEV + uint64_t rx_cycles = 0; + uint64_t rx_count = 0; + uint64_t tx_cycles = 0; + uint64_t tx_count = 0; +#endif + + class DPDKTXPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKTXPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {} + + virtual int poll() { + return qp->poll_tx(); + } + } _tx_poller; + + class DPDKRXGCPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKRXGCPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {} + + virtual int poll() { + return qp->rx_gc(); + } + } _rx_gc_poller; + tx_buf_factory _tx_buf_factory; + class DPDKRXPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKRXPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {} + + virtual int poll() { + return qp->poll_rx_once(); + } + }; + Tub<DPDKRXPoller> _rx_poller; + class DPDKTXGCPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKTXGCPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {} + + virtual int poll() { + return qp->_tx_buf_factory.gc(); + } + } _tx_gc_poller; + std::vector<rte_mbuf*> _tx_burst; + uint16_t _tx_burst_idx = 0; +}; + +class DPDKDevice { + public: + CephContext *cct; + PerfCounters *perf_logger; + std::vector<std::unique_ptr<DPDKQueuePair>> _queues; + std::vector<DPDKWorker*> workers; + size_t _rss_table_bits = 0; + uint8_t _port_idx; + uint16_t _num_queues; + unsigned cores; + hw_features _hw_features; + uint8_t _queues_ready = 0; + unsigned _home_cpu; + bool _use_lro; + bool _enable_fc; + std::vector<uint8_t> _redir_table; + rss_key_type _rss_key; + bool _is_i40e_device = false; + bool _is_vmxnet3_device = false; + + public: + rte_eth_dev_info _dev_info = {}; + + /** + * The final stage of a port initialization. + * @note Must be called *after* all queues from stage (2) have been + * initialized. + */ + int init_port_fini(); + + private: + /** + * Port initialization consists of 3 main stages: + * 1) General port initialization which ends with a call to + * rte_eth_dev_configure() where we request the needed number of Rx and + * Tx queues. + * 2) Individual queues initialization. This is done in the constructor of + * DPDKQueuePair class. In particular the memory pools for queues are allocated + * in this stage. + * 3) The final stage of the initialization which starts with the call of + * rte_eth_dev_start() after which the port becomes fully functional. We + * will also wait for a link to get up in this stage. + */ + + + /** + * First stage of the port initialization. + * + * @return 0 in case of success and an appropriate error code in case of an + * error. + */ + int init_port_start(); + + /** + * Check the link status of out port in up to 9s, and print them finally. + */ + int check_port_link_status(); + + /** + * Configures the HW Flow Control + */ + void set_hw_flow_control(); + + public: + DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc): + cct(c), _port_idx(port_idx), _num_queues(num_queues), + _home_cpu(0), _use_lro(use_lro), + _enable_fc(enable_fc) { + _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues); + /* now initialise the port we will use */ + int ret = init_port_start(); + if (ret != 0) { + rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx); + } + string name(std::string("port") + std::to_string(port_idx)); + PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last); + + plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets"); + plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors"); + + plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors"); + plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors"); + plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors"); + plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + } + + ~DPDKDevice() { + rte_eth_dev_stop(_port_idx); + } + + DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; } + void l2receive(int qid, Packet p) { + _queues[qid]->_rx_stream.produce(std::move(p)); + } + subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) { + auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet)); + _queues[cpuid]->rx_start(); + return std::move(sub); + } + ethernet_address hw_address() { + struct ether_addr mac; + rte_eth_macaddr_get(_port_idx, &mac); + + return mac.addr_bytes; + } + hw_features get_hw_features() { + return _hw_features; + } + const rss_key_type& rss_key() const { return _rss_key; } + uint16_t hw_queues_count() { return _num_queues; } + std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) { + std::unique_ptr<DPDKQueuePair> qp; + qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid)); + return std::move(qp); + } + unsigned hash2qid(uint32_t hash) { + // return hash % hw_queues_count(); + return _redir_table[hash & (_redir_table.size() - 1)]; + } + void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) { + ceph_assert(!_queues[i]); + _queues[i] = std::move(qp); + } + void unset_local_queue(unsigned i) { + ceph_assert(_queues[i]); + _queues[i].reset(); + } + template <typename Func> + unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) { + auto& qp = queue_for_cpu(src_cpuid); + if (!qp._sw_reta) + return src_cpuid; + + ceph_assert(!qp._sw_reta); + auto hash = hashfn() >> _rss_table_bits; + auto& reta = *qp._sw_reta; + return reta[hash % reta.size()]; + } + unsigned hash2cpu(uint32_t hash) { + // there is an assumption here that qid == get_id() which will + // not necessary be true in the future + return forward_dst(hash2qid(hash), [hash] { return hash; }); + } + + hw_features& hw_features_ref() { return _hw_features; } + + const rte_eth_rxconf* def_rx_conf() const { + return &_dev_info.default_rxconf; + } + + const rte_eth_txconf* def_tx_conf() const { + return &_dev_info.default_txconf; + } + + /** + * Set the RSS table in the device and store it in the internal vector. + */ + void set_rss_table(); + + uint8_t port_idx() { return _port_idx; } + bool is_i40e_device() const { + return _is_i40e_device; + } + bool is_vmxnet3_device() const { + return _is_vmxnet3_device; + } +}; + + +std::unique_ptr<DPDKDevice> create_dpdk_net_device( + CephContext *c, unsigned cores, uint8_t port_idx = 0, + bool use_lro = true, bool enable_fc = true); + + +/** + * @return Number of bytes needed for mempool objects of each QP. + */ +uint32_t qp_mempool_obj_size(); + +#endif // CEPH_DPDK_DEV_H diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc new file mode 100644 index 00000000..3101ae57 --- /dev/null +++ b/src/msg/async/dpdk/DPDKStack.cc @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <memory> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <tuple> + +#include "common/ceph_argparse.h" +#include "dpdk_rte.h" +#include "DPDKStack.h" +#include "DPDK.h" +#include "IP.h" +#include "TCP-Stack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" +#include "common/Cond.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdkstack " + +static int dpdk_thread_adaptor(void* f) +{ + (*static_cast<std::function<void ()>*>(f))(); + return 0; +} + +void DPDKWorker::initialize() +{ + static enum { + WAIT_DEVICE_STAGE, + WAIT_PORT_FIN_STAGE, + DONE + } create_stage = WAIT_DEVICE_STAGE; + static Mutex lock("DPDKStack::lock"); + static Cond cond; + static unsigned queue_init_done = 0; + static unsigned cores = 0; + static std::shared_ptr<DPDKDevice> sdev; + + unsigned i = center.get_id(); + if (i == 0) { + // Hardcoded port index 0. + // TODO: Inherit it from the opts + cores = cct->_conf->ms_async_op_threads; + std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device( + cct, cores, cct->_conf->ms_dpdk_port_id, + cct->_conf->ms_dpdk_lro, + cct->_conf->ms_dpdk_hw_flow_control); + sdev = std::shared_ptr<DPDKDevice>(dev.release()); + sdev->workers.resize(cores); + ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl; + + Mutex::Locker l(lock); + create_stage = WAIT_PORT_FIN_STAGE; + cond.Signal(); + } else { + Mutex::Locker l(lock); + while (create_stage <= WAIT_DEVICE_STAGE) + cond.Wait(lock); + } + ceph_assert(sdev); + if (i < sdev->hw_queues_count()) { + auto qp = sdev->init_local_queue(cct, ¢er, cct->_conf->ms_dpdk_hugepages, i); + std::map<unsigned, float> cpu_weights; + for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count(); + j < cores; j+= sdev->hw_queues_count()) + cpu_weights[i] = 1; + cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight; + qp->configure_proxies(cpu_weights); + sdev->set_local_queue(i, std::move(qp)); + Mutex::Locker l(lock); + ++queue_init_done; + cond.Signal(); + } else { + // auto master = qid % sdev->hw_queues_count(); + // sdev->set_local_queue(create_proxy_net_device(master, sdev.get())); + ceph_abort(); + } + if (i == 0) { + { + Mutex::Locker l(lock); + while (queue_init_done < cores) + cond.Wait(lock); + } + + if (sdev->init_port_fini() < 0) { + lderr(cct) << __func__ << " init_port_fini failed " << dendl; + ceph_abort(); + } + Mutex::Locker l(lock); + create_stage = DONE; + cond.Signal(); + } else { + Mutex::Locker l(lock); + while (create_stage <= WAIT_PORT_FIN_STAGE) + cond.Wait(lock); + } + + sdev->workers[i] = this; + _impl = std::unique_ptr<DPDKWorker::Impl>( + new DPDKWorker::Impl(cct, i, ¢er, sdev)); + { + Mutex::Locker l(lock); + if (!--queue_init_done) { + create_stage = WAIT_DEVICE_STAGE; + sdev.reset(); + } + } +} + +using AvailableIPAddress = std::tuple<string, string, string>; +static bool parse_available_address( + const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res) +{ + vector<string> ip_vec, gate_vec, mask_vec; + string_to_vec(ip_vec, ips); + string_to_vec(gate_vec, gates); + string_to_vec(mask_vec, masks); + if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size()) + return false; + + for (size_t i = 0; i < ip_vec.size(); ++i) { + res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]}); + } + return true; +} + +static bool match_available_address(const vector<AvailableIPAddress> &avails, + const entity_addr_t &ip, int &res) +{ + for (size_t i = 0; i < avails.size(); ++i) { + entity_addr_t addr; + auto a = std::get<0>(avails[i]).c_str(); + if (!addr.parse(a)) + continue; + if (addr.is_same_host(ip)) { + res = i; + return true; + } + } + return false; +} + +DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev) + : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif) +{ + vector<AvailableIPAddress> tuples; + bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"), + cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"), + cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples); + if (!parsed) { + lderr(cct) << __func__ << " no available address " + << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", " + << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", " + << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", " + << dendl; + ceph_abort(); + } + _inet.set_host_address(ipv4_address(std::get<0>(tuples[0]))); + _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0]))); + _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0]))); +} + +DPDKWorker::Impl::~Impl() +{ + _dev->unset_local_queue(id); +} + +int DPDKWorker::listen(entity_addr_t &sa, const SocketOptions &opt, + ServerSocket *sock) +{ + ceph_assert(sa.get_family() == AF_INET); + ceph_assert(sock); + + ldout(cct, 10) << __func__ << " addr " << sa << dendl; + // vector<AvailableIPAddress> tuples; + // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr, + // cct->_conf->ms_dpdk_gateway_ipv4_addr, + // cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples); + // if (!parsed) { + // lderr(cct) << __func__ << " no available address " + // << cct->_conf->ms_dpdk_host_ipv4_addr << ", " + // << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", " + // << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", " + // << dendl; + // return -EINVAL; + // } + // int idx; + // parsed = match_available_address(tuples, sa, idx); + // if (!parsed) { + // lderr(cct) << __func__ << " no matched address for " << sa << dendl; + // return -EINVAL; + // } + // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx]))); + // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx]))); + // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx]))); + return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(), + sock); +} + +int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) +{ + // ceph_assert(addr.get_family() == AF_INET); + int r = tcpv4_connect(_impl->_inet.get_tcp(), addr, socket); + ldout(cct, 10) << __func__ << " addr " << addr << dendl; + return r; +} + +void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func) +{ + // create a extra master thread + // + funcs[i] = std::move(func); + int r = 0; + r = dpdk::eal::init(cct); + if (r < 0) { + lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl; + ceph_abort(); + } + // if dpdk::eal::init already called by NVMEDevice, we will select 1..n + // cores + ceph_assert(rte_lcore_count() >= i + 1); + unsigned core_id; + int j = i; + RTE_LCORE_FOREACH_SLAVE(core_id) { + if (i-- == 0) { + break; + } + } + dpdk::eal::execute_on_master([&]() { + r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id); + if (r < 0) { + lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl; + ceph_abort(); + } + }); +} + +void DPDKStack::join_worker(unsigned i) +{ + dpdk::eal::execute_on_master([&]() { + rte_eal_wait_lcore(i+1); + }); +} diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h new file mode 100644 index 00000000..a44ae383 --- /dev/null +++ b/src/msg/async/dpdk/DPDKStack.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_MSG_DPDKSTACK_H +#define CEPH_MSG_DPDKSTACK_H + +#include <functional> + +#include "common/ceph_context.h" +#include "common/Tub.h" + +#include "msg/async/Stack.h" +#include "net.h" +#include "const.h" +#include "IP.h" +#include "Packet.h" + +class interface; + +template <typename Protocol> +class NativeConnectedSocketImpl; + +// DPDKServerSocketImpl +template <typename Protocol> +class DPDKServerSocketImpl : public ServerSocketImpl { + typename Protocol::listener _listener; + public: + DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt, + int type); + int listen() { + return _listener.listen(); + } + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + virtual int fd() const override { + return _listener.fd(); + } +}; + +// NativeConnectedSocketImpl +template <typename Protocol> +class NativeConnectedSocketImpl : public ConnectedSocketImpl { + typename Protocol::connection _conn; + uint32_t _cur_frag = 0; + uint32_t _cur_off = 0; + Tub<Packet> _buf; + Tub<bufferptr> _cache_ptr; + + public: + explicit NativeConnectedSocketImpl(typename Protocol::connection conn) + : _conn(std::move(conn)) {} + NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs) + : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf)) {} + virtual int is_connected() override { + return _conn.is_connected(); + } + + virtual ssize_t read(char *buf, size_t len) override { + size_t left = len; + ssize_t r = 0; + size_t off = 0; + while (left > 0) { + if (!_cache_ptr) { + _cache_ptr.construct(); + r = zero_copy_read(*_cache_ptr); + if (r <= 0) { + _cache_ptr.destroy(); + if (r == -EAGAIN) + break; + return r; + } + } + if (_cache_ptr->length() <= left) { + _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off); + left -= _cache_ptr->length(); + off += _cache_ptr->length(); + _cache_ptr.destroy(); + } else { + _cache_ptr->copy_out(0, left, buf+off); + _cache_ptr->set_offset(_cache_ptr->offset() + left); + _cache_ptr->set_length(_cache_ptr->length() - left); + left = 0; + break; + } + } + return len - left ? len - left : -EAGAIN; + } + + virtual ssize_t zero_copy_read(bufferptr &data) override { + auto err = _conn.get_errno(); + if (err <= 0) + return err; + + if (!_buf) { + _buf = std::move(_conn.read()); + if (!_buf) + return -EAGAIN; + } + + fragment &f = _buf->frag(_cur_frag); + Packet p = _buf->share(_cur_off, f.size); + auto del = std::bind( + [](Packet &p) {}, std::move(p)); + data = buffer::claim_buffer( + f.size, f.base, make_deleter(std::move(del))); + if (++_cur_frag == _buf->nr_frags()) { + _cur_frag = 0; + _cur_off = 0; + _buf.destroy(); + } else { + _cur_off += f.size; + } + ceph_assert(data.length()); + return data.length(); + } + virtual ssize_t send(bufferlist &bl, bool more) override { + auto err = _conn.get_errno(); + if (err < 0) + return (ssize_t)err; + + size_t available = _conn.peek_sent_available(); + if (available == 0) { + return 0; + } + + std::vector<fragment> frags; + std::list<bufferptr>::const_iterator pb = bl.buffers().begin(); + uint64_t left_pbrs = bl.buffers().size(); + uint64_t len = 0; + uint64_t seglen = 0; + while (len < available && left_pbrs--) { + seglen = pb->length(); + if (len + seglen > available) { + // don't continue if we enough at least 1 fragment since no available + // space for next ptr. + if (len > 0) + break; + seglen = std::min(seglen, available); + } + len += seglen; + frags.push_back(fragment{(char*)pb->c_str(), seglen}); + ++pb; + } + + if (len != bl.length()) { + bufferlist swapped; + bl.splice(0, len, &swapped); + auto del = std::bind( + [](bufferlist &bl) {}, std::move(swapped)); + return _conn.send(Packet(std::move(frags), make_deleter(std::move(del)))); + } else { + auto del = std::bind( + [](bufferlist &bl) {}, std::move(bl)); + + return _conn.send(Packet(std::move(frags), make_deleter(std::move(del)))); + } + } + virtual void shutdown() override { + _conn.close_write(); + } + // FIXME need to impl close + virtual void close() override { + _conn.close_write(); + } + virtual int fd() const override { + return _conn.fd(); + } + virtual int socket_fd() const override { + return _conn.fd(); + } + +}; + +template <typename Protocol> +DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl( + Protocol& proto, uint16_t port, const SocketOptions &opt, int type) + : ServerSocketImpl(type), _listener(proto.listen(port)) {} + +template <typename Protocol> +int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) { + if (_listener.get_errno() < 0) + return _listener.get_errno(); + auto c = _listener.accept(); + if (!c) + return -EAGAIN; + + if (out) { + *out = c->remote_addr(); + out->set_type(addr_type); + } + std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi( + new NativeConnectedSocketImpl<Protocol>(std::move(*c))); + *s = ConnectedSocket(std::move(csi)); + return 0; +} + +template <typename Protocol> +void DPDKServerSocketImpl<Protocol>::abort_accept() { + _listener.abort_accept(); +} + +class DPDKWorker : public Worker { + struct Impl { + unsigned id; + interface _netif; + std::shared_ptr<DPDKDevice> _dev; + ipv4 _inet; + Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev); + ~Impl(); + }; + std::unique_ptr<Impl> _impl; + + virtual void initialize() override; + void set_ipv4_packet_filter(ip_packet_filter* filter) { + _impl->_inet.set_packet_filter(filter); + } + using tcp4 = tcp<ipv4_traits>; + + public: + explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {} + virtual int listen(entity_addr_t &addr, const SocketOptions &opts, ServerSocket *) override; + virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; + void arp_learn(ethernet_address l2, ipv4_address l3) { + _impl->_inet.learn(l2, l3); + } + virtual void destroy() override { + _impl.reset(); + } + + friend class DPDKServerSocketImpl<tcp4>; +}; + +class DPDKStack : public NetworkStack { + vector<std::function<void()> > funcs; + public: + explicit DPDKStack(CephContext *cct, const string &t): NetworkStack(cct, t) { + funcs.resize(cct->_conf->ms_async_max_op_threads); + } + virtual bool support_zero_copy_read() const override { return true; } + virtual bool support_local_listen_table() const override { return true; } + + virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override; + virtual void join_worker(unsigned i) override; +}; + +#endif diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc new file mode 100644 index 00000000..5d291716 --- /dev/null +++ b/src/msg/async/dpdk/EventDPDK.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "DPDKStack.h" +#include "EventDPDK.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "DPDKDriver." + +int DPDKDriver::init(EventCenter *c, int nevent) +{ + return 0; +} + +int DPDKDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask + << " add_mask=" << add_mask << dendl; + + int r = manager.listen(fd, add_mask); + if (r < 0) { + lderr(cct) << __func__ << " add fd=" << fd << " failed. " + << cpp_strerror(-r) << dendl; + return -errno; + } + + return 0; +} + +int DPDKDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask + << " delmask=" << delmask << dendl; + int r = 0; + + if (delmask != EVENT_NONE) { + if ((r = manager.unlisten(fd, delmask)) < 0) { + lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask + << " failed." << cpp_strerror(-r) << dendl; + return r; + } + } + return 0; +} + +int DPDKDriver::resize_events(int newsize) +{ + return 0; +} + +int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int num_events = 512; + int events[num_events]; + int masks[num_events]; + + int retval = manager.poll(events, masks, num_events, tvp); + if (retval > 0) { + fired_events.resize(retval); + for (int i = 0; i < retval; i++) { + fired_events[i].fd = events[i]; + fired_events[i].mask = masks[i]; + } + } + return retval; +} diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h new file mode 100644 index 00000000..541c2210 --- /dev/null +++ b/src/msg/async/dpdk/EventDPDK.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_EVENTDPDK_H +#define CEPH_EVENTDPDK_H + +#include "msg/async/Event.h" +#include "msg/async/Stack.h" +#include "UserspaceEvent.h" + +class DPDKDriver : public EventDriver { + CephContext *cct; + + public: + UserspaceEventManager manager; + + explicit DPDKDriver(CephContext *c): cct(c), manager(c) {} + virtual ~DPDKDriver() { } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override; + bool need_wakeup() override { return false; } +}; + +#endif //CEPH_EVENTDPDK_H diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc new file mode 100644 index 00000000..f730cded --- /dev/null +++ b/src/msg/async/dpdk/IP.cc @@ -0,0 +1,470 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/perf_counters.h" + +#include "capture.h" +#include "IP.h" +#include "toeplitz.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + +std::ostream& operator<<(std::ostream& os, const ipv4_address& a) { + auto ip = a.ip; + return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff) + << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff); +} + +utime_t ipv4::_frag_timeout = utime_t(30, 0); +constexpr uint32_t ipv4::_frag_low_thresh; +constexpr uint32_t ipv4::_frag_high_thresh; + +class C_handle_frag_timeout : public EventCallback { + ipv4 *_ipv4; + + public: + C_handle_frag_timeout(ipv4 *i): _ipv4(i) {} + void do_request(uint64_t fd_or_id) { + _ipv4->frag_timeout(); + } +}; + +enum { + l_dpdk_qp_first = 99000, + l_dpdk_total_linearize_operations, + l_dpdk_qp_last +}; + +ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif) + : cct(c), center(cen), _netif(netif), _global_arp(netif), + _arp(c, _global_arp, cen), + _host_address(0), _gw_address(0), _netmask(0), + _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }), + _rx_packets( + _l3.receive( + [this] (Packet p, ethernet_address ea) { + return handle_received_packet(std::move(p), ea); + }, + [this] (forward_hash& out_hash_data, Packet& p, size_t off) { + return forward(out_hash_data, p, off); + } + ) + ), + _tcp(*this, cen), _icmp(c, *this), + _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp }, + { uint8_t(ip_protocol_num::icmp), &_icmp }}), + _packet_filter(nullptr) +{ + PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last); + plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations"); + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + frag_handler = new C_handle_frag_timeout(this); +} + +bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + auto iph = p.get_header<ip_hdr>(off); + + out_hash_data.push_back(iph->src_ip.ip); + out_hash_data.push_back(iph->dst_ip.ip); + + auto h = iph->ntoh(); + auto l4 = _l4[h.ip_proto]; + if (l4) { + if (h.mf() == false && h.offset() == 0) { + // This IP datagram is atomic, forward according to tcp connection hash + l4->forward(out_hash_data, p, off + sizeof(ip_hdr)); + } + // else forward according to ip fields only + } + return true; +} + +int ipv4::handle_received_packet(Packet p, ethernet_address from) +{ + auto iph = p.get_header<ip_hdr>(0); + if (!iph) { + return 0; + } + + // Skip checking csum of reassembled IP datagram + if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) { + checksummer csum; + csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); + if (csum.get() != 0) { + return 0; + } + } + + auto h = iph->ntoh(); + unsigned ip_len = h.len; + unsigned ip_hdr_len = h.ihl * 4; + unsigned pkt_len = p.len(); + auto offset = h.offset(); + + ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto) + << std::dec << " packet from " + << h.src_ip << " -> " << h.dst_ip << " id=" << h.id + << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len + << " pkt_len=" << pkt_len << " offset=" << offset << dendl; + + if (pkt_len > ip_len) { + // Trim extra data in the packet beyond IP total length + p.trim_back(pkt_len - ip_len); + } else if (pkt_len < ip_len) { + // Drop if it contains less than IP total length + return 0; + } + // Drop if the reassembled datagram will be larger than maximum IP size + if (offset + p.len() > ip_packet_len_max) { + return 0; + } + + // FIXME: process options + if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) { + ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl; + _arp.learn(from, h.src_ip); + } + + if (_packet_filter) { + bool handled = false; + _packet_filter->handle(p, &h, from, handled); + if (handled) { + return 0; + } + } + + if (h.dst_ip != _host_address) { + // FIXME: forward + return 0; + } + + // Does this IP datagram need reassembly + auto mf = h.mf(); + if (mf == true || offset != 0) { + frag_limit_mem(); + auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto}; + auto& frag = _frags[frag_id]; + if (mf == false) { + frag.last_frag_received = true; + } + // This is a newly created frag_id + if (frag.mem_size == 0) { + _frags_age.push_back(frag_id); + frag.rx_time = ceph_clock_now(); + } + auto added_size = frag.merge(h, offset, std::move(p)); + _frag_mem += added_size; + if (frag.is_complete()) { + // All the fragments are received + auto dropped_size = frag.mem_size; + auto& ip_data = frag.data.map.begin()->second; + // Choose a cpu to forward this packet + auto cpu_id = center->get_id(); + auto l4 = _l4[h.ip_proto]; + if (l4) { + size_t l4_offset = 0; + forward_hash hash_data; + hash_data.push_back(hton(h.src_ip.ip)); + hash_data.push_back(hton(h.dst_ip.ip)); + l4->forward(hash_data, ip_data, l4_offset); + cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data)); + } + + // No need to forward if the dst cpu is the current cpu + if (cpu_id == center->get_id()) { + l4->received(std::move(ip_data), h.src_ip, h.dst_ip); + } else { + auto to = _netif->hw_address(); + auto pkt = frag.get_assembled_packet(from, to); + _netif->forward(center, cpu_id, std::move(pkt)); + } + + // Delete this frag from _frags and _frags_age + frag_drop(frag_id, dropped_size); + _frags_age.remove(frag_id); + perf_logger->set(l_dpdk_total_linearize_operations, + ipv4_packet_merger::linearizations()); + } else { + // Some of the fragments are missing + if (frag_timefd) { + frag_arm(); + } + } + return 0; + } + + auto l4 = _l4[h.ip_proto]; + if (l4) { + // Trim IP header and pass to upper layer + p.trim_front(ip_hdr_len); + l4->received(std::move(p), h.src_ip, h.dst_ip); + } + return 0; +} + +void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) { + // Figure out where to send the packet to. If it is a directly connected + // host, send to it directly, otherwise send to the default gateway. + ipv4_address dst; + if (in_my_netmask(to)) { + dst = to; + } else { + dst = _gw_address; + } + + _arp.wait(std::move(dst), std::move(p), std::move(cb)); +} + +const hw_features& ipv4::get_hw_features() const +{ + return _netif->get_hw_features(); +} + +void ipv4::send(ipv4_address to, ip_protocol_num proto_num, + Packet p, ethernet_address e_dst) { + auto needs_frag = this->needs_frag(p, proto_num, get_hw_features()); + + auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable { + static uint16_t id = 0; + auto iph = pkt.prepend_header<ip_hdr>(); + iph->ihl = sizeof(*iph) / 4; + iph->ver = 4; + iph->dscp = 0; + iph->ecn = 0; + iph->len = pkt.len(); + // FIXME: a proper id + iph->id = id++; + if (needs_frag) { + uint16_t mf = remaining > 0; + // The fragment offset is measured in units of 8 octets (64 bits) + auto off = offset / 8; + iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off; + } else { + iph->frag = 0; + } + iph->ttl = 64; + iph->ip_proto = (uint8_t)proto_num; + iph->csum = 0; + iph->src_ip = _host_address; + iph->dst_ip = to; + ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to + << " len " << pkt.len() << dendl; + *iph = iph->hton(); + + if (get_hw_features().tx_csum_ip_offload) { + iph->csum = 0; + pkt.offload_info_ref().needs_ip_csum = true; + } else { + checksummer csum; + csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); + iph->csum = csum.get(); + } + + _packetq.push_back( + l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)}); + }; + + if (needs_frag) { + uint16_t offset = 0; + uint16_t remaining = p.len(); + auto mtu = get_hw_features().mtu; + + while (remaining) { + auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining); + remaining -= can_send; + auto pkt = p.share(offset, can_send); + send_pkt(pkt, remaining, offset); + offset += can_send; + } + } else { + // The whole packet can be send in one shot + send_pkt(p, 0, 0); + } +} + +Tub<l3_protocol::l3packet> ipv4::get_packet() { + // _packetq will be mostly empty here unless it hold remnants of previously + // fragmented packet + if (_packetq.empty()) { + for (size_t i = 0; i < _pkt_providers.size(); i++) { + auto l4p = _pkt_providers[_pkt_provider_idx++](); + if (_pkt_provider_idx == _pkt_providers.size()) { + _pkt_provider_idx = 0; + } + if (l4p) { + ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl; + send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst); + break; + } + } + } + + Tub<l3_protocol::l3packet> p; + if (!_packetq.empty()) { + p = std::move(_packetq.front()); + _packetq.pop_front(); + } + return p; +} + +void ipv4::frag_limit_mem() { + if (_frag_mem <= _frag_high_thresh) { + return; + } + auto drop = _frag_mem - _frag_low_thresh; + while (drop) { + if (_frags_age.empty()) { + return; + } + // Drop the oldest frag (first element) from _frags_age + auto frag_id = _frags_age.front(); + _frags_age.pop_front(); + + // Drop from _frags as well + auto& frag = _frags[frag_id]; + auto dropped_size = frag.mem_size; + frag_drop(frag_id, dropped_size); + + drop -= std::min(drop, dropped_size); + } +} + +void ipv4::frag_timeout() { + if (_frags.empty()) { + return; + } + auto now = ceph_clock_now(); + for (auto it = _frags_age.begin(); it != _frags_age.end();) { + auto frag_id = *it; + auto& frag = _frags[frag_id]; + if (now > frag.rx_time + _frag_timeout) { + auto dropped_size = frag.mem_size; + // Drop from _frags + frag_drop(frag_id, dropped_size); + // Drop from _frags_age + it = _frags_age.erase(it); + } else { + // The further items can only be younger + break; + } + } + if (_frags.size() != 0) { + frag_arm(now); + } else { + _frag_mem = 0; + } +} + +int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) { + uint32_t old = mem_size; + unsigned ip_hdr_len = h.ihl * 4; + // Store IP header + if (offset == 0) { + header = p.share(0, ip_hdr_len); + } + // Sotre IP payload + p.trim_front(ip_hdr_len); + data.merge(offset, std::move(p)); + // Update mem size + mem_size = header.memory(); + for (const auto& x : data.map) { + mem_size += x.second.memory(); + } + auto added_size = mem_size - old; + return added_size; +} + +bool ipv4::frag::is_complete() { + // If all the fragments are received, ipv4::frag::merge() should merge all + // the fragments into a single packet + auto offset = data.map.begin()->first; + auto nr_packet = data.map.size(); + return last_frag_received && nr_packet == 1 && offset == 0; +} + +Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) { + auto& ip_header = header; + auto& ip_data = data.map.begin()->second; + // Append a ethernet header, needed for forwarding + auto eh = ip_header.prepend_header<eth_hdr>(); + eh->src_mac = from; + eh->dst_mac = to; + eh->eth_proto = uint16_t(eth_protocol_num::ipv4); + *eh = eh->hton(); + // Prepare a packet contains both ethernet header, ip header and ip data + ip_header.append(std::move(ip_data)); + auto pkt = std::move(ip_header); + auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr)); + // len is the sum of each fragment + iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr))); + // No fragmentation for the assembled datagram + iph->frag = 0; + // Since each fragment's csum is checked, no need to csum + // again for the assembled datagram + offload_info oi; + oi.reassembled = true; + pkt.set_offload_info(oi); + return pkt; +} + +void icmp::received(Packet p, ipaddr from, ipaddr to) { + auto hdr = p.get_header<icmp_hdr>(0); + if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) { + return; + } + hdr->type = icmp_hdr::msg_type::echo_reply; + hdr->code = 0; + hdr->csum = 0; + checksummer csum; + csum.sum(reinterpret_cast<char*>(hdr), p.len()); + hdr->csum = csum.get(); + + if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue + auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable { + if (r == 0) { + _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp}); + } + }; + _inet.wait_l2_dst_address(from, std::move(p), cb); + } +} diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h new file mode 100644 index 00000000..480b4b95 --- /dev/null +++ b/src/msg/async/dpdk/IP.h @@ -0,0 +1,414 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ + +#ifndef CEPH_MSG_IP_H_ +#define CEPH_MSG_IP_H_ + +#include <arpa/inet.h> +#include <unordered_map> +#include <cstdint> +#include <array> +#include <map> +#include <list> +#include <chrono> + +#include "msg/async/Event.h" +#include "common/Throttle.h" + +#include "array_map.h" +#include "ARP.h" +#include "IPChecksum.h" +#include "ip_types.h" +#include "const.h" +#include "net.h" +#include "PacketUtil.h" +#include "toeplitz.h" + +class ipv4; +template <ip_protocol_num ProtoNum> +class ipv4_l4; + +template <typename InetTraits> +class tcp; + +struct ipv4_traits { + using address_type = ipv4_address; + using inet_type = ipv4_l4<ip_protocol_num::tcp>; + struct l4packet { + ipv4_address to; + Packet p; + ethernet_address e_dst; + ip_protocol_num proto_num; + }; + using packet_provider_type = std::function<Tub<l4packet> ()>; + static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) { + csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len); + } + static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min; +}; + +template <ip_protocol_num ProtoNum> +class ipv4_l4 { + public: + ipv4& _inet; + public: + ipv4_l4(ipv4& inet) : _inet(inet) {} + void register_packet_provider(ipv4_traits::packet_provider_type func); + void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb); +}; + +class ip_protocol { + public: + virtual ~ip_protocol() {} + virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; } +}; + +template <typename InetTraits> +struct l4connid { + using ipaddr = typename InetTraits::address_type; + using inet_type = typename InetTraits::inet_type; + struct connid_hash; + + ipaddr local_ip; + ipaddr foreign_ip; + uint16_t local_port; + uint16_t foreign_port; + + bool operator==(const l4connid& x) const { + return local_ip == x.local_ip + && foreign_ip == x.foreign_ip + && local_port == x.local_port + && foreign_port == x.foreign_port; + } + + uint32_t hash(const rss_key_type& rss_key) { + forward_hash hash_data; + hash_data.push_back(hton(foreign_ip.ip)); + hash_data.push_back(hton(local_ip.ip)); + hash_data.push_back(hton(foreign_port)); + hash_data.push_back(hton(local_port)); + return toeplitz_hash(rss_key, hash_data); + } +}; + +class ipv4_tcp final : public ip_protocol { + ipv4_l4<ip_protocol_num::tcp> _inet_l4; + std::unique_ptr<tcp<ipv4_traits>> _tcp; + public: + ipv4_tcp(ipv4& inet, EventCenter *c); + ~ipv4_tcp(); + virtual void received(Packet p, ipv4_address from, ipv4_address to) override; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override; + friend class ipv4; +}; + +struct icmp_hdr { + enum class msg_type : uint8_t { + echo_reply = 0, + echo_request = 8, + }; + msg_type type; + uint8_t code; + uint16_t csum; + uint32_t rest; +} __attribute__((packed)); + + +class icmp { + public: + using ipaddr = ipv4_address; + using inet_type = ipv4_l4<ip_protocol_num::icmp>; + explicit icmp(CephContext *c, inet_type& inet) + : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) { + _inet.register_packet_provider([this] { + Tub<ipv4_traits::l4packet> l4p; + if (!_packetq.empty()) { + l4p = std::move(_packetq.front()); + _packetq.pop_front(); + _queue_space.put(l4p->p.len()); + } + return l4p; + }); + } + void received(Packet p, ipaddr from, ipaddr to); + + private: + CephContext *cct; + // ipv4_l4<ip_protocol_num::icmp> + inet_type& _inet; + circular_buffer<ipv4_traits::l4packet> _packetq; + Throttle _queue_space; +}; + +class ipv4_icmp final : public ip_protocol { + CephContext *cct; + ipv4_l4<ip_protocol_num::icmp> _inet_l4; + icmp _icmp; + public: + ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {} + virtual void received(Packet p, ipv4_address from, ipv4_address to) override { + _icmp.received(std::move(p), from, to); + } + friend class ipv4; +}; + +struct ip_hdr; + +struct ip_packet_filter { + virtual ~ip_packet_filter() {}; + virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0; +}; + +struct ipv4_frag_id { + struct hash; + ipv4_address src_ip; + ipv4_address dst_ip; + uint16_t identification; + uint8_t protocol; + bool operator==(const ipv4_frag_id& x) const { + return src_ip == x.src_ip && + dst_ip == x.dst_ip && + identification == x.identification && + protocol == x.protocol; + } +}; + +struct ipv4_frag_id::hash : private std::hash<ipv4_address>, + private std::hash<uint16_t>, private std::hash<uint8_t> { + size_t operator()(const ipv4_frag_id& id) const noexcept { + using h1 = std::hash<ipv4_address>; + using h2 = std::hash<uint16_t>; + using h3 = std::hash<uint8_t>; + return h1::operator()(id.src_ip) ^ + h1::operator()(id.dst_ip) ^ + h2::operator()(id.identification) ^ + h3::operator()(id.protocol); + } +}; + +struct ipv4_tag {}; +using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>; + +class interface; + +class ipv4 { + public: + using address_type = ipv4_address; + using proto_type = uint16_t; + static address_type broadcast_address() { return ipv4_address(0xffffffff); } + static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); } + CephContext *cct; + EventCenter *center; + + private: + interface* _netif; + std::vector<ipv4_traits::packet_provider_type> _pkt_providers; + Tub<uint64_t> frag_timefd; + EventCallbackRef frag_handler; + arp _global_arp; + arp_for<ipv4> _arp; + ipv4_address _host_address; + ipv4_address _gw_address; + ipv4_address _netmask; + l3_protocol _l3; + subscription<Packet, ethernet_address> _rx_packets; + ipv4_tcp _tcp; + ipv4_icmp _icmp; + array_map<ip_protocol*, 256> _l4; + ip_packet_filter *_packet_filter; + struct frag { + Packet header; + ipv4_packet_merger data; + utime_t rx_time; + uint32_t mem_size = 0; + // fragment with MF == 0 inidates it is the last fragment + bool last_frag_received = false; + + Packet get_assembled_packet(ethernet_address from, ethernet_address to); + int32_t merge(ip_hdr &h, uint16_t offset, Packet p); + bool is_complete(); + }; + std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags; + std::list<ipv4_frag_id> _frags_age; + static utime_t _frag_timeout; + static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024}; + static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024}; + uint32_t _frag_mem = 0; + circular_buffer<l3_protocol::l3packet> _packetq; + unsigned _pkt_provider_idx = 0; + PerfCounters *perf_logger; + + private: + int handle_received_packet(Packet p, ethernet_address from); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + Tub<l3_protocol::l3packet> get_packet(); + bool in_my_netmask(ipv4_address a) const { + return !((a.ip ^ _host_address.ip) & _netmask.ip); + } + void frag_limit_mem(); + void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) { + _frags.erase(frag_id); + _frag_mem -= dropped_size; + } + void frag_arm(utime_t now) { + auto tp = now + _frag_timeout; + frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler)); + } + void frag_arm() { + auto now = ceph_clock_now(); + frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler)); + } + + public: + void frag_timeout(); + + public: + explicit ipv4(CephContext *c, EventCenter *cen, interface* netif); + ~ipv4() { + delete frag_handler; + } + void set_host_address(ipv4_address ip) { + _host_address = ip; + _arp.set_self_addr(ip); + } + ipv4_address host_address() { + return _host_address; + } + void set_gw_address(ipv4_address ip) { + _gw_address = ip; + } + ipv4_address gw_address() const { + return _gw_address; + } + void set_netmask_address(ipv4_address ip) { + _netmask = ip; + } + ipv4_address netmask_address() const { + return _netmask; + } + interface *netif() const { + return _netif; + } + // TODO or something. Should perhaps truly be a list + // of filters. With ordering. And blackjack. Etc. + // But for now, a simple single raw pointer suffices + void set_packet_filter(ip_packet_filter *f) { + _packet_filter = f; + } + ip_packet_filter * packet_filter() const { + return _packet_filter; + } + void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst); + tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; } + void register_l4(proto_type id, ip_protocol* handler); + const hw_features& get_hw_features() const; + static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) { + if (p.len() + ipv4_hdr_len_min <= hw_features.mtu) + return false; + + if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso)) + return false; + + return true; + } + void learn(ethernet_address l2, ipv4_address l3) { + _arp.learn(l2, l3); + } + void register_packet_provider(ipv4_traits::packet_provider_type&& func) { + _pkt_providers.push_back(std::move(func)); + } + void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb); +}; + +template <ip_protocol_num ProtoNum> +inline void ipv4_l4<ProtoNum>::register_packet_provider( + ipv4_traits::packet_provider_type func) { + _inet.register_packet_provider([func] { + auto l4p = func(); + if (l4p) { + (*l4p).proto_num = ProtoNum; + } + return l4p; + }); +} + +template <ip_protocol_num ProtoNum> +inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) { + _inet.wait_l2_dst_address(to, std::move(p), std::move(cb)); +} + +struct ip_hdr { + uint8_t ihl : 4; + uint8_t ver : 4; + uint8_t dscp : 6; + uint8_t ecn : 2; + uint16_t len; + uint16_t id; + uint16_t frag; + enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 }; + uint8_t ttl; + uint8_t ip_proto; + uint16_t csum; + ipv4_address src_ip; + ipv4_address dst_ip; + uint8_t options[0]; + ip_hdr hton() { + ip_hdr hdr = *this; + hdr.len = ::hton(len); + hdr.id = ::hton(id); + hdr.frag = ::hton(frag); + hdr.csum = ::hton(csum); + hdr.src_ip.ip = ::hton(src_ip.ip); + hdr.dst_ip.ip = ::hton(dst_ip.ip); + return hdr; + } + ip_hdr ntoh() { + ip_hdr hdr = *this; + hdr.len = ::ntoh(len); + hdr.id = ::ntoh(id); + hdr.frag = ::ntoh(frag); + hdr.csum = ::ntoh(csum); + hdr.src_ip = src_ip.ntoh(); + hdr.dst_ip = dst_ip.ntoh(); + return hdr; + } + + bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); } + bool df() { return frag & (1 << uint8_t(frag_bits::df)); } + uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); } +} __attribute__((packed)); + +template <typename InetTraits> +struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> { + size_t operator()(const l4connid<InetTraits>& id) const noexcept { + using h1 = std::hash<ipaddr>; + using h2 = std::hash<uint16_t>; + return h1::operator()(id.local_ip) + ^ h1::operator()(id.foreign_ip) + ^ h2::operator()(id.local_port) + ^ h2::operator()(id.foreign_port); + } +}; + +#endif /* CEPH_MSG_IP_H */ diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc new file mode 100644 index 00000000..7a3253c1 --- /dev/null +++ b/src/msg/async/dpdk/IPChecksum.cc @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include <arpa/inet.h> +#include "net.h" +#include "IPChecksum.h" + +void checksummer::sum(const char* data, size_t len) { + auto orig_len = len; + if (odd) { + csum += uint8_t(*data++); + --len; + } + auto p64 = reinterpret_cast<const uint64_t*>(data); + while (len >= 8) { + csum += ntohq(*p64++); + len -= 8; + } + auto p16 = reinterpret_cast<const uint16_t*>(p64); + while (len >= 2) { + csum += ntohs(*p16++); + len -= 2; + } + auto p8 = reinterpret_cast<const uint8_t*>(p16); + if (len) { + csum += *p8++ << 8; + len -= 1; + } + odd ^= orig_len & 1; +} + +uint16_t checksummer::get() const { + __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64); + uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64); + csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return htons(~csum); +} + +void checksummer::sum(const Packet& p) { + for (auto&& f : p.fragments()) { + sum(f.base, f.size); + } +} + +uint16_t ip_checksum(const void* data, size_t len) { + checksummer cksum; + cksum.sum(reinterpret_cast<const char*>(data), len); + return cksum.get(); +} diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h new file mode 100644 index 00000000..9af4a86b --- /dev/null +++ b/src/msg/async/dpdk/IPChecksum.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_CHECKSUM_H_ +#define CEPH_MSG_CHECKSUM_H_ + +#include <cstdint> +#include <cstddef> +#include <arpa/inet.h> + +#include "Packet.h" + +uint16_t ip_checksum(const void* data, size_t len); + +struct checksummer { + __int128 csum = 0; + bool odd = false; + void sum(const char* data, size_t len); + void sum(const Packet& p); + void sum(uint8_t data) { + if (!odd) { + csum += data << 8; + } else { + csum += data; + } + odd = !odd; + } + void sum(uint16_t data) { + if (odd) { + sum(uint8_t(data >> 8)); + sum(uint8_t(data)); + } else { + csum += data; + } + } + void sum(uint32_t data) { + if (odd) { + sum(uint16_t(data)); + sum(uint16_t(data >> 16)); + } else { + csum += data; + } + } + void sum_many() {} + template <typename T0, typename... T> + void sum_many(T0 data, T... rest) { + sum(data); + sum_many(rest...); + } + uint16_t get() const; +}; + +#endif /* CEPH_MSG_CHECKSUM_H_ */ diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc new file mode 100644 index 00000000..6c2320a0 --- /dev/null +++ b/src/msg/async/dpdk/Packet.cc @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> +#include <algorithm> +#include <cctype> + +#include "capture.h" +#include "Packet.h" + +constexpr size_t Packet::internal_data_size; +constexpr size_t Packet::default_nr_frags; + +void Packet::linearize(size_t at_frag, size_t desired_size) { + _impl->unuse_internal_data(); + size_t nr_frags = 0; + size_t accum_size = 0; + while (accum_size < desired_size) { + accum_size += _impl->frags[at_frag + nr_frags].size; + ++nr_frags; + } + char *new_frag = new char[accum_size]; + auto p = new_frag; + for (size_t i = 0; i < nr_frags; ++i) { + auto& f = _impl->frags[at_frag + i]; + p = std::copy(f.base, f.base + f.size, p); + } + // collapse nr_frags into one fragment + std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags, + _impl->frags + at_frag + 1); + _impl->_nr_frags -= nr_frags - 1; + _impl->frags[at_frag] = fragment{new_frag, accum_size}; + if (at_frag == 0 && desired_size == len()) { + // We can drop the old buffer safely + auto x = std::move(_impl->_deleter); + _impl->_deleter = make_deleter([new_frag] { delete []new_frag; }); + } else { + auto del = std::bind( + [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter)); + _impl->_deleter = make_deleter(std::move(del)); + } +} + +class C_free_on_cpu : public EventCallback { + deleter del; + std::function<void()> cb; + public: + C_free_on_cpu(deleter &&d, std::function<void()> &&c): + del(std::move(d)), cb(std::move(c)) {} + void do_request(uint64_t fd) { + // deleter needs to be moved from lambda capture to be destroyed here + // otherwise deleter destructor will be called on a cpu that called + // create_external_event when work_item is destroyed. + deleter xxx(std::move(del)); + cb(); + delete this; + } +}; + +Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb) +{ + auto del = std::bind( + [center, cb] (deleter &del) mutable { + center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb))); + }, std::move(_impl->_deleter)); + // make new deleter that runs old deleter on an origin cpu + _impl->_deleter = make_deleter(deleter(), std::move(del)); + + return Packet(impl::copy(_impl.get())); +} + +std::ostream& operator<<(std::ostream& os, const Packet& p) { + os << "Packet{"; + bool first = true; + for (auto&& frag : p.fragments()) { + if (!first) { + os << ", "; + } + first = false; + if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) { + os << '"'; + for (auto p = frag.base; p != frag.base + frag.size; ++p) { + auto c = *p; + if (isprint(c)) { + os << c; + } else if (c == '\r') { + os << "\\r"; + } else if (c == '\n') { + os << "\\n"; + } else if (c == '\t') { + os << "\\t"; + } else { + uint8_t b = c; + os << "\\x" << (b / 16) << (b % 16); + } + } + os << '"'; + } else { + os << "{"; + bool nfirst = true; + for (auto p = frag.base; p != frag.base + frag.size; ++p) { + if (!nfirst) { + os << " "; + } + nfirst = false; + uint8_t b = *p; + os << b; + } + os << "}"; + } + } + os << "}"; + return os; +} diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h new file mode 100644 index 00000000..db9cd2a7 --- /dev/null +++ b/src/msg/async/dpdk/Packet.h @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_PACKET_H_ +#define CEPH_MSG_PACKET_H_ + +#include <vector> +#include <algorithm> +#include <iosfwd> + +#include "include/types.h" +#include "common/Tub.h" +#include "common/deleter.h" +#include "msg/async/Event.h" + +#include "const.h" + +struct fragment { + char* base; + size_t size; +}; + +struct offload_info { + ip_protocol_num protocol = ip_protocol_num::unused; + bool needs_csum = false; + uint8_t ip_hdr_len = 20; + uint8_t tcp_hdr_len = 20; + uint8_t udp_hdr_len = 8; + bool needs_ip_csum = false; + bool reassembled = false; + uint16_t tso_seg_size = 0; + // HW stripped VLAN header (CPU order) + Tub<uint16_t> vlan_tci; +}; + +// Zero-copy friendly packet class +// +// For implementing zero-copy, we need a flexible destructor that can +// destroy packet data in different ways: decrementing a reference count, +// or calling a free()-like function. +// +// Moreover, we need different destructors for each set of fragments within +// a single fragment. For example, a header and trailer might need delete[] +// to be called, while the internal data needs a reference count to be +// released. Matters are complicated in that fragments can be split +// (due to virtual/physical translation). +// +// To implement this, we associate each packet with a single destructor, +// but allow composing a packet from another packet plus a fragment to +// be added, with its own destructor, causing the destructors to be chained. +// +// The downside is that the data needed for the destructor is duplicated, +// if it is already available in the fragment itself. +// +// As an optimization, when we allocate small fragments, we allocate some +// extra space, so prepending to the packet does not require extra +// allocations. This is useful when adding headers. +// +class Packet { + // enough for lots of headers, not quite two cache lines: + static constexpr size_t internal_data_size = 128 - 16; + static constexpr size_t default_nr_frags = 4; + + struct pseudo_vector { + fragment* _start; + fragment* _finish; + pseudo_vector(fragment* start, size_t nr) + : _start(start), _finish(_start + nr) {} + fragment* begin() { return _start; } + fragment* end() { return _finish; } + fragment& operator[](size_t idx) { return _start[idx]; } + }; + + struct impl { + // when destroyed, virtual destructor will reclaim resources + deleter _deleter; + unsigned _len = 0; + uint16_t _nr_frags = 0; + uint16_t _allocated_frags; + offload_info _offload_info; + Tub<uint32_t> rss_hash; + char data[internal_data_size]; // only frags[0] may use + unsigned headroom = internal_data_size; // in data + // FIXME: share data/frags space + + fragment frags[]; + + explicit impl(size_t nr_frags = default_nr_frags); + impl(const impl&) = delete; + impl(fragment frag, size_t nr_frags = default_nr_frags); + + pseudo_vector fragments() { return { frags, _nr_frags }; } + + static std::unique_ptr<impl> allocate(size_t nr_frags) { + nr_frags = std::max(nr_frags, default_nr_frags); + return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags)); + } + + static std::unique_ptr<impl> copy(impl* old, size_t nr) { + auto n = allocate(nr); + n->_deleter = std::move(old->_deleter); + n->_len = old->_len; + n->_nr_frags = old->_nr_frags; + n->headroom = old->headroom; + n->_offload_info = old->_offload_info; + n->rss_hash.construct(old->rss_hash); + std::copy(old->frags, old->frags + old->_nr_frags, n->frags); + old->copy_internal_fragment_to(n.get()); + return std::move(n); + } + + static std::unique_ptr<impl> copy(impl* old) { + return copy(old, old->_nr_frags); + } + + static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) { + if (old->_allocated_frags >= old->_nr_frags + extra_frags) { + return std::move(old); + } + return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags)); + } + void* operator new(size_t size, size_t nr_frags = default_nr_frags) { + ceph_assert(nr_frags == uint16_t(nr_frags)); + return ::operator new(size + nr_frags * sizeof(fragment)); + } + // Matching the operator new above + void operator delete(void* ptr, size_t nr_frags) { + return ::operator delete(ptr); + } + // Since the above "placement delete" hides the global one, expose it + void operator delete(void* ptr) { + return ::operator delete(ptr); + } + + bool using_internal_data() const { + return _nr_frags + && frags[0].base >= data + && frags[0].base < data + internal_data_size; + } + + void unuse_internal_data() { + if (!using_internal_data()) { + return; + } + auto buf = static_cast<char*>(::malloc(frags[0].size)); + if (!buf) { + throw std::bad_alloc(); + } + deleter d = make_free_deleter(buf); + std::copy(frags[0].base, frags[0].base + frags[0].size, buf); + frags[0].base = buf; + _deleter.append(std::move(d)); + headroom = internal_data_size; + } + void copy_internal_fragment_to(impl* to) { + if (!using_internal_data()) { + return; + } + to->frags[0].base = to->data + headroom; + std::copy(frags[0].base, frags[0].base + frags[0].size, + to->frags[0].base); + } + }; + explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {} + std::unique_ptr<impl> _impl; +public: + static Packet from_static_data(const char* data, size_t len) { + return {fragment{const_cast<char*>(data), len}, deleter()}; + } + + // build empty Packet + Packet(); + // build empty Packet with nr_frags allocated + explicit Packet(size_t nr_frags); + // move existing Packet + Packet(Packet&& x) noexcept; + // copy data into Packet + Packet(const char* data, size_t len); + // copy data into Packet + explicit Packet(fragment frag); + // zero-copy single fragment + Packet(fragment frag, deleter del); + // zero-copy multiple fragments + Packet(std::vector<fragment> frag, deleter del); + // build Packet with iterator + template <typename Iterator> + Packet(Iterator begin, Iterator end, deleter del); + // append fragment (copying new fragment) + Packet(Packet&& x, fragment frag); + // prepend fragment (copying new fragment, with header optimization) + Packet(fragment frag, Packet&& x); + // prepend fragment (zero-copy) + Packet(fragment frag, deleter del, Packet&& x); + // append fragment (zero-copy) + Packet(Packet&& x, fragment frag, deleter d); + // append deleter + Packet(Packet&& x, deleter d); + + Packet& operator=(Packet&& x) { + if (this != &x) { + this->~Packet(); + new (this) Packet(std::move(x)); + } + return *this; + } + + unsigned len() const { return _impl->_len; } + unsigned memory() const { return len() + sizeof(Packet::impl); } + + fragment frag(unsigned idx) const { return _impl->frags[idx]; } + fragment& frag(unsigned idx) { return _impl->frags[idx]; } + + unsigned nr_frags() const { return _impl->_nr_frags; } + pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; } + fragment* fragment_array() const { return _impl->frags; } + + // share Packet data (reference counted, non COW) + Packet share(); + Packet share(size_t offset, size_t len); + + void append(Packet&& p); + + void trim_front(size_t how_much); + void trim_back(size_t how_much); + + // get a header pointer, linearizing if necessary + template <typename Header> + Header* get_header(size_t offset = 0); + + // get a header pointer, linearizing if necessary + char* get_header(size_t offset, size_t size); + + // prepend a header (default-initializing it) + template <typename Header> + Header* prepend_header(size_t extra_size = 0); + + // prepend a header (uninitialized!) + char* prepend_uninitialized_header(size_t size); + + Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{}); + + void linearize() { return linearize(0, len()); } + + void reset() { _impl.reset(); } + + void reserve(int n_frags) { + if (n_frags > _impl->_nr_frags) { + auto extra = n_frags - _impl->_nr_frags; + _impl = impl::allocate_if_needed(std::move(_impl), extra); + } + } + Tub<uint32_t> rss_hash() { + return _impl->rss_hash; + } + void set_rss_hash(uint32_t hash) { + _impl->rss_hash.construct(hash); + } +private: + void linearize(size_t at_frag, size_t desired_size); + bool allocate_headroom(size_t size); +public: + class offload_info offload_info() const { return _impl->_offload_info; } + class offload_info& offload_info_ref() { return _impl->_offload_info; } + void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; } +}; + +std::ostream& operator<<(std::ostream& os, const Packet& p); + +inline Packet::Packet(Packet&& x) noexcept + : _impl(std::move(x._impl)) { +} + +inline Packet::impl::impl(size_t nr_frags) + : _len(0), _allocated_frags(nr_frags) { +} + +inline Packet::impl::impl(fragment frag, size_t nr_frags) + : _len(frag.size), _allocated_frags(nr_frags) { + ceph_assert(_allocated_frags > _nr_frags); + if (frag.size <= internal_data_size) { + headroom -= frag.size; + frags[0] = { data + headroom, frag.size }; + } else { + auto buf = static_cast<char*>(::malloc(frag.size)); + if (!buf) { + throw std::bad_alloc(); + } + deleter d = make_free_deleter(buf); + frags[0] = { buf, frag.size }; + _deleter.append(std::move(d)); + } + std::copy(frag.base, frag.base + frag.size, frags[0].base); + ++_nr_frags; +} + +inline Packet::Packet(): _impl(impl::allocate(1)) { +} + +inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) { +} + +inline Packet::Packet(fragment frag): _impl(new impl(frag)) { +} + +inline Packet::Packet(const char* data, size_t size): + Packet(fragment{const_cast<char*>(data), size}) { +} + +inline Packet::Packet(fragment frag, deleter d) + : _impl(impl::allocate(1)) { + _impl->_deleter = std::move(d); + _impl->frags[_impl->_nr_frags++] = frag; + _impl->_len = frag.size; +} + +inline Packet::Packet(std::vector<fragment> frag, deleter d) + : _impl(impl::allocate(frag.size())) { + _impl->_deleter = std::move(d); + std::copy(frag.begin(), frag.end(), _impl->frags); + _impl->_nr_frags = frag.size(); + _impl->_len = 0; + for (auto&& f : _impl->fragments()) { + _impl->_len += f.size; + } +} + +template <typename Iterator> +inline Packet::Packet(Iterator begin, Iterator end, deleter del) { + unsigned nr_frags = 0, len = 0; + nr_frags = std::distance(begin, end); + std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; }); + _impl = impl::allocate(nr_frags); + _impl->_deleter = std::move(del); + _impl->_len = len; + _impl->_nr_frags = nr_frags; + std::copy(begin, end, _impl->frags); +} + +inline Packet::Packet(Packet&& x, fragment frag) + : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) { + _impl->_len += frag.size; + char* buf = new char[frag.size]; + std::copy(frag.base, frag.base + frag.size, buf); + _impl->frags[_impl->_nr_frags++] = {buf, frag.size}; + _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] { + delete[] buf; + }); +} + +inline bool Packet::allocate_headroom(size_t size) { + if (_impl->headroom >= size) { + _impl->_len += size; + if (!_impl->using_internal_data()) { + _impl = impl::allocate_if_needed(std::move(_impl), 1); + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + _impl->frags[0] = { _impl->data + internal_data_size, 0 }; + ++_impl->_nr_frags; + } + _impl->headroom -= size; + _impl->frags[0].base -= size; + _impl->frags[0].size += size; + return true; + } else { + return false; + } +} + + +inline Packet::Packet(fragment frag, Packet&& x) + : _impl(std::move(x._impl)) { + // try to prepend into existing internal fragment + if (allocate_headroom(frag.size)) { + std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base); + return; + } else { + // didn't work out, allocate and copy + _impl->unuse_internal_data(); + _impl = impl::allocate_if_needed(std::move(_impl), 1); + _impl->_len += frag.size; + char *buf = new char[frag.size]; + std::copy(frag.base, frag.base + frag.size, buf); + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + ++_impl->_nr_frags; + _impl->frags[0] = {buf, frag.size}; + _impl->_deleter = make_deleter( + std::move(_impl->_deleter), [buf] { delete []buf; }); + } +} + +inline Packet::Packet(Packet&& x, fragment frag, deleter d) + : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) { + _impl->_len += frag.size; + _impl->frags[_impl->_nr_frags++] = frag; + d.append(std::move(_impl->_deleter)); + _impl->_deleter = std::move(d); +} + +inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) { + _impl->_deleter.append(std::move(d)); +} + +inline void Packet::append(Packet&& p) { + if (!_impl->_len) { + *this = std::move(p); + return; + } + _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags); + _impl->_len += p._impl->_len; + p._impl->unuse_internal_data(); + std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags, + _impl->frags + _impl->_nr_frags); + _impl->_nr_frags += p._impl->_nr_frags; + p._impl->_deleter.append(std::move(_impl->_deleter)); + _impl->_deleter = std::move(p._impl->_deleter); +} + +inline char* Packet::get_header(size_t offset, size_t size) { + if (offset + size > _impl->_len) { + return nullptr; + } + size_t i = 0; + while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) { + offset -= _impl->frags[i++].size; + } + if (i == _impl->_nr_frags) { + return nullptr; + } + if (offset + size > _impl->frags[i].size) { + linearize(i, offset + size); + } + return _impl->frags[i].base + offset; +} + +template <typename Header> +inline Header* Packet::get_header(size_t offset) { + return reinterpret_cast<Header*>(get_header(offset, sizeof(Header))); +} + +inline void Packet::trim_front(size_t how_much) { + ceph_assert(how_much <= _impl->_len); + _impl->_len -= how_much; + size_t i = 0; + while (how_much && how_much >= _impl->frags[i].size) { + how_much -= _impl->frags[i++].size; + } + std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags); + _impl->_nr_frags -= i; + if (!_impl->using_internal_data()) { + _impl->headroom = internal_data_size; + } + if (how_much) { + if (_impl->using_internal_data()) { + _impl->headroom += how_much; + } + _impl->frags[0].base += how_much; + _impl->frags[0].size -= how_much; + } +} + +inline void Packet::trim_back(size_t how_much) { + ceph_assert(how_much <= _impl->_len); + _impl->_len -= how_much; + size_t i = _impl->_nr_frags - 1; + while (how_much && how_much >= _impl->frags[i].size) { + how_much -= _impl->frags[i--].size; + } + _impl->_nr_frags = i + 1; + if (how_much) { + _impl->frags[i].size -= how_much; + if (i == 0 && _impl->using_internal_data()) { + _impl->headroom += how_much; + } + } +} + +template <typename Header> +Header* Packet::prepend_header(size_t extra_size) { + auto h = prepend_uninitialized_header(sizeof(Header) + extra_size); + return new (h) Header{}; +} + +// prepend a header (uninitialized!) +inline char* Packet::prepend_uninitialized_header(size_t size) { + if (!allocate_headroom(size)) { + // didn't work out, allocate and copy + _impl->unuse_internal_data(); + // try again, after unuse_internal_data we may have space after all + if (!allocate_headroom(size)) { + // failed + _impl->_len += size; + _impl = impl::allocate_if_needed(std::move(_impl), 1); + char *buf = new char[size]; + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + ++_impl->_nr_frags; + _impl->frags[0] = {buf, size}; + _impl->_deleter = make_deleter(std::move(_impl->_deleter), + [buf] { delete []buf; }); + } + } + return _impl->frags[0].base; +} + +inline Packet Packet::share() { + return share(0, _impl->_len); +} + +inline Packet Packet::share(size_t offset, size_t len) { + _impl->unuse_internal_data(); // FIXME: eliminate? + Packet n; + n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags); + size_t idx = 0; + while (offset > 0 && offset >= _impl->frags[idx].size) { + offset -= _impl->frags[idx++].size; + } + while (n._impl->_len < len) { + auto& f = _impl->frags[idx++]; + auto fsize = std::min(len - n._impl->_len, f.size - offset); + n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize }; + n._impl->_len += fsize; + offset = 0; + } + n._impl->_offload_info = _impl->_offload_info; + ceph_assert(!n._impl->_deleter); + n._impl->_deleter = _impl->_deleter.share(); + return n; +} + +#endif /* CEPH_MSG_PACKET_H_ */ diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h new file mode 100644 index 00000000..118218e6 --- /dev/null +++ b/src/msg/async/dpdk/PacketUtil.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_PACKET_UTIL_H_ +#define CEPH_MSG_PACKET_UTIL_H_ + +#include <map> +#include <iostream> + +#include "Packet.h" + +template <typename Offset, typename Tag> +class packet_merger { + private: + static uint64_t& linearizations_ref() { + static thread_local uint64_t linearization_count; + return linearization_count; + } + public: + std::map<Offset, Packet> map; + + static uint64_t linearizations() { + return linearizations_ref(); + } + + void merge(Offset offset, Packet p) { + bool insert = true; + auto beg = offset; + auto end = beg + p.len(); + // First, try to merge the packet with existing segment + for (auto it = map.begin(); it != map.end();) { + auto& seg_pkt = it->second; + auto seg_beg = it->first; + auto seg_end = seg_beg + seg_pkt.len(); + // There are 6 cases: + if (seg_beg <= beg && end <= seg_end) { + // 1) seg_beg beg end seg_end + // We already have data in this packet + return; + } else if (beg <= seg_beg && seg_end <= end) { + // 2) beg seg_beg seg_end end + // The new segment contains more data than this old segment + // Delete the old one, insert the new one + it = map.erase(it); + insert = true; + break; + } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) { + // 3) beg seg_beg end seg_end + // Merge two segments, trim front of old segment + auto trim = end - seg_beg; + seg_pkt.trim_front(trim); + p.append(std::move(seg_pkt)); + // Delete the old one, insert the new one + it = map.erase(it); + insert = true; + break; + } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) { + // 4) seg_beg beg seg_end end + // Merge two segments, trim front of new segment + auto trim = seg_end - beg; + p.trim_front(trim); + // Append new data to the old segment, keep the old segment + seg_pkt.append(std::move(p)); + seg_pkt.linearize(); + ++linearizations_ref(); + insert = false; + break; + } else { + // 5) beg end < seg_beg seg_end + // or + // 6) seg_beg seg_end < beg end + // Can not merge with this segment, keep looking + it++; + insert = true; + } + } + + if (insert) { + p.linearize(); + ++linearizations_ref(); + map.emplace(beg, std::move(p)); + } + + // Second, merge adjacent segments after this packet has been merged, + // because this packet might fill a "whole" and make two adjacent + // segments mergable + for (auto it = map.begin(); it != map.end();) { + // The first segment + auto& seg_pkt = it->second; + auto seg_beg = it->first; + auto seg_end = seg_beg + seg_pkt.len(); + + // The second segment + auto it_next = it; + it_next++; + if (it_next == map.end()) { + break; + } + auto& p = it_next->second; + auto beg = it_next->first; + auto end = beg + p.len(); + + // Merge the the second segment into first segment if possible + if (seg_beg <= beg && beg <= seg_end && seg_end < end) { + // Merge two segments, trim front of second segment + auto trim = seg_end - beg; + p.trim_front(trim); + // Append new data to the first segment, keep the first segment + seg_pkt.append(std::move(p)); + + // Delete the second segment + map.erase(it_next); + + // Keep merging this first segment with its new next packet + // So we do not update the iterator: it + continue; + } else if (end <= seg_end) { + // The first segment has all the data in the second segment + // Delete the second segment + map.erase(it_next); + continue; + } else if (seg_end < beg) { + // Can not merge first segment with second segment + it = it_next; + continue; + } else { + // If we reach here, we have a bug with merge. + std::cout << "packet_merger: merge error\n"; + abort(); + } + } + } +}; + +#endif diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h new file mode 100644 index 00000000..996ae93c --- /dev/null +++ b/src/msg/async/dpdk/TCP-Stack.h @@ -0,0 +1,40 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +// tcp/network-stack integration + +#ifndef CEPH_MSG_DPDK_TCP_STACK_H +#define CEPH_MSG_DPDK_TCP_STACK_H + +class ServerSocket; +class ConnectedSocket; + +class ipv4_traits; +template <typename InetTraits> +class tcp; + +int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, + int type, ServerSocket *sa); + +int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, + ConnectedSocket *sa); + +#endif diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc new file mode 100644 index 00000000..c6397709 --- /dev/null +++ b/src/msg/async/dpdk/TCP.cc @@ -0,0 +1,840 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "align.h" +#include "TCP.h" +#include "IP.h" +#include "DPDKStack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "tcp " + +void tcp_option::parse(uint8_t* beg, uint8_t* end) +{ + while (beg < end) { + auto kind = option_kind(*beg); + if (kind != option_kind::nop && kind != option_kind::eol) { + // Make sure there is enough room for this option + auto len = *(beg + 1); + if (beg + len > end) { + return; + } + } + switch (kind) { + case option_kind::mss: + _mss_received = true; + _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss); + beg += option_len::mss; + break; + case option_kind::win_scale: + _win_scale_received = true; + _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift; + // We can turn on win_scale option, 7 is Linux's default win scale size + _local_win_scale = 7; + beg += option_len::win_scale; + break; + case option_kind::sack: + _sack_received = true; + beg += option_len::sack; + break; + case option_kind::nop: + beg += option_len::nop; + break; + case option_kind::eol: + return; + default: + // Ignore options we do not understand + auto len = *(beg + 1); + beg += len; + // Prevent infinite loop + if (len == 0) { + return; + } + break; + } + } +} + +uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size) +{ + auto hdr = reinterpret_cast<uint8_t*>(th); + auto off = hdr + sizeof(tcp_hdr); + uint8_t size = 0; + bool syn_on = th->f_syn; + bool ack_on = th->f_ack; + + if (syn_on) { + if (_mss_received || !ack_on) { + auto mss = new (off) tcp_option::mss; + mss->mss = _local_mss; + off += mss->len; + size += mss->len; + *mss = mss->hton(); + } + if (_win_scale_received || !ack_on) { + auto win_scale = new (off) tcp_option::win_scale; + win_scale->shift = _local_win_scale; + off += win_scale->len; + size += win_scale->len; + } + } + if (size > 0) { + // Insert NOP option + auto size_max = align_up(uint8_t(size + 1), tcp_option::align); + while (size < size_max - uint8_t(option_len::eol)) { + new (off) tcp_option::nop; + off += option_len::nop; + size += option_len::nop; + } + new (off) tcp_option::eol; + size += option_len::eol; + } + ceph_assert(size == options_size); + + return size; +} + +uint8_t tcp_option::get_size(bool syn_on, bool ack_on) +{ + uint8_t size = 0; + if (syn_on) { + if (_mss_received || !ack_on) { + size += option_len::mss; + } + if (_win_scale_received || !ack_on) { + size += option_len::win_scale; + } + } + if (size > 0) { + size += option_len::eol; + // Insert NOP option to align on 32-bit + size = align_up(size, tcp_option::align); + } + return size; +} + +ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c) + : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c))) +{ } + +ipv4_tcp::~ipv4_tcp() { } + +void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to) +{ + _tcp->received(std::move(p), from, to); +} + +bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + return _tcp->forward(out_hash_data, p, off); +} + +int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, + int type, ServerSocket *sock) +{ + auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type); + int r = p->listen(); + if (r < 0) { + delete p; + return r; + } + *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p)); + return 0; +} + +int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, + ConnectedSocket *sock) +{ + auto conn = tcpv4.connect(addr); + *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>( + new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn)))); + return 0; +} + +template <typename InetTraits> +void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip) +{ + ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin) + << " syn=" << bool(rth->f_syn) << dendl; + if (rth->f_rst) { + return; + } + Packet p; + auto th = p.prepend_header<tcp_hdr>(); + th->src_port = rth->dst_port; + th->dst_port = rth->src_port; + if (rth->f_ack) { + th->seq = rth->ack; + } + // If this RST packet is in response to a SYN packet. We ACK the ISN. + if (rth->f_syn) { + th->ack = rth->seq + 1; + th->f_ack = true; + } + th->f_rst = true; + th->data_offset = sizeof(*th) / 4; + th->checksum = 0; + *th = th->hton(); + + checksummer csum; + offload_info oi; + InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th)); + if (get_hw_features().tx_csum_l4_offload) { + th->checksum = ~csum.get(); + oi.needs_csum = true; + } else { + csum.sum(p); + th->checksum = csum.get(); + oi.needs_csum = false; + } + + oi.protocol = ip_protocol_num::tcp; + oi.tcp_hdr_len = sizeof(tcp_hdr); + p.set_offload_info(oi); + + send_packet_without_tcb(local_ip, foreign_ip, std::move(p)); +} + +#undef dout_prefix +#define dout_prefix _prefix(_dout) +template<typename InetTraits> +ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) { + return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port + << " tcb(" << this << " fd=" << fd << " s=" << _state << ")."; +} + +template<typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p) +{ + auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); + auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); + auto opt_end = opt_start + opt_len; + p.trim_front(th->data_offset * 4); + tcp_sequence seg_seq = th->seq; + + // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ + _rcv.next = seg_seq + 1; + _rcv.initial = seg_seq; + + // ISS should be selected and a SYN segment sent of the form: + // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> + // SND.NXT is set to ISS+1 and SND.UNA to ISS + // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is + // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we + // have + // th->seq = syn_on ? _snd.initial : _snd.next + // to make sure retransmitted SYN has correct SEQ number. + do_setup_isn(); + + _rcv.urgent = _rcv.next; + + ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl; + init_from_options(th, opt_start, opt_end); + do_syn_received(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p) +{ + auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); + auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); + auto opt_end = opt_start + opt_len; + p.trim_front(th->data_offset * 4); + tcp_sequence seg_seq = th->seq; + auto seg_ack = th->ack; + + ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw + << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; + + bool acceptable = false; + // 3.1 first check the ACK bit + if (th->f_ack) { + // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the + // RST bit is set, if so drop the segment and return) + if (seg_ack <= _snd.initial || seg_ack > _snd.next) { + return respond_with_reset(th); + } + + // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next; + } + + // 3.2 second check the RST bit + if (th->f_rst) { + // If the ACK was acceptable then signal the user "error: connection + // reset", drop the segment, enter CLOSED state, delete TCB, and + // return. Otherwise (no ACK) drop the segment and return. + if (acceptable) { + return do_reset(); + } else { + return; + } + } + + // 3.3 third check the security and precedence + // NOTE: Ignored for now + + // 3.4 fourth check the SYN bit + if (th->f_syn) { + // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should + // be advanced to equal SEG.ACK (if there is an ACK), and any segments + // on the retransmission queue which are thereby acknowledged should be + // removed. + _rcv.next = seg_seq + 1; + _rcv.initial = seg_seq; + if (th->f_ack) { + // TODO: clean retransmission queue + _snd.unacknowledged = seg_ack; + } + if (_snd.unacknowledged > _snd.initial) { + // If SND.UNA > ISS (our SYN has been ACKed), change the connection + // state to ESTABLISHED, form an ACK segment + // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl; + init_from_options(th, opt_start, opt_end); + do_established(); + output(); + } else { + // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment + // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> + ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl; + do_syn_received(); + } + } + + // 3.5 fifth, if neither of the SYN or RST bits is set then drop the + // segment and return. + return; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p) +{ + p.trim_front(th->data_offset * 4); + bool do_output = false; + bool do_output_data = false; + tcp_sequence seg_seq = th->seq; + auto seg_ack = th->ack; + auto seg_len = p.len(); + ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw + << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw + << " rcv next " << _rcv.next.raw << " len " << seg_len + << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; + + // 4.1 first check sequence number + if (!segment_acceptable(seg_seq, seg_len)) { + //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + return output(); + } + + // In the following it is assumed that the segment is the idealized + // segment that begins at RCV.NXT and does not exceed the window. + if (seg_seq < _rcv.next) { + // ignore already acknowledged data + auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len); + ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl; + p.trim_front(dup); + seg_len -= dup; + seg_seq += dup; + } + // FIXME: We should trim data outside the right edge of the receive window as well + + if (seg_seq != _rcv.next) { + ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw + << " actual " << seg_seq.raw + << " out of order size " << _rcv.out_of_order.map.size() + << dendl; + insert_out_of_order(seg_seq, std::move(p)); + // A TCP receiver SHOULD send an immediate duplicate ACK + // when an out-of-order segment arrives. + return output(); + } + + // 4.2 second check the RST bit + if (th->f_rst) { + if (in_state(SYN_RECEIVED)) { + // If this connection was initiated with a passive OPEN (i.e., + // came from the LISTEN state), then return this connection to + // LISTEN state and return. The user need not be informed. If + // this connection was initiated with an active OPEN (i.e., came + // from SYN_SENT state) then the connection was refused, signal + // the user "connection refused". In either case, all segments + // on the retransmission queue should be removed. And in the + // active OPEN case, enter the CLOSED state and delete the TCB, + // and return. + errno = -ECONNREFUSED; + return do_reset(); + } + if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) { + // If the RST bit is set then, any outstanding RECEIVEs and SEND + // should receive "reset" responses. All segment queues should be + // flushed. Users should also receive an unsolicited general + // "connection reset" signal. Enter the CLOSED state, delete the + // TCB, and return. + return do_reset(); + } + if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) { + // If the RST bit is set then, enter the CLOSED state, delete the + // TCB, and return. + return do_closed(); + } + } + + // 4.3 third check security and precedence + // NOTE: Ignored for now + + // 4.4 fourth, check the SYN bit + if (th->f_syn) { + // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 + // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT + + // If the SYN is in the window it is an error, send a reset, any + // outstanding RECEIVEs and SEND should receive "reset" responses, + // all segment queues should be flushed, the user should also + // receive an unsolicited general "connection reset" signal, enter + // the CLOSED state, delete the TCB, and return. + respond_with_reset(th); + return do_reset(); + + // If the SYN is not in the window this step would not be reached + // and an ack would have been sent in the first step (sequence + // number check). + } + + // 4.5 fifth check the ACK field + if (!th->f_ack) { + // if the ACK bit is off drop the segment and return + return; + } else { + // SYN_RECEIVED STATE + if (in_state(SYN_RECEIVED)) { + // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state + // and continue processing. + if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) { + ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl; + do_established(); + if (_tcp.push_listen_queue(_local_port, this)) { + ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl; + } else { + ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl; + return respond_with_reset(th); + } + } else { + // <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(th); + } + } + auto update_window = [this, th, seg_seq, seg_ack] { + ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq + << " seg_ack=" << seg_ack << " old window=" << th->window + << " new window=" << int(_snd.window_scale) << dendl; + _snd.window = th->window << _snd.window_scale; + _snd.wl1 = seg_seq; + _snd.wl2 = seg_ack; + if (_snd.window == 0) { + _persist_time_out = _rto; + start_persist_timer(); + } else { + stop_persist_timer(); + } + }; + // ESTABLISHED STATE or + // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state. + if (in_state(ESTABLISHED | CLOSE_WAIT)) { + // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK. + if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) { + // Remote ACKed data we sent + auto acked_bytes = data_segment_acked(seg_ack); + + // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated. + if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) { + update_window(); + } + + // some data is acked, try send more data + do_output_data = true; + + auto set_retransmit_timer = [this] { + if (_snd.data.empty()) { + // All outstanding segments are acked, turn off the timer. + stop_retransmit_timer(); + // Signal the waiter of this event + signal_all_data_acked(); + } else { + // Restart the timer becasue new data is acked. + start_retransmit_timer(); + } + }; + + if (_snd.dupacks >= 3) { + // We are in fast retransmit / fast recovery phase + uint32_t smss = _snd.mss; + if (seg_ack > _snd.recover) { + ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl; + // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS) + _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss); + // Exit the fast recovery procedure + exit_fast_recovery(); + set_retransmit_timer(); + } else { + ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl; + // Retransmit the first unacknowledged segment + fast_retransmit(); + // Deflate the congestion window by the amount of new data + // acknowledged by the Cumulative Acknowledgment field + _snd.cwnd -= acked_bytes; + // If the partial ACK acknowledges at least one SMSS of new + // data, then add back SMSS bytes to the congestion window + if (acked_bytes >= smss) { + _snd.cwnd += smss; + } + // Send a new segment if permitted by the new value of + // cwnd. Do not exit the fast recovery procedure For + // the first partial ACK that arrives during fast + // recovery, also reset the retransmit timer. + if (++_snd.partial_ack == 1) { + start_retransmit_timer(); + } + } + } else { + // RFC5681: The fast retransmit algorithm uses the arrival + // of 3 duplicate ACKs (as defined in section 2, without + // any intervening ACKs which move SND.UNA) as an + // indication that a segment has been lost. + // + // So, here we reset dupacks to zero becasue this ACK moves + // SND.UNA. + exit_fast_recovery(); + set_retransmit_timer(); + } + } else if (!_snd.data.empty() && seg_len == 0 && + th->f_fin == 0 && th->f_syn == 0 && + th->ack == _snd.unacknowledged && + uint32_t(th->window << _snd.window_scale) == _snd.window) { + // Note: + // RFC793 states: + // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored + // RFC5681 states: + // The TCP sender SHOULD use the "fast retransmit" algorithm to detect + // and repair loss, based on incoming duplicate ACKs. + // Here, We follow RFC5681. + _snd.dupacks++; + uint32_t smss = _snd.mss; + // 3 duplicated ACKs trigger a fast retransmit + if (_snd.dupacks == 1 || _snd.dupacks == 2) { + // RFC5681 Step 3.1 + // Send cwnd + 2 * smss per RFC3042 + do_output_data = true; + } else if (_snd.dupacks == 3) { + // RFC6582 Step 3.2 + if (seg_ack - 1 > _snd.recover) { + _snd.recover = _snd.next - 1; + // RFC5681 Step 3.2 + _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss); + fast_retransmit(); + } else { + // Do not enter fast retransmit and do not reset ssthresh + } + // RFC5681 Step 3.3 + _snd.cwnd = _snd.ssthresh + 3 * smss; + } else if (_snd.dupacks > 3) { + // RFC5681 Step 3.4 + _snd.cwnd += smss; + // RFC5681 Step 3.5 + do_output_data = true; + } + } else if (seg_ack > _snd.next) { + // If the ACK acks something not yet sent (SEG.ACK > SND.NXT) + // then send an ACK, drop the segment, and return + return output(); + } else if (_snd.window == 0 && th->window > 0) { + update_window(); + do_output_data = true; + } + } + // FIN_WAIT_1 STATE + if (in_state(FIN_WAIT_1)) { + // In addition to the processing for the ESTABLISHED state, if + // our FIN is now acknowledged then enter FIN-WAIT-2 and continue + // processing in that state. + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl; + _state = FIN_WAIT_2; + do_local_fin_acked(); + } + } + // FIN_WAIT_2 STATE + if (in_state(FIN_WAIT_2)) { + // In addition to the processing for the ESTABLISHED state, if + // the retransmission queue is empty, the user’s CLOSE can be + // acknowledged ("ok") but do not delete the TCB. + // TODO + } + // CLOSING STATE + if (in_state(CLOSING)) { + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl; + do_local_fin_acked(); + return do_time_wait(); + } else { + return; + } + } + // LAST_ACK STATE + if (in_state(LAST_ACK)) { + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl; + do_local_fin_acked(); + return do_closed(); + } + } + // TIME_WAIT STATE + if (in_state(TIME_WAIT)) { + // The only thing that can arrive in this state is a + // retransmission of the remote FIN. Acknowledge it, and restart + // the 2 MSL timeout. + // TODO + } + } + + // 4.6 sixth, check the URG bit + if (th->f_urg) { + // TODO + } + + // 4.7 seventh, process the segment text + if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) { + if (p.len()) { + // Once the TCP takes responsibility for the data it advances + // RCV.NXT over the data accepted, and adjusts RCV.WND as + // apporopriate to the current buffer availability. The total of + // RCV.NXT and RCV.WND should not be reduced. + _rcv.data.push_back(std::move(p)); + _rcv.next += seg_len; + auto merged = merge_out_of_order(); + signal_data_received(); + // Send an acknowledgment of the form: + // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + // This acknowledgment should be piggybacked on a segment being + // transmitted if possible without incurring undue delay. + if (merged) { + // TCP receiver SHOULD send an immediate ACK when the + // incoming segment fills in all or part of a gap in the + // sequence space. + do_output = true; + } else { + do_output = should_send_ack(seg_len); + } + ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl; + } + } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) { + // This should not occur, since a FIN has been received from the + // remote side. Ignore the segment text. + return; + } + + // 4.8 eighth, check the FIN bit + if (th->f_fin) { + if (in_state(CLOSED | LISTEN | SYN_SENT)) { + // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT + // since the SEG.SEQ cannot be validated; drop the segment and return. + return; + } + auto fin_seq = seg_seq + seg_len; + if (fin_seq == _rcv.next) { + _rcv.next = fin_seq + 1; + + // If this <FIN> packet contains data as well, we can ACK both data + // and <FIN> in a single packet, so canncel the previous ACK. + clear_delayed_ack(); + do_output = false; + // Send ACK for the FIN! + output(); + signal_data_received(); + _errno = 0; + + if (in_state(SYN_RECEIVED | ESTABLISHED)) { + ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl; + _state = CLOSE_WAIT; + // EOF + } + if (in_state(FIN_WAIT_1)) { + // If our FIN has been ACKed (perhaps in this segment), then + // enter TIME-WAIT, start the time-wait timer, turn off the other + // timers; otherwise enter the CLOSING state. + // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2 + // not FIN_WAIT_1 if we reach here. + ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl; + _state = CLOSING; + } + if (in_state(FIN_WAIT_2)) { + ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl; + return do_time_wait(); + } + } + } + if (do_output || (do_output_data && can_send())) { + // Since we will do output, we can canncel scheduled delayed ACK. + clear_delayed_ack(); + output(); + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::connect() +{ + ldout(_tcp.cct, 20) << __func__ << dendl; + // An initial send sequence number (ISS) is selected. A SYN segment of the + // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1, + // enter SYN-SENT state, and return. + do_setup_isn(); + + // Local receive window scale factor + _rcv.window_scale = _option._local_win_scale = 7; + // Maximum segment size local can receive + _rcv.mss = _option._local_mss = local_mss(); + // Linux's default window size + _rcv.window = 29200 << _rcv.window_scale; + + do_syn_sent(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::close_final_cleanup() +{ + if (_snd._all_data_acked_fd >= 0) { + center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE); + _tcp.manager.close(_snd._all_data_acked_fd); + _snd._all_data_acked_fd = -1; + } + + _snd.closed = true; + signal_data_received(); + ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl; + if (in_state(CLOSE_WAIT)) { + ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl; + _state = LAST_ACK; + } else if (in_state(ESTABLISHED)) { + ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl; + _state = FIN_WAIT_1; + } + // Send <FIN> to remote + // Note: we call output_one to make sure a packet with FIN actually + // sent out. If we only call output() and _packetq is not empty, + // tcp::tcb::get_packet(), packet with FIN will not be generated. + output_one(); + output(); + center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::retransmit() +{ + auto output_update_rto = [this] { + output(); + // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off + this->_rto = std::min(this->_rto * 2, this->_rto_max); + start_retransmit_timer(); + }; + + // Retransmit SYN + if (syn_needs_on()) { + if (_snd.syn_retransmit++ < _max_nr_retransmit) { + output_update_rto(); + } else { + _errno = -ECONNABORTED; + ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + } + + // Retransmit FIN + if (fin_needs_on()) { + if (_snd.fin_retransmit++ < _max_nr_retransmit) { + output_update_rto(); + } else { + ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + } + + // Retransmit Data + if (_snd.data.empty()) { + return; + } + + // If there are unacked data, retransmit the earliest segment + auto& unacked_seg = _snd.data.front(); + + // According to RFC5681 + // Update ssthresh only for the first retransmit + uint32_t smss = _snd.mss; + if (unacked_seg.nr_transmits == 0) { + _snd.ssthresh = std::max(flight_size() / 2, 2 * smss); + } + // RFC6582 Step 4 + _snd.recover = _snd.next - 1; + // Start the slow start process + _snd.cwnd = smss; + // End fast recovery + exit_fast_recovery(); + + ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size() + << " nr=" << unacked_seg.nr_transmits << dendl; + if (unacked_seg.nr_transmits < _max_nr_retransmit) { + unacked_seg.nr_transmits++; + } else { + // Delete connection when max num of retransmission is reached + ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + retransmit_one(); + + output_update_rto(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::persist() { + ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl; + // Send 1 byte packet to probe peer's window size + _snd.window_probe = true; + output_one(); + _snd.window_probe = false; + + output(); + // Perform binary exponential back-off per RFC1122 + _persist_time_out = std::min(_persist_time_out * 2, _rto_max); + start_persist_timer(); +} diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h new file mode 100644 index 00000000..b7bd7132 --- /dev/null +++ b/src/msg/async/dpdk/TCP.h @@ -0,0 +1,1503 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_DPDK_TCP_H_ +#define CEPH_DPDK_TCP_H_ + +#include <unordered_map> +#include <map> +#include <queue> +#include <functional> +#include <deque> +#include <chrono> +#include <stdexcept> +#include <system_error> + +#include "msg/async/dpdk/EventDPDK.h" + +#include "include/utime.h" +#include "common/Throttle.h" +#include "common/ceph_time.h" +#include "common/ceph_crypto.h" +#include "msg/async/Event.h" +#include "IPChecksum.h" +#include "IP.h" +#include "const.h" +#include "byteorder.h" +#include "shared_ptr.h" +#include "PacketUtil.h" + +#include "include/random.h" + +struct tcp_hdr; + +enum class tcp_state : uint16_t { + CLOSED = (1 << 0), + LISTEN = (1 << 1), + SYN_SENT = (1 << 2), + SYN_RECEIVED = (1 << 3), + ESTABLISHED = (1 << 4), + FIN_WAIT_1 = (1 << 5), + FIN_WAIT_2 = (1 << 6), + CLOSE_WAIT = (1 << 7), + CLOSING = (1 << 8), + LAST_ACK = (1 << 9), + TIME_WAIT = (1 << 10) +}; + +inline tcp_state operator|(tcp_state s1, tcp_state s2) { + return tcp_state(uint16_t(s1) | uint16_t(s2)); +} + +inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) { + switch (s) { + case tcp_state::CLOSED: return str << "CLOSED"; + case tcp_state::LISTEN: return str << "LISTEN"; + case tcp_state::SYN_SENT: return str << "SYN_SENT"; + case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED"; + case tcp_state::ESTABLISHED: return str << "ESTABLISHED"; + case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1"; + case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2"; + case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT"; + case tcp_state::CLOSING: return str << "CLOSING"; + case tcp_state::LAST_ACK: return str << "LAST_ACK"; + case tcp_state::TIME_WAIT: return str << "TIME_WAIT"; + default: return str << "UNKNOWN"; + } +} + +struct tcp_option { + // The kind and len field are fixed and defined in TCP protocol + enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 }; + enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 }; + struct mss { + option_kind kind = option_kind::mss; + option_len len = option_len::mss; + uint16_t mss; + struct mss hton() { + struct mss m = *this; + m.mss = ::hton(m.mss); + return m; + } + } __attribute__((packed)); + struct win_scale { + option_kind kind = option_kind::win_scale; + option_len len = option_len::win_scale; + uint8_t shift; + } __attribute__((packed)); + struct sack { + option_kind kind = option_kind::sack; + option_len len = option_len::sack; + } __attribute__((packed)); + struct timestamps { + option_kind kind = option_kind::timestamps; + option_len len = option_len::timestamps; + uint32_t t1; + uint32_t t2; + } __attribute__((packed)); + struct nop { + option_kind kind = option_kind::nop; + } __attribute__((packed)); + struct eol { + option_kind kind = option_kind::eol; + } __attribute__((packed)); + static const uint8_t align = 4; + + void parse(uint8_t* beg, uint8_t* end); + uint8_t fill(tcp_hdr* th, uint8_t option_size); + uint8_t get_size(bool syn_on, bool ack_on); + + // For option negotiattion + bool _mss_received = false; + bool _win_scale_received = false; + bool _timestamps_received = false; + bool _sack_received = false; + + // Option data + uint16_t _remote_mss = 536; + uint16_t _local_mss; + uint8_t _remote_win_scale = 0; + uint8_t _local_win_scale = 0; +}; +inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; } +inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; } + +struct tcp_sequence { + uint32_t raw; +}; + +tcp_sequence ntoh(tcp_sequence ts) { + return tcp_sequence { ::ntoh(ts.raw) }; +} + +tcp_sequence hton(tcp_sequence ts) { + return tcp_sequence { ::hton(ts.raw) }; +} + +inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) { + return os << s.raw; +} + +inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; } +inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; } +inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; } +inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; } +inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; } +inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; } +inline bool operator==(tcp_sequence s, tcp_sequence q) { return s.raw == q.raw; } +inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); } +inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; } +inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; } +inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); } +inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); } + +struct tcp_hdr { + uint16_t src_port; + uint16_t dst_port; + tcp_sequence seq; + tcp_sequence ack; + uint8_t rsvd1 : 4; + uint8_t data_offset : 4; + uint8_t f_fin : 1; + uint8_t f_syn : 1; + uint8_t f_rst : 1; + uint8_t f_psh : 1; + uint8_t f_ack : 1; + uint8_t f_urg : 1; + uint8_t rsvd2 : 2; + uint16_t window; + uint16_t checksum; + uint16_t urgent; + + tcp_hdr hton() { + tcp_hdr hdr = *this; + hdr.src_port = ::hton(src_port); + hdr.dst_port = ::hton(dst_port); + hdr.seq = ::hton(seq); + hdr.ack = ::hton(ack); + hdr.window = ::hton(window); + hdr.checksum = ::hton(checksum); + hdr.urgent = ::hton(urgent); + return hdr; + } + + tcp_hdr ntoh() { + tcp_hdr hdr = *this; + hdr.src_port = ::ntoh(src_port); + hdr.dst_port = ::ntoh(dst_port); + hdr.seq = ::ntoh(seq); + hdr.ack = ::ntoh(ack); + hdr.window = ::ntoh(window); + hdr.checksum = ::ntoh(checksum); + hdr.urgent = ::ntoh(urgent); + return hdr; + } +} __attribute__((packed)); + +struct tcp_tag {}; +using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>; + +template <typename InetTraits> +class tcp { + public: + using ipaddr = typename InetTraits::address_type; + using inet_type = typename InetTraits::inet_type; + using connid = l4connid<InetTraits>; + using connid_hash = typename connid::connid_hash; + class connection; + class listener; + private: + class tcb; + + class C_handle_delayed_ack : public EventCallback { + tcb *tc; + + public: + C_handle_delayed_ack(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->_nr_full_seg_received = 0; + tc->output(); + } + }; + + class C_handle_retransmit : public EventCallback { + tcb *tc; + + public: + C_handle_retransmit(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->retransmit(); + } + }; + + class C_handle_persist : public EventCallback { + tcb *tc; + + public: + C_handle_persist(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->persist(); + } + }; + + class C_all_data_acked : public EventCallback { + tcb *tc; + + public: + C_all_data_acked(tcb *t): tc(t) {} + void do_request(uint64_t fd_or_id) { + tc->close_final_cleanup(); + } + }; + + class C_actual_remove_tcb : public EventCallback { + lw_shared_ptr<tcb> tc; + public: + C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {} + void do_request(uint64_t r) { + delete this; + } + }; + + class tcb : public enable_lw_shared_from_this<tcb> { + using clock_type = ceph::coarse_real_clock; + static constexpr tcp_state CLOSED = tcp_state::CLOSED; + static constexpr tcp_state LISTEN = tcp_state::LISTEN; + static constexpr tcp_state SYN_SENT = tcp_state::SYN_SENT; + static constexpr tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED; + static constexpr tcp_state ESTABLISHED = tcp_state::ESTABLISHED; + static constexpr tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1; + static constexpr tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2; + static constexpr tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT; + static constexpr tcp_state CLOSING = tcp_state::CLOSING; + static constexpr tcp_state LAST_ACK = tcp_state::LAST_ACK; + static constexpr tcp_state TIME_WAIT = tcp_state::TIME_WAIT; + tcp_state _state = CLOSED; + tcp& _tcp; + UserspaceEventManager &manager; + connection* _conn = nullptr; + bool _connect_done = false; + ipaddr _local_ip; + ipaddr _foreign_ip; + uint16_t _local_port; + uint16_t _foreign_port; + struct unacked_segment { + Packet p; + uint16_t data_len; + unsigned nr_transmits; + clock_type::time_point tx_time; + }; + struct send { + tcp_sequence unacknowledged; + tcp_sequence next; + uint32_t window; + uint8_t window_scale; + uint16_t mss; + tcp_sequence urgent; + tcp_sequence wl1; + tcp_sequence wl2; + tcp_sequence initial; + std::deque<unacked_segment> data; + std::deque<Packet> unsent; + uint32_t unsent_len = 0; + uint32_t queued_len = 0; + bool closed = false; + // Wait for all data are acked + int _all_data_acked_fd = -1; + // Limit number of data queued into send queue + Throttle user_queue_space; + // Round-trip time variation + std::chrono::microseconds rttvar; + // Smoothed round-trip time + std::chrono::microseconds srtt; + bool first_rto_sample = true; + clock_type::time_point syn_tx_time; + // Congestion window + uint32_t cwnd; + // Slow start threshold + uint32_t ssthresh; + // Duplicated ACKs + uint16_t dupacks = 0; + unsigned syn_retransmit = 0; + unsigned fin_retransmit = 0; + uint32_t limited_transfer = 0; + uint32_t partial_ack = 0; + tcp_sequence recover; + bool window_probe = false; + send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {} + } _snd; + struct receive { + tcp_sequence next; + uint32_t window; + uint8_t window_scale; + uint16_t mss; + tcp_sequence urgent; + tcp_sequence initial; + std::deque<Packet> data; + tcp_packet_merger out_of_order; + } _rcv; + EventCenter *center; + int fd; + // positive means no errno, 0 means eof, nagetive means error + int16_t _errno = 1; + tcp_option _option; + EventCallbackRef delayed_ack_event; + Tub<uint64_t> _delayed_ack_fd; + // Retransmission timeout + std::chrono::microseconds _rto{1000*1000}; + std::chrono::microseconds _persist_time_out{1000*1000}; + static constexpr std::chrono::microseconds _rto_min{1000*1000}; + static constexpr std::chrono::microseconds _rto_max{60000*1000}; + // Clock granularity + static constexpr std::chrono::microseconds _rto_clk_granularity{1000}; + static constexpr uint16_t _max_nr_retransmit{5}; + EventCallbackRef retransmit_event; + Tub<uint64_t> retransmit_fd; + EventCallbackRef persist_event; + EventCallbackRef all_data_ack_event; + Tub<uint64_t> persist_fd; + uint16_t _nr_full_seg_received = 0; + struct isn_secret { + // 512 bits secretkey for ISN generating + uint32_t key[16]; + isn_secret () { + for (auto& k : key) { + k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max()); + } + } + }; + static isn_secret _isn_secret; + tcp_sequence get_isn(); + circular_buffer<typename InetTraits::l4packet> _packetq; + bool _poll_active = false; + public: + // callback + void close_final_cleanup(); + ostream& _prefix(std::ostream *_dout); + + public: + tcb(tcp& t, connid id); + ~tcb(); + void input_handle_listen_state(tcp_hdr* th, Packet p); + void input_handle_syn_sent_state(tcp_hdr* th, Packet p); + void input_handle_other_state(tcp_hdr* th, Packet p); + void output_one(bool data_retransmit = false); + bool is_all_data_acked(); + int send(Packet p); + void connect(); + Tub<Packet> read(); + void close(); + void remove_from_tcbs() { + auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port}; + _tcp._tcbs.erase(id); + } + Tub<typename InetTraits::l4packet> get_packet(); + void output() { + if (!_poll_active) { + _poll_active = true; + + auto tcb = this->shared_from_this(); + _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) { + if (r == 0) { + tcb->_tcp.poll_tcb(dst, std::move(tcb)); + } else if (r == -ETIMEDOUT) { + // in other states connection should time out + if (tcb->in_state(SYN_SENT)) { + tcb->_errno = -ETIMEDOUT; + tcb->cleanup(); + } + } else if (r == -EBUSY) { + // retry later + tcb->_poll_active = false; + tcb->start_retransmit_timer(); + } + }); + } + } + + int16_t get_errno() const { + return _errno; + } + + tcp_state& state() { + return _state; + } + + uint64_t peek_sent_available() { + if (!in_state(ESTABLISHED)) + return 0; + uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current(); + return left; + } + + int is_connected() const { + if (_errno <= 0) + return _errno; + return _connect_done; + } + + private: + void respond_with_reset(tcp_hdr* th); + bool merge_out_of_order(); + void insert_out_of_order(tcp_sequence seq, Packet p); + void trim_receive_data_after_window(); + bool should_send_ack(uint16_t seg_len); + void clear_delayed_ack(); + Packet get_transmit_packet(); + void retransmit_one() { + bool data_retransmit = true; + output_one(data_retransmit); + } + void start_retransmit_timer() { + if (retransmit_fd) + center->delete_time_event(*retransmit_fd); + retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event)); + }; + void stop_retransmit_timer() { + if (retransmit_fd) { + center->delete_time_event(*retransmit_fd); + retransmit_fd.destroy(); + } + }; + void start_persist_timer() { + if (persist_fd) + center->delete_time_event(*persist_fd); + persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event)); + }; + void stop_persist_timer() { + if (persist_fd) { + center->delete_time_event(*persist_fd); + persist_fd.destroy(); + } + }; + void persist(); + void retransmit(); + void fast_retransmit(); + void update_rto(clock_type::time_point tx_time); + void update_cwnd(uint32_t acked_bytes); + void cleanup(); + uint32_t can_send() { + if (_snd.window_probe) { + return 1; + } + // Can not send more than advertised window allows + auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len); + // Can not send more than congestion window allows + x = std::min(_snd.cwnd, x); + if (_snd.dupacks == 1 || _snd.dupacks == 2) { + // RFC5681 Step 3.1 + // Send cwnd + 2 * smss per RFC3042 + auto flight = flight_size(); + auto max = _snd.cwnd + 2 * _snd.mss; + x = flight <= max ? std::min(x, max - flight) : 0; + _snd.limited_transfer += x; + } else if (_snd.dupacks >= 3) { + // RFC5681 Step 3.5 + // Sent 1 full-sized segment at most + x = std::min(uint32_t(_snd.mss), x); + } + return x; + } + uint32_t flight_size() { + uint32_t size = 0; + std::for_each(_snd.data.begin(), _snd.data.end(), + [&] (unacked_segment& seg) { size += seg.p.len(); }); + return size; + } + uint16_t local_mss() { + return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min; + } + void queue_packet(Packet p) { + _packetq.emplace_back( + typename InetTraits::l4packet{_foreign_ip, std::move(p)}); + } + void signal_data_received() { + manager.notify(fd, EVENT_READABLE); + } + void signal_all_data_acked() { + if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0) + manager.notify(_snd._all_data_acked_fd, EVENT_READABLE); + } + void do_syn_sent() { + _state = SYN_SENT; + _snd.syn_tx_time = clock_type::now(); + // Send <SYN> to remote + output(); + } + void do_syn_received() { + _state = SYN_RECEIVED; + _snd.syn_tx_time = clock_type::now(); + // Send <SYN,ACK> to remote + output(); + } + void do_established() { + _state = ESTABLISHED; + update_rto(_snd.syn_tx_time); + _connect_done = true; + manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE); + } + void do_reset() { + _state = CLOSED; + // Free packets to be sent which are waiting for user_queue_space + _snd.user_queue_space.reset(); + cleanup(); + _errno = -ECONNRESET; + manager.notify(fd, EVENT_READABLE); + + if (_snd._all_data_acked_fd >= 0) + manager.notify(_snd._all_data_acked_fd, EVENT_READABLE); + } + void do_time_wait() { + // FIXME: Implement TIME_WAIT state timer + _state = TIME_WAIT; + cleanup(); + } + void do_closed() { + _state = CLOSED; + cleanup(); + } + void do_setup_isn() { + _snd.initial = get_isn(); + _snd.unacknowledged = _snd.initial; + _snd.next = _snd.initial + 1; + _snd.recover = _snd.initial; + } + void do_local_fin_acked() { + _snd.unacknowledged += 1; + _snd.next += 1; + } + bool syn_needs_on() { + return in_state(SYN_SENT | SYN_RECEIVED); + } + bool fin_needs_on() { + return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed && + _snd.unsent_len == 0 && _snd.queued_len == 0; + } + bool ack_needs_on() { + return !in_state(CLOSED | LISTEN | SYN_SENT); + } + bool foreign_will_not_send() { + return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED); + } + bool in_state(tcp_state state) { + return uint16_t(_state) & uint16_t(state); + } + void exit_fast_recovery() { + _snd.dupacks = 0; + _snd.limited_transfer = 0; + _snd.partial_ack = 0; + } + uint32_t data_segment_acked(tcp_sequence seg_ack); + bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len); + void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end); + friend class connection; + + friend class C_handle_delayed_ack; + friend class C_handle_retransmit; + friend class C_handle_persist; + friend class C_all_data_acked; + }; + + CephContext *cct; + // ipv4_l4<ip_protocol_num::tcp> + inet_type& _inet; + EventCenter *center; + UserspaceEventManager &manager; + std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs; + std::unordered_map<uint16_t, listener*> _listening; + std::random_device _rd; + std::default_random_engine _e; + std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535}; + circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs; + // queue for packets that do not belong to any tcb + circular_buffer<ipv4_traits::l4packet> _packetq; + Throttle _queue_space; + // Limit number of data queued into send queue + public: + class connection { + lw_shared_ptr<tcb> _tcb; + public: + explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; } + connection(const connection&) = delete; + connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) { + _tcb->_conn = this; + } + ~connection(); + void operator=(const connection&) = delete; + connection& operator=(connection&& x) { + if (this != &x) { + this->~connection(); + new (this) connection(std::move(x)); + } + return *this; + } + int fd() const { + return _tcb->fd; + } + int send(Packet p) { + return _tcb->send(std::move(p)); + } + Tub<Packet> read() { + return _tcb->read(); + } + int16_t get_errno() const { + return _tcb->get_errno(); + } + void close_read(); + void close_write(); + entity_addr_t remote_addr() const { + entity_addr_t addr; + auto net_ip = _tcb->_foreign_ip.hton(); + memcpy((void*)&addr.in4_addr().sin_addr.s_addr, + &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr)); + addr.set_family(AF_INET); + return addr; + } + uint64_t peek_sent_available() { + return _tcb->peek_sent_available(); + } + int is_connected() const { return _tcb->is_connected(); } + }; + class listener { + tcp& _tcp; + uint16_t _port; + int _fd = -1; + int16_t _errno; + queue<connection> _q; + size_t _q_max_length; + + private: + listener(tcp& t, uint16_t port, size_t queue_length) + : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) { + } + public: + listener(const listener&) = delete; + void operator=(const listener&) = delete; + listener(listener&& x) + : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno), + _q(std::move(x._q)) { + if (_fd >= 0) + _tcp._listening[_port] = this; + } + ~listener() { + abort_accept(); + } + int listen() { + if (_tcp._listening.find(_port) != _tcp._listening.end()) + return -EADDRINUSE; + _tcp._listening.emplace(_port, this); + _fd = _tcp.manager.get_eventfd(); + return 0; + } + Tub<connection> accept() { + Tub<connection> c; + if (!_q.empty()) { + c = std::move(_q.front()); + _q.pop(); + } + return c; + } + void abort_accept() { + while (!_q.empty()) + _q.pop(); + if (_fd >= 0) { + _tcp._listening.erase(_port); + _tcp.manager.close(_fd); + _fd = -1; + } + } + int16_t get_errno() const { + return _errno; + } + bool full() const { + return _q.size() == _q_max_length; + } + int fd() const { + return _fd; + } + friend class tcp; + }; + public: + explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen); + void received(Packet p, ipaddr from, ipaddr to); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + listener listen(uint16_t port, size_t queue_length = 100); + connection connect(const entity_addr_t &addr); + const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); } + void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) { + _poll_tcbs.emplace_back(std::move(tcb), dst); + } + bool push_listen_queue(uint16_t port, tcb *t) { + auto listener = _listening.find(port); + if (listener == _listening.end() || listener->second->full()) { + return false; + } + listener->second->_q.push(connection(t->shared_from_this())); + manager.notify(listener->second->_fd, EVENT_READABLE); + return true; + } + + private: + void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p); + void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip); + friend class listener; +}; + +template <typename InetTraits> +tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen) + : cct(c), _inet(inet), center(cen), + manager(static_cast<DPDKDriver*>(cen->get_driver())->manager), + _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) { + int tcb_polled = 0u; + _inet.register_packet_provider([this, tcb_polled] () mutable { + Tub<typename InetTraits::l4packet> l4p; + auto c = _poll_tcbs.size(); + if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) { + l4p = std::move(_packetq.front()); + _packetq.pop_front(); + _queue_space.put(l4p->p.len()); + } else { + while (c--) { + tcb_polled++; + lw_shared_ptr<tcb> tcb; + ethernet_address dst; + std::tie(tcb, dst) = std::move(_poll_tcbs.front()); + _poll_tcbs.pop_front(); + l4p = std::move(tcb->get_packet()); + if (l4p) { + l4p->e_dst = dst; + break; + } + } + } + return l4p; + }); +} + +template <typename InetTraits> +auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener { + return listener(*this, port, queue_length); +} + +template <typename InetTraits> +typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) { + uint16_t src_port; + connid id; + auto src_ip = _inet._inet.host_address(); + auto dst_ip = ipv4_address(addr); + auto dst_port = addr.get_port(); + + do { + src_port = _port_dist(_e); + id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port}; + if (_tcbs.find(id) == _tcbs.end()) { + if (_inet._inet.netif()->hw_queues_count() == 1 || + _inet._inet.netif()->hash2cpu( + id.hash(_inet._inet.netif()->rss_key())) == center->get_id()) + break; + } + } while (true); + + auto tcbp = make_lw_shared<tcb>(*this, id); + _tcbs.insert({id, tcbp}); + tcbp->connect(); + return connection(tcbp); +} + +template <typename InetTraits> +bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) { + auto th = p.get_header<tcp_hdr>(off); + if (th) { + out_hash_data.push_back(th->src_port); + out_hash_data.push_back(th->dst_port); + } + return true; +} + +template <typename InetTraits> +void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) { + auto th = p.get_header<tcp_hdr>(0); + if (!th) { + return; + } + // th->data_offset is correct even before ntoh() + if (unsigned(th->data_offset * 4) < sizeof(*th)) { + return; + } + + if (!get_hw_features().rx_csum_offload) { + checksummer csum; + InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len()); + csum.sum(p); + if (csum.get() != 0) { + return; + } + } + auto h = th->ntoh(); + auto id = connid{to, from, h.dst_port, h.src_port}; + auto tcbi = _tcbs.find(id); + lw_shared_ptr<tcb> tcbp; + if (tcbi == _tcbs.end()) { + auto listener = _listening.find(id.local_port); + if (listener == _listening.end() || listener->second->full()) { + // 1) In CLOSE state + // 1.1 all data in the incoming segment is discarded. An incoming + // segment containing a RST is discarded. An incoming segment not + // containing a RST causes a RST to be sent in response. + // FIXME: + // if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> + // if ACK on: <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(&h, id.local_ip, id.foreign_ip); + } else { + // 2) In LISTEN state + // 2.1 first check for an RST + if (h.f_rst) { + // An incoming RST should be ignored + return; + } + // 2.2 second check for an ACK + if (h.f_ack) { + // Any acknowledgment is bad if it arrives on a connection + // still in the LISTEN state. + // <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(&h, id.local_ip, id.foreign_ip); + } + // 2.3 third check for a SYN + if (h.f_syn) { + // check the security + // NOTE: Ignored for now + tcbp = make_lw_shared<tcb>(*this, id); + _tcbs.insert({id, tcbp}); + return tcbp->input_handle_listen_state(&h, std::move(p)); + } + // 2.4 fourth other text or control + // So you are unlikely to get here, but if you do, drop the + // segment, and return. + return; + } + } else { + tcbp = tcbi->second; + if (tcbp->state() == tcp_state::SYN_SENT) { + // 3) In SYN_SENT State + return tcbp->input_handle_syn_sent_state(&h, std::move(p)); + } else { + // 4) In other state, can be one of the following: + // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 + // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT + return tcbp->input_handle_other_state(&h, std::move(p)); + } + } +} + +// Send packet does not belong to any tcb +template <typename InetTraits> +void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) { + if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue + _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable { + if (r == 0) + _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp}); + }); + } +} + +template <typename InetTraits> +tcp<InetTraits>::connection::~connection() { + if (_tcb) { + _tcb->_conn = nullptr; + close_read(); + close_write(); + } +} + +template <typename InetTraits> +tcp<InetTraits>::tcb::tcb(tcp& t, connid id) + : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip), + _local_port(id.local_port), _foreign_port(id.foreign_port), + _snd(_tcp.cct), + center(t.center), + fd(t.manager.get_eventfd()), + delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)), + retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)), + persist_event(new tcp<InetTraits>::C_handle_persist(this)), + all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {} + +template <typename InetTraits> +tcp<InetTraits>::tcb::~tcb() +{ + if (_delayed_ack_fd) + center->delete_time_event(*_delayed_ack_fd); + if (retransmit_fd) + center->delete_time_event(*retransmit_fd); + if (persist_fd) + center->delete_time_event(*persist_fd); + delete delayed_ack_event; + delete retransmit_event; + delete persist_event; + delete all_data_ack_event; + manager.close(fd); + fd = -1; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth) +{ + _tcp.respond_with_reset(rth, _local_ip, _foreign_ip); +} + +template <typename InetTraits> +uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) { + uint32_t total_acked_bytes = 0; + // Full ACK of segment + while (!_snd.data.empty() + && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) { + auto acked_bytes = _snd.data.front().p.len(); + _snd.unacknowledged += acked_bytes; + // Ignore retransmitted segments when setting the RTO + if (_snd.data.front().nr_transmits == 0) { + update_rto(_snd.data.front().tx_time); + } + update_cwnd(acked_bytes); + total_acked_bytes += acked_bytes; + _snd.user_queue_space.put(_snd.data.front().data_len); + manager.notify(fd, EVENT_WRITABLE); + _snd.data.pop_front(); + } + // Partial ACK of segment + if (_snd.unacknowledged < seg_ack) { + auto acked_bytes = seg_ack - _snd.unacknowledged; + if (!_snd.data.empty()) { + auto& unacked_seg = _snd.data.front(); + unacked_seg.p.trim_front(acked_bytes); + } + _snd.unacknowledged = seg_ack; + update_cwnd(acked_bytes); + total_acked_bytes += acked_bytes; + } + return total_acked_bytes; +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) { + if (seg_len == 0 && _rcv.window == 0) { + // SEG.SEQ = RCV.NXT + return seg_seq == _rcv.next; + } else if (seg_len == 0 && _rcv.window > 0) { + // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window); + } else if (seg_len > 0 && _rcv.window > 0) { + // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + // or + // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window); + bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window); + return x || y; + } else { + // SEG.LEN > 0 RCV.WND = 0, not acceptable + return false; + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) { + // Handle tcp options + _option.parse(opt_start, opt_end); + + // Remote receive window scale factor + _snd.window_scale = _option._remote_win_scale; + // Local receive window scale factor + _rcv.window_scale = _option._local_win_scale; + + // Maximum segment size remote can receive + _snd.mss = _option._remote_mss; + // Maximum segment size local can receive + _rcv.mss = _option._local_mss = local_mss(); + + // Linux's default window size + _rcv.window = 29200 << _rcv.window_scale; + _snd.window = th->window << _snd.window_scale; + + // Segment sequence number used for last window update + _snd.wl1 = th->seq; + // Segment acknowledgment number used for last window update + _snd.wl2 = th->ack; + + // Setup initial congestion window + if (2190 < _snd.mss) { + _snd.cwnd = 2 * _snd.mss; + } else if (1095 < _snd.mss && _snd.mss <= 2190) { + _snd.cwnd = 3 * _snd.mss; + } else { + _snd.cwnd = 4 * _snd.mss; + } + + // Setup initial slow start threshold + _snd.ssthresh = th->window << _snd.window_scale; +} + +template <typename InetTraits> +Packet tcp<InetTraits>::tcb::get_transmit_packet() { + // easy case: empty queue + if (_snd.unsent.empty()) { + return Packet(); + } + auto can_send = this->can_send(); + // Max number of TCP payloads we can pass to NIC + uint32_t len; + if (_tcp.get_hw_features().tx_tso) { + // FIXME: Info tap device the size of the split packet + len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min; + } else { + len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss); + } + can_send = std::min(can_send, len); + // easy case: one small packet + if (_snd.unsent.front().len() <= can_send) { + auto p = std::move(_snd.unsent.front()); + _snd.unsent.pop_front(); + _snd.unsent_len -= p.len(); + return p; + } + // moderate case: need to split one packet + if (_snd.unsent.front().len() > can_send) { + auto p = _snd.unsent.front().share(0, can_send); + _snd.unsent.front().trim_front(can_send); + _snd.unsent_len -= p.len(); + return p; + } + // hard case: merge some packets, possibly split last + auto p = std::move(_snd.unsent.front()); + _snd.unsent.pop_front(); + can_send -= p.len(); + while (!_snd.unsent.empty() + && _snd.unsent.front().len() <= can_send) { + can_send -= _snd.unsent.front().len(); + p.append(std::move(_snd.unsent.front())); + _snd.unsent.pop_front(); + } + // FIXME: this will result in calling "deleter" of packet which free managed objects + // will used later + // if (!_snd.unsent.empty() && can_send) { + // auto& q = _snd.unsent.front(); + // p.append(q.share(0, can_send)); + // q.trim_front(can_send); + // } + _snd.unsent_len -= p.len(); + return p; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::output_one(bool data_retransmit) { + if (in_state(CLOSED)) { + return; + } + + Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet(); + Packet clone = p.share(); // early clone to prevent share() from calling packet::unuse_internal_data() on header. + uint16_t len = p.len(); + bool syn_on = syn_needs_on(); + bool ack_on = ack_needs_on(); + + auto options_size = _option.get_size(syn_on, ack_on); + auto th = p.prepend_header<tcp_hdr>(options_size); + + th->src_port = _local_port; + th->dst_port = _foreign_port; + + th->f_syn = syn_on; + th->f_ack = ack_on; + if (ack_on) { + clear_delayed_ack(); + } + th->f_urg = false; + th->f_psh = false; + + tcp_sequence seq; + if (data_retransmit) { + seq = _snd.unacknowledged; + } else { + seq = syn_on ? _snd.initial : _snd.next; + _snd.next += len; + } + th->seq = seq; + th->ack = _rcv.next; + th->data_offset = (sizeof(*th) + options_size) / 4; + th->window = _rcv.window >> _rcv.window_scale; + th->checksum = 0; + + // FIXME: does the FIN have to fit in the window? + bool fin_on = fin_needs_on(); + th->f_fin = fin_on; + + // Add tcp options + _option.fill(th, options_size); + *th = th->hton(); + + offload_info oi; + checksummer csum; + uint16_t pseudo_hdr_seg_len = 0; + + oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size; + + if (_tcp.get_hw_features().tx_csum_l4_offload) { + oi.needs_csum = true; + + // + // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's + // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones' + // complement sum of the pseudo header. + // + // For TSO the csum should be calculated for a pseudo header with + // segment length set to 0. All the rest is the same as for a TCP Tx + // CSUM offload case. + // + if (_tcp.get_hw_features().tx_tso && len > _snd.mss) { + oi.tso_seg_size = _snd.mss; + } else { + pseudo_hdr_seg_len = sizeof(*th) + options_size + len; + } + } else { + pseudo_hdr_seg_len = sizeof(*th) + options_size + len; + oi.needs_csum = false; + } + + InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip, + pseudo_hdr_seg_len); + + if (_tcp.get_hw_features().tx_csum_l4_offload) { + th->checksum = ~csum.get(); + } else { + csum.sum(p); + th->checksum = csum.get(); + } + + oi.protocol = ip_protocol_num::tcp; + + p.set_offload_info(oi); + + if (!data_retransmit && (len || syn_on || fin_on)) { + auto now = clock_type::now(); + if (len) { + unsigned nr_transmits = 0; + _snd.data.emplace_back(unacked_segment{std::move(clone), + len, nr_transmits, now}); + } + if (!retransmit_fd) { + start_retransmit_timer(); + } + } + + queue_packet(std::move(p)); +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::is_all_data_acked() { + if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) { + return true; + } + return false; +} + +template <typename InetTraits> +Tub<Packet> tcp<InetTraits>::tcb::read() { + Tub<Packet> p; + if (_rcv.data.empty()) + return p; + + p.construct(); + for (auto&& q : _rcv.data) { + p->append(std::move(q)); + } + _rcv.data.clear(); + return p; +} + +template <typename InetTraits> +int tcp<InetTraits>::tcb::send(Packet p) { + // We can not send after the connection is closed + ceph_assert(!_snd.closed); + + if (in_state(CLOSED)) + return -ECONNRESET; + + auto len = p.len(); + if (!_snd.user_queue_space.get_or_fail(len)) { + // note: caller must ensure enough queue space to send + ceph_abort(); + } + // TODO: Handle p.len() > max user_queue_space case + _snd.queued_len += len; + _snd.unsent_len += len; + _snd.queued_len -= len; + _snd.unsent.push_back(std::move(p)); + if (can_send() > 0) { + output(); + } + return len; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::close() { + if (in_state(CLOSED) || _snd.closed) { + return ; + } + // TODO: We should make this asynchronous + + _errno = -EPIPE; + center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); + bool acked = is_all_data_acked(); + if (!acked) { + _snd._all_data_acked_fd = manager.get_eventfd(); + center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event); + } else { + close_final_cleanup(); + } +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) { + // We've received a TSO packet, do ack immediately + if (seg_len > _rcv.mss) { + _nr_full_seg_received = 0; + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } + return true; + } + + // We've received a full sized segment, ack for every second full sized segment + if (seg_len == _rcv.mss) { + if (_nr_full_seg_received++ >= 1) { + _nr_full_seg_received = 0; + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } + return true; + } + } + + // If the timer is armed and its callback hasn't been run. + if (_delayed_ack_fd) { + return false; + } + + // If the timer is not armed, schedule a delayed ACK. + // The maximum delayed ack timer allowed by RFC1122 is 500ms, most + // implementations use 200ms. + _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event)); + return false; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::clear_delayed_ack() { + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::merge_out_of_order() { + bool merged = false; + if (_rcv.out_of_order.map.empty()) { + return merged; + } + for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) { + auto& p = it->second; + auto seg_beg = it->first; + auto seg_len = p.len(); + auto seg_end = seg_beg + seg_len; + if (seg_beg <= _rcv.next && seg_end > _rcv.next) { + // This segment has been received out of order and its previous + // segment has been received now + auto trim = _rcv.next - seg_beg; + if (trim) { + p.trim_front(trim); + seg_len -= trim; + } + _rcv.next += seg_len; + _rcv.data.push_back(std::move(p)); + // Since c++11, erase() always returns the value of the following element + it = _rcv.out_of_order.map.erase(it); + merged = true; + } else if (_rcv.next >= seg_end) { + // This segment has been receive already, drop it + it = _rcv.out_of_order.map.erase(it); + } else { + // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only, + // so we can stop looking here. + it++; + break; + } + } + return merged; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) { + _rcv.out_of_order.merge(seg, std::move(p)); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::trim_receive_data_after_window() { + abort(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::fast_retransmit() { + if (!_snd.data.empty()) { + auto& unacked_seg = _snd.data.front(); + unacked_seg.nr_transmits++; + retransmit_one(); + output(); + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) { + // Update RTO according to RFC6298 + auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time); + if (_snd.first_rto_sample) { + _snd.first_rto_sample = false; + // RTTVAR <- R/2 + // SRTT <- R + _snd.rttvar = R / 2; + _snd.srtt = R; + } else { + // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'| + // SRTT <- (1 - alpha) * SRTT + alpha * R' + // where alpha = 1/8 and beta = 1/4 + auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt); + _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4; + _snd.srtt = _snd.srtt * 7 / 8 + R / 8; + } + // RTO <- SRTT + max(G, K * RTTVAR) + _rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar); + + // Make sure 1 sec << _rto << 60 sec + _rto = std::max(_rto, _rto_min); + _rto = std::min(_rto, _rto_max); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) { + uint32_t smss = _snd.mss; + if (_snd.cwnd < _snd.ssthresh) { + // In slow start phase + _snd.cwnd += std::min(acked_bytes, smss); + } else { + // In congestion avoidance phase + uint32_t round_up = 1; + _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd); + } +} + + +template <typename InetTraits> +void tcp<InetTraits>::tcb::cleanup() { + manager.notify(fd, EVENT_READABLE); + _snd.closed = true; + _snd.unsent.clear(); + _snd.data.clear(); + _rcv.out_of_order.map.clear(); + _rcv.data.clear(); + stop_retransmit_timer(); + clear_delayed_ack(); + center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this)); + remove_from_tcbs(); +} + +template <typename InetTraits> +tcp_sequence tcp<InetTraits>::tcb::get_isn() { + // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers + // with the expression: + // ISN = M + F(localip, localport, remoteip, remoteport, secretkey) + // M is the 4 microsecond timer + using namespace std::chrono; + uint32_t hash[4]; + hash[0] = _local_ip.ip; + hash[1] = _foreign_ip.ip; + hash[2] = (_local_port << 16) + _foreign_port; + hash[3] = _isn_secret.key[15]; + ceph::crypto::MD5 md5; + md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key)); + md5.Final((unsigned char*)hash); + auto seq = hash[0]; + auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch()); + seq += m.count() / 4; + return make_seq(seq); +} + +template <typename InetTraits> +Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() { + _poll_active = false; + if (_packetq.empty()) { + output_one(); + } + + Tub<typename InetTraits::l4packet> p; + if (in_state(CLOSED)) { + return p; + } + + ceph_assert(!_packetq.empty()); + + p = std::move(_packetq.front()); + _packetq.pop_front(); + if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) { + // If there are packets to send in the queue or tcb is allowed to send + // more add tcp back to polling set to keep sending. In addition, dupacks >= 3 + // is an indication that an segment is lost, stop sending more in this case. + output(); + } + return p; +} + +template <typename InetTraits> +void tcp<InetTraits>::connection::close_read() { + // do nothing + // _tcb->manager.notify(_tcb->fd, EVENT_READABLE); +} + +template <typename InetTraits> +void tcp<InetTraits>::connection::close_write() { + _tcb->close(); +} + +template <typename InetTraits> +constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity; + +template <typename InetTraits> +typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret; + + +#endif /* TCP_HH_ */ diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc new file mode 100644 index 00000000..282dcef1 --- /dev/null +++ b/src/msg/async/dpdk/UserspaceEvent.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "UserspaceEvent.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + +int UserspaceEventManager::get_eventfd() +{ + int fd; + if (!unused_fds.empty()) { + fd = unused_fds.front(); + unused_fds.pop_front(); + } else { + fd = ++max_fd; + fds.resize(fd + 1); + } + + Tub<UserspaceFDImpl> &impl = fds[fd]; + ceph_assert(!impl); + impl.construct(); + ldout(cct, 20) << __func__ << " fd=" << fd << dendl; + return fd; +} + +int UserspaceEventManager::notify(int fd, int mask) +{ + ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl; + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask) + << " listening=" << int(impl->listening_mask) + << " waiting_idx=" << int(impl->waiting_idx) << dendl; + + impl->activating_mask |= mask; + if (impl->waiting_idx) + return 0; + + if (impl->listening_mask & mask) { + if (waiting_fds.size() <= max_wait_idx) + waiting_fds.resize(waiting_fds.size()*2); + impl->waiting_idx = ++max_wait_idx; + waiting_fds[max_wait_idx] = fd; + } + + ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask) + << " listening=" << int(impl->listening_mask) + << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl; + return 0; +} + +void UserspaceEventManager::close(int fd) +{ + ldout(cct, 20) << __func__ << " fd=" << fd << dendl; + if ((size_t)fd >= fds.size()) + return ; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return ; + + if (fd == max_fd) + --max_fd; + else + unused_fds.push_back(fd); + + if (impl->activating_mask) { + if (waiting_fds[max_wait_idx] == fd) { + ceph_assert(impl->waiting_idx == max_wait_idx); + --max_wait_idx; + } + waiting_fds[impl->waiting_idx] = -1; + } + impl.destroy(); +} + +int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp) +{ + int fd; + uint32_t i = 0; + int count = 0; + ceph_assert(num_events); + // leave zero slot for waiting_fds + while (i < max_wait_idx) { + fd = waiting_fds[++i]; + if (fd == -1) + continue; + + events[count] = fd; + Tub<UserspaceFDImpl> &impl = fds[fd]; + ceph_assert(impl); + masks[count] = impl->listening_mask & impl->activating_mask; + ceph_assert(masks[count]); + ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl; + impl->activating_mask &= (~masks[count]); + impl->waiting_idx = 0; + if (++count >= num_events) + break; + } + if (i < max_wait_idx) { + memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i)); + } + max_wait_idx -= i; + return count; +} diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h new file mode 100644 index 00000000..7e89517d --- /dev/null +++ b/src/msg/async/dpdk/UserspaceEvent.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_USERSPACEEVENT_H +#define CEPH_USERSPACEEVENT_H + +#include <cstddef> +#include <errno.h> +#include <string.h> + +#include <vector> +#include <list> + +#include "include/ceph_assert.h" +#include "include/int_types.h" +#include "common/Tub.h" + +class CephContext; + +class UserspaceEventManager { + struct UserspaceFDImpl { + uint32_t waiting_idx = 0; + int16_t read_errno = 0; + int16_t write_errno = 0; + int8_t listening_mask = 0; + int8_t activating_mask = 0; + uint32_t magic = 4921; + }; + CephContext *cct; + int max_fd = 0; + uint32_t max_wait_idx = 0; + std::vector<Tub<UserspaceFDImpl> > fds; + std::vector<int> waiting_fds; + std::list<uint32_t> unused_fds; + + public: + explicit UserspaceEventManager(CephContext *c): cct(c) { + waiting_fds.resize(1024); + } + + int get_eventfd(); + + int listen(int fd, int mask) { + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + impl->listening_mask |= mask; + if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) { + if (waiting_fds.size() <= max_wait_idx) + waiting_fds.resize(waiting_fds.size()*2); + impl->waiting_idx = ++max_wait_idx; + waiting_fds[max_wait_idx] = fd; + } + return 0; + } + + int unlisten(int fd, int mask) { + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + impl->listening_mask &= (~mask); + if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) { + if (waiting_fds[max_wait_idx] == fd) { + ceph_assert(impl->waiting_idx == max_wait_idx); + --max_wait_idx; + } + waiting_fds[impl->waiting_idx] = -1; + impl->waiting_idx = 0; + } + return 0; + } + + int notify(int fd, int mask); + void close(int fd); + int poll(int *events, int *masks, int num_events, struct timeval *tp); + + bool check() { + for (auto &&m : fds) { + if (m && m->magic != 4921) + return false; + } + return true; + } +}; + +#endif //CEPH_USERSPACEEVENT_H diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h new file mode 100644 index 00000000..3b48f789 --- /dev/null +++ b/src/msg/async/dpdk/align.h @@ -0,0 +1,50 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_ALIGN_HH_ +#define CEPH_MSG_DPDK_ALIGN_HH_ + +#include <cstdint> +#include <cstdlib> + +template <typename T> +inline constexpr T align_up(T v, T align) { + return (v + align - 1) & ~(align - 1); +} + +template <typename T> +inline constexpr T* align_up(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align)); +} + +template <typename T> +inline constexpr T align_down(T v, T align) { + return v & ~(align - 1); +} + +template <typename T> +inline constexpr T* align_down(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align)); +} + +#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */ diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h new file mode 100644 index 00000000..40f7728d --- /dev/null +++ b/src/msg/async/dpdk/array_map.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_ARRAY_MAP_HH_ +#define CEPH_ARRAY_MAP_HH_ + +#include <array> + +// unordered_map implemented as a simple array + +template <typename Value, size_t Max> +class array_map { + std::array<Value, Max> _a {}; + public: + array_map(std::initializer_list<std::pair<size_t, Value>> i) { + for (auto kv : i) { + _a[kv.first] = kv.second; + } + } + Value& operator[](size_t key) { return _a[key]; } + const Value& operator[](size_t key) const { return _a[key]; } + + Value& at(size_t key) { + if (key >= Max) { + throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max)); + } + return _a[key]; + } +}; + +#endif /* ARRAY_MAP_HH_ */ diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h new file mode 100644 index 00000000..a996ec07 --- /dev/null +++ b/src/msg/async/dpdk/byteorder.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_BYTEORDER_H_ +#define CEPH_MSG_BYTEORDER_H_ + +#include <arpa/inet.h> // for ntohs() and friends +#include <iosfwd> +#include <utility> + +inline uint64_t ntohq(uint64_t v) { + return __builtin_bswap64(v); +} +inline uint64_t htonq(uint64_t v) { + return __builtin_bswap64(v); +} + +inline void ntoh() {} +inline void hton() {} + +inline uint8_t ntoh(uint8_t x) { return x; } +inline uint8_t hton(uint8_t x) { return x; } +inline uint16_t ntoh(uint16_t x) { return ntohs(x); } +inline uint16_t hton(uint16_t x) { return htons(x); } +inline uint32_t ntoh(uint32_t x) { return ntohl(x); } +inline uint32_t hton(uint32_t x) { return htonl(x); } +inline uint64_t ntoh(uint64_t x) { return ntohq(x); } +inline uint64_t hton(uint64_t x) { return htonq(x); } + +inline int8_t ntoh(int8_t x) { return x; } +inline int8_t hton(int8_t x) { return x; } +inline int16_t ntoh(int16_t x) { return ntohs(x); } +inline int16_t hton(int16_t x) { return htons(x); } +inline int32_t ntoh(int32_t x) { return ntohl(x); } +inline int32_t hton(int32_t x) { return htonl(x); } +inline int64_t ntoh(int64_t x) { return ntohq(x); } +inline int64_t hton(int64_t x) { return htonq(x); } + +#endif /* CEPH_MSG_BYTEORDER_H_ */ diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h new file mode 100644 index 00000000..1ace8eeb --- /dev/null +++ b/src/msg/async/dpdk/capture.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_DPDK_CAPTURE_H +#define CEPH_MSG_DPDK_CAPTURE_H + +#include <utility> + +template <typename T, typename F> +class capture_impl { + T x; + F f; + public: + capture_impl(capture_impl &) = delete; + capture_impl( T && x, F && f ) + : x{std::forward<T>(x)}, f{std::forward<F>(f)} + {} + + template <typename ...Ts> auto operator()( Ts&&...args ) + -> decltype(f( x, std::forward<Ts>(args)... )) + { + return f( x, std::forward<Ts>(args)... ); + } + + template <typename ...Ts> auto operator()( Ts&&...args ) const + -> decltype(f( x, std::forward<Ts>(args)... )) + { + return f( x, std::forward<Ts>(args)... ); + } +}; + +template <typename T, typename F> +capture_impl<T,F> capture( T && x, F && f ) { + return capture_impl<T,F>( + std::forward<T>(x), std::forward<F>(f) ); +} + +#endif //CEPH_MSG_DPDK_CAPTURE_H diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h new file mode 100644 index 00000000..2c92c120 --- /dev/null +++ b/src/msg/async/dpdk/circular_buffer.h @@ -0,0 +1,347 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_CIRCULAR_BUFFER_HH_ +#define CEPH_CIRCULAR_BUFFER_HH_ + +// A growable double-ended queue container that can be efficiently +// extended (and shrunk) from both ends. Implementation is a single +// storage vector. +// +// Similar to libstdc++'s std::deque, except that it uses a single level +// store, and so is more efficient for simple stored items. +// Similar to boost::circular_buffer_space_optimized, except it uses +// uninitialized storage for unoccupied elements (and thus move/copy +// constructors instead of move/copy assignments, which are less efficient). + +#include <memory> +#include <algorithm> + +#include "transfer.h" + +template <typename T, typename Alloc = std::allocator<T>> +class circular_buffer { + struct impl : Alloc { + T* storage = nullptr; + // begin, end interpreted (mod capacity) + size_t begin = 0; + size_t end = 0; + size_t capacity = 0; + }; + impl _impl; + public: + using value_type = T; + using size_type = size_t; + using reference = T&; + using pointer = T*; + using const_reference = const T&; + using const_pointer = const T*; + public: + circular_buffer() = default; + circular_buffer(circular_buffer&& X); + circular_buffer(const circular_buffer& X) = delete; + ~circular_buffer(); + circular_buffer& operator=(const circular_buffer&) = delete; + circular_buffer& operator=(circular_buffer&&) = delete; + void push_front(const T& data); + void push_front(T&& data); + template <typename... A> + void emplace_front(A&&... args); + void push_back(const T& data); + void push_back(T&& data); + template <typename... A> + void emplace_back(A&&... args); + T& front(); + T& back(); + void pop_front(); + void pop_back(); + bool empty() const; + size_t size() const; + size_t capacity() const; + T& operator[](size_t idx); + template <typename Func> + void for_each(Func func); + // access an element, may return wrong or destroyed element + // only useful if you do not rely on data accuracy (e.g. prefetch) + T& access_element_unsafe(size_t idx); + private: + void expand(); + void maybe_expand(size_t nr = 1); + size_t mask(size_t idx) const; + + template<typename CB, typename ValueType> + struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> { + typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t; + + ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; } + ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; } + // prefix + cbiterator<CB, ValueType>& operator++() { + idx++; + return *this; + } + // postfix + cbiterator<CB, ValueType> operator++(int unused) { + auto v = *this; + idx++; + return v; + } + // prefix + cbiterator<CB, ValueType>& operator--() { + idx--; + return *this; + } + // postfix + cbiterator<CB, ValueType> operator--(int unused) { + auto v = *this; + idx--; + return v; + } + cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const { + return cbiterator<CB, ValueType>(cb, idx + n); + } + cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const { + return cbiterator<CB, ValueType>(cb, idx - n); + } + cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) { + idx += n; + return *this; + } + cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) { + idx -= n; + return *this; + } + bool operator==(const cbiterator<CB, ValueType>& rhs) const { + return idx == rhs.idx; + } + bool operator!=(const cbiterator<CB, ValueType>& rhs) const { + return idx != rhs.idx; + } + bool operator<(const cbiterator<CB, ValueType>& rhs) const { + return idx < rhs.idx; + } + bool operator>(const cbiterator<CB, ValueType>& rhs) const { + return idx > rhs.idx; + } + bool operator>=(const cbiterator<CB, ValueType>& rhs) const { + return idx >= rhs.idx; + } + bool operator<=(const cbiterator<CB, ValueType>& rhs) const { + return idx <= rhs.idx; + } + typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const { + return idx - rhs.idx; + } + private: + CB* cb; + size_t idx; + cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {} + friend class circular_buffer; + }; + friend class iterator; + + public: + typedef cbiterator<circular_buffer, T> iterator; + typedef cbiterator<const circular_buffer, const T> const_iterator; + + iterator begin() { + return iterator(this, _impl.begin); + } + const_iterator begin() const { + return const_iterator(this, _impl.begin); + } + iterator end() { + return iterator(this, _impl.end); + } + const_iterator end() const { + return const_iterator(this, _impl.end); + } + const_iterator cbegin() const { + return const_iterator(this, _impl.begin); + } + const_iterator cend() const { + return const_iterator(this, _impl.end); + } +}; + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const { + return idx & (_impl.capacity - 1); +} + +template <typename T, typename Alloc> +inline bool circular_buffer<T, Alloc>::empty() const { + return _impl.begin == _impl.end; +} + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::size() const { + return _impl.end - _impl.begin; +} + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::capacity() const { + return _impl.capacity; +} + +template <typename T, typename Alloc> +inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x) + : _impl(std::move(x._impl)) { + x._impl = {}; +} + +template <typename T, typename Alloc> +template <typename Func> +inline void circular_buffer<T, Alloc>::for_each(Func func) { + auto s = _impl.storage; + auto m = _impl.capacity - 1; + for (auto i = _impl.begin; i != _impl.end; ++i) { + func(s[i & m]); + } +} + +template <typename T, typename Alloc> +inline circular_buffer<T, Alloc>::~circular_buffer() { + for_each([this] (T& obj) { + _impl.destroy(&obj); + }); + _impl.deallocate(_impl.storage, _impl.capacity); +} + +template <typename T, typename Alloc> +void circular_buffer<T, Alloc>::expand() { + auto new_cap = std::max<size_t>(_impl.capacity * 2, 1); + auto new_storage = _impl.allocate(new_cap); + auto p = new_storage; + try { + for_each([this, &p] (T& obj) { + transfer_pass1(_impl, &obj, p); + p++; + }); + } catch (...) { + while (p != new_storage) { + _impl.destroy(--p); + } + _impl.deallocate(new_storage, new_cap); + throw; + } + p = new_storage; + for_each([this, &p] (T& obj) { + transfer_pass2(_impl, &obj, p++); + }); + std::swap(_impl.storage, new_storage); + std::swap(_impl.capacity, new_cap); + _impl.begin = 0; + _impl.end = p - _impl.storage; + _impl.deallocate(new_storage, new_cap); +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) { + if (_impl.end - _impl.begin + nr > _impl.capacity) { + expand(); + } +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_front(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, data); + --_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_front(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, std::move(data)); + --_impl.begin; +} + +template <typename T, typename Alloc> +template <typename... Args> +inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, std::forward<Args>(args)...); + --_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_back(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, data); + ++_impl.end; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_back(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, std::move(data)); + ++_impl.end; +} + +template <typename T, typename Alloc> +template <typename... Args> +inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, std::forward<Args>(args)...); + ++_impl.end; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::front() { + return _impl.storage[mask(_impl.begin)]; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::back() { + return _impl.storage[mask(_impl.end - 1)]; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::pop_front() { + _impl.destroy(&front()); + ++_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::pop_back() { + _impl.destroy(&back()); + --_impl.end; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::operator[](size_t idx) { + return _impl.storage[mask(_impl.begin + idx)]; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) { + return _impl.storage[mask(_impl.begin + idx)]; +} + +#endif /* CEPH_CIRCULAR_BUFFER_HH_ */ diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h new file mode 100644 index 00000000..ea5dc49e --- /dev/null +++ b/src/msg/async/dpdk/const.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_CONST_H_ +#define CEPH_MSG_CONST_H_ + +#include <stdint.h> + +enum class ip_protocol_num : uint8_t { + icmp = 1, tcp = 6, unused = 255 +}; + +enum class eth_protocol_num : uint16_t { + ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd +}; + +const uint8_t eth_hdr_len = 14; +const uint8_t tcp_hdr_len_min = 20; +const uint8_t ipv4_hdr_len_min = 20; +const uint8_t ipv6_hdr_len_min = 40; +const uint16_t ip_packet_len_max = 65535; + +#endif diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc new file mode 100644 index 00000000..9f9d343b --- /dev/null +++ b/src/msg/async/dpdk/dpdk_rte.cc @@ -0,0 +1,154 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <bitset> + +#include <rte_config.h> +#include <rte_common.h> +#include <rte_ethdev.h> +#include <rte_version.h> + +#include "DPDK.h" +#include "dpdk_rte.h" + +namespace dpdk { + + static inline std::vector<char> string2vector(std::string str) { + auto v = std::vector<char>(str.begin(), str.end()); + v.push_back('\0'); + return v; + } + + bool eal::initialized = false; + std::thread eal::t; + std::mutex eal::lock; + std::condition_variable eal::cond; + std::list<std::function<void()>> eal::funcs; + + static int bitcount(unsigned long long n) + { + return std::bitset<CHAR_BIT * sizeof(n)>{n}.count(); + } + + int eal::init(CephContext *c) + { + if (initialized) { + return 1; + } + + bool done = false; + auto num = std::stoull(c->_conf.get_val<std::string>("ms_dpdk_coremask"), + nullptr, 16); + unsigned int coremaskbit = bitcount(num); + + ceph_assert(coremaskbit > c->_conf->ms_async_op_threads); + + t = std::thread([&]() { + // TODO: Inherit these from the app parameters - "opts" + std::vector<std::vector<char>> args { + string2vector(string("ceph")), + string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")), + string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel), + }; + + Tub<std::string> hugepages_path; + if (!c->_conf->ms_dpdk_hugepages.empty()) { + hugepages_path.construct(c->_conf->ms_dpdk_hugepages); + } + + // If "hugepages" is not provided and DPDK PMD drivers mode is requested - + // use the default DPDK huge tables configuration. + if (hugepages_path) { + args.push_back(string2vector("--huge-dir")); + args.push_back(string2vector(*hugepages_path)); + + // + // We don't know what is going to be our networking configuration so we + // assume there is going to be a queue per-CPU. Plus we'll give a DPDK + // 64MB for "other stuff". + // + unsigned int x; + std::stringstream ss; + ss << std::hex << "fffefffe"; + ss >> x; + size_t size_MB = mem_size(bitcount(x)) >> 20; + std::stringstream size_MB_str; + size_MB_str << size_MB; + + args.push_back(string2vector("-m")); + args.push_back(string2vector(size_MB_str.str())); + } else if (!c->_conf->ms_dpdk_pmd.empty()) { + args.push_back(string2vector("--no-huge")); + } + + std::string rte_file_prefix; + rte_file_prefix = "rte_"; + rte_file_prefix += c->_conf->name.to_str(); + args.push_back(string2vector("--file-prefix")); + args.push_back(string2vector(rte_file_prefix)); + + std::vector<char*> cargs; + + for (auto&& a: args) { + cargs.push_back(a.data()); + } + /* initialise the EAL for all */ + int ret = rte_eal_init(cargs.size(), cargs.data()); + if (ret < 0) + return ret; + + std::unique_lock<std::mutex> l(lock); + initialized = true; + done = true; + cond.notify_all(); + while (true) { + if (!funcs.empty()) { + auto f = std::move(funcs.front()); + funcs.pop_front(); + f(); + cond.notify_all(); + } else { + cond.wait(l); + } + } + }); + t.detach(); + std::unique_lock<std::mutex> l(lock); + while (!done) + cond.wait(l); + return 0; + } + + size_t eal::mem_size(int num_cpus) + { + size_t memsize = 0; + // + // PMD mempool memory: + // + // We don't know what is going to be our networking configuration so we + // assume there is going to be a queue per-CPU. + // + memsize += num_cpus * qp_mempool_obj_size(); + + // Plus we'll give a DPDK 64MB for "other stuff". + memsize += (64UL << 20); + + return memsize; + } + +} // namespace dpdk diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h new file mode 100644 index 00000000..4aa83899 --- /dev/null +++ b/src/msg/async/dpdk/dpdk_rte.h @@ -0,0 +1,74 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef CEPH_DPDK_RTE_H_ +#define CEPH_DPDK_RTE_H_ + + +#include <condition_variable> +#include <mutex> +#include <thread> + +#include <bitset> +#include <rte_config.h> +#include <rte_version.h> +#include <boost/program_options.hpp> + +/*********************** Compat section ***************************************/ +// We currently support only versions 2.0 and above. +#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0)) +#error "DPDK version above 2.0.0 is required" +#endif + +#if defined(RTE_MBUF_REFCNT_ATOMIC) +#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \ + "config/common_linuxapp" +#endif +/******************************************************************************/ + +namespace dpdk { + +// DPDK Environment Abstraction Layer +class eal { + public: + using cpuset = std::bitset<RTE_MAX_LCORE>; + + static std::mutex lock; + static std::condition_variable cond; + static std::list<std::function<void()>> funcs; + static int init(CephContext *c); + static void execute_on_master(std::function<void()> &&f) { + bool done = false; + std::unique_lock<std::mutex> l(lock); + funcs.emplace_back([&]() { f(); done = true; }); + cond.notify_all(); + while (!done) + cond.wait(l); + } + /** + * Returns the amount of memory needed for DPDK + * @param num_cpus Number of CPUs the application is going to use + * + * @return + */ + static size_t mem_size(int num_cpus); + static bool initialized; + static std::thread t; +}; + +} // namespace dpdk +#endif // CEPH_DPDK_RTE_H_ diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc new file mode 100644 index 00000000..9aca5078 --- /dev/null +++ b/src/msg/async/dpdk/ethernet.cc @@ -0,0 +1,16 @@ +#include <iomanip> + +#include "ethernet.h" + +std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) { + auto& m = ea.mac; + using u = uint32_t; + os << std::hex << std::setw(2) + << u(m[0]) << ":" + << u(m[1]) << ":" + << u(m[2]) << ":" + << u(m[3]) << ":" + << u(m[4]) << ":" + << u(m[5]); + return os; +} diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h new file mode 100644 index 00000000..b007425f --- /dev/null +++ b/src/msg/async/dpdk/ethernet.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_ETHERNET_H_ +#define CEPH_MSG_ETHERNET_H_ + +#include <array> +#include <sstream> + +#include "include/ceph_assert.h" +#include "byteorder.h" + +struct ethernet_address { + ethernet_address() {} + + ethernet_address(const uint8_t *eaddr) { + std::copy(eaddr, eaddr + 6, mac.begin()); + } + + ethernet_address(std::initializer_list<uint8_t> eaddr) { + ceph_assert(eaddr.size() == mac.size()); + std::copy(eaddr.begin(), eaddr.end(), mac.begin()); + } + + ethernet_address ntoh() { + return *this; + } + ethernet_address hton() { + return *this; + } + std::array<uint8_t, 6> mac; +} __attribute__((packed)); + +inline bool operator==(const ethernet_address& a, const ethernet_address& b) { + return a.mac == b.mac; +} +std::ostream& operator<<(std::ostream& os, const ethernet_address& ea); + +struct ethernet { + using address = ethernet_address; + static address broadcast_address() { + return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + } + static constexpr uint16_t arp_hardware_type() { return 1; } +}; + +struct eth_hdr { + ethernet_address dst_mac; + ethernet_address src_mac; + uint16_t eth_proto; + eth_hdr hton() { + eth_hdr hdr = *this; + hdr.eth_proto = ::hton(eth_proto); + return hdr; + } + eth_hdr ntoh() { + eth_hdr hdr = *this; + hdr.eth_proto = ::ntoh(eth_proto); + return hdr; + } +} __attribute__((packed)); + +ethernet_address parse_ethernet_address(std::string addr); + +#endif /* CEPH_MSG_ETHERNET_H_ */ diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h new file mode 100644 index 00000000..356d8fd6 --- /dev/null +++ b/src/msg/async/dpdk/ip_types.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_IP_TYPES_H_H +#define CEPH_IP_TYPES_H_H + +#include <boost/asio/ip/address_v4.hpp> +#include <string> + +class Packet; +class ethernet_address; +using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>; + +struct ipv4_addr { + uint32_t ip; + uint16_t port; + + ipv4_addr() : ip(0), port(0) {} + ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {} + ipv4_addr(uint16_t port) : ip(0), port(port) {} + ipv4_addr(const std::string &addr); + ipv4_addr(const std::string &addr, uint16_t port); + + ipv4_addr(const entity_addr_t &ad) { + ip = ntoh(ad.in4_addr().sin_addr.s_addr); + port = ad.get_port(); + } + + ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {} +}; + +struct ipv4_address { + ipv4_address() : ip(0) {} + explicit ipv4_address(uint32_t ip) : ip(ip) {} + explicit ipv4_address(const std::string& addr) { + ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong()); + } + ipv4_address(ipv4_addr addr) { + ip = addr.ip; + } + + uint32_t ip; + + ipv4_address hton() { + ipv4_address addr; + addr.ip = ::hton(ip); + return addr; + } + ipv4_address ntoh() { + ipv4_address addr; + addr.ip = ::ntoh(ip); + return addr; + } + + friend bool operator==(ipv4_address x, ipv4_address y) { + return x.ip == y.ip; + } + friend bool operator!=(ipv4_address x, ipv4_address y) { + return x.ip != y.ip; + } +} __attribute__((packed)); + +static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; } + +std::ostream& operator<<(std::ostream& os, const ipv4_address& a); + +namespace std { + + template <> + struct hash<ipv4_address> { + size_t operator()(ipv4_address a) const { return a.ip; } + }; + +} + +#endif //CEPH_IP_TYPES_H_H diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc new file mode 100644 index 00000000..6e361f18 --- /dev/null +++ b/src/msg/async/dpdk/net.cc @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + */ + +#include "net.h" +#include "DPDK.h" +#include "DPDKStack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "net " + +interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center) + : cct(cct), _dev(dev), + _rx(_dev->receive( + center->get_id(), + [center, this] (Packet p) { + return dispatch_packet(center, std::move(p)); + } + )), + _hw_address(_dev->hw_address()), + _hw_features(_dev->get_hw_features()) { + auto idx = 0u; + unsigned qid = center->get_id(); + dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable { + Tub<Packet> p; + for (size_t i = 0; i < _pkt_providers.size(); i++) { + auto l3p = _pkt_providers[idx++](); + if (idx == _pkt_providers.size()) + idx = 0; + if (l3p) { + auto l3pv = std::move(*l3p); + auto eh = l3pv.p.prepend_header<eth_hdr>(); + eh->dst_mac = l3pv.to; + eh->src_mac = _hw_address; + eh->eth_proto = uint16_t(l3pv.proto_num); + *eh = eh->hton(); + ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num) + << " " << _hw_address << " -> " << l3pv.to + << " length " << std::dec << l3pv.p.len() << dendl; + p = std::move(l3pv.p); + return p; + } + } + return p; + }); +} + +subscription<Packet, ethernet_address> interface::register_l3( + eth_protocol_num proto_num, + std::function<int (Packet p, ethernet_address from)> next, + std::function<bool (forward_hash&, Packet& p, size_t)> forward) +{ + auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward))); + ceph_assert(i.second); + l3_rx_stream& l3_rx = i.first->second; + return l3_rx.packet_stream.listen(std::move(next)); +} + +unsigned interface::hash2cpu(uint32_t hash) { + return _dev->hash2cpu(hash); +} + +const rss_key_type& interface::rss_key() const { + return _dev->rss_key(); +} + +uint16_t interface::hw_queues_count() const { + return _dev->hw_queues_count(); +} + +class C_handle_l2forward : public EventCallback { + std::shared_ptr<DPDKDevice> sdev; + unsigned &queue_depth; + Packet p; + unsigned dst; + + public: + C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target) + : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {} + void do_request(uint64_t fd) { + sdev->l2receive(dst, std::move(p)); + queue_depth--; + delete this; + } +}; + +void interface::forward(EventCenter *source, unsigned target, Packet p) { + static __thread unsigned queue_depth; + + if (queue_depth < 1000) { + queue_depth++; + // FIXME: need ensure this event not be called after EventCenter destruct + _dev->workers[target]->center.dispatch_event_external( + new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target)); + } +} + +int interface::dispatch_packet(EventCenter *center, Packet p) { + auto eh = p.get_header<eth_hdr>(); + if (eh) { + auto i = _proto_map.find(ntoh(eh->eth_proto)); + auto hwrss = p.rss_hash(); + if (hwrss) { + ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto) + << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh() + << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl; + } else { + ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto) + << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh() + << " length " << std::dec << p.len() << dendl; + } + if (i != _proto_map.end()) { + l3_rx_stream& l3 = i->second; + auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () { + auto hwrss = p.rss_hash(); + if (hwrss) { + return *hwrss; + } else { + forward_hash data; + if (l3.forward(data, p, sizeof(eth_hdr))) { + return toeplitz_hash(rss_key(), data); + } + return 0u; + } + }); + if (fw != center->get_id()) { + ldout(cct, 1) << __func__ << " forward to " << fw << dendl; + forward(center, fw, std::move(p)); + } else { + auto h = eh->ntoh(); + auto from = h.src_mac; + p.trim_front(sizeof(*eh)); + // avoid chaining, since queue length is unlimited + // drop instead. + if (l3.ready()) { + return l3.packet_stream.produce(std::move(p), from); + } + } + } + } + return 0; +} + +class C_arp_learn : public EventCallback { + DPDKWorker *worker; + ethernet_address l2_addr; + ipv4_address l3_addr; + + public: + C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3) + : worker(w), l2_addr(l2), l3_addr(l3) {} + void do_request(uint64_t id) { + worker->arp_learn(l2_addr, l3_addr); + delete this; + } +}; + +void interface::arp_learn(ethernet_address l2, ipv4_address l3) +{ + for (auto &&w : _dev->workers) { + w->center.dispatch_event_external( + new C_arp_learn(w, l2, l3)); + } +} + +l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func) + : _netif(netif), _proto_num(proto_num) { + _netif->register_packet_provider(std::move(func)); +} + +subscription<Packet, ethernet_address> l3_protocol::receive( + std::function<int (Packet, ethernet_address)> rx_fn, + std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) { + return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward)); +}; diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h new file mode 100644 index 00000000..63f0422b --- /dev/null +++ b/src/msg/async/dpdk/net.h @@ -0,0 +1,138 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_NET_H +#define CEPH_MSG_DPDK_NET_H + +#include "const.h" +#include "ethernet.h" +#include "Packet.h" +#include "stream.h" +#include "toeplitz.h" + +struct hw_features { + // Enable tx ip header checksum offload + bool tx_csum_ip_offload = false; + // Enable tx l4 (TCP or UDP) checksum offload + bool tx_csum_l4_offload = false; + // Enable rx checksum offload + bool rx_csum_offload = false; + // LRO is enabled + bool rx_lro = false; + // Enable tx TCP segment offload + bool tx_tso = false; + // Enable tx UDP fragmentation offload + bool tx_ufo = false; + // Maximum Transmission Unit + uint16_t mtu = 1500; + // Maximun packet len when TCP/UDP offload is enabled + uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len; +}; + +class forward_hash { + uint8_t data[64]; + size_t end_idx = 0; + public: + size_t size() const { + return end_idx; + } + void push_back(uint8_t b) { + ceph_assert(end_idx < sizeof(data)); + data[end_idx++] = b; + } + void push_back(uint16_t b) { + push_back(uint8_t(b)); + push_back(uint8_t(b >> 8)); + } + void push_back(uint32_t b) { + push_back(uint16_t(b)); + push_back(uint16_t(b >> 16)); + } + const uint8_t& operator[](size_t idx) const { + return data[idx]; + } +}; + +class interface; + +class l3_protocol { + public: + struct l3packet { + eth_protocol_num proto_num; + ethernet_address to; + Packet p; + }; + using packet_provider_type = std::function<Tub<l3packet> ()>; + + private: + interface* _netif; + eth_protocol_num _proto_num; + + public: + explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func); + subscription<Packet, ethernet_address> receive( + std::function<int (Packet, ethernet_address)> rx_fn, + std::function<bool (forward_hash &h, Packet &p, size_t s)> forward); + + private: + friend class interface; +}; + +class DPDKDevice; +struct ipv4_address; + +class interface { + CephContext *cct; + struct l3_rx_stream { + stream<Packet, ethernet_address> packet_stream; + std::function<bool (forward_hash&, Packet&, size_t)> forward; + bool ready() { return packet_stream.started(); } + explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {} + }; + std::unordered_map<uint16_t, l3_rx_stream> _proto_map; + std::shared_ptr<DPDKDevice> _dev; + subscription<Packet> _rx; + ethernet_address _hw_address; + struct hw_features _hw_features; + std::vector<l3_protocol::packet_provider_type> _pkt_providers; + + private: + int dispatch_packet(EventCenter *c, Packet p); + public: + explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center); + ethernet_address hw_address() { return _hw_address; } + const struct hw_features& get_hw_features() const { return _hw_features; } + subscription<Packet, ethernet_address> register_l3( + eth_protocol_num proto_num, + std::function<int (Packet, ethernet_address)> next, + std::function<bool (forward_hash&, Packet&, size_t)> forward); + void forward(EventCenter *source, unsigned target, Packet p); + unsigned hash2cpu(uint32_t hash); + void register_packet_provider(l3_protocol::packet_provider_type func) { + _pkt_providers.push_back(std::move(func)); + } + const rss_key_type& rss_key() const; + uint16_t hw_queues_count() const; + void arp_learn(ethernet_address l2, ipv4_address l3); + friend class l3_protocol; +}; + +#endif //CEPH_MSG_DPDK_NET_H diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h new file mode 100644 index 00000000..984ddca1 --- /dev/null +++ b/src/msg/async/dpdk/queue.h @@ -0,0 +1,96 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_QUEUE_H_ +#define CEPH_MSG_DPDK_QUEUE_H_ + +#include <queue> + +#include "circular_buffer.h" + +template <typename T> +class queue { + std::queue<T, circular_buffer<T>> _q; + size_t _max; + + public: + explicit queue(size_t size): _max(size) {} + + // Push an item. + // + // Returns false if the queue was full and the item was not pushed. + bool push(T&& a); + + // pops an item. + T pop(); + + // Consumes items from the queue, passing them to @func, until @func + // returns false or the queue it empty + // + // Returns false if func returned false. + template <typename Func> + bool consume(Func&& func); + + // Returns true when the queue is empty. + bool empty() const; + + // Returns true when the queue is full. + bool full() const; + + size_t size() const { return _q.size(); } + + // Destroy any items in the queue + void clear() { + while (!_q.empty()) { + _q.pop(); + } + } +}; + +template <typename T> +inline bool queue<T>::push(T&& data) { + if (_q.size() < _max) { + _q.push(std::move(data)); + notify_not_empty(); + return true; + } else { + return false; + } +} + +template <typename T> +inline T queue<T>::pop() { + T data = std::move(_q.front()); + _q.pop(); + return data; +} + +template <typename T> +inline bool queue<T>::empty() const { + return _q.empty(); +} + +template <typename T> +inline bool queue<T>::full() const { + return _q.size() == _max; +} + +#endif /* CEPH_MSG_DPDK_QUEUE_H_ */ diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h new file mode 100644 index 00000000..d078063b --- /dev/null +++ b/src/msg/async/dpdk/shared_ptr.h @@ -0,0 +1,391 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_LW_SHARED_PTR_H_ +#define CEPH_LW_SHARED_PTR_H_ + +#include <utility> +#include <type_traits> +#include <functional> +#include <iostream> + +// This header defines a shared pointer facility, lw_shared_ptr<>, +// modeled after std::shared_ptr<>. +// +// Unlike std::shared_ptr<>, this implementation is thread +// safe, and two pointers sharing the same object must not be used in +// different threads. +// +// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<> +// occupying just one machine word, and adding just one word to the shared +// object. However, it does not support polymorphism. +// +// It supports shared_from_this() via enable_shared_from_this<> +// and lw_enable_shared_from_this<>(). +// + +template <typename T> +class lw_shared_ptr; + +template <typename T> +class enable_lw_shared_from_this; + +template <typename T> +class enable_shared_from_this; + +template <typename T, typename... A> +lw_shared_ptr<T> make_lw_shared(A&&... a); + +template <typename T> +lw_shared_ptr<T> make_lw_shared(T&& a); + +template <typename T> +lw_shared_ptr<T> make_lw_shared(T& a); + +struct lw_shared_ptr_counter_base { + long _count = 0; +}; + + +namespace internal { + +template <class T, class U> +struct lw_shared_ptr_accessors; + +template <class T> +struct lw_shared_ptr_accessors_esft; + +template <class T> +struct lw_shared_ptr_accessors_no_esft; + +} + + +// We want to support two use cases for shared_ptr<T>: +// +// 1. T is any type (primitive or class type) +// +// 2. T is a class type that inherits from enable_shared_from_this<T>. +// +// In the first case, we must wrap T in an object containing the counter, +// since T may be a primitive type and cannot be a base class. +// +// In the second case, we want T to reach the counter through its +// enable_shared_from_this<> base class, so that we can implement +// shared_from_this(). +// +// To implement those two conflicting requirements (T alongside its counter; +// T inherits from an object containing the counter) we use std::conditional<> +// and some accessor functions to select between two implementations. + + +// CRTP from this to enable shared_from_this: +template <typename T> +class enable_lw_shared_from_this : private lw_shared_ptr_counter_base { + using ctor = T; +protected: + enable_lw_shared_from_this() noexcept {} + enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {} + enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {} + enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; } + enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; } +public: + lw_shared_ptr<T> shared_from_this(); + lw_shared_ptr<const T> shared_from_this() const; + + template <typename X> + friend class lw_shared_ptr; + template <typename X> + friend class ::internal::lw_shared_ptr_accessors_esft; + template <typename X, class Y> + friend class ::internal::lw_shared_ptr_accessors; +}; + +template <typename T> +struct shared_ptr_no_esft : private lw_shared_ptr_counter_base { + T _value; + + shared_ptr_no_esft() = default; + shared_ptr_no_esft(const T& x) : _value(x) {} + shared_ptr_no_esft(T&& x) : _value(std::move(x)) {} + template <typename... A> + shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {} + + template <typename X> + friend class lw_shared_ptr; + template <typename X> + friend class ::internal::lw_shared_ptr_accessors_no_esft; + template <typename X, class Y> + friend class ::internal::lw_shared_ptr_accessors; +}; + + +/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed, +/// primarily so that incomplete classes can be used. +/// +/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>. +/// The specialization must be visible for all uses of \c lw_shared_ptr<T>. +/// +/// To customize, the template must have a `static void dispose(T*)` operator that disposes of +/// the object. +template <typename T> +struct lw_shared_ptr_deleter; // No generic implementation + +namespace internal { + +template <typename T> +struct lw_shared_ptr_accessors_esft { + using concrete_type = std::remove_const_t<T>; + static T* to_value(lw_shared_ptr_counter_base* counter) { + return static_cast<T*>(counter); + } + static void dispose(lw_shared_ptr_counter_base* counter) { + delete static_cast<T*>(counter); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // since to_value() is defined above, we don't need to do anything special + // to force-instantiate it + } +}; + +template <typename T> +struct lw_shared_ptr_accessors_no_esft { + using concrete_type = shared_ptr_no_esft<T>; + static T* to_value(lw_shared_ptr_counter_base* counter) { + return &static_cast<concrete_type*>(counter)->_value; + } + static void dispose(lw_shared_ptr_counter_base* counter) { + delete static_cast<concrete_type*>(counter); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // since to_value() is defined above, we don't need to do anything special + // to force-instantiate it + } +}; + +// Generic case: lw_shared_ptr_deleter<T> is not specialized, select +// implementation based on whether T inherits from enable_lw_shared_from_this<T>. +template <typename T, typename U = void> +struct lw_shared_ptr_accessors : std::conditional_t< + std::is_base_of<enable_lw_shared_from_this<T>, T>::value, + lw_shared_ptr_accessors_esft<T>, + lw_shared_ptr_accessors_no_esft<T>> { +}; + +// Overload when lw_shared_ptr_deleter<T> specialized +template <typename T> +struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> { + using concrete_type = T; + static T* to_value(lw_shared_ptr_counter_base* counter); + static void dispose(lw_shared_ptr_counter_base* counter) { + lw_shared_ptr_deleter<T>::dispose(to_value(counter)); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // instantiate to_value(); must be defined by shared_ptr_incomplete.hh + to_value(p); + } +}; + +} + +template <typename T> +class lw_shared_ptr { + using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>; + using concrete_type = typename accessors::concrete_type; + mutable lw_shared_ptr_counter_base* _p = nullptr; +private: + lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) { + if (_p) { + ++_p->_count; + } + } + template <typename... A> + static lw_shared_ptr make(A&&... a) { + auto p = new concrete_type(std::forward<A>(a)...); + accessors::instantiate_to_value(p); + return lw_shared_ptr(p); + } +public: + using element_type = T; + + lw_shared_ptr() noexcept = default; + lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {} + lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) { + if (_p) { + ++_p->_count; + } + } + lw_shared_ptr(lw_shared_ptr&& x) noexcept : _p(x._p) { + x._p = nullptr; + } + [[gnu::always_inline]] + ~lw_shared_ptr() { + if (_p && !--_p->_count) { + accessors::dispose(_p); + } + } + lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept { + if (_p != x._p) { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(x); + } + return *this; + } + lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept { + if (_p != x._p) { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(std::move(x)); + } + return *this; + } + lw_shared_ptr& operator=(std::nullptr_t) noexcept { + return *this = lw_shared_ptr(); + } + lw_shared_ptr& operator=(T&& x) noexcept { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x))); + return *this; + } + + T& operator*() const noexcept { return *accessors::to_value(_p); } + T* operator->() const noexcept { return accessors::to_value(_p); } + T* get() const noexcept { + if (_p) { + return accessors::to_value(_p); + } else { + return nullptr; + } + } + + long int use_count() const noexcept { + if (_p) { + return _p->_count; + } else { + return 0; + } + } + + operator lw_shared_ptr<const T>() const noexcept { + return lw_shared_ptr<const T>(_p); + } + + explicit operator bool() const noexcept { + return _p; + } + + bool owned() const noexcept { + return _p->_count == 1; + } + + bool operator==(const lw_shared_ptr<const T>& x) const { + return _p == x._p; + } + + bool operator!=(const lw_shared_ptr<const T>& x) const { + return !operator==(x); + } + + bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return _p == x._p; + } + + bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return !operator==(x); + } + + bool operator<(const lw_shared_ptr<const T>& x) const { + return _p < x._p; + } + + bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return _p < x._p; + } + + template <typename U> + friend class lw_shared_ptr; + + template <typename X, typename... A> + friend lw_shared_ptr<X> make_lw_shared(A&&...); + + template <typename U> + friend lw_shared_ptr<U> make_lw_shared(U&&); + + template <typename U> + friend lw_shared_ptr<U> make_lw_shared(U&); + + template <typename U> + friend class enable_lw_shared_from_this; +}; + +template <typename T, typename... A> +inline +lw_shared_ptr<T> make_lw_shared(A&&... a) { + return lw_shared_ptr<T>::make(std::forward<A>(a)...); +} + +template <typename T> +inline +lw_shared_ptr<T> make_lw_shared(T&& a) { + return lw_shared_ptr<T>::make(std::move(a)); +} + +template <typename T> +inline +lw_shared_ptr<T> make_lw_shared(T& a) { + return lw_shared_ptr<T>::make(a); +} + +template <typename T> +inline +lw_shared_ptr<T> +enable_lw_shared_from_this<T>::shared_from_this() { + return lw_shared_ptr<T>(this); +} + +template <typename T> +inline +lw_shared_ptr<const T> +enable_lw_shared_from_this<T>::shared_from_this() const { + return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this)); +} + +template <typename T> +static inline +std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) { + if (!p) { + return out << "null"; + } + return out << *p; +} + +namespace std { + + template <typename T> + struct hash<lw_shared_ptr<T>> : private hash<T*> { + size_t operator()(const lw_shared_ptr<T>& p) const { + return hash<T*>::operator()(p.get()); + } + }; + +} + +#endif /* CEPH_LW_SHARED_PTR_H_ */ diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h new file mode 100644 index 00000000..1898e8f8 --- /dev/null +++ b/src/msg/async/dpdk/stream.h @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_STREAM_H_ +#define CEPH_MSG_STREAM_H_ + +#include <exception> +#include <cassert> + +// A stream<> is the producer side. It may call produce() as long +// as the returned from the previous invocation is ready. +// To signify no more data is available, call close(). +// +// A subscription<> is the consumer side. It is created by a call +// to stream::listen(). Calling subscription::start(), +// which registers the data processing callback, starts processing +// events. It may register for end-of-stream notifications by +// return the when_done() future, which also delivers error +// events (as exceptions). +// +// The consumer can pause generation of new data by returning +// positive integer; when it becomes ready, the producer +// will resume processing. + +template <typename... T> +class subscription; + +template <typename... T> +class stream { + subscription<T...>* _sub = nullptr; + int done; + bool ready; + public: + using next_fn = std::function<int (T...)>; + stream() = default; + stream(const stream&) = delete; + stream(stream&&) = delete; + ~stream() { + if (_sub) { + _sub->_stream = nullptr; + } + } + + void operator=(const stream&) = delete; + void operator=(stream&&) = delete; + + // Returns a subscription that reads value from this + // stream. + subscription<T...> listen() { + return subscription<T...>(this); + } + + // Returns a subscription that reads value from this + // stream, and also sets up the listen function. + subscription<T...> listen(next_fn next) { + auto sub = subscription<T...>(this); + sub.start(std::move(next)); + return sub; + } + + // Becomes ready when the listener is ready to accept + // values. Call only once, when beginning to produce + // values. + bool started() { + return ready; + } + + // Produce a value. Call only after started(), and after + // a previous produce() is ready. + int produce(T... data) { + return _sub->_next(std::move(data)...); + } + + // End the stream. Call only after started(), and after + // a previous produce() is ready. No functions may be called + // after this. + void close() { + done = 1; + } + + // Signal an error. Call only after started(), and after + // a previous produce() is ready. No functions may be called + // after this. + void set_exception(int error) { + done = error; + } + private: + void start(); + friend class subscription<T...>; +}; + +template <typename... T> +class subscription { + public: + using next_fn = typename stream<T...>::next_fn; + private: + stream<T...>* _stream; + next_fn _next; + private: + explicit subscription(stream<T...>* s): _stream(s) { + ceph_assert(!_stream->_sub); + _stream->_sub = this; + } + + public: + subscription(subscription&& x) + : _stream(x._stream), _next(std::move(x._next)) { + x._stream = nullptr; + if (_stream) { + _stream->_sub = this; + } + } + ~subscription() { + if (_stream) { + _stream->_sub = nullptr; + } + } + + /// \brief Start receiving events from the stream. + /// + /// \param next Callback to call for each event + void start(std::function<int (T...)> next) { + _next = std::move(next); + _stream->ready = true; + } + + // Becomes ready when the stream is empty, or when an error + // happens (in that case, an exception is held). + int done() { + return _stream->done; + } + + friend class stream<T...>; +}; + +#endif /* CEPH_MSG_STREAM_H_ */ diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h new file mode 100644 index 00000000..3ca38808 --- /dev/null +++ b/src/msg/async/dpdk/toeplitz.h @@ -0,0 +1,92 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*- + * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef CEPH_MSG_TOEPLITZ_H_ +#define CEPH_MSG_TOEPLITZ_H_ + +#include <vector> + +using rss_key_type = std::vector<uint8_t>; + +// Mellanox Linux's driver key +static const rss_key_type default_rsskey_40bytes = { + 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, + 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, + 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, + 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, + 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc +}; + +// Intel's i40e PMD default RSS key +static const rss_key_type default_rsskey_52bytes = { + 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, + 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, + 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, + 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, + 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, + 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, + 0x81, 0x15, 0x03, 0x66 +}; + +template<typename T> +static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data) +{ + uint32_t hash = 0, v; + u_int i, b; + + /* XXXRW: Perhaps an assertion about key length vs. data length? */ + + v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; + for (i = 0; i < data.size(); i++) { + for (b = 0; b < 8; b++) { + if (data[i] & (1<<(7-b))) + hash ^= v; + v <<= 1; + if ((i + 4) < key.size() && + (key[i+4] & (1<<(7-b)))) + v |= 1; + } + } + return (hash); +} +#endif diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h new file mode 100644 index 00000000..599db5bd --- /dev/null +++ b/src/msg/async/dpdk/transfer.h @@ -0,0 +1,64 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_TRANSFER_H_ +#define CEPH_TRANSFER_H_ + +// Helper functions for copying or moving multiple objects in an exception +// safe manner, then destroying the sources. +// +// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs, +// (this copies the object from @from to @to). If no exceptions are encountered, +// call transfer_pass2(allocator, &from, &to). This destroys the object at the +// origin. If exceptions were encountered, simply destroy all copied objects. +// +// As an optimization, if the objects are moveable without throwing (noexcept) +// transfer_pass1() simply moves the objects and destroys the source, and +// transfer_pass2() does nothing. + +#include <type_traits> +#include <utility> + +template <typename T, typename Alloc> +inline void transfer_pass1(Alloc& a, T* from, T* to, + typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.construct(to, std::move(*from)); + a.destroy(from); +} + +template <typename T, typename Alloc> +inline void transfer_pass2(Alloc& a, T* from, T* to, + typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { +} + +template <typename T, typename Alloc> +inline void transfer_pass1(Alloc& a, T* from, T* to, + typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.construct(to, *from); +} + +template <typename T, typename Alloc> +inline void transfer_pass2(Alloc& a, T* from, T* to, + typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.destroy(from); +} + +#endif /* CEPH_TRANSFER_H_ */ diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc new file mode 100644 index 00000000..f047eb18 --- /dev/null +++ b/src/msg/async/frames_v2.cc @@ -0,0 +1,480 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "frames_v2.h" + +#include <ostream> + +#undef FMT_HEADER_ONLY +#define FMT_HEADER_ONLY 1 +#include "seastar/fmt/include/fmt/format.h" + +namespace ceph::msgr::v2 { + +// Unpads bufferlist to unpadded_len. +static void unpad_zero(bufferlist& bl, uint32_t unpadded_len) { + ceph_assert(bl.length() >= unpadded_len); + if (bl.length() > unpadded_len) { + bl.splice(unpadded_len, bl.length() - unpadded_len); + } +} + +// Discards trailing empty segments, unless there is just one segment. +// A frame always has at least one (possibly empty) segment. +static size_t calc_num_segments(const bufferlist segment_bls[], + size_t segment_count) { + ceph_assert(segment_count > 0 && segment_count <= MAX_NUM_SEGMENTS); + for (size_t i = segment_count; i-- > 0; ) { + if (segment_bls[i].length() > 0) { + return i + 1; + } + } + return 1; +} + +static void check_segment_crc(const bufferlist& segment_bl, + uint32_t expected_crc) { + uint32_t crc = segment_bl.crc32c(-1); + if (crc != expected_crc) { + throw FrameError(fmt::format( + "bad segment crc calculated={} expected={}", crc, expected_crc)); + } +} + +// Returns true if the frame is ready for dispatching, or false if +// it was aborted by the sender and must be dropped. +static bool check_epilogue_late_status(__u8 late_status) { + __u8 aborted = late_status & FRAME_LATE_STATUS_ABORTED_MASK; + if (aborted != FRAME_LATE_STATUS_ABORTED && + aborted != FRAME_LATE_STATUS_COMPLETE) { + throw FrameError(fmt::format("bad late_status")); + } + return aborted == FRAME_LATE_STATUS_COMPLETE; +} + +void FrameAssembler::fill_preamble(Tag tag, + preamble_block_t& preamble) const { + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&preamble, 0, sizeof(preamble)); + + preamble.tag = static_cast<__u8>(tag); + for (size_t i = 0; i < m_descs.size(); i++) { + preamble.segments[i].length = m_descs[i].logical_len; + preamble.segments[i].alignment = m_descs[i].align; + } + preamble.num_segments = m_descs.size(); + preamble.crc = ceph_crc32c( + 0, reinterpret_cast<const unsigned char*>(&preamble), + sizeof(preamble) - sizeof(preamble.crc)); +} + +uint64_t FrameAssembler::get_frame_logical_len() const { + ceph_assert(!m_descs.empty()); + uint64_t logical_len = 0; + for (size_t i = 0; i < m_descs.size(); i++) { + logical_len += m_descs[i].logical_len; + } + return logical_len; +} + +uint64_t FrameAssembler::get_frame_onwire_len() const { + ceph_assert(!m_descs.empty()); + uint64_t onwire_len = get_preamble_onwire_len(); + for (size_t i = 0; i < m_descs.size(); i++) { + onwire_len += get_segment_onwire_len(i); + } + onwire_len += get_epilogue_onwire_len(); + return onwire_len; +} + +bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + epilogue_crc_rev0_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + + bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue)); + frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble)); + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + epilogue.crc_values[i] = segment_bls[i].crc32c(-1); + if (segment_bls[i].length() > 0) { + frame_bl.claim_append(segment_bls[i]); + } + } + frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue)); + return frame_bl; +} + +bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + bufferlist preamble_bl(sizeof(preamble)); + preamble_bl.append(reinterpret_cast<const char*>(&preamble), + sizeof(preamble)); + + epilogue_secure_rev0_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + bufferlist epilogue_bl(sizeof(epilogue)); + epilogue_bl.append(reinterpret_cast<const char*>(&epilogue), + sizeof(epilogue)); + + // preamble + MAX_NUM_SEGMENTS + epilogue + uint32_t onwire_lens[MAX_NUM_SEGMENTS + 2]; + onwire_lens[0] = preamble_bl.length(); + for (size_t i = 0; i < m_descs.size(); i++) { + onwire_lens[i + 1] = segment_bls[i].length(); // already padded + } + onwire_lens[m_descs.size() + 1] = epilogue_bl.length(); + m_crypto->tx->reset_tx_handler(onwire_lens, + onwire_lens + m_descs.size() + 2); + m_crypto->tx->authenticated_encrypt_update(preamble_bl); + for (size_t i = 0; i < m_descs.size(); i++) { + if (segment_bls[i].length() > 0) { + m_crypto->tx->authenticated_encrypt_update(segment_bls[i]); + } + } + m_crypto->tx->authenticated_encrypt_update(epilogue_bl); + return m_crypto->tx->authenticated_encrypt_final(); +} + +bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + epilogue_crc_rev1_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE; + + bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue)); + frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble)); + + ceph_assert(segment_bls[0].length() == m_descs[0].logical_len); + if (segment_bls[0].length() > 0) { + uint32_t crc = segment_bls[0].crc32c(-1); + frame_bl.claim_append(segment_bls[0]); + encode(crc, frame_bl); + } + if (m_descs.size() == 1) { + return frame_bl; // no epilogue if only one segment + } + + for (size_t i = 1; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + epilogue.crc_values[i - 1] = segment_bls[i].crc32c(-1); + if (segment_bls[i].length() > 0) { + frame_bl.claim_append(segment_bls[i]); + } + } + frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue)); + return frame_bl; +} + +bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + bufferlist preamble_bl; + if (segment_bls[0].length() > FRAME_PREAMBLE_INLINE_SIZE) { + // first segment is partially inlined, inline buffer is full + preamble_bl.reserve(sizeof(preamble)); + preamble_bl.append(reinterpret_cast<const char*>(&preamble), + sizeof(preamble)); + segment_bls[0].splice(0, FRAME_PREAMBLE_INLINE_SIZE, &preamble_bl); + } else { + // first segment is fully inlined, inline buffer may need padding + uint32_t pad_len = FRAME_PREAMBLE_INLINE_SIZE - segment_bls[0].length(); + preamble_bl.reserve(sizeof(preamble) + pad_len); + preamble_bl.append(reinterpret_cast<const char*>(&preamble), + sizeof(preamble)); + preamble_bl.claim_append(segment_bls[0]); + if (pad_len > 0) { + preamble_bl.append_zero(pad_len); + } + } + + m_crypto->tx->reset_tx_handler({preamble_bl.length()}); + m_crypto->tx->authenticated_encrypt_update(preamble_bl); + auto frame_bl = m_crypto->tx->authenticated_encrypt_final(); + + if (segment_bls[0].length() > 0) { + m_crypto->tx->reset_tx_handler({segment_bls[0].length()}); + m_crypto->tx->authenticated_encrypt_update(segment_bls[0]); + auto tmp = m_crypto->tx->authenticated_encrypt_final(); + frame_bl.claim_append(tmp); + } + if (m_descs.size() == 1) { + return frame_bl; // no epilogue if only one segment + } + + epilogue_secure_rev1_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE; + bufferlist epilogue_bl(sizeof(epilogue)); + epilogue_bl.append(reinterpret_cast<const char*>(&epilogue), + sizeof(epilogue)); + + // MAX_NUM_SEGMENTS - 1 + epilogue + uint32_t onwire_lens[MAX_NUM_SEGMENTS]; + for (size_t i = 1; i < m_descs.size(); i++) { + onwire_lens[i - 1] = segment_bls[i].length(); // already padded + } + onwire_lens[m_descs.size() - 1] = epilogue_bl.length(); + m_crypto->tx->reset_tx_handler(onwire_lens, onwire_lens + m_descs.size()); + for (size_t i = 1; i < m_descs.size(); i++) { + if (segment_bls[i].length() > 0) { + m_crypto->tx->authenticated_encrypt_update(segment_bls[i]); + } + } + m_crypto->tx->authenticated_encrypt_update(epilogue_bl); + auto tmp = m_crypto->tx->authenticated_encrypt_final(); + frame_bl.claim_append(tmp); + return frame_bl; +} + +bufferlist FrameAssembler::assemble_frame(Tag tag, bufferlist segment_bls[], + const uint16_t segment_aligns[], + size_t segment_count) { + m_descs.resize(calc_num_segments(segment_bls, segment_count)); + for (size_t i = 0; i < m_descs.size(); i++) { + m_descs[i].logical_len = segment_bls[i].length(); + m_descs[i].align = segment_aligns[i]; + } + + preamble_block_t preamble; + fill_preamble(tag, preamble); + + if (m_crypto->rx) { + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + // We're padding segments to biggest cipher's block size. Although + // AES-GCM can live without that as it's a stream cipher, we don't + // want to be fixed to stream ciphers only. + uint32_t padded_len = get_segment_padded_len(i); + if (padded_len > segment_bls[i].length()) { + uint32_t pad_len = padded_len - segment_bls[i].length(); + segment_bls[i].reserve(pad_len); + segment_bls[i].append_zero(pad_len); + } + } + if (m_is_rev1) { + return asm_secure_rev1(preamble, segment_bls); + } + return asm_secure_rev0(preamble, segment_bls); + } + if (m_is_rev1) { + return asm_crc_rev1(preamble, segment_bls); + } + return asm_crc_rev0(preamble, segment_bls); +} + +Tag FrameAssembler::disassemble_preamble(bufferlist& preamble_bl) { + if (m_crypto->rx) { + m_crypto->rx->reset_rx_handler(); + if (m_is_rev1) { + ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE + + get_auth_tag_len()); + m_crypto->rx->authenticated_decrypt_update_final(preamble_bl); + } else { + ceph_assert(preamble_bl.length() == sizeof(preamble_block_t)); + m_crypto->rx->authenticated_decrypt_update(preamble_bl); + } + } else { + ceph_assert(preamble_bl.length() == sizeof(preamble_block_t)); + } + + // I expect ceph_le32 will make the endian conversion for me. Passing + // everything through ::Decode is unnecessary. + auto preamble = reinterpret_cast<const preamble_block_t*>( + preamble_bl.c_str()); + // check preamble crc before any further processing + uint32_t crc = ceph_crc32c( + 0, reinterpret_cast<const unsigned char*>(preamble), + sizeof(*preamble) - sizeof(preamble->crc)); + if (crc != preamble->crc) { + throw FrameError(fmt::format( + "bad preamble crc calculated={} expected={}", crc, preamble->crc)); + } + + // see calc_num_segments() + if (preamble->num_segments < 1 || + preamble->num_segments > MAX_NUM_SEGMENTS) { + throw FrameError(fmt::format( + "bad number of segments num_segments={}", preamble->num_segments)); + } + if (preamble->num_segments > 1 && + preamble->segments[preamble->num_segments - 1].length == 0) { + throw FrameError("last segment empty"); + } + + m_descs.resize(preamble->num_segments); + for (size_t i = 0; i < m_descs.size(); i++) { + m_descs[i].logical_len = preamble->segments[i].length; + m_descs[i].align = preamble->segments[i].alignment; + } + return static_cast<Tag>(preamble->tag); +} + +bool FrameAssembler::disasm_all_crc_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const { + ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev0_block_t)); + auto epilogue = reinterpret_cast<const epilogue_crc_rev0_block_t*>( + epilogue_bl.c_str()); + + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + check_segment_crc(segment_bls[i], epilogue->crc_values[i]); + } + return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED); +} + +bool FrameAssembler::disasm_all_secure_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const { + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == get_segment_padded_len(i)); + if (segment_bls[i].length() > 0) { + m_crypto->rx->authenticated_decrypt_update(segment_bls[i]); + unpad_zero(segment_bls[i], m_descs[i].logical_len); + } + } + + ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev0_block_t) + + get_auth_tag_len()); + m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl); + auto epilogue = reinterpret_cast<const epilogue_secure_rev0_block_t*>( + epilogue_bl.c_str()); + return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED); +} + +void FrameAssembler::disasm_first_crc_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const { + ceph_assert(preamble_bl.length() == sizeof(preamble_block_t)); + if (m_descs[0].logical_len > 0) { + ceph_assert(segment_bl.length() == m_descs[0].logical_len + + FRAME_CRC_SIZE); + bufferlist::const_iterator it(&segment_bl, m_descs[0].logical_len); + uint32_t expected_crc; + decode(expected_crc, it); + segment_bl.splice(m_descs[0].logical_len, FRAME_CRC_SIZE); + check_segment_crc(segment_bl, expected_crc); + } else { + ceph_assert(segment_bl.length() == 0); + } +} + +bool FrameAssembler::disasm_remaining_crc_rev1(bufferlist segment_bls[], + bufferlist& epilogue_bl) const { + ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev1_block_t)); + auto epilogue = reinterpret_cast<const epilogue_crc_rev1_block_t*>( + epilogue_bl.c_str()); + + for (size_t i = 1; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + check_segment_crc(segment_bls[i], epilogue->crc_values[i - 1]); + } + return check_epilogue_late_status(epilogue->late_status); +} + +void FrameAssembler::disasm_first_secure_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const { + ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE); + uint32_t padded_len = get_segment_padded_len(0); + if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) { + ceph_assert(segment_bl.length() == padded_len + get_auth_tag_len() - + FRAME_PREAMBLE_INLINE_SIZE); + m_crypto->rx->reset_rx_handler(); + m_crypto->rx->authenticated_decrypt_update_final(segment_bl); + // prepend the inline buffer (already decrypted) to segment_bl + bufferlist tmp; + segment_bl.swap(tmp); + preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE, + &segment_bl); + segment_bl.claim_append(tmp); + } else { + ceph_assert(segment_bl.length() == 0); + preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE, + &segment_bl); + } + unpad_zero(segment_bl, m_descs[0].logical_len); + ceph_assert(segment_bl.length() == m_descs[0].logical_len); +} + +bool FrameAssembler::disasm_remaining_secure_rev1( + bufferlist segment_bls[], bufferlist& epilogue_bl) const { + m_crypto->rx->reset_rx_handler(); + for (size_t i = 1; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == get_segment_padded_len(i)); + if (segment_bls[i].length() > 0) { + m_crypto->rx->authenticated_decrypt_update(segment_bls[i]); + unpad_zero(segment_bls[i], m_descs[i].logical_len); + } + } + + ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev1_block_t) + + get_auth_tag_len()); + m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl); + auto epilogue = reinterpret_cast<const epilogue_secure_rev1_block_t*>( + epilogue_bl.c_str()); + return check_epilogue_late_status(epilogue->late_status); +} + +void FrameAssembler::disassemble_first_segment(bufferlist& preamble_bl, + bufferlist& segment_bl) const { + ceph_assert(!m_descs.empty()); + if (m_is_rev1) { + if (m_crypto->rx) { + disasm_first_secure_rev1(preamble_bl, segment_bl); + } else { + disasm_first_crc_rev1(preamble_bl, segment_bl); + } + } else { + // noop, everything is handled in disassemble_remaining_segments() + } +} + +bool FrameAssembler::disassemble_remaining_segments( + bufferlist segment_bls[], bufferlist& epilogue_bl) const { + ceph_assert(!m_descs.empty()); + if (m_is_rev1) { + if (m_descs.size() == 1) { + // no epilogue if only one segment + ceph_assert(epilogue_bl.length() == 0); + return true; + } + if (m_crypto->rx) { + return disasm_remaining_secure_rev1(segment_bls, epilogue_bl); + } + return disasm_remaining_crc_rev1(segment_bls, epilogue_bl); + } + if (m_crypto->rx) { + return disasm_all_secure_rev0(segment_bls, epilogue_bl); + } + return disasm_all_crc_rev0(segment_bls, epilogue_bl); +} + +std::ostream& operator<<(std::ostream& os, const FrameAssembler& frame_asm) { + if (!frame_asm.m_descs.empty()) { + os << frame_asm.get_preamble_onwire_len(); + for (size_t i = 0; i < frame_asm.m_descs.size(); i++) { + os << " + " << frame_asm.get_segment_onwire_len(i) + << " (logical " << frame_asm.m_descs[i].logical_len + << "/" << frame_asm.m_descs[i].align << ")"; + } + os << " + " << frame_asm.get_epilogue_onwire_len() << " "; + } + os << "rev1=" << frame_asm.m_is_rev1 + << " rx=" << frame_asm.m_crypto->rx.get() + << " tx=" << frame_asm.m_crypto->tx.get(); + return os; +} + +} // namespace ceph::msgr::v2 diff --git a/src/msg/async/frames_v2.h b/src/msg/async/frames_v2.h new file mode 100644 index 00000000..88fa4e1b --- /dev/null +++ b/src/msg/async/frames_v2.h @@ -0,0 +1,842 @@ +#ifndef _MSG_ASYNC_FRAMES_V2_ +#define _MSG_ASYNC_FRAMES_V2_ + +#include "include/types.h" +#include "common/Clock.h" +#include "crypto_onwire.h" +#include <array> +#include <iosfwd> +#include <utility> + +#include <boost/container/static_vector.hpp> + +/** + * Protocol V2 Frame Structures + * + * Documentation in: doc/dev/msgr2.rst + **/ + +namespace ceph::msgr::v2 { + +// We require these features from any peer, period, in order to encode +// a entity_addrvec_t. +const uint64_t msgr2_required = CEPH_FEATUREMASK_MSG_ADDR2; + +// We additionally assume the peer has the below features *purely for +// the purpose of encoding the frames themselves*. The only complex +// types in the frames are entity_addr_t and entity_addrvec_t, and we +// specifically want the peer to understand the (new in nautilus) +// TYPE_ANY. We treat narrow this assumption to frames because we +// expect there may be future clients (the kernel) that understand +// msgr v2 and understand this encoding but don't necessarily have +// everything else that SERVER_NAUTILUS implies. Yes, a fresh feature +// bit would be a cleaner approach, but those are scarce these days. +const uint64_t msgr2_frame_assumed = + msgr2_required | + CEPH_FEATUREMASK_SERVER_NAUTILUS; + +enum class Tag : __u8 { + HELLO = 1, + AUTH_REQUEST, + AUTH_BAD_METHOD, + AUTH_REPLY_MORE, + AUTH_REQUEST_MORE, + AUTH_DONE, + AUTH_SIGNATURE, + CLIENT_IDENT, + SERVER_IDENT, + IDENT_MISSING_FEATURES, + SESSION_RECONNECT, + SESSION_RESET, + SESSION_RETRY, + SESSION_RETRY_GLOBAL, + SESSION_RECONNECT_OK, + WAIT, + MESSAGE, + KEEPALIVE2, + KEEPALIVE2_ACK, + ACK +}; + +struct segment_t { + // TODO: this will be dropped with support for `allocation policies`. + // We need them because of the rx_buffers zero-copy optimization. + static constexpr __le16 PAGE_SIZE_ALIGNMENT{4096}; + + static constexpr __le16 DEFAULT_ALIGNMENT = sizeof(void *); + + ceph_le32 length; + ceph_le16 alignment; +} __attribute__((packed)); + +struct SegmentIndex { + struct Msg { + static constexpr std::size_t HEADER = 0; + static constexpr std::size_t FRONT = 1; + static constexpr std::size_t MIDDLE = 2; + static constexpr std::size_t DATA = 3; + }; + + struct Control { + static constexpr std::size_t PAYLOAD = 0; + }; +}; + +static constexpr uint8_t CRYPTO_BLOCK_SIZE { 16 }; + +static constexpr std::size_t MAX_NUM_SEGMENTS = 4; + +// V2 preamble consists of one or more preamble blocks depending on +// the number of segments a particular frame needs. Each block holds +// up to MAX_NUM_SEGMENTS segments and has its own CRC. +// +// XXX: currently the multi-segment facility is NOT implemented. +struct preamble_block_t { + // Tag. For multi-segmented frames the value is the same + // between subsequent preamble blocks. + __u8 tag; + + // Number of segments to go in entire frame. First preable block has + // set this to just #segments, second #segments - MAX_NUM_SEGMENTS, + // third to #segments - MAX_NUM_SEGMENTS and so on. + __u8 num_segments; + + std::array<segment_t, MAX_NUM_SEGMENTS> segments; + __u8 _reserved[2]; + + // CRC32 for this single preamble block. + ceph_le32 crc; +} __attribute__((packed)); +static_assert(sizeof(preamble_block_t) % CRYPTO_BLOCK_SIZE == 0); +static_assert(std::is_standard_layout<preamble_block_t>::value); + +struct epilogue_crc_rev0_block_t { + __u8 late_flags; // FRAME_LATE_FLAG_ABORTED + std::array<ceph_le32, MAX_NUM_SEGMENTS> crc_values; +} __attribute__((packed)); +static_assert(std::is_standard_layout_v<epilogue_crc_rev0_block_t>); + +struct epilogue_crc_rev1_block_t { + __u8 late_status; // FRAME_LATE_STATUS_* + ceph_le32 crc_values[MAX_NUM_SEGMENTS - 1]; +} __attribute__((packed)); +static_assert(std::is_standard_layout_v<epilogue_crc_rev1_block_t>); + +struct epilogue_secure_rev0_block_t { + __u8 late_flags; // FRAME_LATE_FLAG_ABORTED + __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_flags)]; +} __attribute__((packed)); +static_assert(sizeof(epilogue_secure_rev0_block_t) % CRYPTO_BLOCK_SIZE == 0); +static_assert(std::is_standard_layout_v<epilogue_secure_rev0_block_t>); + +// epilogue_secure_rev0_block_t with late_flags changed to late_status +struct epilogue_secure_rev1_block_t { + __u8 late_status; // FRAME_LATE_STATUS_* + __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_status)]; +} __attribute__((packed)); +static_assert(sizeof(epilogue_secure_rev1_block_t) % CRYPTO_BLOCK_SIZE == 0); +static_assert(std::is_standard_layout_v<epilogue_secure_rev1_block_t>); + +static constexpr uint32_t FRAME_CRC_SIZE = 4; +static constexpr uint32_t FRAME_PREAMBLE_INLINE_SIZE = 48; +static_assert(FRAME_PREAMBLE_INLINE_SIZE % CRYPTO_BLOCK_SIZE == 0); +// just for performance, nothing should break otherwise +static_assert(sizeof(ceph_msg_header2) <= FRAME_PREAMBLE_INLINE_SIZE); +static constexpr uint32_t FRAME_PREAMBLE_WITH_INLINE_SIZE = + sizeof(preamble_block_t) + FRAME_PREAMBLE_INLINE_SIZE; + +// A frame can be aborted by the sender after transmitting the +// preamble and the first segment. The remainder of the frame +// is filled with zeros, up until the epilogue. +// +// This flag is for msgr2.0. Note that in crc mode, late_flags +// is not covered by any crc -- a single bit flip can result in +// a completed frame being dropped or in an aborted frame with +// garbage segment payloads being dispatched. +#define FRAME_LATE_FLAG_ABORTED (1<<0) + +// For msgr2.1, FRAME_LATE_STATUS_ABORTED has the same meaning +// as FRAME_LATE_FLAG_ABORTED and late_status replaces late_flags. +// Bit error detection in crc mode is achieved by using a 4-bit +// nibble per flag with two code words that are far apart in terms +// of Hamming Distance (HD=4, same as provided by CRC32-C for +// input lengths over ~5K). +#define FRAME_LATE_STATUS_ABORTED 0x1 +#define FRAME_LATE_STATUS_COMPLETE 0xe +#define FRAME_LATE_STATUS_ABORTED_MASK 0xf + +#define FRAME_LATE_STATUS_RESERVED_TRUE 0x10 +#define FRAME_LATE_STATUS_RESERVED_FALSE 0xe0 +#define FRAME_LATE_STATUS_RESERVED_MASK 0xf0 + +struct FrameError : std::runtime_error { + using runtime_error::runtime_error; +}; + +class FrameAssembler { +public: + // crypto must be non-null + FrameAssembler(const ceph::crypto::onwire::rxtx_t* crypto, bool is_rev1) + : m_crypto(crypto), m_is_rev1(is_rev1) {} + + void set_is_rev1(bool is_rev1) { + m_descs.clear(); + m_is_rev1 = is_rev1; + } + + bool get_is_rev1() { + return m_is_rev1; + } + + size_t get_num_segments() const { + ceph_assert(!m_descs.empty()); + return m_descs.size(); + } + + uint32_t get_segment_logical_len(size_t seg_idx) const { + ceph_assert(seg_idx < m_descs.size()); + return m_descs[seg_idx].logical_len; + } + + uint16_t get_segment_align(size_t seg_idx) const { + ceph_assert(seg_idx < m_descs.size()); + return m_descs[seg_idx].align; + } + + // Preamble: + // + // preamble_block_t + // [preamble inline buffer + auth tag -- only in msgr2.1 secure mode] + // + // The preamble is generated unconditionally. + // + // In msgr2.1 secure mode, the first segment is inlined into the + // preamble inline buffer, either fully or partially. + uint32_t get_preamble_onwire_len() const { + if (m_is_rev1 && m_crypto->rx) { + return FRAME_PREAMBLE_WITH_INLINE_SIZE + get_auth_tag_len(); + } + return sizeof(preamble_block_t); + } + + // Segment: + // + // segment payload + // [zero padding -- only in secure mode] + // [crc or auth tag -- only in msgr2.1, only for the first segment] + // + // For an empty segment, nothing is generated. In msgr2.1 secure + // mode, if the first segment gets fully inlined into the preamble + // inline buffer, it is considered empty. + uint32_t get_segment_onwire_len(size_t seg_idx) const { + ceph_assert(seg_idx < m_descs.size()); + if (m_crypto->rx) { + uint32_t padded_len = get_segment_padded_len(seg_idx); + if (m_is_rev1 && seg_idx == 0) { + if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) { + return padded_len + get_auth_tag_len() - FRAME_PREAMBLE_INLINE_SIZE; + } + return 0; + } + return padded_len; + } + if (m_is_rev1 && seg_idx == 0 && m_descs[0].logical_len > 0) { + return m_descs[0].logical_len + FRAME_CRC_SIZE; + } + return m_descs[seg_idx].logical_len; + } + + // Epilogue: + // + // epilogue_*_block_t + // [auth tag -- only in secure mode] + // + // For msgr2.0, the epilogue is generated unconditionally. In + // crc mode, it stores crcs for all segments; the preamble is + // covered by its own crc. In secure mode, the epilogue auth tag + // covers the whole frame. + // + // For msgr2.1, the epilogue is generated only if the frame has + // more than one segment (i.e. at least one of second to fourth + // segments is not empty). In crc mode, it stores crcs for + // second to fourh segments; the preamble and the first segment + // are covered by their own crcs. In secure mode, the epilogue + // auth tag covers second to fourth segments; the preamble and the + // first segment (if not fully inlined into the preamble inline + // buffer) are covered by their own auth tags. + // + // Note that the auth tag format is an implementation detail of a + // particular cipher. FrameAssembler is concerned only with where + // the auth tag is placed (at the end of the ciphertext) and how + // long it is (RxHandler::get_extra_size_at_final()). This is to + // provide room for other encryption algorithms: currently we use + // AES-128-GCM with 16-byte tags, but it is possible to switch to + // e.g. AES-128-CBC + HMAC-SHA512 without affecting the protocol + // (except for the cipher negotiation, of course). + // + // Additionally, each variant of the epilogue contains either + // late_flags or late_status field that directs handling of frames + // with more than one segment. + uint32_t get_epilogue_onwire_len() const { + ceph_assert(!m_descs.empty()); + if (m_is_rev1 && m_descs.size() == 1) { + return 0; + } + if (m_crypto->rx) { + return (m_is_rev1 ? sizeof(epilogue_secure_rev1_block_t) : + sizeof(epilogue_secure_rev0_block_t)) + get_auth_tag_len(); + } + return m_is_rev1 ? sizeof(epilogue_crc_rev1_block_t) : + sizeof(epilogue_crc_rev0_block_t); + } + + uint64_t get_frame_logical_len() const; + uint64_t get_frame_onwire_len() const; + + bufferlist assemble_frame(Tag tag, bufferlist segment_bls[], + const uint16_t segment_aligns[], + size_t segment_count); + + Tag disassemble_preamble(bufferlist& preamble_bl); + + // Like msgr1, and unlike msgr2.0, msgr2.1 allows interpreting the + // first segment before reading in the rest of the frame. + // + // For msgr2.1 (set_is_rev1(true)), you may: + // + // - read in the first segment + // - call disassemble_first_segment() + // - use the contents of the first segment, for example to + // look up user-provided buffers based on ceph_msg_header2::tid + // - read in the remaining segments, possibly directly into + // user-provided buffers + // - read in epilogue + // - call disassemble_remaining_segments() + // + // For msgr2.0 (set_is_rev1(false)), disassemble_first_segment() is + // a noop. To accomodate, disassemble_remaining_segments() always + // takes all segments and skips over the first segment in msgr2.1 + // case. You must: + // + // - read in all segments + // - read in epilogue + // - call disassemble_remaining_segments() + // + // disassemble_remaining_segments() returns true if the frame is + // ready for dispatching, or false if it was aborted by the sender + // and must be dropped. + void disassemble_first_segment(bufferlist& preamble_bl, + bufferlist& segment_bl) const; + bool disassemble_remaining_segments(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + +private: + struct segment_desc_t { + uint32_t logical_len; + uint16_t align; + }; + + uint32_t get_segment_padded_len(size_t seg_idx) const { + return p2roundup<uint32_t>(m_descs[seg_idx].logical_len, + CRYPTO_BLOCK_SIZE); + } + + uint32_t get_auth_tag_len() const { + return m_crypto->rx->get_extra_size_at_final(); + } + + bufferlist asm_crc_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + bufferlist asm_secure_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + bufferlist asm_crc_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + bufferlist asm_secure_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + + bool disasm_all_crc_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + bool disasm_all_secure_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + void disasm_first_crc_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const; + bool disasm_remaining_crc_rev1(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + void disasm_first_secure_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const; + bool disasm_remaining_secure_rev1(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + + void fill_preamble(Tag tag, preamble_block_t& preamble) const; + friend std::ostream& operator<<(std::ostream& os, + const FrameAssembler& frame_asm); + + boost::container::static_vector<segment_desc_t, MAX_NUM_SEGMENTS> m_descs; + const ceph::crypto::onwire::rxtx_t* m_crypto; + bool m_is_rev1; // msgr2.1? +}; + +template <class T, uint16_t... SegmentAlignmentVs> +struct Frame { + static constexpr size_t SegmentsNumV = sizeof...(SegmentAlignmentVs); + static_assert(SegmentsNumV > 0 && SegmentsNumV <= MAX_NUM_SEGMENTS); +protected: + std::array<ceph::bufferlist, SegmentsNumV> segments; + +private: + static constexpr std::array<uint16_t, SegmentsNumV> alignments { + SegmentAlignmentVs... + }; + +public: + ceph::bufferlist get_buffer(FrameAssembler& tx_frame_asm) { + auto bl = tx_frame_asm.assemble_frame(T::tag, segments.data(), + alignments.data(), SegmentsNumV); + ceph_assert(bl.length() == tx_frame_asm.get_frame_onwire_len()); + return bl; + } +}; + +// ControlFrames are used to manage transceiver state (like connections) and +// orchestrate transfers of MessageFrames. They use only single segment with +// marshalling facilities -- derived classes specify frame structure through +// Args pack while ControlFrame provides common encode/decode machinery. +template <class C, typename... Args> +class ControlFrame : public Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */> { +protected: + ceph::bufferlist &get_payload_segment() { + return this->segments[SegmentIndex::Control::PAYLOAD]; + } + + // this tuple is only used when decoding values from a payload segment + std::tuple<Args...> _values; + + // FIXME: for now, we assume specific features for the purpoess of encoding + // the frames themselves (*not* messages in message frames!). + uint64_t features = msgr2_frame_assumed; + + template <typename T> + inline void _encode_payload_each(T &t) { + if constexpr (std::is_same<T, std::vector<uint32_t> const>()) { + encode((uint32_t)t.size(), this->get_payload_segment(), features); + for (const auto &elem : t) { + encode(elem, this->get_payload_segment(), features); + } + } else { + encode(t, this->get_payload_segment(), features); + } + } + + template <typename T> + inline void _decode_payload_each(T &t, bufferlist::const_iterator &ti) const { + if constexpr (std::is_same<T, std::vector<uint32_t>>()) { + uint32_t size; + decode(size, ti); + t.resize(size); + for (uint32_t i = 0; i < size; ++i) { + decode(t[i], ti); + } + } else { + decode(t, ti); + } + } + + template <std::size_t... Is> + inline void _decode_payload(bufferlist::const_iterator &ti, + std::index_sequence<Is...>) const { + (_decode_payload_each((Args &)std::get<Is>(_values), ti), ...); + } + + template <std::size_t N> + inline decltype(auto) get_val() { + return std::get<N>(_values); + } + + ControlFrame() + : Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */>() { + } + + void _encode(const Args &... args) { + (_encode_payload_each(args), ...); + } + + void _decode(const ceph::bufferlist &bl) { + auto ti = bl.cbegin(); + _decode_payload(ti, std::index_sequence_for<Args...>()); + } + +public: + static C Encode(const Args &... args) { + C c; + c._encode(args...); + return c; + } + + static C Decode(const ceph::bufferlist &payload) { + C c; + c._decode(payload); + return c; + } +}; + +struct HelloFrame : public ControlFrame<HelloFrame, + uint8_t, // entity type + entity_addr_t> { // peer address + static const Tag tag = Tag::HELLO; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint8_t &entity_type() { return get_val<0>(); } + inline entity_addr_t &peer_addr() { return get_val<1>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthRequestFrame : public ControlFrame<AuthRequestFrame, + uint32_t, // auth method + vector<uint32_t>, // preferred modes + bufferlist> { // auth payload + static const Tag tag = Tag::AUTH_REQUEST; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint32_t &method() { return get_val<0>(); } + inline vector<uint32_t> &preferred_modes() { return get_val<1>(); } + inline bufferlist &auth_payload() { return get_val<2>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthBadMethodFrame : public ControlFrame<AuthBadMethodFrame, + uint32_t, // method + int32_t, // result + std::vector<uint32_t>, // allowed methods + std::vector<uint32_t>> { // allowed modes + static const Tag tag = Tag::AUTH_BAD_METHOD; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint32_t &method() { return get_val<0>(); } + inline int32_t &result() { return get_val<1>(); } + inline std::vector<uint32_t> &allowed_methods() { return get_val<2>(); } + inline std::vector<uint32_t> &allowed_modes() { return get_val<3>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthReplyMoreFrame : public ControlFrame<AuthReplyMoreFrame, + bufferlist> { // auth payload + static const Tag tag = Tag::AUTH_REPLY_MORE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline bufferlist &auth_payload() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthRequestMoreFrame : public ControlFrame<AuthRequestMoreFrame, + bufferlist> { // auth payload + static const Tag tag = Tag::AUTH_REQUEST_MORE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline bufferlist &auth_payload() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthDoneFrame : public ControlFrame<AuthDoneFrame, + uint64_t, // global id + uint32_t, // connection mode + bufferlist> { // auth method payload + static const Tag tag = Tag::AUTH_DONE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &global_id() { return get_val<0>(); } + inline uint32_t &con_mode() { return get_val<1>(); } + inline bufferlist &auth_payload() { return get_val<2>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthSignatureFrame + : public ControlFrame<AuthSignatureFrame, + sha256_digest_t> { + static const Tag tag = Tag::AUTH_SIGNATURE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline sha256_digest_t &signature() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ClientIdentFrame + : public ControlFrame<ClientIdentFrame, + entity_addrvec_t, // my addresses + entity_addr_t, // target address + int64_t, // global_id + uint64_t, // global seq + uint64_t, // supported features + uint64_t, // required features + uint64_t, // flags + uint64_t> { // client cookie + static const Tag tag = Tag::CLIENT_IDENT; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline entity_addrvec_t &addrs() { return get_val<0>(); } + inline entity_addr_t &target_addr() { return get_val<1>(); } + inline int64_t &gid() { return get_val<2>(); } + inline uint64_t &global_seq() { return get_val<3>(); } + inline uint64_t &supported_features() { return get_val<4>(); } + inline uint64_t &required_features() { return get_val<5>(); } + inline uint64_t &flags() { return get_val<6>(); } + inline uint64_t &cookie() { return get_val<7>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ServerIdentFrame + : public ControlFrame<ServerIdentFrame, + entity_addrvec_t, // my addresses + int64_t, // global_id + uint64_t, // global seq + uint64_t, // supported features + uint64_t, // required features + uint64_t, // flags + uint64_t> { // server cookie + static const Tag tag = Tag::SERVER_IDENT; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline entity_addrvec_t &addrs() { return get_val<0>(); } + inline int64_t &gid() { return get_val<1>(); } + inline uint64_t &global_seq() { return get_val<2>(); } + inline uint64_t &supported_features() { return get_val<3>(); } + inline uint64_t &required_features() { return get_val<4>(); } + inline uint64_t &flags() { return get_val<5>(); } + inline uint64_t &cookie() { return get_val<6>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ReconnectFrame + : public ControlFrame<ReconnectFrame, + entity_addrvec_t, // my addresses + uint64_t, // client cookie + uint64_t, // server cookie + uint64_t, // global sequence + uint64_t, // connect sequence + uint64_t> { // message sequence + static const Tag tag = Tag::SESSION_RECONNECT; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline entity_addrvec_t &addrs() { return get_val<0>(); } + inline uint64_t &client_cookie() { return get_val<1>(); } + inline uint64_t &server_cookie() { return get_val<2>(); } + inline uint64_t &global_seq() { return get_val<3>(); } + inline uint64_t &connect_seq() { return get_val<4>(); } + inline uint64_t &msg_seq() { return get_val<5>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ResetFrame : public ControlFrame<ResetFrame, + bool> { // full reset + static const Tag tag = Tag::SESSION_RESET; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline bool &full() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct RetryFrame : public ControlFrame<RetryFrame, + uint64_t> { // connection seq + static const Tag tag = Tag::SESSION_RETRY; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &connect_seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct RetryGlobalFrame : public ControlFrame<RetryGlobalFrame, + uint64_t> { // global seq + static const Tag tag = Tag::SESSION_RETRY_GLOBAL; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &global_seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct WaitFrame : public ControlFrame<WaitFrame> { + static const Tag tag = Tag::WAIT; + using ControlFrame::Encode; + using ControlFrame::Decode; + +protected: + using ControlFrame::ControlFrame; +}; + +struct ReconnectOkFrame : public ControlFrame<ReconnectOkFrame, + uint64_t> { // message seq + static const Tag tag = Tag::SESSION_RECONNECT_OK; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &msg_seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct IdentMissingFeaturesFrame + : public ControlFrame<IdentMissingFeaturesFrame, + uint64_t> { // missing features mask + static const Tag tag = Tag::IDENT_MISSING_FEATURES; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &features() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct KeepAliveFrame : public ControlFrame<KeepAliveFrame, + utime_t> { // timestamp + static const Tag tag = Tag::KEEPALIVE2; + using ControlFrame::Encode; + using ControlFrame::Decode; + + static KeepAliveFrame Encode() { + return KeepAliveFrame::Encode(ceph_clock_now()); + } + + inline utime_t ×tamp() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct KeepAliveFrameAck : public ControlFrame<KeepAliveFrameAck, + utime_t> { // ack timestamp + static const Tag tag = Tag::KEEPALIVE2_ACK; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline utime_t ×tamp() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AckFrame : public ControlFrame<AckFrame, + uint64_t> { // message sequence + static const Tag tag = Tag::ACK; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +using segment_bls_t = + boost::container::static_vector<bufferlist, MAX_NUM_SEGMENTS>; + +// This class is used for encoding/decoding header of the message frame. +// Body is processed almost independently with the sole junction point +// being the `extra_payload_len` passed to get_buffer(). +struct MessageFrame : public Frame<MessageFrame, + /* four segments */ + segment_t::DEFAULT_ALIGNMENT, + segment_t::DEFAULT_ALIGNMENT, + segment_t::DEFAULT_ALIGNMENT, + segment_t::PAGE_SIZE_ALIGNMENT> { + static const Tag tag = Tag::MESSAGE; + + static MessageFrame Encode(const ceph_msg_header2 &msg_header, + const ceph::bufferlist &front, + const ceph::bufferlist &middle, + const ceph::bufferlist &data) { + MessageFrame f; + f.segments[SegmentIndex::Msg::HEADER].append( + reinterpret_cast<const char*>(&msg_header), sizeof(msg_header)); + + f.segments[SegmentIndex::Msg::FRONT] = front; + f.segments[SegmentIndex::Msg::MIDDLE] = middle; + f.segments[SegmentIndex::Msg::DATA] = data; + + return f; + } + + static MessageFrame Decode(segment_bls_t& recv_segments) { + MessageFrame f; + // transfer segments' bufferlists. If a MessageFrame contains less + // SegmentsNumV segments, the missing ones will be seen as zeroed. + for (__u8 idx = 0; idx < std::size(recv_segments); idx++) { + f.segments[idx] = std::move(recv_segments[idx]); + } + return f; + } + + inline const ceph_msg_header2 &header() { + auto& hdrbl = segments[SegmentIndex::Msg::HEADER]; + return reinterpret_cast<const ceph_msg_header2&>(*hdrbl.c_str()); + } + + ceph::bufferlist &front() { + return segments[SegmentIndex::Msg::FRONT]; + } + + ceph::bufferlist &middle() { + return segments[SegmentIndex::Msg::MIDDLE]; + } + + ceph::bufferlist &data() { + return segments[SegmentIndex::Msg::DATA]; + } + + uint32_t front_len() const { + return segments[SegmentIndex::Msg::FRONT].length(); + } + + uint32_t middle_len() const { + return segments[SegmentIndex::Msg::MIDDLE].length(); + } + + uint32_t data_len() const { + return segments[SegmentIndex::Msg::DATA].length(); + } + +protected: + using Frame::Frame; +}; + +} // namespace ceph::msgr::v2 + +#endif // _MSG_ASYNC_FRAMES_V2_ diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc new file mode 100644 index 00000000..2b4e646d --- /dev/null +++ b/src/msg/async/net_handler.cc @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include "net_handler.h" +#include "common/debug.h" +#include "common/errno.h" +#include "include/compat.h" +#include "include/sock_compat.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "NetHandler " + +namespace ceph{ + +int NetHandler::create_socket(int domain, bool reuse_addr) +{ + int s; + int r = 0; + + if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) { + r = errno; + lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl; + return -r; + } + +#if !defined(__FreeBSD__) + /* Make sure connection-intensive things like the benchmark + * will be able to close/open sockets a zillion of times */ + if (reuse_addr) { + int on = 1; + if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) { + r = errno; + lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: " + << strerror(r) << dendl; + close(s); + return -r; + } + } +#endif + + return s; +} + +int NetHandler::set_nonblock(int sd) +{ + int flags; + int r = 0; + + /* Set the socket nonblocking. + * Note that fcntl(2) for F_GETFL and F_SETFL can't be + * interrupted by a signal. */ + if ((flags = fcntl(sd, F_GETFL)) < 0 ) { + r = errno; + lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl; + return -r; + } + if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) { + r = errno; + lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl; + return -r; + } + + return 0; +} + +int NetHandler::set_socket_options(int sd, bool nodelay, int size) +{ + int r = 0; + // disable Nagle algorithm? + if (nodelay) { + int flag = 1; + r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); + if (r < 0) { + r = errno; + ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl; + } + } + if (size) { + r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size)); + if (r < 0) { + r = errno; + ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl; + } + } + + // block ESIGPIPE +#ifdef CEPH_USE_SO_NOSIGPIPE + int val = 1; + r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val)); + if (r) { + r = errno; + ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl; + } +#endif + return -r; +} + +void NetHandler::set_priority(int sd, int prio, int domain) +{ +#ifdef SO_PRIORITY + if (prio < 0) { + return; + } + int r = -1; +#ifdef IPTOS_CLASS_CS6 + int iptos = IPTOS_CLASS_CS6; + switch (domain) { + case AF_INET: + r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos)); + break; + case AF_INET6: + r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos)); + break; + default: + lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")" + << " to " << iptos << dendl; + return; + } + if (r < 0) { + r = errno; + ldout(cct,0) << "couldn't set TOS to " << iptos + << ": " << cpp_strerror(r) << dendl; + } + +#endif // IPTOS_CLASS_CS6 + // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0. + // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT + // We need to call setsockopt(SO_PRIORITY) after it. + r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); + if (r < 0) { + r = errno; + ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio + << ": " << cpp_strerror(r) << dendl; + } +#else + return; +#endif // SO_PRIORITY +} + +int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock) +{ + int ret; + int s = create_socket(addr.get_family()); + if (s < 0) + return s; + + if (nonblock) { + ret = set_nonblock(s); + if (ret < 0) { + close(s); + return ret; + } + } + + set_socket_options(s, cct->_conf->ms_tcp_nodelay, cct->_conf->ms_tcp_rcvbuf); + + { + entity_addr_t addr = bind_addr; + if (cct->_conf->ms_bind_before_connect && (!addr.is_blank_ip())) { + addr.set_port(0); + ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len()); + if (ret < 0) { + ret = errno; + ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl; + close(s); + return -ret; + } + } + } + + ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len()); + if (ret < 0) { + ret = errno; + if (errno == EINPROGRESS && nonblock) + return s; + + ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl; + close(s); + return -ret; + } + + return s; +} + +int NetHandler::reconnect(const entity_addr_t &addr, int sd) +{ + int r = 0; + int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len()); + + if (ret < 0 && errno != EISCONN) { + r = errno; + ldout(cct, 10) << __func__ << " reconnect: " << strerror(r) << dendl; + if (r == EINPROGRESS || r == EALREADY) + return 1; + return -r; + } + + return 0; +} + +int NetHandler::connect(const entity_addr_t &addr, const entity_addr_t& bind_addr) +{ + return generic_connect(addr, bind_addr, false); +} + +int NetHandler::nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr) +{ + return generic_connect(addr, bind_addr, true); +} + + +} diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h new file mode 100644 index 00000000..19042377 --- /dev/null +++ b/src/msg/async/net_handler.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_NET_UTILS_H +#define CEPH_COMMON_NET_UTILS_H +#include "common/config.h" + +namespace ceph { + class NetHandler { + int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock); + + CephContext *cct; + public: + int create_socket(int domain, bool reuse_addr=false); + explicit NetHandler(CephContext *c): cct(c) {} + int set_nonblock(int sd); + int set_socket_options(int sd, bool nodelay, int size); + int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr); + + /** + * Try to reconnect the socket. + * + * @return 0 success + * > 0 just break, and wait for event + * < 0 need to goto fail + */ + int reconnect(const entity_addr_t &addr, int sd); + int nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr); + void set_priority(int sd, int priority, int domain); + }; +} + +#endif diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc new file mode 100644 index 00000000..34299975 --- /dev/null +++ b/src/msg/async/rdma/Infiniband.cc @@ -0,0 +1,1234 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Infiniband.h" +#include "common/errno.h" +#include "common/debug.h" +#include "RDMAStack.h" +#include <sys/time.h> +#include <sys/resource.h> + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "Infiniband " + +static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1; +static const uint32_t MAX_INLINE_DATA = 0; +static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000"); +static const uint32_t CQ_DEPTH = 30000; + +Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr), gid_idx(0) +{ +#ifdef HAVE_IBV_EXP + union ibv_gid cgid; + struct ibv_exp_gid_attr gid_attr; + bool malformed = false; + + ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl; + int r = ibv_query_port(ctxt, port_num, port_attr); + if (r == -1) { + lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + lid = port_attr->lid; + + // search for requested GID in GIDs table + ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid) + << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl; + r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(), + "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx" + ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx", + &cgid.raw[ 0], &cgid.raw[ 1], + &cgid.raw[ 2], &cgid.raw[ 3], + &cgid.raw[ 4], &cgid.raw[ 5], + &cgid.raw[ 6], &cgid.raw[ 7], + &cgid.raw[ 8], &cgid.raw[ 9], + &cgid.raw[10], &cgid.raw[11], + &cgid.raw[12], &cgid.raw[13], + &cgid.raw[14], &cgid.raw[15]); + + if (r != 16) { + ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl; + malformed = true; + } + + gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE; + + for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) { + r = ibv_query_gid(ctxt, port_num, gid_idx, &gid); + if (r) { + lderr(cct) << __func__ << " query gid of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr); + if (r) { + lderr(cct) << __func__ << " query gid attributes of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + if (malformed) break; // stay with gid_idx=0 + if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) && + (memcmp(&gid, &cgid, 16) == 0) ) { + ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl; + break; + } + } + + if (gid_idx == port_attr->gid_tbl_len) { + lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl; + ceph_abort(); + } +#else + int r = ibv_query_port(ctxt, port_num, port_attr); + if (r == -1) { + lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + lid = port_attr->lid; + r = ibv_query_gid(ctxt, port_num, 0, &gid); + if (r) { + lderr(cct) << __func__ << " query gid failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#endif +} + + +Device::Device(CephContext *cct, ibv_device* d, struct ibv_context *dc) + : device(d), device_attr(new ibv_device_attr), active_port(nullptr) +{ + if (device == NULL) { + lderr(cct) << __func__ << " device == NULL" << cpp_strerror(errno) << dendl; + ceph_abort(); + } + name = ibv_get_device_name(device); + if (cct->_conf->ms_async_rdma_cm) { + ctxt = dc; + } else { + ctxt = ibv_open_device(device); + } + if (ctxt == NULL) { + lderr(cct) << __func__ << " open rdma device failed. " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + int r = ibv_query_device(ctxt, device_attr); + if (r == -1) { + lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +} + +void Device::binding_port(CephContext *cct, int port_num) { + port_cnt = device_attr->phys_port_cnt; + for (uint8_t i = 0; i < port_cnt; ++i) { + Port *port = new Port(cct, ctxt, i+1); + if (i + 1 == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) { + active_port = port; + ldout(cct, 1) << __func__ << " found active port " << i+1 << dendl; + break; + } else { + ldout(cct, 10) << __func__ << " port " << i+1 << " is not what we want. state: " << port->get_port_attr()->state << ")"<< dendl; + } + delete port; + } + if (nullptr == active_port) { + lderr(cct) << __func__ << " port not found" << dendl; + ceph_assert(active_port); + } +} + + +Infiniband::QueuePair::QueuePair( + CephContext *c, Infiniband& infiniband, ibv_qp_type type, + int port, ibv_srq *srq, + Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq, + uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key) +: cct(c), infiniband(infiniband), + type(type), + ctxt(infiniband.device->ctxt), + ib_physical_port(port), + pd(infiniband.pd->pd), + srq(srq), + qp(NULL), + cm_id(cid), + txcq(txcq), + rxcq(rxcq), + initial_psn(0), + max_send_wr(tx_queue_len), + max_recv_wr(rx_queue_len), + q_key(q_key), + dead(false) +{ + initial_psn = lrand48() & 0xffffff; + if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) { + lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl; + ceph_abort(); + } + pd = infiniband.pd->pd; +} + +int Infiniband::QueuePair::init() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + ibv_qp_init_attr qpia; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&qpia, 0, sizeof(qpia)); + qpia.send_cq = txcq->get_cq(); + qpia.recv_cq = rxcq->get_cq(); + if (srq) { + qpia.srq = srq; // use the same shared receive queue + } else { + qpia.cap.max_recv_wr = max_recv_wr; + qpia.cap.max_recv_sge = 1; + } + qpia.cap.max_send_wr = max_send_wr; // max outstanding send requests + qpia.cap.max_send_sge = 1; // max send scatter-gather elements + qpia.cap.max_inline_data = MAX_INLINE_DATA; // max bytes of immediate data on send q + qpia.qp_type = type; // RC, UC, UD, or XRC + qpia.sq_sig_all = 0; // only generate CQEs on requested WQEs + + if (!cct->_conf->ms_async_rdma_cm) { + qp = ibv_create_qp(pd, &qpia); + if (qp == NULL) { + lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl; + if (errno == ENOMEM) { + lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, " + " ms_async_rdma_send_buffers or" + " ms_async_rdma_buffer_size" << dendl; + } + return -1; + } + } else { + ceph_assert(cm_id->verbs == pd->context); + if (rdma_create_qp(cm_id, pd, &qpia)) { + lderr(cct) << __func__ << " failed to create queue pair with rdmacm library" + << cpp_strerror(errno) << dendl; + return -1; + } + qp = cm_id->qp; + } + ldout(cct, 20) << __func__ << " successfully create queue pair: " + << "qp=" << qp << dendl; + + if (cct->_conf->ms_async_rdma_cm) + return 0; + + // move from RESET to INIT state + ibv_qp_attr qpa; + memset(&qpa, 0, sizeof(qpa)); + qpa.qp_state = IBV_QPS_INIT; + qpa.pkey_index = 0; + qpa.port_num = (uint8_t)(ib_physical_port); + qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; + qpa.qkey = q_key; + + int mask = IBV_QP_STATE | IBV_QP_PORT; + switch (type) { + case IBV_QPT_RC: + mask |= IBV_QP_ACCESS_FLAGS; + mask |= IBV_QP_PKEY_INDEX; + break; + case IBV_QPT_UD: + mask |= IBV_QP_QKEY; + mask |= IBV_QP_PKEY_INDEX; + break; + case IBV_QPT_RAW_PACKET: + break; + default: + ceph_abort(); + } + + int ret = ibv_modify_qp(qp, &qpa, mask); + if (ret) { + ibv_destroy_qp(qp); + lderr(cct) << __func__ << " failed to transition to INIT state: " + << cpp_strerror(errno) << dendl; + return -1; + } + ldout(cct, 20) << __func__ << " successfully change queue pair to INIT:" + << " qp=" << qp << dendl; + return 0; +} + +/** + * Change RC QueuePair into the ERROR state. This is necessary modify + * the Queue Pair into the Error state and poll all of the relevant + * Work Completions prior to destroying a Queue Pair. + * Since destroying a Queue Pair does not guarantee that its Work + * Completions are removed from the CQ upon destruction. Even if the + * Work Completions are already in the CQ, it might not be possible to + * retrieve them. If the Queue Pair is associated with an SRQ, it is + * recommended wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED + * + * \return + * -errno if the QueuePair can't switch to ERROR + * 0 for success. + */ +int Infiniband::QueuePair::to_dead() +{ + if (dead) + return 0; + ibv_qp_attr qpa; + memset(&qpa, 0, sizeof(qpa)); + qpa.qp_state = IBV_QPS_ERR; + + int mask = IBV_QP_STATE; + int ret = ibv_modify_qp(qp, &qpa, mask); + if (ret) { + lderr(cct) << __func__ << " failed to transition to ERROR state: " + << cpp_strerror(errno) << dendl; + return -errno; + } + dead = true; + return ret; +} + +int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to query qp: " + << cpp_strerror(errno) << dendl; + return -1; + } + + if (rqp) + *rqp = qpa.dest_qp_num; + return 0; +} + +/** + * Get the remote infiniband address for this QueuePair, as set in #plumb(). + * LIDs are "local IDs" in infiniband terminology. They are short, locally + * routable addresses. + */ +int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to query qp: " + << cpp_strerror(errno) << dendl; + return -1; + } + + if (lid) + *lid = qpa.ah_attr.dlid; + return 0; +} + +/** + * Get the state of a QueuePair. + */ +int Infiniband::QueuePair::get_state() const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to get state: " + << cpp_strerror(errno) << dendl; + return -1; + } + return qpa.qp_state; +} + +/** + * Return true if the queue pair is in an error state, false otherwise. + */ +bool Infiniband::QueuePair::is_error() const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, -1, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to get state: " + << cpp_strerror(errno) << dendl; + return true; + } + return qpa.cur_qp_state == IBV_QPS_ERR; +} + + +Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib) + : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0) +{ +} + +Infiniband::CompletionChannel::~CompletionChannel() +{ + if (channel) { + int r = ibv_destroy_comp_channel(channel); + if (r < 0) + lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl; + ceph_assert(r == 0); + } +} + +int Infiniband::CompletionChannel::init() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + channel = ibv_create_comp_channel(infiniband.device->ctxt); + if (!channel) { + lderr(cct) << __func__ << " failed to create receive completion channel: " + << cpp_strerror(errno) << dendl; + return -1; + } + int rc = NetHandler(cct).set_nonblock(channel->fd); + if (rc < 0) { + ibv_destroy_comp_channel(channel); + return -1; + } + return 0; +} + +void Infiniband::CompletionChannel::ack_events() +{ + ibv_ack_cq_events(cq, cq_events_that_need_ack); + cq_events_that_need_ack = 0; +} + +bool Infiniband::CompletionChannel::get_cq_event() +{ + ibv_cq *cq = NULL; + void *ev_ctx; + if (ibv_get_cq_event(channel, &cq, &ev_ctx)) { + if (errno != EAGAIN && errno != EINTR) + lderr(cct) << __func__ << " failed to retrieve CQ event: " + << cpp_strerror(errno) << dendl; + return false; + } + + /* accumulate number of cq events that need to + * * be acked, and periodically ack them + * */ + if (++cq_events_that_need_ack == MAX_ACK_EVENT) { + ldout(cct, 20) << __func__ << " ack aq events." << dendl; + ibv_ack_cq_events(cq, MAX_ACK_EVENT); + cq_events_that_need_ack = 0; + } + + return true; +} + + +Infiniband::CompletionQueue::~CompletionQueue() +{ + if (cq) { + int r = ibv_destroy_cq(cq); + if (r < 0) + lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl; + ceph_assert(r == 0); + } +} + +int Infiniband::CompletionQueue::init() +{ + cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0); + if (!cq) { + lderr(cct) << __func__ << " failed to create receive completion queue: " + << cpp_strerror(errno) << dendl; + return -1; + } + + if (ibv_req_notify_cq(cq, 0)) { + lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl; + ibv_destroy_cq(cq); + cq = nullptr; + return -1; + } + + channel->bind_cq(cq); + ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl; + return 0; +} + +int Infiniband::CompletionQueue::rearm_notify(bool solicite_only) +{ + ldout(cct, 20) << __func__ << " started." << dendl; + int r = ibv_req_notify_cq(cq, 0); + if (r < 0) + lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl; + return r; +} + +int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) { + int r = ibv_poll_cq(cq, num_entries, ret_wc_array); + if (r < 0) { + lderr(cct) << __func__ << " poll_completion_queue occur met error: " + << cpp_strerror(errno) << dendl; + return -1; + } + return r; +} + + +Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device) + : pd(ibv_alloc_pd(device->ctxt)) +{ + if (pd == NULL) { + lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +} + +Infiniband::ProtectionDomain::~ProtectionDomain() +{ + ibv_dealloc_pd(pd); +} + + +Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t len, char* b) + : mr(m), bytes(len), offset(0), buffer(b) +{ +} + +Infiniband::MemoryManager::Chunk::~Chunk() +{ +} + +void Infiniband::MemoryManager::Chunk::set_offset(uint32_t o) +{ + offset = o; +} + +uint32_t Infiniband::MemoryManager::Chunk::get_offset() +{ + return offset; +} + +void Infiniband::MemoryManager::Chunk::set_bound(uint32_t b) +{ + bound = b; +} + +void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b) +{ + offset = 0; + bound = b; +} + +uint32_t Infiniband::MemoryManager::Chunk::get_bound() +{ + return bound; +} + +uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len) +{ + uint32_t left = bound - offset; + if (left >= len) { + memcpy(buf, buffer+offset, len); + offset += len; + return len; + } else { + memcpy(buf, buffer+offset, left); + offset = 0; + bound = 0; + return left; + } +} + +uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len) +{ + uint32_t left = bytes - offset; + if (left >= len) { + memcpy(buffer+offset, buf, len); + offset += len; + return len; + } else { + memcpy(buffer+offset, buf, left); + offset = bytes; + return left; + } +} + +bool Infiniband::MemoryManager::Chunk::full() +{ + return offset == bytes; +} + +bool Infiniband::MemoryManager::Chunk::over() +{ + return Infiniband::MemoryManager::Chunk::offset == bound; +} + +void Infiniband::MemoryManager::Chunk::clear() +{ + offset = 0; + bound = 0; +} + +Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s) + : manager(m), buffer_size(s), lock("cluster_lock") +{ +} + +Infiniband::MemoryManager::Cluster::~Cluster() +{ + int r = ibv_dereg_mr(chunk_base->mr); + ceph_assert(r == 0); + const auto chunk_end = chunk_base + num_chunk; + for (auto chunk = chunk_base; chunk != chunk_end; chunk++) { + chunk->~Chunk(); + } + + ::free(chunk_base); + manager.free(base); +} + +int Infiniband::MemoryManager::Cluster::fill(uint32_t num) +{ + ceph_assert(!base); + num_chunk = num; + uint32_t bytes = buffer_size * num; + + base = (char*)manager.malloc(bytes); + end = base + bytes; + ceph_assert(base); + chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num)); + // FIPS zeroization audit 20191115: this memset is not security related. + memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num); + free_chunks.reserve(num); + ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + ceph_assert(m); + Chunk* chunk = chunk_base; + for (uint32_t offset = 0; offset < bytes; offset += buffer_size){ + new(chunk) Chunk(m, buffer_size, base+offset); + free_chunks.push_back(chunk); + chunk++; + } + return 0; +} + +void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck) +{ + Mutex::Locker l(lock); + for (auto c : ck) { + c->clear(); + free_chunks.push_back(c); + } +} + +int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t bytes) +{ + uint32_t num = bytes / buffer_size + 1; + if (bytes % buffer_size == 0) + --num; + int r = num; + Mutex::Locker l(lock); + if (free_chunks.empty()) + return 0; + if (!bytes) { + r = free_chunks.size(); + for (auto c : free_chunks) + chunks.push_back(c); + free_chunks.clear(); + return r; + } + if (free_chunks.size() < num) { + num = free_chunks.size(); + r = num; + } + for (uint32_t i = 0; i < num; ++i) { + chunks.push_back(free_chunks.back()); + free_chunks.pop_back(); + } + return r; +} + +bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs) +{ + /* unlimited */ + if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0) + return true; + + if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) { + lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " << + n_bufs_allocated << " requested: " << nbufs << + " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl; + return false; + } + + return true; +} + +void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) { + perf_logger = logger; + if (perf_logger != nullptr) + perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated); +} + +void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs) +{ + n_bufs_allocated += nbufs; + + if (!perf_logger) + return; + + if (nbufs > 0) { + perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs); + } else { + perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs); + } +} + +void *Infiniband::MemoryManager::mem_pool::slow_malloc() +{ + void *p; + + Mutex::Locker l(PoolAllocator::lock); + PoolAllocator::g_ctx = ctx; + // this will trigger pool expansion via PoolAllocator::malloc() + p = boost::pool<PoolAllocator>::malloc(); + PoolAllocator::g_ctx = nullptr; + return p; +} + +Infiniband::MemoryManager::MemPoolContext *Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr; +Mutex Infiniband::MemoryManager::PoolAllocator::lock("pool-alloc-lock"); + +// lock is taken by mem_pool::slow_malloc() +char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type bytes) +{ + mem_info *m; + Chunk *ch; + size_t rx_buf_size; + unsigned nbufs; + MemoryManager *manager; + CephContext *cct; + + ceph_assert(g_ctx); + manager = g_ctx->manager; + cct = manager->cct; + rx_buf_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size; + nbufs = bytes/rx_buf_size; + + if (!g_ctx->can_alloc(nbufs)) + return NULL; + + m = static_cast<mem_info *>(manager->malloc(bytes + sizeof(*m))); + if (!m) { + lderr(cct) << __func__ << " failed to allocate " << + bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl; + return NULL; + } + + m->mr = ibv_reg_mr(manager->pd->pd, m->chunks, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + if (m->mr == NULL) { + lderr(cct) << __func__ << " failed to register " << + bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl; + manager->free(m); + return NULL; + } + + m->nbufs = nbufs; + // save this chunk context + m->ctx = g_ctx; + + // note that the memory can be allocated before perf logger is set + g_ctx->update_stats(nbufs); + + /* initialize chunks */ + ch = m->chunks; + for (unsigned i = 0; i < nbufs; i++) { + ch->lkey = m->mr->lkey; + ch->bytes = cct->_conf->ms_async_rdma_buffer_size; + ch->offset = 0; + ch->buffer = ch->data; // TODO: refactor tx and remove buffer + ch = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(ch) + rx_buf_size); + } + + return reinterpret_cast<char *>(m->chunks); +} + + +void Infiniband::MemoryManager::PoolAllocator::free(char * const block) +{ + mem_info *m; + Mutex::Locker l(lock); + + m = reinterpret_cast<mem_info *>(block) - 1; + m->ctx->update_stats(-m->nbufs); + ibv_dereg_mr(m->mr); + m->ctx->manager->free(m); +} + +Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p) + : cct(c), device(d), pd(p), + rxbuf_pool_ctx(this), + rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size, + c->_conf->ms_async_rdma_receive_buffers > 0 ? + // if possible make initial pool size 2 * receive_queue_len + // that way there will be no pool expansion upon receive of the + // first packet. + (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ? + c->_conf->ms_async_rdma_receive_buffers : 2 * c->_conf->ms_async_rdma_receive_queue_len) : + // rx pool is infinite, we can set any initial size that we want + 2 * c->_conf->ms_async_rdma_receive_queue_len) +{ +} + +Infiniband::MemoryManager::~MemoryManager() +{ + if (send) + delete send; +} + +void* Infiniband::MemoryManager::huge_pages_malloc(size_t size) +{ + size_t real_size = ALIGN_TO_PAGE_SIZE(size + HUGE_PAGE_SIZE); + char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS |MAP_POPULATE | MAP_HUGETLB,-1, 0); + if (ptr == MAP_FAILED) { + ptr = (char *)std::malloc(real_size); + if (ptr == NULL) return NULL; + real_size = 0; + } + *((size_t *)ptr) = real_size; + return ptr + HUGE_PAGE_SIZE; +} + +void Infiniband::MemoryManager::huge_pages_free(void *ptr) +{ + if (ptr == NULL) return; + void *real_ptr = (char *)ptr -HUGE_PAGE_SIZE; + size_t real_size = *((size_t *)real_ptr); + ceph_assert(real_size % HUGE_PAGE_SIZE == 0); + if (real_size != 0) + munmap(real_ptr, real_size); + else + std::free(real_ptr); +} + + +void* Infiniband::MemoryManager::malloc(size_t size) +{ + if (cct->_conf->ms_async_rdma_enable_hugepage) + return huge_pages_malloc(size); + else + return std::malloc(size); +} + +void Infiniband::MemoryManager::free(void *ptr) +{ + if (cct->_conf->ms_async_rdma_enable_hugepage) + huge_pages_free(ptr); + else + std::free(ptr); +} + +void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num) +{ + ceph_assert(device); + ceph_assert(pd); + + send = new Cluster(*this, size); + send->fill(tx_num); +} + +void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks) +{ + send->take_back(chunks); +} + +int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes) +{ + return send->get_buffers(c, bytes); +} + +static std::atomic<bool> init_prereq = {false}; + +void Infiniband::verify_prereq(CephContext *cct) { + + //On RDMA MUST be called before fork + int rc = ibv_fork_init(); + if (rc) { + lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl; + ceph_abort(); + } + + ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage << dendl; + if (cct->_conf->ms_async_rdma_enable_hugepage){ + rc = setenv("RDMAV_HUGEPAGES_SAFE","1",1); + ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") << dendl; + if (rc) { + lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl; + ceph_abort(); + } + } + + //Check ulimit + struct rlimit limit; + getrlimit(RLIMIT_MEMLOCK, &limit); + if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) { + lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory." + " We recommend setting this parameter to infinity" << dendl; + } + init_prereq = true; +} + +Infiniband::Infiniband(CephContext *cct) + : cct(cct), lock("IB lock"), + device_name(cct->_conf->ms_async_rdma_device_name), + port_num( cct->_conf->ms_async_rdma_port_num) +{ + if (!init_prereq) + verify_prereq(cct); + ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl; +} + +void Infiniband::init() +{ + Mutex::Locker l(lock); + + if (initialized) + return; + + device_list = new DeviceList(cct); + initialized = true; + + device = device_list->get_device(device_name.c_str()); + ceph_assert(device); + device->binding_port(cct, port_num); + ib_physical_port = device->active_port->get_port_num(); + pd = new ProtectionDomain(cct, device); + ceph_assert(NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0); + + support_srq = cct->_conf->ms_async_rdma_support_srq; + if (support_srq) + rx_queue_len = device->device_attr->max_srq_wr; + else + rx_queue_len = device->device_attr->max_qp_wr; + if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) { + rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len; + ldout(cct, 1) << __func__ << " receive queue length is " << rx_queue_len << " receive buffers" << dendl; + } else { + ldout(cct, 0) << __func__ << " requested receive queue length " << + cct->_conf->ms_async_rdma_receive_queue_len << + " is too big. Setting " << rx_queue_len << dendl; + } + + // check for the misconfiguration + if (cct->_conf->ms_async_rdma_receive_buffers > 0 && + rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) { + lderr(cct) << __func__ << " rdma_receive_queue_len (" << + rx_queue_len << ") > ms_async_rdma_receive_buffers(" << + cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl; + ceph_abort(); + } + + tx_queue_len = device->device_attr->max_qp_wr; + if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) { + tx_queue_len = cct->_conf->ms_async_rdma_send_buffers; + ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers" << dendl; + } else { + ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl; + } + + ldout(cct, 1) << __func__ << " device allow " << device->device_attr->max_cqe + << " completion entries" << dendl; + + memory_manager = new MemoryManager(cct, device, pd); + memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len); + + if (support_srq) { + srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT); + post_chunks_to_rq(rx_queue_len, NULL); //add to srq + } +} + +Infiniband::~Infiniband() +{ + if (!initialized) + return; + if (support_srq) + ibv_destroy_srq(srq); + delete memory_manager; + delete pd; +} + +/** + * Create a shared receive queue. This basically wraps the verbs call. + * + * \param[in] max_wr + * The max number of outstanding work requests in the SRQ. + * \param[in] max_sge + * The max number of scatter elements per WR. + * \return + * A valid ibv_srq pointer, or NULL on error. + */ +ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge) +{ + ibv_srq_init_attr sia; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&sia, 0, sizeof(sia)); + sia.srq_context = device->ctxt; + sia.attr.max_wr = max_wr; + sia.attr.max_sge = max_sge; + return ibv_create_srq(pd->pd, &sia); +} + +int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes) +{ + return memory_manager->get_send_buffers(c, bytes); +} + +/** + * Create a new QueuePair. This factory should be used in preference to + * the QueuePair constructor directly, since this lets derivatives of + * Infiniband, e.g. MockInfiniband (if it existed), + * return mocked out QueuePair derivatives. + * + * \return + * QueuePair on success or NULL if init fails + * See QueuePair::QueuePair for parameter documentation. + */ +Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx, + CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id) +{ + Infiniband::QueuePair *qp = new QueuePair( + cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id); + if (qp->init()) { + delete qp; + return NULL; + } + return qp; +} + +int Infiniband::post_chunks_to_rq(int num, ibv_qp *qp) +{ + int ret, i = 0; + ibv_sge isge[num]; + Chunk *chunk; + ibv_recv_wr rx_work_request[num]; + + while (i < num) { + chunk = get_memory_manager()->get_rx_buffer(); + if (chunk == NULL) { + lderr(cct) << __func__ << " WARNING: out of memory. Requested " << num << + " rx buffers. Got " << i << dendl; + if (i == 0) + return 0; + // if we got some buffers post them and hope for the best + rx_work_request[i-1].next = 0; + break; + } + + isge[i].addr = reinterpret_cast<uint64_t>(chunk->data); + isge[i].length = chunk->bytes; + isge[i].lkey = chunk->lkey; + + memset(&rx_work_request[i], 0, sizeof(rx_work_request[i])); + rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// stash descriptor ptr + if (i == num - 1) { + rx_work_request[i].next = 0; + } else { + rx_work_request[i].next = &rx_work_request[i+1]; + } + rx_work_request[i].sg_list = &isge[i]; + rx_work_request[i].num_sge = 1; + i++; + } + ibv_recv_wr *badworkrequest; + if (support_srq) { + ret = ibv_post_srq_recv(srq, &rx_work_request[0], &badworkrequest); + ceph_assert(ret == 0); + } else { + ceph_assert(qp); + ret = ibv_post_recv(qp, &rx_work_request[0], &badworkrequest); + ceph_assert(ret == 0); + } + return i; +} + +Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c) +{ + Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this); + if (cc->init()) { + delete cc; + return NULL; + } + return cc; +} + +Infiniband::CompletionQueue* Infiniband::create_comp_queue( + CephContext *cct, CompletionChannel *cc) +{ + Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue( + cct, *this, CQ_DEPTH, cc); + if (cq->init()) { + delete cq; + return NULL; + } + return cq; +} + +// 1 means no valid buffer read, 0 means got enough buffer +// else return < 0 means error +int Infiniband::recv_msg(CephContext *cct, int sd, IBSYNMsg& im) +{ + char msg[TCP_MSG_LEN]; + char gid[33]; + ssize_t r = ::read(sd, &msg, sizeof(msg)); + // Drop incoming qpt + if (cct->_conf->ms_inject_socket_failures && sd >= 0) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 0) << __func__ << " injecting socket failure" << dendl; + return -EINVAL; + } + } + if (r < 0) { + r = -errno; + lderr(cct) << __func__ << " got error " << r << ": " + << cpp_strerror(r) << dendl; + } else if (r == 0) { // valid disconnect message of length 0 + ldout(cct, 10) << __func__ << " got disconnect message " << dendl; + } else if ((size_t)r != sizeof(msg)) { // invalid message + ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl; + r = -EINVAL; + } else { // valid message + sscanf(msg, "%hx:%x:%x:%x:%s", &(im.lid), &(im.qpn), &(im.psn), &(im.peer_qpn),gid); + wire_gid_to_gid(gid, &(im.gid)); + ldout(cct, 5) << __func__ << " recevd: " << im.lid << ", " << im.qpn << ", " << im.psn << ", " << im.peer_qpn << ", " << gid << dendl; + } + return r; +} + +int Infiniband::send_msg(CephContext *cct, int sd, IBSYNMsg& im) +{ + int retry = 0; + ssize_t r; + + char msg[TCP_MSG_LEN]; + char gid[33]; +retry: + gid_to_wire_gid(&(im.gid), gid); + sprintf(msg, "%04x:%08x:%08x:%08x:%s", im.lid, im.qpn, im.psn, im.peer_qpn, gid); + ldout(cct, 10) << __func__ << " sending: " << im.lid << ", " << im.qpn << ", " << im.psn + << ", " << im.peer_qpn << ", " << gid << dendl; + r = ::write(sd, msg, sizeof(msg)); + // Drop incoming qpt + if (cct->_conf->ms_inject_socket_failures && sd >= 0) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 0) << __func__ << " injecting socket failure" << dendl; + return -EINVAL; + } + } + + if ((size_t)r != sizeof(msg)) { + // FIXME need to handle EAGAIN instead of retry + if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) { + retry++; + goto retry; + } + if (r < 0) + lderr(cct) << __func__ << " send returned error " << errno << ": " + << cpp_strerror(errno) << dendl; + else + lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl; + return -errno; + } + return 0; +} + +void Infiniband::wire_gid_to_gid(const char *wgid, union ibv_gid *gid) +{ + char tmp[9]; + uint32_t v32; + int i; + + for (tmp[8] = 0, i = 0; i < 4; ++i) { + memcpy(tmp, wgid + i * 8, 8); + sscanf(tmp, "%x", &v32); + *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32); + } +} + +void Infiniband::gid_to_wire_gid(const union ibv_gid *gid, char wgid[]) +{ + for (int i = 0; i < 4; ++i) + sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4))); +} + +Infiniband::QueuePair::~QueuePair() +{ + if (qp) { + ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl; + ceph_assert(!ibv_destroy_qp(qp)); + } +} + +/** + * Given a string representation of the `status' field from Verbs + * struct `ibv_wc'. + * + * \param[in] status + * The integer status obtained in ibv_wc.status. + * \return + * A string corresponding to the given status. + */ +const char* Infiniband::wc_status_to_string(int status) +{ + static const char *lookup[] = { + "SUCCESS", + "LOC_LEN_ERR", + "LOC_QP_OP_ERR", + "LOC_EEC_OP_ERR", + "LOC_PROT_ERR", + "WR_FLUSH_ERR", + "MW_BIND_ERR", + "BAD_RESP_ERR", + "LOC_ACCESS_ERR", + "REM_INV_REQ_ERR", + "REM_ACCESS_ERR", + "REM_OP_ERR", + "RETRY_EXC_ERR", + "RNR_RETRY_EXC_ERR", + "LOC_RDD_VIOL_ERR", + "REM_INV_RD_REQ_ERR", + "REM_ABORT_ERR", + "INV_EECN_ERR", + "INV_EEC_STATE_ERR", + "FATAL_ERR", + "RESP_TIMEOUT_ERR", + "GENERAL_ERR" + }; + + if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR) + return "<status out of range!>"; + return lookup[status]; +} + +const char* Infiniband::qp_state_string(int status) { + switch(status) { + case IBV_QPS_RESET : return "IBV_QPS_RESET"; + case IBV_QPS_INIT : return "IBV_QPS_INIT"; + case IBV_QPS_RTR : return "IBV_QPS_RTR"; + case IBV_QPS_RTS : return "IBV_QPS_RTS"; + case IBV_QPS_SQD : return "IBV_QPS_SQD"; + case IBV_QPS_SQE : return "IBV_QPS_SQE"; + case IBV_QPS_ERR : return "IBV_QPS_ERR"; + default: return " out of range."; + } +} diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h new file mode 100644 index 00000000..2889cdfc --- /dev/null +++ b/src/msg/async/rdma/Infiniband.h @@ -0,0 +1,529 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_INFINIBAND_H +#define CEPH_INFINIBAND_H + +#include <boost/pool/pool.hpp> +// need this because boost messes with ceph log/assert definitions +#include "include/ceph_assert.h" + +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> + +#include <atomic> +#include <string> +#include <vector> + +#include "include/int_types.h" +#include "include/page.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Mutex.h" +#include "common/perf_counters.h" +#include "msg/msg_types.h" +#include "msg/async/net_handler.h" + +#define HUGE_PAGE_SIZE (2 * 1024 * 1024) +#define ALIGN_TO_PAGE_SIZE(x) \ + (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE) + +struct IBSYNMsg { + uint16_t lid; + uint32_t qpn; + uint32_t psn; + uint32_t peer_qpn; + union ibv_gid gid; +} __attribute__((packed)); + +class RDMAStack; +class CephContext; + +class Port { + struct ibv_context* ctxt; + int port_num; + struct ibv_port_attr* port_attr; + uint16_t lid; + int gid_idx = 0; + union ibv_gid gid; + + public: + explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn); + uint16_t get_lid() { return lid; } + ibv_gid get_gid() { return gid; } + int get_port_num() { return port_num; } + ibv_port_attr* get_port_attr() { return port_attr; } + int get_gid_idx() { return gid_idx; } +}; + + +class Device { + ibv_device *device; + const char* name; + uint8_t port_cnt = 0; + public: + explicit Device(CephContext *c, ibv_device* d, struct ibv_context *dc); + ~Device() { + if (active_port) { + delete active_port; + ceph_assert(ibv_close_device(ctxt) == 0); + } + } + const char* get_name() { return name;} + uint16_t get_lid() { return active_port->get_lid(); } + ibv_gid get_gid() { return active_port->get_gid(); } + int get_gid_idx() { return active_port->get_gid_idx(); } + void binding_port(CephContext *c, int port_num); + struct ibv_context *ctxt; + ibv_device_attr *device_attr; + Port* active_port; +}; + + +class DeviceList { + struct ibv_device ** device_list; + struct ibv_context ** device_context_list; + int num; + Device** devices; + public: + explicit DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)), + device_context_list(rdma_get_devices(&num)) { + if (device_list == NULL || num == 0) { + lderr(cct) << __func__ << " failed to get rdma device list. " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + devices = new Device*[num]; + + for (int i = 0;i < num; ++i) { + devices[i] = new Device(cct, device_list[i], device_context_list[i]); + } + } + ~DeviceList() { + for (int i=0; i < num; ++i) { + delete devices[i]; + } + delete []devices; + ibv_free_device_list(device_list); + } + + Device* get_device(const char* device_name) { + ceph_assert(devices); + for (int i = 0; i < num; ++i) { + if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) { + return devices[i]; + } + } + return NULL; + } +}; + +// stat counters +enum { + l_msgr_rdma_dispatcher_first = 94000, + + l_msgr_rdma_polling, + l_msgr_rdma_inflight_tx_chunks, + l_msgr_rdma_rx_bufs_in_use, + l_msgr_rdma_rx_bufs_total, + + l_msgr_rdma_tx_total_wc, + l_msgr_rdma_tx_total_wc_errors, + l_msgr_rdma_tx_wc_retry_errors, + l_msgr_rdma_tx_wc_wr_flush_errors, + + l_msgr_rdma_rx_total_wc, + l_msgr_rdma_rx_total_wc_errors, + l_msgr_rdma_rx_fin, + + l_msgr_rdma_handshake_errors, + + l_msgr_rdma_total_async_events, + l_msgr_rdma_async_last_wqe_events, + + l_msgr_rdma_created_queue_pair, + l_msgr_rdma_active_queue_pair, + + l_msgr_rdma_dispatcher_last, +}; + +enum { + l_msgr_rdma_first = 95000, + + l_msgr_rdma_tx_no_mem, + l_msgr_rdma_tx_parital_mem, + l_msgr_rdma_tx_failed, + + l_msgr_rdma_tx_chunks, + l_msgr_rdma_tx_bytes, + l_msgr_rdma_rx_chunks, + l_msgr_rdma_rx_bytes, + l_msgr_rdma_pending_sent_conns, + + l_msgr_rdma_last, +}; + +class RDMADispatcher; + +class Infiniband { + public: + class ProtectionDomain { + public: + explicit ProtectionDomain(CephContext *cct, Device *device); + ~ProtectionDomain(); + + ibv_pd* const pd; + }; + + + class MemoryManager { + public: + class Chunk { + public: + Chunk(ibv_mr* m, uint32_t len, char* b); + ~Chunk(); + + void set_offset(uint32_t o); + uint32_t get_offset(); + void set_bound(uint32_t b); + void prepare_read(uint32_t b); + uint32_t get_bound(); + uint32_t read(char* buf, uint32_t len); + uint32_t write(char* buf, uint32_t len); + bool full(); + bool over(); + void clear(); + + public: + ibv_mr* mr; + uint32_t lkey = 0; + uint32_t bytes; + uint32_t bound = 0; + uint32_t offset; + char* buffer; // TODO: remove buffer/refactor TX + char data[0]; + }; + + class Cluster { + public: + Cluster(MemoryManager& m, uint32_t s); + ~Cluster(); + + int fill(uint32_t num); + void take_back(std::vector<Chunk*> &ck); + int get_buffers(std::vector<Chunk*> &chunks, size_t bytes); + Chunk *get_chunk_by_buffer(const char *c) { + uint32_t idx = (c - base) / buffer_size; + Chunk *chunk = chunk_base + idx; + return chunk; + } + bool is_my_buffer(const char *c) const { + return c >= base && c < end; + } + + MemoryManager& manager; + uint32_t buffer_size; + uint32_t num_chunk = 0; + Mutex lock; + std::vector<Chunk*> free_chunks; + char *base = nullptr; + char *end = nullptr; + Chunk* chunk_base = nullptr; + }; + + class MemPoolContext { + PerfCounters *perf_logger; + + public: + MemoryManager *manager; + unsigned n_bufs_allocated; + // true if it is possible to alloc + // more memory for the pool + explicit MemPoolContext(MemoryManager *m) : + perf_logger(nullptr), + manager(m), + n_bufs_allocated(0) {} + bool can_alloc(unsigned nbufs); + void update_stats(int val); + void set_stat_logger(PerfCounters *logger); + }; + + class PoolAllocator { + struct mem_info { + ibv_mr *mr; + MemPoolContext *ctx; + unsigned nbufs; + Chunk chunks[0]; + }; + public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + static char * malloc(const size_type bytes); + static void free(char * const block); + + static MemPoolContext *g_ctx; + static Mutex lock; + }; + + /** + * modify boost pool so that it is possible to + * have a thread safe 'context' when allocating/freeing + * the memory. It is needed to allow a different pool + * configurations and bookkeeping per CephContext and + * also to be able to use same allocator to deal with + * RX and TX pool. + * TODO: use boost pool to allocate TX chunks too + */ + class mem_pool : public boost::pool<PoolAllocator> { + private: + MemPoolContext *ctx; + void *slow_malloc(); + + public: + explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size, + const size_type nnext_size = 32, + const size_type nmax_size = 0) : + pool(nrequested_size, nnext_size, nmax_size), + ctx(ctx) { } + + void *malloc() { + if (!store().empty()) + return (store().malloc)(); + // need to alloc more memory... + // slow path code + return slow_malloc(); + } + }; + + MemoryManager(CephContext *c, Device *d, ProtectionDomain *p); + ~MemoryManager(); + + void* malloc(size_t size); + void free(void *ptr); + + void create_tx_pool(uint32_t size, uint32_t tx_num); + void return_tx(std::vector<Chunk*> &chunks); + int get_send_buffers(std::vector<Chunk*> &c, size_t bytes); + bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); } + Chunk *get_tx_chunk_by_buffer(const char *c) { + return send->get_chunk_by_buffer(c); + } + uint32_t get_tx_buffer_size() const { + return send->buffer_size; + } + + Chunk *get_rx_buffer() { + return reinterpret_cast<Chunk *>(rxbuf_pool.malloc()); + } + + void release_rx_buffer(Chunk *chunk) { + rxbuf_pool.free(chunk); + } + + void set_rx_stat_logger(PerfCounters *logger) { + rxbuf_pool_ctx.set_stat_logger(logger); + } + + CephContext *cct; + private: + // TODO: Cluster -> TxPool txbuf_pool + // chunk layout fix + // + Cluster* send = nullptr;// SEND + Device *device; + ProtectionDomain *pd; + MemPoolContext rxbuf_pool_ctx; + mem_pool rxbuf_pool; + + + void* huge_pages_malloc(size_t size); + void huge_pages_free(void *ptr); + }; + + private: + uint32_t tx_queue_len = 0; + uint32_t rx_queue_len = 0; + uint32_t max_sge = 0; + uint8_t ib_physical_port = 0; + MemoryManager* memory_manager = nullptr; + ibv_srq* srq = nullptr; // shared receive work queue + Device *device = NULL; + ProtectionDomain *pd = NULL; + DeviceList *device_list = nullptr; + void wire_gid_to_gid(const char *wgid, union ibv_gid *gid); + void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]); + CephContext *cct; + Mutex lock; + bool initialized = false; + const std::string &device_name; + uint8_t port_num; + bool support_srq = false; + + public: + explicit Infiniband(CephContext *c); + ~Infiniband(); + void init(); + static void verify_prereq(CephContext *cct); + + class CompletionChannel { + static const uint32_t MAX_ACK_EVENT = 5000; + CephContext *cct; + Infiniband& infiniband; + ibv_comp_channel *channel; + ibv_cq *cq; + uint32_t cq_events_that_need_ack; + + public: + CompletionChannel(CephContext *c, Infiniband &ib); + ~CompletionChannel(); + int init(); + bool get_cq_event(); + int get_fd() { return channel->fd; } + ibv_comp_channel* get_channel() { return channel; } + void bind_cq(ibv_cq *c) { cq = c; } + void ack_events(); + }; + + // this class encapsulates the creation, use, and destruction of an RC + // completion queue. + // + // You need to call init and it will create a cq and associate to comp channel + class CompletionQueue { + public: + CompletionQueue(CephContext *c, Infiniband &ib, + const uint32_t qd, CompletionChannel *cc) + : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {} + ~CompletionQueue(); + int init(); + int poll_cq(int num_entries, ibv_wc *ret_wc_array); + + ibv_cq* get_cq() const { return cq; } + int rearm_notify(bool solicited_only=true); + CompletionChannel* get_cc() const { return channel; } + private: + CephContext *cct; + Infiniband& infiniband; // Infiniband to which this QP belongs + CompletionChannel *channel; + ibv_cq *cq; + uint32_t queue_depth; + }; + + // this class encapsulates the creation, use, and destruction of an RC + // queue pair. + // + // you need call init and it will create a qp and bring it to the INIT state. + // after obtaining the lid, qpn, and psn of a remote queue pair, one + // must call plumb() to bring the queue pair to the RTS state. + class QueuePair { + public: + QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type, + int ib_physical_port, ibv_srq *srq, + Infiniband::CompletionQueue* txcq, + Infiniband::CompletionQueue* rxcq, + uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0); + ~QueuePair(); + + int init(); + + /** + * Get the initial packet sequence number for this QueuePair. + * This is randomly generated on creation. It should not be confused + * with the remote side's PSN, which is set in #plumb(). + */ + uint32_t get_initial_psn() const { return initial_psn; }; + /** + * Get the local queue pair number for this QueuePair. + * QPNs are analogous to UDP/TCP port numbers. + */ + uint32_t get_local_qp_number() const { return qp->qp_num; }; + /** + * Get the remote queue pair number for this QueuePair, as set in #plumb(). + * QPNs are analogous to UDP/TCP port numbers. + */ + int get_remote_qp_number(uint32_t *rqp) const; + /** + * Get the remote infiniband address for this QueuePair, as set in #plumb(). + * LIDs are "local IDs" in infiniband terminology. They are short, locally + * routable addresses. + */ + int get_remote_lid(uint16_t *lid) const; + /** + * Get the state of a QueuePair. + */ + int get_state() const; + /** + * Return true if the queue pair is in an error state, false otherwise. + */ + bool is_error() const; + void add_tx_wr(uint32_t amt) { tx_wr_inflight += amt; } + void dec_tx_wr(uint32_t amt) { tx_wr_inflight -= amt; } + uint32_t get_tx_wr() const { return tx_wr_inflight; } + ibv_qp* get_qp() const { return qp; } + Infiniband::CompletionQueue* get_tx_cq() const { return txcq; } + Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; } + int to_dead(); + bool is_dead() const { return dead; } + + private: + CephContext *cct; + Infiniband& infiniband; // Infiniband to which this QP belongs + ibv_qp_type type; // QP type (IBV_QPT_RC, etc.) + ibv_context* ctxt; // device context of the HCA to use + int ib_physical_port; + ibv_pd* pd; // protection domain + ibv_srq* srq; // shared receive queue + ibv_qp* qp; // infiniband verbs QP handle + struct rdma_cm_id *cm_id; + Infiniband::CompletionQueue* txcq; + Infiniband::CompletionQueue* rxcq; + uint32_t initial_psn; // initial packet sequence number + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t q_key; + bool dead; + std::atomic<uint32_t> tx_wr_inflight = {0}; // counter for inflight Tx WQEs + }; + + public: + typedef MemoryManager::Cluster Cluster; + typedef MemoryManager::Chunk Chunk; + QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*, + ibv_qp_type type, struct rdma_cm_id *cm_id); + ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge); + // post rx buffers to srq, return number of buffers actually posted + int post_chunks_to_rq(int num, ibv_qp *qp=NULL); + void post_chunk_to_pool(Chunk* chunk) { + get_memory_manager()->release_rx_buffer(chunk); + } + int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes); + CompletionChannel *create_comp_channel(CephContext *c); + CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL); + uint8_t get_ib_physical_port() { return ib_physical_port; } + int send_msg(CephContext *cct, int sd, IBSYNMsg& msg); + int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg); + uint16_t get_lid() { return device->get_lid(); } + ibv_gid get_gid() { return device->get_gid(); } + MemoryManager* get_memory_manager() { return memory_manager; } + Device* get_device() { return device; } + int get_async_fd() { return device->ctxt->async_fd; } + bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);} + Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); } + static const char* wc_status_to_string(int status); + static const char* qp_state_string(int status); + uint32_t get_rx_queue_len() const { return rx_queue_len; } +}; + +#endif diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc new file mode 100644 index 00000000..89be7428 --- /dev/null +++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc @@ -0,0 +1,743 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "RDMAStack.h" + +class C_handle_connection_established : public EventCallback { + RDMAConnectedSocketImpl *csi; + bool active = true; + public: + C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {} + void do_request(uint64_t fd) final { + if (active) + csi->handle_connection_established(); + } + void close() { + active = false; + } +}; + +class C_handle_connection_read : public EventCallback { + RDMAConnectedSocketImpl *csi; + bool active = true; + public: + explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {} + void do_request(uint64_t fd) final { + if (active) + csi->handle_connection(); + } + void close() { + active = false; + } +}; + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAConnectedSocketImpl " + +RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w) + : cct(cct), connected(0), error(0), infiniband(ib), + dispatcher(s), worker(w), lock("RDMAConnectedSocketImpl::lock"), + is_server(false), read_handler(new C_handle_connection_read(this)), + established_handler(new C_handle_connection_established(this)), + active(false), pending(false) +{ + if (!cct->_conf->ms_async_rdma_cm) { + qp = infiniband->create_queue_pair(cct, s->get_tx_cq(), s->get_rx_cq(), IBV_QPT_RC, NULL); + my_msg.qpn = qp->get_local_qp_number(); + my_msg.psn = qp->get_initial_psn(); + my_msg.lid = infiniband->get_lid(); + my_msg.peer_qpn = 0; + my_msg.gid = infiniband->get_gid(); + notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK); + dispatcher->register_qp(qp, this); + dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair); + dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair); + } +} + +RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl() +{ + ldout(cct, 20) << __func__ << " destruct." << dendl; + cleanup(); + worker->remove_pending_conn(this); + dispatcher->erase_qpn(my_msg.qpn); + + for (unsigned i=0; i < wc.size(); ++i) { + dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id)); + } + for (unsigned i=0; i < buffers.size(); ++i) { + dispatcher->post_chunk_to_pool(buffers[i]); + } + + Mutex::Locker l(lock); + if (notify_fd >= 0) + ::close(notify_fd); + if (tcp_fd >= 0) + ::close(tcp_fd); + error = ECONNRESET; +} + +void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v) +{ + Mutex::Locker l(lock); + if (wc.empty()) + wc = std::move(v); + else + wc.insert(wc.end(), v.begin(), v.end()); + notify(); +} + +void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w) +{ + Mutex::Locker l(lock); + if (wc.empty()) + return ; + w.swap(wc); +} + +int RDMAConnectedSocketImpl::activate() +{ + ibv_qp_attr qpa; + int r; + + // now connect up the qps and switch to RTR + memset(&qpa, 0, sizeof(qpa)); + qpa.qp_state = IBV_QPS_RTR; + qpa.path_mtu = IBV_MTU_1024; + qpa.dest_qp_num = peer_msg.qpn; + qpa.rq_psn = peer_msg.psn; + qpa.max_dest_rd_atomic = 1; + qpa.min_rnr_timer = 12; + //qpa.ah_attr.is_global = 0; + qpa.ah_attr.is_global = 1; + qpa.ah_attr.grh.hop_limit = 6; + qpa.ah_attr.grh.dgid = peer_msg.gid; + + qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx(); + + qpa.ah_attr.dlid = peer_msg.lid; + qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl; + qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp; + qpa.ah_attr.src_path_bits = 0; + qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port()); + + ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl; + + r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_MAX_DEST_RD_ATOMIC); + if (r) { + lderr(cct) << __func__ << " failed to transition to RTR state: " + << cpp_strerror(errno) << dendl; + return -1; + } + + ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl; + + // now move to RTS + qpa.qp_state = IBV_QPS_RTS; + + // How long to wait before retrying if packet lost or server dead. + // Supposedly the timeout is 4.096us*2^timeout. However, the actual + // timeout appears to be 4.096us*2^(timeout+1), so the setting + // below creates a 135ms timeout. + qpa.timeout = 14; + + // How many times to retry after timeouts before giving up. + qpa.retry_cnt = 7; + + // How many times to retry after RNR (receiver not ready) condition + // before giving up. Occurs when the remote side has not yet posted + // a receive request. + qpa.rnr_retry = 7; // 7 is infinite retry. + qpa.sq_psn = my_msg.psn; + qpa.max_rd_atomic = 1; + + r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC); + if (r) { + lderr(cct) << __func__ << " failed to transition to RTS state: " + << cpp_strerror(errno) << dendl; + return -1; + } + + // the queue pair should be ready to use once the client has finished + // setting up their end. + ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl; + ldout(cct, 20) << __func__ << " QueuePair: " << qp << " with qp:" << qp->get_qp() << dendl; + + if (!is_server) { + connected = 1; //indicate successfully + ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << my_msg.qpn << dendl; + submit(false); + } + active = true; + + return 0; +} + +int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) { + ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:" + << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl; + NetHandler net(cct); + + // we construct a socket to transport ib sync message + // but we shouldn't block in tcp connecting + if (opts.nonblock) { + tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr); + } else { + tcp_fd = net.connect(peer_addr, opts.connect_bind_addr); + } + + if (tcp_fd < 0) { + return -errno; + } + + int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size); + if (r < 0) { + ::close(tcp_fd); + tcp_fd = -1; + return -errno; + } + + ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl; + net.set_priority(tcp_fd, opts.priority, peer_addr.get_family()); + r = 0; + if (opts.nonblock) { + worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler); + } else { + r = handle_connection_established(false); + } + return r; +} + +int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) { + ldout(cct, 20) << __func__ << " start " << dendl; + // delete read event + worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE); + if (1 == connected) { + ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl; + if (need_set_fault) { + fault(); + } + return -1; + } + // send handshake msg to server + my_msg.peer_qpn = 0; + int r = infiniband->send_msg(cct, tcp_fd, my_msg); + if (r < 0) { + ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl; + if (need_set_fault) { + fault(); + } + return r; + } + worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler); + ldout(cct, 20) << __func__ << " finish " << dendl; + return 0; +} + +void RDMAConnectedSocketImpl::handle_connection() { + ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl; + int r = infiniband->recv_msg(cct, tcp_fd, peer_msg); + if (r <= 0) { + if (r != -EAGAIN) { + dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors); + ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl; + fault(); + } + return; + } + + if (1 == connected) { + ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl; + fault(); + return; + } + + if (!is_server) {// syn + ack from server + my_msg.peer_qpn = peer_msg.qpn; + ldout(cct, 20) << __func__ << " peer msg : < " << peer_msg.qpn << ", " << peer_msg.psn + << ", " << peer_msg.lid << ", " << peer_msg.peer_qpn << "> " << dendl; + if (!connected) { + r = activate(); + ceph_assert(!r); + } + notify(); + r = infiniband->send_msg(cct, tcp_fd, my_msg); + if (r < 0) { + ldout(cct, 1) << __func__ << " send client ack failed." << dendl; + dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors); + fault(); + } + } else { + if (peer_msg.peer_qpn == 0) {// syn from client + if (active) { + ldout(cct, 10) << __func__ << " server is already active." << dendl; + return ; + } + r = activate(); + ceph_assert(!r); + r = infiniband->send_msg(cct, tcp_fd, my_msg); + if (r < 0) { + ldout(cct, 1) << __func__ << " server ack failed." << dendl; + dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors); + fault(); + return ; + } + } else { // ack from client + connected = 1; + ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl; + //cleanup(); + submit(false); + notify(); + } + } +} + +ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len) +{ + uint64_t i = 0; + int r = ::read(notify_fd, &i, sizeof(i)); + ldout(cct, 20) << __func__ << " notify_fd : " << i << " in " << my_msg.qpn << " r = " << r << dendl; + + if (!active) { + ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl; + return -EAGAIN; + } + + if (0 == connected) { + ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl; + return -EAGAIN; + } + ssize_t read = 0; + if (!buffers.empty()) + read = read_buffers(buf,len); + + std::vector<ibv_wc> cqe; + get_wc(cqe); + if (cqe.empty()) { + if (!buffers.empty()) { + notify(); + } + if (read > 0) { + return read; + } + if (error) { + return -error; + } else { + return -EAGAIN; + } + } + + ldout(cct, 20) << __func__ << " poll queue got " << cqe.size() << " responses. QP: " << my_msg.qpn << dendl; + for (size_t i = 0; i < cqe.size(); ++i) { + ibv_wc* response = &cqe[i]; + ceph_assert(response->status == IBV_WC_SUCCESS); + Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id); + ldout(cct, 25) << __func__ << " chunk length: " << response->byte_len << " bytes." << chunk << dendl; + chunk->prepare_read(response->byte_len); + worker->perf_logger->inc(l_msgr_rdma_rx_bytes, response->byte_len); + if (response->byte_len == 0) { + dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin); + if (connected) { + error = ECONNRESET; + ldout(cct, 20) << __func__ << " got remote close msg..." << dendl; + } + dispatcher->post_chunk_to_pool(chunk); + } else { + if (read == (ssize_t)len) { + buffers.push_back(chunk); + ldout(cct, 25) << __func__ << " buffers add a chunk: " << response->byte_len << dendl; + } else if (read + response->byte_len > (ssize_t)len) { + read += chunk->read(buf+read, (ssize_t)len-read); + buffers.push_back(chunk); + ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl; + } else { + read += chunk->read(buf+read, response->byte_len); + dispatcher->post_chunk_to_pool(chunk); + update_post_backlog(); + } + } + } + + worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size()); + if (is_server && connected == 0) { + ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << my_msg.qpn << " peer QP: " << peer_msg.qpn << dendl; + connected = 1; //if so, we don't need the last handshake + cleanup(); + submit(false); + } + + if (!buffers.empty()) { + notify(); + } + + if (read == 0 && error) + return -error; + return read == 0 ? -EAGAIN : read; +} + +ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len) +{ + size_t read = 0, tmp = 0; + auto c = buffers.begin(); + for (; c != buffers.end() ; ++c) { + tmp = (*c)->read(buf+read, len-read); + read += tmp; + ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound() << ". Chunk:" << *c << dendl; + if ((*c)->over()) { + dispatcher->post_chunk_to_pool(*c); + update_post_backlog(); + ldout(cct, 25) << __func__ << " one chunk over." << dendl; + } + if (read == len) { + break; + } + } + + if (c != buffers.end() && (*c)->over()) + ++c; + buffers.erase(buffers.begin(), c); + ldout(cct, 25) << __func__ << " got " << read << " bytes, buffers size: " << buffers.size() << dendl; + return read; +} + +ssize_t RDMAConnectedSocketImpl::zero_copy_read(bufferptr &data) +{ + if (error) + return -error; + static const int MAX_COMPLETIONS = 16; + ibv_wc wc[MAX_COMPLETIONS]; + ssize_t size = 0; + + ibv_wc* response; + Chunk* chunk; + bool loaded = false; + auto iter = buffers.begin(); + if (iter != buffers.end()) { + chunk = *iter; + // FIXME need to handle release + // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband); + buffers.erase(iter); + loaded = true; + size = chunk->bound; + } + + std::vector<ibv_wc> cqe; + get_wc(cqe); + if (cqe.empty()) + return size == 0 ? -EAGAIN : size; + + ldout(cct, 20) << __func__ << " pool completion queue got " << cqe.size() << " responses."<< dendl; + + for (size_t i = 0; i < cqe.size(); ++i) { + response = &wc[i]; + chunk = reinterpret_cast<Chunk*>(response->wr_id); + chunk->prepare_read(response->byte_len); + if (!loaded && i == 0) { + // FIXME need to handle release + // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband); + size = chunk->bound; + continue; + } + buffers.push_back(chunk); + iter++; + } + + if (size == 0) + return -EAGAIN; + return size; +} + +ssize_t RDMAConnectedSocketImpl::send(bufferlist &bl, bool more) +{ + if (error) { + if (!active) + return -EPIPE; + return -error; + } + size_t bytes = bl.length(); + if (!bytes) + return 0; + { + Mutex::Locker l(lock); + pending_bl.claim_append(bl); + if (!connected) { + ldout(cct, 20) << __func__ << " fake send to upper, QP: " << my_msg.qpn << dendl; + return bytes; + } + } + ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << dendl; + ssize_t r = submit(more); + if (r < 0 && r != -EAGAIN) + return r; + return bytes; +} + +ssize_t RDMAConnectedSocketImpl::submit(bool more) +{ + if (error) + return -error; + Mutex::Locker l(lock); + size_t bytes = pending_bl.length(); + ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: " + << pending_bl.buffers().size() << dendl; + if (!bytes) + return 0; + + auto fill_tx_via_copy = [this](std::vector<Chunk*> &tx_buffers, + unsigned bytes, + auto& start, + const auto& end) -> unsigned { + ceph_assert(start != end); + auto chunk_idx = tx_buffers.size(); + int ret = worker->get_reged_mem(this, tx_buffers, bytes); + if (ret == 0) { + ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl; + worker->perf_logger->inc(l_msgr_rdma_tx_no_mem); + return 0; + } + + unsigned total_copied = 0; + Chunk *current_chunk = tx_buffers[chunk_idx]; + while (start != end) { + const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str()); + unsigned copied = 0; + while (copied < start->length()) { + uint32_t r = current_chunk->write((char*)addr+copied, start->length() - copied); + copied += r; + total_copied += r; + bytes -= r; + if (current_chunk->full()){ + if (++chunk_idx == tx_buffers.size()) + return total_copied; + current_chunk = tx_buffers[chunk_idx]; + } + } + ++start; + } + ceph_assert(bytes == 0); + return total_copied; + }; + + std::vector<Chunk*> tx_buffers; + auto it = std::cbegin(pending_bl.buffers()); + auto copy_it = it; + unsigned total = 0; + unsigned need_reserve_bytes = 0; + while (it != pending_bl.buffers().end()) { + if (infiniband->is_tx_buffer(it->raw_c_str())) { + if (need_reserve_bytes) { + unsigned copied = fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it); + total += copied; + if (copied < need_reserve_bytes) + goto sending; + need_reserve_bytes = 0; + } + ceph_assert(copy_it == it); + tx_buffers.push_back(infiniband->get_tx_chunk_by_buffer(it->raw_c_str())); + total += it->length(); + ++copy_it; + } else { + need_reserve_bytes += it->length(); + } + ++it; + } + if (need_reserve_bytes) + total += fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it); + + sending: + if (total == 0) + return -EAGAIN; + ceph_assert(total <= pending_bl.length()); + bufferlist swapped; + if (total < pending_bl.length()) { + worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem); + pending_bl.splice(total, pending_bl.length()-total, &swapped); + pending_bl.swap(swapped); + } else { + pending_bl.clear(); + } + + ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers " + << pending_bl.buffers().size() << " tx chunks " << tx_buffers.size() << dendl; + + int r = post_work_request(tx_buffers); + if (r < 0) + return r; + + ldout(cct, 20) << __func__ << " finished sending " << bytes << " bytes." << dendl; + return pending_bl.length() ? -EAGAIN : 0; +} + +int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers) +{ + ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " " << tx_buffers[0] << dendl; + vector<Chunk*>::iterator current_buffer = tx_buffers.begin(); + ibv_sge isge[tx_buffers.size()]; + uint32_t current_sge = 0; + ibv_send_wr iswr[tx_buffers.size()]; + uint32_t current_swr = 0; + ibv_send_wr* pre_wr = NULL; + uint32_t num = 0; + + // FIPS zeroization audit 20191115: these memsets are not security related. + memset(iswr, 0, sizeof(iswr)); + memset(isge, 0, sizeof(isge)); + + while (current_buffer != tx_buffers.end()) { + isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer); + isge[current_sge].length = (*current_buffer)->get_offset(); + isge[current_sge].lkey = (*current_buffer)->mr->lkey; + ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length << dendl; + + iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer); + iswr[current_swr].next = NULL; + iswr[current_swr].sg_list = &isge[current_sge]; + iswr[current_swr].num_sge = 1; + iswr[current_swr].opcode = IBV_WR_SEND; + iswr[current_swr].send_flags = IBV_SEND_SIGNALED; + /*if (isge[current_sge].length < infiniband->max_inline_data) { + iswr[current_swr].send_flags = IBV_SEND_INLINE; + ldout(cct, 20) << __func__ << " send_inline." << dendl; + }*/ + + num++; + worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length); + if (pre_wr) + pre_wr->next = &iswr[current_swr]; + pre_wr = &iswr[current_swr]; + ++current_sge; + ++current_swr; + ++current_buffer; + } + + ibv_send_wr *bad_tx_work_request; + if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) { + ldout(cct, 1) << __func__ << " failed to send data" + << " (most probably should be peer not ready): " + << cpp_strerror(errno) << dendl; + worker->perf_logger->inc(l_msgr_rdma_tx_failed); + return -errno; + } + qp->add_tx_wr(num); + worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size()); + ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl; + return 0; +} + +void RDMAConnectedSocketImpl::fin() { + ibv_send_wr wr; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = reinterpret_cast<uint64_t>(qp); + wr.num_sge = 0; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_SIGNALED; + ibv_send_wr* bad_tx_work_request; + if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) { + ldout(cct, 1) << __func__ << " failed to send message=" + << " ibv_post_send failed(most probably should be peer not ready): " + << cpp_strerror(errno) << dendl; + worker->perf_logger->inc(l_msgr_rdma_tx_failed); + return ; + } + qp->add_tx_wr(1); +} + +void RDMAConnectedSocketImpl::cleanup() { + if (read_handler && tcp_fd >= 0) { + (static_cast<C_handle_connection_read*>(read_handler))->close(); + worker->center.submit_to(worker->center.get_id(), [this]() { + worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE); + }, false); + delete read_handler; + read_handler = nullptr; + } + if (established_handler) { + (static_cast<C_handle_connection_established*>(established_handler))->close(); + delete established_handler; + established_handler = nullptr; + } +} + +void RDMAConnectedSocketImpl::notify() +{ + // note: notify_fd is an event fd (man eventfd) + // write argument must be a 64bit integer + uint64_t i = 1; + + ceph_assert(sizeof(i) == write(notify_fd, &i, sizeof(i))); +} + +void RDMAConnectedSocketImpl::shutdown() +{ + if (!error) + fin(); + error = ECONNRESET; + active = false; +} + +void RDMAConnectedSocketImpl::close() +{ + if (!error) + fin(); + error = ECONNRESET; + active = false; +} + +void RDMAConnectedSocketImpl::fault() +{ + ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl; + /*if (qp) { + qp->to_dead(); + qp = NULL; + }*/ + error = ECONNRESET; + connected = 1; + notify(); +} + +void RDMAConnectedSocketImpl::set_accept_fd(int sd) +{ + tcp_fd = sd; + is_server = true; + worker->center.submit_to(worker->center.get_id(), [this]() { + worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler); + }, true); +} + +void RDMAConnectedSocketImpl::post_chunks_to_rq(int num) +{ + post_backlog += num - infiniband->post_chunks_to_rq(num, qp->get_qp()); +} + +void RDMAConnectedSocketImpl::update_post_backlog() +{ + if (post_backlog) + post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp->get_qp()); +} diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc new file mode 100644 index 00000000..432c2d2b --- /dev/null +++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc @@ -0,0 +1,183 @@ +#include "RDMAStack.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl " + +#define TIMEOUT_MS 3000 +#define RETRY_COUNT 7 + +RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w, RDMACMInfo *info) + : RDMAConnectedSocketImpl(cct, ib, s, w), cm_con_handler(new C_handle_cm_connection(this)) +{ + status = IDLE; + notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK); + if (info) { + is_server = true; + cm_id = info->cm_id; + cm_channel = info->cm_channel; + status = RDMA_ID_CREATED; + remote_qpn = info->qp_num; + if (alloc_resource()) { + close_notify(); + return; + } + worker->center.submit_to(worker->center.get_id(), [this]() { + worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler); + status = CHANNEL_FD_CREATED; + }, false); + status = RESOURCE_ALLOCATED; + local_qpn = qp->get_local_qp_number(); + my_msg.qpn = local_qpn; + } else { + is_server = false; + cm_channel = rdma_create_event_channel(); + rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP); + status = RDMA_ID_CREATED; + ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl; + } +} + +RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() { + ldout(cct, 20) << __func__ << " destruct." << dendl; + std::unique_lock l(close_mtx); + close_condition.wait(l, [&] { return closed; }); + if (status >= RDMA_ID_CREATED) { + rdma_destroy_id(cm_id); + rdma_destroy_event_channel(cm_channel); + } +} + +int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) { + worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler); + status = CHANNEL_FD_CREATED; + if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) { + lderr(cct) << __func__ << " failed to resolve addr" << dendl; + return -1; + } + return 0; +} + +void RDMAIWARPConnectedSocketImpl::close() { + error = ECONNRESET; + active = false; + if (status >= CONNECTED) { + rdma_disconnect(cm_id); + } + close_notify(); +} + +void RDMAIWARPConnectedSocketImpl::shutdown() { + error = ECONNRESET; + active = false; +} + +void RDMAIWARPConnectedSocketImpl::handle_cm_connection() { + struct rdma_cm_event *event; + rdma_get_cm_event(cm_channel, &event); + ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event) + << " (cm id: " << cm_id << ")" << dendl; + struct rdma_conn_param cm_params; + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + status = ADDR_RESOLVED; + if (rdma_resolve_route(cm_id, TIMEOUT_MS)) { + lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl; + notify(); + } + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + status = ROUTE_RESOLVED; + if (alloc_resource()) { + lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl; + connected = -ECONNREFUSED; + notify(); + break; + } + local_qpn = qp->get_local_qp_number(); + my_msg.qpn = local_qpn; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&cm_params, 0, sizeof(cm_params)); + cm_params.retry_count = RETRY_COUNT; + cm_params.qp_num = local_qpn; + if (rdma_connect(cm_id, &cm_params)) { + lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl; + connected = -ECONNREFUSED; + notify(); + } + break; + + case RDMA_CM_EVENT_ESTABLISHED: + ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl; + status = CONNECTED; + if (!is_server) { + remote_qpn = event->param.conn.qp_num; + activate(); + notify(); + } + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + lderr(cct) << __func__ << " rdma connection rejected" << dendl; + connected = -ECONNREFUSED; + notify(); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + status = DISCONNECTED; + close_notify(); + if (!error) { + error = ECONNRESET; + notify(); + } + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + break; + + default: + ceph_abort_msg("unhandled event"); + break; + } + rdma_ack_cm_event(event); +} + +void RDMAIWARPConnectedSocketImpl::activate() { + ldout(cct, 30) << __func__ << dendl; + active = true; + connected = 1; +} + +int RDMAIWARPConnectedSocketImpl::alloc_resource() { + ldout(cct, 30) << __func__ << dendl; + qp = infiniband->create_queue_pair(cct, dispatcher->get_tx_cq(), + dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id); + if (!qp) { + return -1; + } + if (!cct->_conf->ms_async_rdma_support_srq) + dispatcher->post_chunks_to_rq(infiniband->get_rx_queue_len(), qp->get_qp()); + dispatcher->register_qp(qp, this); + dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair); + dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair); + return 0; +} + +void RDMAIWARPConnectedSocketImpl::close_notify() { + ldout(cct, 30) << __func__ << dendl; + if (status >= CHANNEL_FD_CREATED) { + worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE); + } + std::unique_lock l(close_mtx); + if (!closed) { + closed = true; + close_condition.notify_all(); + } +} diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc new file mode 100644 index 00000000..210eaf00 --- /dev/null +++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc @@ -0,0 +1,107 @@ +#include <poll.h> + +#include "msg/async/net_handler.h" +#include "RDMAStack.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl " + +RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl( + CephContext *cct, Infiniband* i, + RDMADispatcher *s, RDMAWorker *w, entity_addr_t& a, unsigned addr_slot) + : RDMAServerSocketImpl(cct, i, s, w, a, addr_slot) +{ +} + +int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa, + const SocketOptions &opt) +{ + ldout(cct, 20) << __func__ << " bind to rdma point" << dendl; + cm_channel = rdma_create_event_channel(); + rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP); + ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl; + int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr())); + if (rc < 0) { + rc = -errno; + ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr() + << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl; + goto err; + } + rc = rdma_listen(cm_id, 128); + if (rc < 0) { + rc = -errno; + ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr() + << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl; + goto err; + } + server_setup_socket = cm_channel->fd; + ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl; + return 0; + +err: + server_setup_socket = -1; + rdma_destroy_id(cm_id); + rdma_destroy_event_channel(cm_channel); + return rc; +} + +int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, + entity_addr_t *out, Worker *w) +{ + ldout(cct, 15) << __func__ << dendl; + + ceph_assert(sock); + struct pollfd pfd = { + .fd = cm_channel->fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, 0); + ceph_assert(ret >= 0); + if (!ret) + return -EAGAIN; + + struct rdma_cm_event *cm_event; + rdma_get_cm_event(cm_channel, &cm_event); + ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl; + + struct rdma_cm_id *event_cm_id = cm_event->id; + struct rdma_event_channel *event_channel = rdma_create_event_channel(); + + rdma_migrate_id(event_cm_id, event_channel); + + struct rdma_cm_id *new_cm_id = event_cm_id; + struct rdma_conn_param *remote_conn_param = &cm_event->param.conn; + struct rdma_conn_param local_conn_param; + + RDMACMInfo info(new_cm_id, event_channel, remote_conn_param->qp_num); + RDMAIWARPConnectedSocketImpl* server = + new RDMAIWARPConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w), &info); + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&local_conn_param, 0, sizeof(local_conn_param)); + local_conn_param.qp_num = server->get_local_qpn(); + + if (rdma_accept(new_cm_id, &local_conn_param)) { + return -EAGAIN; + } + server->activate(); + ldout(cct, 20) << __func__ << " accepted a new QP" << dendl; + + rdma_ack_cm_event(cm_event); + + std::unique_ptr<RDMAConnectedSocketImpl> csi(server); + *sock = ConnectedSocket(std::move(csi)); + struct sockaddr *addr = &new_cm_id->route.addr.dst_addr; + out->set_sockaddr(addr); + + return 0; +} + +void RDMAIWARPServerSocketImpl::abort_accept() +{ + if (server_setup_socket >= 0) { + rdma_destroy_id(cm_id); + rdma_destroy_event_channel(cm_channel); + } +} diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc new file mode 100644 index 00000000..98402cfd --- /dev/null +++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "msg/async/net_handler.h" +#include "RDMAStack.h" + +#include "include/compat.h" +#include "include/sock_compat.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAServerSocketImpl " + +RDMAServerSocketImpl::RDMAServerSocketImpl( + CephContext *cct, Infiniband* i, RDMADispatcher *s, RDMAWorker *w, + entity_addr_t& a, unsigned slot) + : ServerSocketImpl(a.get_type(), slot), + cct(cct), net(cct), server_setup_socket(-1), infiniband(i), + dispatcher(s), worker(w), sa(a) +{ +} + +int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt) +{ + int rc = 0; + server_setup_socket = net.create_socket(sa.get_family(), true); + if (server_setup_socket < 0) { + rc = -errno; + lderr(cct) << __func__ << " failed to create server socket: " + << cpp_strerror(errno) << dendl; + return rc; + } + + rc = net.set_nonblock(server_setup_socket); + if (rc < 0) { + goto err; + } + + rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size); + if (rc < 0) { + goto err; + } + + rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len()); + if (rc < 0) { + rc = -errno; + ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr() + << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl; + goto err; + } + + rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog); + if (rc < 0) { + rc = -errno; + lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl; + goto err; + } + + ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port() << dendl; + return 0; + +err: + ::close(server_setup_socket); + server_setup_socket = -1; + return rc; +} + +int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) +{ + ldout(cct, 15) << __func__ << dendl; + + ceph_assert(sock); + + sockaddr_storage ss; + socklen_t slen = sizeof(ss); + int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen); + if (sd < 0) { + return -errno; + } + + int r = net.set_nonblock(sd); + if (r < 0) { + ::close(sd); + return -errno; + } + + r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size); + if (r < 0) { + ::close(sd); + return -errno; + } + + ceph_assert(NULL != out); //out should not be NULL in accept connection + + out->set_type(addr_type); + out->set_sockaddr((sockaddr*)&ss); + net.set_priority(sd, opt.priority, out->get_family()); + + RDMAConnectedSocketImpl* server; + //Worker* w = dispatcher->get_stack()->get_worker(); + server = new RDMAConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w)); + server->set_accept_fd(sd); + ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl; + std::unique_ptr<RDMAConnectedSocketImpl> csi(server); + *sock = ConnectedSocket(std::move(csi)); + + return 0; +} + +void RDMAServerSocketImpl::abort_accept() +{ + if (server_setup_socket >= 0) + ::close(server_setup_socket); +} diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc new file mode 100644 index 00000000..f63a8e7d --- /dev/null +++ b/src/msg/async/rdma/RDMAStack.cc @@ -0,0 +1,610 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <poll.h> +#include <errno.h> +#include <sys/time.h> +#include <sys/resource.h> + +#include "include/str_list.h" +#include "include/compat.h" +#include "common/Cycles.h" +#include "common/deleter.h" +#include "common/Tub.h" +#include "RDMAStack.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "RDMAStack " + +RDMADispatcher::~RDMADispatcher() +{ + ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl; + polling_stop(); + + ceph_assert(qp_conns.empty()); + ceph_assert(num_qp_conn == 0); + ceph_assert(dead_queue_pairs.empty()); + ceph_assert(num_dead_queue_pair == 0); + + delete async_handler; +} + +RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s) + : cct(c), async_handler(new C_handle_cq_async(this)), lock("RDMADispatcher::lock"), + w_lock("RDMADispatcher::for worker pending list"), stack(s) +{ + PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last); + + plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling"); + plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks"); + plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed"); + plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers"); + + plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions"); + plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors"); + plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors"); + plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors"); + + plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion"); + plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion"); + plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request"); + + plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events"); + plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events"); + + plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors"); + + + plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number"); + plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + Cycles::init(); +} + +void RDMADispatcher::polling_start() +{ + // take lock because listen/connect can happen from different worker threads + Mutex::Locker l(lock); + + if (t.joinable()) + return; // dispatcher thread already running + + get_stack()->get_infiniband().get_memory_manager()->set_rx_stat_logger(perf_logger); + + tx_cc = get_stack()->get_infiniband().create_comp_channel(cct); + ceph_assert(tx_cc); + rx_cc = get_stack()->get_infiniband().create_comp_channel(cct); + ceph_assert(rx_cc); + tx_cq = get_stack()->get_infiniband().create_comp_queue(cct, tx_cc); + ceph_assert(tx_cq); + rx_cq = get_stack()->get_infiniband().create_comp_queue(cct, rx_cc); + ceph_assert(rx_cq); + + t = std::thread(&RDMADispatcher::polling, this); + ceph_pthread_setname(t.native_handle(), "rdma-polling"); +} + +void RDMADispatcher::polling_stop() +{ + { + Mutex::Locker l(lock); + done = true; + } + + if (!t.joinable()) + return; + + t.join(); + + tx_cc->ack_events(); + rx_cc->ack_events(); + delete tx_cq; + delete rx_cq; + delete tx_cc; + delete rx_cc; +} + +void RDMADispatcher::handle_async_event() +{ + ldout(cct, 30) << __func__ << dendl; + while (1) { + ibv_async_event async_event; + if (ibv_get_async_event(get_stack()->get_infiniband().get_device()->ctxt, &async_event)) { + if (errno != EAGAIN) + lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno + << " " << cpp_strerror(errno) << ")" << dendl; + return; + } + perf_logger->inc(l_msgr_rdma_total_async_events); + // FIXME: Currently we must ensure no other factor make QP in ERROR state, + // otherwise this qp can't be deleted in current cleanup flow. + if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) { + perf_logger->inc(l_msgr_rdma_async_last_wqe_events); + uint64_t qpn = async_event.element.qp->qp_num; + ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp + << " evt: " << ibv_event_type_str(async_event.event_type) << dendl; + Mutex::Locker l(lock); + RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn); + if (!conn) { + ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl; + } else { + ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl; + conn->fault(); + if (!cct->_conf->ms_async_rdma_cm) + erase_qpn_lockless(qpn); + } + } else { + ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt + << " evt: " << ibv_event_type_str(async_event.event_type) + << dendl; + } + ibv_ack_async_event(&async_event); + } +} + +void RDMADispatcher::post_chunk_to_pool(Chunk* chunk) +{ + Mutex::Locker l(lock); + get_stack()->get_infiniband().post_chunk_to_pool(chunk); + perf_logger->dec(l_msgr_rdma_rx_bufs_in_use); +} + +int RDMADispatcher::post_chunks_to_rq(int num, ibv_qp *qp) +{ + Mutex::Locker l(lock); + return get_stack()->get_infiniband().post_chunks_to_rq(num, qp); +} + +void RDMADispatcher::polling() +{ + static int MAX_COMPLETIONS = 32; + ibv_wc wc[MAX_COMPLETIONS]; + + std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled; + std::vector<ibv_wc> tx_cqe; + ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl; + RDMAConnectedSocketImpl *conn = nullptr; + uint64_t last_inactive = Cycles::rdtsc(); + bool rearmed = false; + int r = 0; + + while (true) { + int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc); + if (tx_ret > 0) { + ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret + << " responses."<< dendl; + handle_tx_event(wc, tx_ret); + } + + int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc); + if (rx_ret > 0) { + ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret + << " responses."<< dendl; + perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_ret); + perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_ret); + + Mutex::Locker l(lock);//make sure connected socket alive when pass wc + + for (int i = 0; i < rx_ret; ++i) { + ibv_wc* response = &wc[i]; + Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id); + + if (response->status == IBV_WC_SUCCESS) { + ceph_assert(wc[i].opcode == IBV_WC_RECV); + conn = get_conn_lockless(response->qp_num); + if (!conn) { + ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk " << chunk << " will be back ? " << r << dendl; + get_stack()->get_infiniband().post_chunk_to_pool(chunk); + perf_logger->dec(l_msgr_rdma_rx_bufs_in_use); + } else { + conn->post_chunks_to_rq(1); + polled[conn].push_back(*response); + } + } else { + perf_logger->inc(l_msgr_rdma_rx_total_wc_errors); + ldout(cct, 1) << __func__ << " work request returned error for buffer(" << chunk + << ") status(" << response->status << ":" + << get_stack()->get_infiniband().wc_status_to_string(response->status) << ")" << dendl; + if (response->status != IBV_WC_WR_FLUSH_ERR) { + conn = get_conn_lockless(response->qp_num); + if (conn && conn->is_connected()) + conn->fault(); + } + get_stack()->get_infiniband().post_chunk_to_pool(chunk); + perf_logger->dec(l_msgr_rdma_rx_bufs_in_use); + } + } + for (auto &&i : polled) + i.first->pass_wc(std::move(i.second)); + polled.clear(); + } + + if (!tx_ret && !rx_ret) { + // NOTE: Has TX just transitioned to idle? We should do it when idle! + // It's now safe to delete queue pairs (see comment by declaration + // for dead_queue_pairs). + // Additionally, don't delete qp while outstanding_buffers isn't empty, + // because we need to check qp's state before sending + perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight); + if (num_dead_queue_pair) { + Mutex::Locker l(lock); // FIXME reuse dead qp because creating one qp costs 1 ms + auto it = dead_queue_pairs.begin(); + while (it != dead_queue_pairs.end()) { + auto i = *it; + // Bypass QPs that do not collect all Tx completions yet. + if (i->get_tx_wr()) { + ldout(cct, 20) << __func__ << " bypass qp=" << i << " tx_wr=" << i->get_tx_wr() << dendl; + ++it; + } else { + ldout(cct, 10) << __func__ << " finally delete qp=" << i << dendl; + delete i; + it = dead_queue_pairs.erase(it); + perf_logger->dec(l_msgr_rdma_active_queue_pair); + --num_dead_queue_pair; + } + } + } + if (!num_qp_conn && done && dead_queue_pairs.empty()) + break; + + uint64_t now = Cycles::rdtsc(); + if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) { + handle_async_event(); + if (!rearmed) { + // Clean up cq events after rearm notify ensure no new incoming event + // arrived between polling and rearm + tx_cq->rearm_notify(); + rx_cq->rearm_notify(); + rearmed = true; + continue; + } + + struct pollfd channel_poll[2]; + channel_poll[0].fd = tx_cc->get_fd(); + channel_poll[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP; + channel_poll[0].revents = 0; + channel_poll[1].fd = rx_cc->get_fd(); + channel_poll[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP; + channel_poll[1].revents = 0; + r = 0; + perf_logger->set(l_msgr_rdma_polling, 0); + while (!done && r == 0) { + r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100)); + if (r < 0) { + r = -errno; + lderr(cct) << __func__ << " poll failed " << r << dendl; + ceph_abort(); + } + } + if (r > 0 && tx_cc->get_cq_event()) + ldout(cct, 20) << __func__ << " got tx cq event." << dendl; + if (r > 0 && rx_cc->get_cq_event()) + ldout(cct, 20) << __func__ << " got rx cq event." << dendl; + last_inactive = Cycles::rdtsc(); + perf_logger->set(l_msgr_rdma_polling, 1); + rearmed = false; + } + } + } +} + +void RDMADispatcher::notify_pending_workers() { + if (num_pending_workers) { + RDMAWorker *w = nullptr; + { + Mutex::Locker l(w_lock); + if (!pending_workers.empty()) { + w = pending_workers.front(); + pending_workers.pop_front(); + --num_pending_workers; + } + } + if (w) + w->notify_worker(); + } +} + +void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi) +{ + Mutex::Locker l(lock); + ceph_assert(!qp_conns.count(qp->get_local_qp_number())); + qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi); + ++num_qp_conn; +} + +RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp) +{ + auto it = qp_conns.find(qp); + if (it == qp_conns.end()) + return nullptr; + if (it->second.first->is_dead()) + return nullptr; + return it->second.second; +} + +Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp) +{ + Mutex::Locker l(lock); + // Try to find the QP in qp_conns firstly. + auto it = qp_conns.find(qp); + if (it != qp_conns.end()) + return it->second.first; + + // Try again in dead_queue_pairs. + for (auto &i: dead_queue_pairs) + if (i->get_local_qp_number() == qp) + return i; + + return nullptr; +} + +void RDMADispatcher::erase_qpn_lockless(uint32_t qpn) +{ + auto it = qp_conns.find(qpn); + if (it == qp_conns.end()) + return ; + ++num_dead_queue_pair; + dead_queue_pairs.push_back(it->second.first); + qp_conns.erase(it); + --num_qp_conn; +} + +void RDMADispatcher::erase_qpn(uint32_t qpn) +{ + Mutex::Locker l(lock); + erase_qpn_lockless(qpn); +} + +void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n) +{ + std::vector<Chunk*> tx_chunks; + + for (int i = 0; i < n; ++i) { + ibv_wc* response = &cqe[i]; + Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id); + ldout(cct, 25) << __func__ << " QP: " << response->qp_num + << " len: " << response->byte_len << " , addr:" << chunk + << " " << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl; + + QueuePair *qp = get_qp(response->qp_num); + if (qp) + qp->dec_tx_wr(1); + + if (response->status != IBV_WC_SUCCESS) { + perf_logger->inc(l_msgr_rdma_tx_total_wc_errors); + if (response->status == IBV_WC_RETRY_EXC_ERR) { + ldout(cct, 1) << __func__ << " connection between server and client not working. Disconnect this now" << dendl; + perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors); + } else if (response->status == IBV_WC_WR_FLUSH_ERR) { + ldout(cct, 1) << __func__ << " Work Request Flushed Error: this connection's qp=" + << response->qp_num << " should be down while this WR=" << response->wr_id + << " still in flight." << dendl; + perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors); + } else { + ldout(cct, 1) << __func__ << " send work request returned error for buffer(" + << response->wr_id << ") status(" << response->status << "): " + << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl; + Mutex::Locker l(lock);//make sure connected socket alive when pass wc + RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num); + + if (conn && conn->is_connected()) { + ldout(cct, 25) << __func__ << " qp state is : " << conn->get_qp_state() << dendl; + conn->fault(); + } else { + ldout(cct, 1) << __func__ << " missing qp_num=" << response->qp_num << " discard event" << dendl; + } + } + } + + //TX completion may come either from regular send message or from 'fin' message. + //In the case of 'fin' wr_id points to the QueuePair. + if (get_stack()->get_infiniband().get_memory_manager()->is_tx_buffer(chunk->buffer)) { + tx_chunks.push_back(chunk); + } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) { + ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl; + } else { + ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl; + ceph_abort(); + } + } + + perf_logger->inc(l_msgr_rdma_tx_total_wc, n); + post_tx_buffer(tx_chunks); +} + +/** + * Add the given Chunks to the given free queue. + * + * \param[in] chunks + * The Chunks to enqueue. + * \return + * 0 if success or -1 for failure + */ +void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks) +{ + if (chunks.empty()) + return ; + + inflight -= chunks.size(); + get_stack()->get_infiniband().get_memory_manager()->return_tx(chunks); + ldout(cct, 30) << __func__ << " release " << chunks.size() + << " chunks, inflight " << inflight << dendl; + notify_pending_workers(); +} + + +RDMAWorker::RDMAWorker(CephContext *c, unsigned i) + : Worker(c, i), stack(nullptr), + tx_handler(new C_handle_cq_tx(this)), lock("RDMAWorker::lock") +{ + // initialize perf_logger + char name[128]; + sprintf(name, "AsyncMessenger::RDMAWorker-%u", id); + PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last); + + plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer"); + plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer"); + plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted"); + + plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted"); + plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted"); + plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); +} + +RDMAWorker::~RDMAWorker() +{ + delete tx_handler; +} + +void RDMAWorker::initialize() +{ + if (!dispatcher) { + dispatcher = &stack->get_dispatcher(); + } +} + +int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot, + const SocketOptions &opt,ServerSocket *sock) +{ + get_stack()->get_infiniband().init(); + dispatcher->polling_start(); + RDMAServerSocketImpl *p; + if (cct->_conf->ms_async_rdma_type == "iwarp") { + p = new RDMAIWARPServerSocketImpl( + cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this, + sa, addr_slot); + } else { + p = new RDMAServerSocketImpl(cct, &get_stack()->get_infiniband(), + &get_stack()->get_dispatcher(), this, sa, + addr_slot); + } + int r = p->listen(sa, opt); + if (r < 0) { + delete p; + return r; + } + + *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p)); + return 0; +} + +int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) +{ + get_stack()->get_infiniband().init(); + dispatcher->polling_start(); + + RDMAConnectedSocketImpl* p; + if (cct->_conf->ms_async_rdma_type == "iwarp") { + p = new RDMAIWARPConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this); + } else { + p = new RDMAConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this); + } + int r = p->try_connect(addr, opts); + + if (r < 0) { + ldout(cct, 1) << __func__ << " try connecting failed." << dendl; + delete p; + return r; + } + std::unique_ptr<RDMAConnectedSocketImpl> csi(p); + *socket = ConnectedSocket(std::move(csi)); + return 0; +} + +int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes) +{ + ceph_assert(center.in_thread()); + int r = get_stack()->get_infiniband().get_tx_buffers(c, bytes); + ceph_assert(r >= 0); + size_t got = get_stack()->get_infiniband().get_memory_manager()->get_tx_buffer_size() * r; + ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered bytes, inflight " << dispatcher->inflight << dendl; + stack->get_dispatcher().inflight += r; + if (got >= bytes) + return r; + + if (o) { + if (!o->is_pending()) { + pending_sent_conns.push_back(o); + perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1); + o->set_pending(1); + } + dispatcher->make_pending_worker(this); + } + return r; +} + + +void RDMAWorker::handle_pending_message() +{ + ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl; + while (!pending_sent_conns.empty()) { + RDMAConnectedSocketImpl *o = pending_sent_conns.front(); + pending_sent_conns.pop_front(); + ssize_t r = o->submit(false); + ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl; + if (r < 0) { + if (r == -EAGAIN) { + pending_sent_conns.push_back(o); + dispatcher->make_pending_worker(this); + return ; + } + o->fault(); + } + o->set_pending(0); + perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1); + } + dispatcher->notify_pending_workers(); +} + +RDMAStack::RDMAStack(CephContext *cct, const string &t) + : NetworkStack(cct, t), ib(cct), dispatcher(cct, this) +{ + ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl; + + unsigned num = get_num_worker(); + for (unsigned i = 0; i < num; ++i) { + RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i)); + w->set_stack(this); + } + ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << &dispatcher << dendl; +} + +RDMAStack::~RDMAStack() +{ + if (cct->_conf->ms_async_rdma_enable_hugepage) { + unsetenv("RDMAV_HUGEPAGES_SAFE"); //remove env variable on destruction + } +} + +void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func) +{ + threads.resize(i+1); + threads[i] = std::thread(func); +} + +void RDMAStack::join_worker(unsigned i) +{ + ceph_assert(threads.size() > i && threads[i].joinable()); + threads[i].join(); +} diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h new file mode 100644 index 00000000..e4d34ee0 --- /dev/null +++ b/src/msg/async/rdma/RDMAStack.h @@ -0,0 +1,348 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_RDMASTACK_H +#define CEPH_MSG_RDMASTACK_H + +#include <sys/eventfd.h> + +#include <list> +#include <vector> +#include <thread> + +#include "common/ceph_context.h" +#include "common/debug.h" +#include "common/errno.h" +#include "msg/async/Stack.h" +#include "Infiniband.h" + +class RDMAConnectedSocketImpl; +class RDMAServerSocketImpl; +class RDMAStack; +class RDMAWorker; + +class RDMADispatcher { + typedef Infiniband::MemoryManager::Chunk Chunk; + typedef Infiniband::QueuePair QueuePair; + + std::thread t; + CephContext *cct; + Infiniband::CompletionQueue* tx_cq = nullptr; + Infiniband::CompletionQueue* rx_cq = nullptr; + Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr; + EventCallbackRef async_handler; + bool done = false; + std::atomic<uint64_t> num_dead_queue_pair = {0}; + std::atomic<uint64_t> num_qp_conn = {0}; + Mutex lock; // protect `qp_conns`, `dead_queue_pairs` + // qp_num -> InfRcConnection + // The main usage of `qp_conns` is looking up connection by qp_num, + // so the lifecycle of element in `qp_conns` is the lifecycle of qp. + //// make qp queue into dead state + /** + * 1. Connection call mark_down + * 2. Move the Queue Pair into the Error state(QueuePair::to_dead) + * 3. Wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED(handle_async_event) + * 4. Wait for CQ to be empty(handle_tx_event) + * 5. Destroy the QP by calling ibv_destroy_qp()(handle_tx_event) + * + * @param qp The qp needed to dead + */ + ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns; + + /// if a queue pair is closed when transmit buffers are active + /// on it, the transmit buffers never get returned via tx_cq. To + /// work around this problem, don't delete queue pairs immediately. Instead, + /// save them in this vector and delete them at a safe time, when there are + /// no outstanding transmit buffers to be lost. + std::vector<QueuePair*> dead_queue_pairs; + + std::atomic<uint64_t> num_pending_workers = {0}; + Mutex w_lock; // protect pending workers + // fixme: lockfree + std::list<RDMAWorker*> pending_workers; + RDMAStack* stack; + + class C_handle_cq_async : public EventCallback { + RDMADispatcher *dispatcher; + public: + explicit C_handle_cq_async(RDMADispatcher *w): dispatcher(w) {} + void do_request(uint64_t fd) { + // worker->handle_tx_event(); + dispatcher->handle_async_event(); + } + }; + + public: + PerfCounters *perf_logger; + + explicit RDMADispatcher(CephContext* c, RDMAStack* s); + virtual ~RDMADispatcher(); + void handle_async_event(); + + void polling_start(); + void polling_stop(); + void polling(); + void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi); + void make_pending_worker(RDMAWorker* w) { + Mutex::Locker l(w_lock); + auto it = std::find(pending_workers.begin(), pending_workers.end(), w); + if (it != pending_workers.end()) + return; + pending_workers.push_back(w); + ++num_pending_workers; + } + RDMAStack* get_stack() { return stack; } + RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp); + QueuePair* get_qp(uint32_t qp); + void erase_qpn_lockless(uint32_t qpn); + void erase_qpn(uint32_t qpn); + Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; } + Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; } + void notify_pending_workers(); + void handle_tx_event(ibv_wc *cqe, int n); + void post_tx_buffer(std::vector<Chunk*> &chunks); + + std::atomic<uint64_t> inflight = {0}; + + void post_chunk_to_pool(Chunk* chunk); + int post_chunks_to_rq(int num, ibv_qp *qp=NULL); +}; + +class RDMAWorker : public Worker { + typedef Infiniband::CompletionQueue CompletionQueue; + typedef Infiniband::CompletionChannel CompletionChannel; + typedef Infiniband::MemoryManager::Chunk Chunk; + typedef Infiniband::MemoryManager MemoryManager; + typedef std::vector<Chunk*>::iterator ChunkIter; + RDMAStack *stack; + EventCallbackRef tx_handler; + std::list<RDMAConnectedSocketImpl*> pending_sent_conns; + RDMADispatcher* dispatcher = nullptr; + Mutex lock; + + class C_handle_cq_tx : public EventCallback { + RDMAWorker *worker; + public: + explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {} + void do_request(uint64_t fd) { + worker->handle_pending_message(); + } + }; + + public: + PerfCounters *perf_logger; + explicit RDMAWorker(CephContext *c, unsigned i); + virtual ~RDMAWorker(); + virtual int listen(entity_addr_t &addr, + unsigned addr_slot, + const SocketOptions &opts, ServerSocket *) override; + virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; + virtual void initialize() override; + RDMAStack *get_stack() { return stack; } + int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes); + void remove_pending_conn(RDMAConnectedSocketImpl *o) { + ceph_assert(center.in_thread()); + pending_sent_conns.remove(o); + } + void handle_pending_message(); + void set_stack(RDMAStack *s) { stack = s; } + void notify_worker() { + center.dispatch_event_external(tx_handler); + } +}; + +struct RDMACMInfo { + RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_) + : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {} + rdma_cm_id *cm_id; + rdma_event_channel *cm_channel; + uint32_t qp_num; +}; + +class RDMAConnectedSocketImpl : public ConnectedSocketImpl { + public: + typedef Infiniband::MemoryManager::Chunk Chunk; + typedef Infiniband::CompletionChannel CompletionChannel; + typedef Infiniband::CompletionQueue CompletionQueue; + + protected: + CephContext *cct; + Infiniband::QueuePair *qp; + IBSYNMsg peer_msg; + IBSYNMsg my_msg; + int connected; + int error; + Infiniband* infiniband; + RDMADispatcher* dispatcher; + RDMAWorker* worker; + std::vector<Chunk*> buffers; + int notify_fd = -1; + bufferlist pending_bl; + + Mutex lock; + std::vector<ibv_wc> wc; + bool is_server; + EventCallbackRef read_handler; + EventCallbackRef established_handler; + int tcp_fd = -1; + bool active;// qp is active ? + bool pending; + int post_backlog = 0; + + void notify(); + ssize_t read_buffers(char* buf, size_t len); + int post_work_request(std::vector<Chunk*>&); + + public: + RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w); + virtual ~RDMAConnectedSocketImpl(); + + void pass_wc(std::vector<ibv_wc> &&v); + void get_wc(std::vector<ibv_wc> &w); + virtual int is_connected() override { return connected; } + + virtual ssize_t read(char* buf, size_t len) override; + virtual ssize_t zero_copy_read(bufferptr &data) override; + virtual ssize_t send(bufferlist &bl, bool more) override; + virtual void shutdown() override; + virtual void close() override; + virtual int fd() const override { return notify_fd; } + virtual int socket_fd() const override { return tcp_fd; } + void fault(); + const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); } + ssize_t submit(bool more); + int activate(); + void fin(); + void handle_connection(); + int handle_connection_established(bool need_set_fault = true); + void cleanup(); + void set_accept_fd(int sd); + virtual int try_connect(const entity_addr_t&, const SocketOptions &opt); + bool is_pending() {return pending;} + void set_pending(bool val) {pending = val;} + void post_chunks_to_rq(int num); + void update_post_backlog(); +}; + +enum RDMA_CM_STATUS { + IDLE = 1, + RDMA_ID_CREATED, + CHANNEL_FD_CREATED, + RESOURCE_ALLOCATED, + ADDR_RESOLVED, + ROUTE_RESOLVED, + CONNECTED, + DISCONNECTED, + ERROR +}; + +class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl { + public: + RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w, RDMACMInfo *info = nullptr); + ~RDMAIWARPConnectedSocketImpl(); + virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override; + virtual void close() override; + virtual void shutdown() override; + virtual void handle_cm_connection(); + uint32_t get_local_qpn() const { return local_qpn; } + void activate(); + int alloc_resource(); + void close_notify(); + + private: + rdma_cm_id *cm_id; + rdma_event_channel *cm_channel; + uint32_t local_qpn; + uint32_t remote_qpn; + EventCallbackRef cm_con_handler; + bool is_server; + std::mutex close_mtx; + std::condition_variable close_condition; + bool closed; + RDMA_CM_STATUS status; + + + class C_handle_cm_connection : public EventCallback { + RDMAIWARPConnectedSocketImpl *csi; + public: + C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {} + void do_request(uint64_t fd) { + csi->handle_cm_connection(); + } + }; +}; + +class RDMAServerSocketImpl : public ServerSocketImpl { + protected: + CephContext *cct; + NetHandler net; + int server_setup_socket; + Infiniband* infiniband; + RDMADispatcher *dispatcher; + RDMAWorker *worker; + entity_addr_t sa; + + public: + RDMAServerSocketImpl(CephContext *cct, Infiniband* i, RDMADispatcher *s, + RDMAWorker *w, entity_addr_t& a, unsigned slot); + + virtual int listen(entity_addr_t &sa, const SocketOptions &opt); + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + virtual int fd() const override { return server_setup_socket; } + int get_fd() { return server_setup_socket; } +}; + +class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl { + public: + RDMAIWARPServerSocketImpl( + CephContext *cct, Infiniband *i, RDMADispatcher *s, RDMAWorker *w, + entity_addr_t& addr, unsigned addr_slot); + virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override; + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + private: + rdma_cm_id *cm_id; + rdma_event_channel *cm_channel; +}; + +class RDMAStack : public NetworkStack { + vector<std::thread> threads; + PerfCounters *perf_counter; + Infiniband ib; + RDMADispatcher dispatcher; + + std::atomic<bool> fork_finished = {false}; + + public: + explicit RDMAStack(CephContext *cct, const string &t); + virtual ~RDMAStack(); + virtual bool support_zero_copy_read() const override { return false; } + virtual bool nonblock_connect_need_writable_event() const override { return false; } + + virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override; + virtual void join_worker(unsigned i) override; + RDMADispatcher &get_dispatcher() { return dispatcher; } + Infiniband &get_infiniband() { return ib; } + virtual bool is_ready() override { return fork_finished.load(); }; + virtual void ready() override { fork_finished = true; }; +}; + + +#endif diff --git a/src/msg/msg_types.cc b/src/msg/msg_types.cc new file mode 100644 index 00000000..76b9585b --- /dev/null +++ b/src/msg/msg_types.cc @@ -0,0 +1,383 @@ + +#include "msg_types.h" + +#include <arpa/inet.h> +#include <stdlib.h> +#include <string.h> +#include <netdb.h> + +#include "common/Formatter.h" + +void entity_name_t::dump(Formatter *f) const +{ + f->dump_string("type", type_str()); + f->dump_unsigned("num", num()); +} + +void entity_addr_t::dump(Formatter *f) const +{ + f->dump_string("type", get_type_name(type)); + f->dump_stream("addr") << get_sockaddr(); + f->dump_unsigned("nonce", nonce); +} + +void entity_inst_t::dump(Formatter *f) const +{ + f->dump_object("name", name); + f->dump_object("addr", addr); +} + +void entity_name_t::generate_test_instances(list<entity_name_t*>& o) +{ + o.push_back(new entity_name_t(entity_name_t::MON())); + o.push_back(new entity_name_t(entity_name_t::MON(1))); + o.push_back(new entity_name_t(entity_name_t::OSD(1))); + o.push_back(new entity_name_t(entity_name_t::CLIENT(1))); +} + +void entity_addr_t::generate_test_instances(list<entity_addr_t*>& o) +{ + o.push_back(new entity_addr_t()); + entity_addr_t *a = new entity_addr_t(); + a->set_nonce(1); + o.push_back(a); + entity_addr_t *b = new entity_addr_t(); + b->set_type(entity_addr_t::TYPE_LEGACY); + b->set_nonce(5); + b->set_family(AF_INET); + b->set_in4_quad(0, 127); + b->set_in4_quad(1, 0); + b->set_in4_quad(2, 1); + b->set_in4_quad(3, 2); + b->set_port(2); + o.push_back(b); +} + +void entity_inst_t::generate_test_instances(list<entity_inst_t*>& o) +{ + o.push_back(new entity_inst_t()); + entity_name_t name; + entity_addr_t addr; + entity_inst_t *a = new entity_inst_t(name, addr); + o.push_back(a); +} + +bool entity_addr_t::parse(const char *s, const char **end, int default_type) +{ + *this = entity_addr_t(); + + const char *start = s; + if (end) { + *end = s; + } + + int newtype; + if (strncmp("v1:", s, 3) == 0) { + start += 3; + newtype = TYPE_LEGACY; + } else if (strncmp("v2:", s, 3) == 0) { + start += 3; + newtype = TYPE_MSGR2; + } else if (strncmp("any:", s, 4) == 0) { + start += 4; + newtype = TYPE_ANY; + } else if (*s == '-') { + newtype = TYPE_NONE; + if (end) { + *end = s + 1; + } + return true; + } else { + newtype = default_type ? default_type : TYPE_DEFAULT; + } + + bool brackets = false; + if (*start == '[') { + start++; + brackets = true; + } + + // inet_pton() requires a null terminated input, so let's fill two + // buffers, one with ipv4 allowed characters, and one with ipv6, and + // then see which parses. + char buf4[39]; + char *o = buf4; + const char *p = start; + while (o < buf4 + sizeof(buf4) && + *p && ((*p == '.') || + (*p >= '0' && *p <= '9'))) { + *o++ = *p++; + } + *o = 0; + + char buf6[64]; // actually 39 + null is sufficient. + o = buf6; + p = start; + while (o < buf6 + sizeof(buf6) && + *p && ((*p == ':') || + (*p >= '0' && *p <= '9') || + (*p >= 'a' && *p <= 'f') || + (*p >= 'A' && *p <= 'F'))) { + *o++ = *p++; + } + *o = 0; + //cout << "buf4 is '" << buf4 << "', buf6 is '" << buf6 << "'" << std::endl; + + // ipv4? + struct in_addr a4; + struct in6_addr a6; + if (inet_pton(AF_INET, buf4, &a4)) { + u.sin.sin_addr.s_addr = a4.s_addr; + u.sa.sa_family = AF_INET; + p = start + strlen(buf4); + } else if (inet_pton(AF_INET6, buf6, &a6)) { + u.sa.sa_family = AF_INET6; + memcpy(&u.sin6.sin6_addr, &a6, sizeof(a6)); + p = start + strlen(buf6); + } else { + return false; + } + + if (brackets) { + if (*p != ']') + return false; + p++; + } + + //cout << "p is " << *p << std::endl; + if (*p == ':') { + // parse a port, too! + p++; + int port = atoi(p); + if (port > MAX_PORT_NUMBER) { + return false; + } + set_port(port); + while (*p && *p >= '0' && *p <= '9') + p++; + } + + if (*p == '/') { + // parse nonce, too + p++; + int non = atoi(p); + set_nonce(non); + while (*p && *p >= '0' && *p <= '9') + p++; + } + + if (end) + *end = p; + + type = newtype; + + //cout << *this << std::endl; + return true; +} + +ostream& operator<<(ostream& out, const entity_addr_t &addr) +{ + if (addr.type == entity_addr_t::TYPE_NONE) { + return out << "-"; + } + if (addr.type != entity_addr_t::TYPE_ANY) { + out << entity_addr_t::get_type_name(addr.type) << ":"; + } + out << addr.get_sockaddr() << '/' << addr.nonce; + return out; +} + +ostream& operator<<(ostream& out, const sockaddr_storage &ss) +{ + char buf[NI_MAXHOST] = { 0 }; + char serv[NI_MAXSERV] = { 0 }; + size_t hostlen; + + if (ss.ss_family == AF_INET) + hostlen = sizeof(struct sockaddr_in); + else if (ss.ss_family == AF_INET6) + hostlen = sizeof(struct sockaddr_in6); + else + hostlen = sizeof(struct sockaddr_storage); + getnameinfo((struct sockaddr *)&ss, hostlen, buf, sizeof(buf), + serv, sizeof(serv), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ss.ss_family == AF_INET6) + return out << '[' << buf << "]:" << serv; + return out << buf << ':' << serv; +} + +ostream& operator<<(ostream& out, const sockaddr *sa) +{ + char buf[NI_MAXHOST] = { 0 }; + char serv[NI_MAXSERV] = { 0 }; + size_t hostlen; + + if (sa->sa_family == AF_INET) + hostlen = sizeof(struct sockaddr_in); + else if (sa->sa_family == AF_INET6) + hostlen = sizeof(struct sockaddr_in6); + else + hostlen = sizeof(struct sockaddr_storage); + getnameinfo(sa, hostlen, buf, sizeof(buf), + serv, sizeof(serv), + NI_NUMERICHOST | NI_NUMERICSERV); + if (sa->sa_family == AF_INET6) + return out << '[' << buf << "]:" << serv; + return out << buf << ':' << serv; +} + +// entity_addrvec_t + +bool entity_addrvec_t::parse(const char *s, const char **end) +{ + const char *orig_s = s; + const char *static_end; + if (!end) { + end = &static_end; + } else { + *end = s; + } + v.clear(); + bool brackets = false; + if (*s == '[') { + // weirdness: make sure this isn't an IPV6 addr! + entity_addr_t a; + const char *p; + if (!a.parse(s, &p) || !a.is_ipv6()) { + // it's not + brackets = true; + ++s; + } + } + while (*s) { + entity_addr_t a; + bool r = a.parse(s, end); + if (!r) { + if (brackets) { + v.clear(); + *end = orig_s; + return false; + } + break; + } + v.push_back(a); + s = *end; + if (!brackets) { + break; + } + if (*s != ',') { + break; + } + ++s; + } + if (brackets) { + if (*s == ']') { + ++s; + *end = s; + } else { + *end = orig_s; + v.clear(); + return false; + } + } + return !v.empty(); +} + +void entity_addrvec_t::encode(bufferlist& bl, uint64_t features) const +{ + using ceph::encode; + if ((features & CEPH_FEATURE_MSG_ADDR2) == 0) { + // encode a single legacy entity_addr_t for unfeatured peers + encode(legacy_addr(), bl, 0); + return; + } + encode((__u8)2, bl); + encode(v, bl, features); +} + +void entity_addrvec_t::decode(bufferlist::const_iterator& bl) +{ + using ceph::decode; + __u8 marker; + decode(marker, bl); + if (marker == 0) { + // legacy! + entity_addr_t addr; + addr.decode_legacy_addr_after_marker(bl); + v.clear(); + v.push_back(addr); + return; + } + if (marker == 1) { + entity_addr_t addr; + DECODE_START(1, bl); + decode(addr.type, bl); + decode(addr.nonce, bl); + __u32 elen; + decode(elen, bl); + if (elen) { + struct sockaddr *sa = (struct sockaddr *)addr.get_sockaddr(); +#if defined(__FreeBSD__) || defined(__APPLE__) + sa->sa_len = 0; +#endif + uint16_t ss_family; + if (elen < sizeof(ss_family)) { + throw ceph::buffer::malformed_input("elen smaller than family len"); + } + decode(ss_family, bl); + sa->sa_family = ss_family; + elen -= sizeof(ss_family); + if (elen > addr.get_sockaddr_len() - sizeof(sa->sa_family)) { + throw ceph::buffer::malformed_input("elen exceeds sockaddr len"); + } + bl.copy(elen, sa->sa_data); + } + DECODE_FINISH(bl); + v.clear(); + v.push_back(addr); + return; + } + if (marker > 2) + throw buffer::malformed_input("entity_addrvec_marker > 2"); + decode(v, bl); +} + +void entity_addrvec_t::dump(Formatter *f) const +{ + f->open_array_section("addrvec"); + for (vector<entity_addr_t>::const_iterator p = v.begin(); + p != v.end(); ++p) { + f->dump_object("addr", *p); + } + f->close_section(); +} + +void entity_addrvec_t::generate_test_instances(list<entity_addrvec_t*>& ls) +{ + ls.push_back(new entity_addrvec_t()); + ls.push_back(new entity_addrvec_t()); + ls.back()->v.push_back(entity_addr_t()); + ls.push_back(new entity_addrvec_t()); + ls.back()->v.push_back(entity_addr_t()); + ls.back()->v.push_back(entity_addr_t()); +} + +std::string entity_addr_t::ip_only_to_str() const +{ + const char *host_ip = NULL; + char addr_buf[INET6_ADDRSTRLEN]; + switch (get_family()) { + case AF_INET: + host_ip = inet_ntop(AF_INET, &in4_addr().sin_addr, + addr_buf, INET_ADDRSTRLEN); + break; + case AF_INET6: + host_ip = inet_ntop(AF_INET6, &in6_addr().sin6_addr, + addr_buf, INET6_ADDRSTRLEN); + break; + default: + break; + } + return host_ip ? host_ip : ""; +} diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h new file mode 100644 index 00000000..74d5ee30 --- /dev/null +++ b/src/msg/msg_types.h @@ -0,0 +1,803 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_TYPES_H +#define CEPH_MSG_TYPES_H + +#include <sstream> + +#include <netinet/in.h> + +#include "include/ceph_features.h" +#include "include/types.h" +#include "include/blobhash.h" +#include "include/encoding.h" + +#define MAX_PORT_NUMBER 65535 + +namespace ceph { + class Formatter; +} + +extern ostream& operator<<(ostream& out, const sockaddr_storage &ss); +extern ostream& operator<<(ostream& out, const sockaddr *sa); + +typedef uint8_t entity_type_t; + +class entity_name_t { +public: + entity_type_t _type; + int64_t _num; + +public: + static const int TYPE_MON = CEPH_ENTITY_TYPE_MON; + static const int TYPE_MDS = CEPH_ENTITY_TYPE_MDS; + static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD; + static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT; + static const int TYPE_MGR = CEPH_ENTITY_TYPE_MGR; + + static const int64_t NEW = -1; + + // cons + entity_name_t() : _type(0), _num(0) { } + entity_name_t(int t, int64_t n) : _type(t), _num(n) { } + explicit entity_name_t(const ceph_entity_name &n) : + _type(n.type), _num(n.num) { } + + // static cons + static entity_name_t MON(int64_t i=NEW) { return entity_name_t(TYPE_MON, i); } + static entity_name_t MDS(int64_t i=NEW) { return entity_name_t(TYPE_MDS, i); } + static entity_name_t OSD(int64_t i=NEW) { return entity_name_t(TYPE_OSD, i); } + static entity_name_t CLIENT(int64_t i=NEW) { return entity_name_t(TYPE_CLIENT, i); } + static entity_name_t MGR(int64_t i=NEW) { return entity_name_t(TYPE_MGR, i); } + + int64_t num() const { return _num; } + int type() const { return _type; } + const char *type_str() const { + return ceph_entity_type_name(type()); + } + + bool is_new() const { return num() < 0; } + + bool is_client() const { return type() == TYPE_CLIENT; } + bool is_mds() const { return type() == TYPE_MDS; } + bool is_osd() const { return type() == TYPE_OSD; } + bool is_mon() const { return type() == TYPE_MON; } + bool is_mgr() const { return type() == TYPE_MGR; } + + operator ceph_entity_name() const { + ceph_entity_name n = { _type, init_le64(_num) }; + return n; + } + + bool parse(const string& s) { + const char *start = s.c_str(); + char *end; + bool got = parse(start, &end); + return got && end == start + s.length(); + } + bool parse(const char *start, char **end) { + if (strstr(start, "mon.") == start) { + _type = TYPE_MON; + start += 4; + } else if (strstr(start, "osd.") == start) { + _type = TYPE_OSD; + start += 4; + } else if (strstr(start, "mds.") == start) { + _type = TYPE_MDS; + start += 4; + } else if (strstr(start, "client.") == start) { + _type = TYPE_CLIENT; + start += 7; + } else if (strstr(start, "mgr.") == start) { + _type = TYPE_MGR; + start += 4; + } else { + return false; + } + if (isspace(*start)) + return false; + _num = strtoll(start, end, 10); + if (*end == NULL || *end == start) + return false; + return true; + } + + DENC(entity_name_t, v, p) { + denc(v._type, p); + denc(v._num, p); + } + void dump(Formatter *f) const; + + static void generate_test_instances(list<entity_name_t*>& o); +}; +WRITE_CLASS_DENC(entity_name_t) + +inline bool operator== (const entity_name_t& l, const entity_name_t& r) { + return (l.type() == r.type()) && (l.num() == r.num()); } +inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { + return (l.type() != r.type()) || (l.num() != r.num()); } +inline bool operator< (const entity_name_t& l, const entity_name_t& r) { + return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } + +inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { + //if (addr.is_namer()) return out << "namer"; + if (addr.is_new() || addr.num() < 0) + return out << addr.type_str() << ".?"; + else + return out << addr.type_str() << '.' << addr.num(); +} +inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) { + return out << entity_name_t{addr.type, static_cast<int64_t>(addr.num)}; +} + +namespace std { + template<> struct hash< entity_name_t > + { + size_t operator()( const entity_name_t &m ) const + { + return rjhash32(m.type() ^ m.num()); + } + }; +} // namespace std + +// define a wire format for sockaddr that matches Linux's. +struct ceph_sockaddr_storage { + ceph_le16 ss_family; + __u8 __ss_padding[128 - sizeof(ceph_le16)]; + + void encode(bufferlist& bl) const { + struct ceph_sockaddr_storage ss = *this; + ss.ss_family = htons(ss.ss_family); + ::encode_raw(ss, bl); + } + + void decode(bufferlist::const_iterator& bl) { + struct ceph_sockaddr_storage ss; + ::decode_raw(ss, bl); + ss.ss_family = ntohs(ss.ss_family); + *this = ss; + } +} __attribute__ ((__packed__)); +WRITE_CLASS_ENCODER(ceph_sockaddr_storage) + +/* + * encode sockaddr.ss_family as network byte order + */ +static inline void encode(const sockaddr_storage& a, bufferlist& bl) { +#if defined(__linux__) + struct sockaddr_storage ss = a; + ss.ss_family = htons(ss.ss_family); + ::encode_raw(ss, bl); +#elif defined(__FreeBSD__) || defined(__APPLE__) + ceph_sockaddr_storage ss{}; + auto src = (unsigned char const *)&a; + auto dst = (unsigned char *)&ss; + src += sizeof(a.ss_len); + ss.ss_family = a.ss_family; + src += sizeof(a.ss_family); + dst += sizeof(ss.ss_family); + const auto copy_size = std::min((unsigned char*)(&a + 1) - src, + (unsigned char*)(&ss + 1) - dst); + ::memcpy(dst, src, copy_size); + encode(ss, bl); +#else + ceph_sockaddr_storage ss{}; + ::memset(&ss, '\0', sizeof(ss)); + ::memcpy(&wireaddr, &ss, std::min(sizeof(ss), sizeof(a))); + encode(ss, bl); +#endif +} +static inline void decode(sockaddr_storage& a, bufferlist::const_iterator& bl) { +#if defined(__linux__) + ::decode_raw(a, bl); + a.ss_family = ntohs(a.ss_family); +#elif defined(__FreeBSD__) || defined(__APPLE__) + ceph_sockaddr_storage ss{}; + decode(ss, bl); + auto src = (unsigned char const *)&ss; + auto dst = (unsigned char *)&a; + a.ss_len = 0; + dst += sizeof(a.ss_len); + a.ss_family = ss.ss_family; + src += sizeof(ss.ss_family); + dst += sizeof(a.ss_family); + auto const copy_size = std::min((unsigned char*)(&ss + 1) - src, + (unsigned char*)(&a + 1) - dst); + ::memcpy(dst, src, copy_size); +#else + ceph_sockaddr_storage ss{}; + decode(ss, bl); + ::memcpy(&a, &ss, std::min(sizeof(ss), sizeof(a))); +#endif +} + +/* + * an entity's network address. + * includes a random value that prevents it from being reused. + * thus identifies a particular process instance. + * ipv4 for now. + */ +struct entity_addr_t { + typedef enum { + TYPE_NONE = 0, + TYPE_LEGACY = 1, ///< legacy msgr1 protocol (ceph jewel and older) + TYPE_MSGR2 = 2, ///< msgr2 protocol (new in ceph kraken) + TYPE_ANY = 3, ///< ambiguous + } type_t; + static const type_t TYPE_DEFAULT = TYPE_MSGR2; + static std::string_view get_type_name(int t) { + switch (t) { + case TYPE_NONE: return "none"; + case TYPE_LEGACY: return "v1"; + case TYPE_MSGR2: return "v2"; + case TYPE_ANY: return "any"; + default: return "???"; + } + }; + + __u32 type; + __u32 nonce; + union { + sockaddr sa; + sockaddr_in sin; + sockaddr_in6 sin6; + } u; + + entity_addr_t() : type(0), nonce(0) { + memset(&u, 0, sizeof(u)); + } + entity_addr_t(__u32 _type, __u32 _nonce) : type(_type), nonce(_nonce) { + memset(&u, 0, sizeof(u)); + } + explicit entity_addr_t(const ceph_entity_addr &o) { + type = o.type; + nonce = o.nonce; + memcpy(&u, &o.in_addr, sizeof(u)); +#if !defined(__FreeBSD__) + u.sa.sa_family = ntohs(u.sa.sa_family); +#endif + } + + uint32_t get_type() const { return type; } + void set_type(uint32_t t) { type = t; } + bool is_legacy() const { return type == TYPE_LEGACY; } + bool is_msgr2() const { return type == TYPE_MSGR2; } + bool is_any() const { return type == TYPE_ANY; } + + __u32 get_nonce() const { return nonce; } + void set_nonce(__u32 n) { nonce = n; } + + int get_family() const { + return u.sa.sa_family; + } + void set_family(int f) { + u.sa.sa_family = f; + } + + bool is_ipv4() const { + return u.sa.sa_family == AF_INET; + } + bool is_ipv6() const { + return u.sa.sa_family == AF_INET6; + } + + sockaddr_in &in4_addr() { + return u.sin; + } + const sockaddr_in &in4_addr() const{ + return u.sin; + } + sockaddr_in6 &in6_addr(){ + return u.sin6; + } + const sockaddr_in6 &in6_addr() const{ + return u.sin6; + } + const sockaddr *get_sockaddr() const { + return &u.sa; + } + size_t get_sockaddr_len() const { + switch (u.sa.sa_family) { + case AF_INET: + return sizeof(u.sin); + case AF_INET6: + return sizeof(u.sin6); + } + return sizeof(u); + } + bool set_sockaddr(const struct sockaddr *sa) + { + switch (sa->sa_family) { + case AF_INET: + // pre-zero, since we're only copying a portion of the source + memset(&u, 0, sizeof(u)); + memcpy(&u.sin, sa, sizeof(u.sin)); + break; + case AF_INET6: + // pre-zero, since we're only copying a portion of the source + memset(&u, 0, sizeof(u)); + memcpy(&u.sin6, sa, sizeof(u.sin6)); + break; + case AF_UNSPEC: + memset(&u, 0, sizeof(u)); + break; + default: + return false; + } + return true; + } + + sockaddr_storage get_sockaddr_storage() const { + sockaddr_storage ss; + memcpy(&ss, &u, sizeof(u)); + memset((char*)&ss + sizeof(u), 0, sizeof(ss) - sizeof(u)); + return ss; + } + + void set_in4_quad(int pos, int val) { + u.sin.sin_family = AF_INET; + unsigned char *ipq = (unsigned char*)&u.sin.sin_addr.s_addr; + ipq[pos] = val; + } + void set_port(int port) { + switch (u.sa.sa_family) { + case AF_INET: + u.sin.sin_port = htons(port); + break; + case AF_INET6: + u.sin6.sin6_port = htons(port); + break; + default: + ceph_abort(); + } + } + int get_port() const { + switch (u.sa.sa_family) { + case AF_INET: + return ntohs(u.sin.sin_port); + break; + case AF_INET6: + return ntohs(u.sin6.sin6_port); + break; + } + return 0; + } + + operator ceph_entity_addr() const { + ceph_entity_addr a; + a.type = 0; + a.nonce = nonce; + a.in_addr = get_sockaddr_storage(); +#if !defined(__FreeBSD__) + a.in_addr.ss_family = htons(a.in_addr.ss_family); +#endif + return a; + } + + bool probably_equals(const entity_addr_t &o) const { + if (get_port() != o.get_port()) + return false; + if (get_nonce() != o.get_nonce()) + return false; + if (is_blank_ip() || o.is_blank_ip()) + return true; + if (memcmp(&u, &o.u, sizeof(u)) == 0) + return true; + return false; + } + + bool is_same_host(const entity_addr_t &o) const { + if (u.sa.sa_family != o.u.sa.sa_family) + return false; + if (u.sa.sa_family == AF_INET) + return u.sin.sin_addr.s_addr == o.u.sin.sin_addr.s_addr; + if (u.sa.sa_family == AF_INET6) + return memcmp(u.sin6.sin6_addr.s6_addr, + o.u.sin6.sin6_addr.s6_addr, + sizeof(u.sin6.sin6_addr.s6_addr)) == 0; + return false; + } + + bool is_blank_ip() const { + switch (u.sa.sa_family) { + case AF_INET: + return u.sin.sin_addr.s_addr == INADDR_ANY; + case AF_INET6: + return memcmp(&u.sin6.sin6_addr, &in6addr_any, sizeof(in6addr_any)) == 0; + default: + return true; + } + } + + bool is_ip() const { + switch (u.sa.sa_family) { + case AF_INET: + case AF_INET6: + return true; + default: + return false; + } + } + + std::string ip_only_to_str() const; + + std::string get_legacy_str() const { + ostringstream ss; + ss << get_sockaddr() << "/" << get_nonce(); + return ss.str(); + } + + bool parse(const char *s, const char **end = 0, int type=0); + + void decode_legacy_addr_after_marker(bufferlist::const_iterator& bl) + { + using ceph::decode; + __u8 marker; + __u16 rest; + decode(marker, bl); + decode(rest, bl); + decode(nonce, bl); + sockaddr_storage ss; + decode(ss, bl); + set_sockaddr((sockaddr*)&ss); + if (get_family() == AF_UNSPEC) { + type = TYPE_NONE; + } else { + type = TYPE_LEGACY; + } + } + + // Right now, these only deal with sockaddr_storage that have only family and content. + // Apparently on BSD there is also an ss_len that we need to handle; this requires + // broader study + + void encode(bufferlist& bl, uint64_t features) const { + using ceph::encode; + if ((features & CEPH_FEATURE_MSG_ADDR2) == 0) { + encode((__u32)0, bl); + encode(nonce, bl); + sockaddr_storage ss = get_sockaddr_storage(); + encode(ss, bl); + return; + } + encode((__u8)1, bl); + ENCODE_START(1, 1, bl); + if (HAVE_FEATURE(features, SERVER_NAUTILUS)) { + encode(type, bl); + } else { + // map any -> legacy for old clients. this is primary for the benefit + // of OSDMap's blacklist, but is reasonable in general since any: is + // meaningless for pre-nautilus clients or daemons. + auto t = type; + if (t == TYPE_ANY) { + t = TYPE_LEGACY; + } + encode(t, bl); + } + encode(nonce, bl); + __u32 elen = get_sockaddr_len(); +#if (__FreeBSD__) || defined(__APPLE__) + elen -= sizeof(u.sa.sa_len); +#endif + encode(elen, bl); + if (elen) { + uint16_t ss_family = u.sa.sa_family; + + encode(ss_family, bl); + elen -= sizeof(u.sa.sa_family); + bl.append(u.sa.sa_data, elen); + } + ENCODE_FINISH(bl); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + __u8 marker; + decode(marker, bl); + if (marker == 0) { + decode_legacy_addr_after_marker(bl); + return; + } + if (marker != 1) + throw buffer::malformed_input("entity_addr_t marker != 1"); + DECODE_START(1, bl); + decode(type, bl); + decode(nonce, bl); + __u32 elen; + decode(elen, bl); + if (elen) { +#if defined(__FreeBSD__) || defined(__APPLE__) + u.sa.sa_len = 0; +#endif + uint16_t ss_family; + if (elen < sizeof(ss_family)) { + throw buffer::malformed_input("elen smaller than family len"); + } + decode(ss_family, bl); + u.sa.sa_family = ss_family; + elen -= sizeof(ss_family); + if (elen > get_sockaddr_len() - sizeof(u.sa.sa_family)) { + throw buffer::malformed_input("elen exceeds sockaddr len"); + } + bl.copy(elen, u.sa.sa_data); + } + DECODE_FINISH(bl); + } + + void dump(Formatter *f) const; + + static void generate_test_instances(list<entity_addr_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(entity_addr_t) + +ostream& operator<<(ostream& out, const entity_addr_t &addr); + +inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } +inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } +inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } +inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } +inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } +inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } + +namespace std { + template<> struct hash< entity_addr_t > + { + size_t operator()( const entity_addr_t& x ) const + { + static blobhash H; + return H((const char*)&x, sizeof(x)); + } + }; +} // namespace std + +struct entity_addrvec_t { + vector<entity_addr_t> v; + + entity_addrvec_t() {} + explicit entity_addrvec_t(const entity_addr_t& a) : v({ a }) {} + + unsigned size() const { return v.size(); } + bool empty() const { return v.empty(); } + + entity_addr_t legacy_addr() const { + for (auto& a : v) { + if (a.type == entity_addr_t::TYPE_LEGACY) { + return a; + } + } + return entity_addr_t(); + } + entity_addr_t as_legacy_addr() const { + for (auto& a : v) { + if (a.is_legacy()) { + return a; + } + if (a.is_any()) { + auto b = a; + b.set_type(entity_addr_t::TYPE_LEGACY); + return b; + } + } + // hrm... lie! + auto a = front(); + a.set_type(entity_addr_t::TYPE_LEGACY); + return a; + } + entity_addr_t front() const { + if (!v.empty()) { + return v.front(); + } + return entity_addr_t(); + } + entity_addr_t legacy_or_front_addr() const { + for (auto& a : v) { + if (a.type == entity_addr_t::TYPE_LEGACY) { + return a; + } + } + if (!v.empty()) { + return v.front(); + } + return entity_addr_t(); + } + string get_legacy_str() const { + return legacy_or_front_addr().get_legacy_str(); + } + + entity_addr_t msgr2_addr() const { + for (auto &a : v) { + if (a.type == entity_addr_t::TYPE_MSGR2) { + return a; + } + } + return entity_addr_t(); + } + bool has_msgr2() const { + for (auto& a : v) { + if (a.is_msgr2()) { + return true; + } + } + return false; + } + + bool parse(const char *s, const char **end = 0); + + void get_ports(set<int> *ports) const { + for (auto& a : v) { + ports->insert(a.get_port()); + } + } + set<int> get_ports() const { + set<int> r; + get_ports(&r); + return r; + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::const_iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<entity_addrvec_t*>& ls); + + bool legacy_equals(const entity_addrvec_t& o) const { + if (v == o.v) { + return true; + } + if (v.size() == 1 && + front().is_legacy() && + front() == o.legacy_addr()) { + return true; + } + if (o.v.size() == 1 && + o.front().is_legacy() && + o.front() == legacy_addr()) { + return true; + } + return false; + } + + bool probably_equals(const entity_addrvec_t& o) const { + for (unsigned i = 0; i < v.size(); ++i) { + if (!v[i].probably_equals(o.v[i])) { + return false; + } + } + return true; + } + bool contains(const entity_addr_t& a) const { + for (auto& i : v) { + if (a == i) { + return true; + } + } + return false; + } + bool is_same_host(const entity_addr_t& a) const { + for (auto& i : v) { + if (i.is_same_host(a)) { + return true; + } + } + return false; + } + + friend ostream& operator<<(ostream& out, const entity_addrvec_t& av) { + if (av.v.empty()) { + return out; + } else if (av.v.size() == 1) { + return out << av.v[0]; + } else { + return out << av.v; + } + } + + friend bool operator==(const entity_addrvec_t& l, const entity_addrvec_t& r) { + return l.v == r.v; + } + friend bool operator!=(const entity_addrvec_t& l, const entity_addrvec_t& r) { + return l.v != r.v; + } + friend bool operator<(const entity_addrvec_t& l, const entity_addrvec_t& r) { + return l.v < r.v; // see lexicographical_compare() + } +}; +WRITE_CLASS_ENCODER_FEATURES(entity_addrvec_t); + +namespace std { + template<> struct hash< entity_addrvec_t > + { + size_t operator()( const entity_addrvec_t& x ) const + { + static blobhash H; + size_t r = 0; + for (auto& i : x.v) { + r += H((const char*)&i, sizeof(i)); + } + return r; + } + }; +} // namespace std + +/* + * a particular entity instance + */ +struct entity_inst_t { + entity_name_t name; + entity_addr_t addr; + entity_inst_t() {} + entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} + // cppcheck-suppress noExplicitConstructor + entity_inst_t(const ceph_entity_inst& i) : name(i.name), addr(i.addr) { } + entity_inst_t(const ceph_entity_name& n, const ceph_entity_addr &a) : name(n), addr(a) {} + operator ceph_entity_inst() { + ceph_entity_inst i = {name, addr}; + return i; + } + + void encode(bufferlist& bl, uint64_t features) const { + using ceph::encode; + encode(name, bl); + encode(addr, bl, features); + } + void decode(bufferlist::const_iterator& bl) { + using ceph::decode; + decode(name, bl); + decode(addr, bl); + } + + void dump(Formatter *f) const; + static void generate_test_instances(list<entity_inst_t*>& o); +}; +WRITE_CLASS_ENCODER_FEATURES(entity_inst_t) + + +inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { + return a.name == b.name && a.addr == b.addr; +} +inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { + return a.name != b.name || a.addr != b.addr; +} +inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { + return a.name < b.name || (a.name == b.name && a.addr < b.addr); +} +inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { + return a.name < b.name || (a.name == b.name && a.addr <= b.addr); +} +inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return b < a; } +inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return b <= a; } + +namespace std { + template<> struct hash< entity_inst_t > + { + size_t operator()( const entity_inst_t& x ) const + { + static hash< entity_name_t > H; + static hash< entity_addr_t > I; + return H(x.name) ^ I(x.addr); + } + }; +} // namespace std + + +inline ostream& operator<<(ostream& out, const entity_inst_t &i) +{ + return out << i.name << " " << i.addr; +} +inline ostream& operator<<(ostream& out, const ceph_entity_inst &i) +{ + entity_inst_t n = i; + return out << n; +} + +#endif diff --git a/src/msg/simple/Accepter.cc b/src/msg/simple/Accepter.cc new file mode 100644 index 00000000..52f3df2b --- /dev/null +++ b/src/msg/simple/Accepter.cc @@ -0,0 +1,402 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "include/sock_compat.h" +#include <iterator> +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <sys/uio.h> +#include <limits.h> +#include <poll.h> + +#include "msg/msg_types.h" +#include "msg/Message.h" + +#include "Accepter.h" +#include "Pipe.h" +#include "SimpleMessenger.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/safe_io.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "accepter." + + +/******************************************** + * Accepter + */ + +int Accepter::create_selfpipe(int *pipe_rd, int *pipe_wr) { + int selfpipe[2]; + if (pipe_cloexec(selfpipe) < 0) { + int e = errno; + lderr(msgr->cct) << __func__ << " unable to create the selfpipe: " + << cpp_strerror(e) << dendl; + return -e; + } + for (size_t i = 0; i < std::size(selfpipe); i++) { + int rc = fcntl(selfpipe[i], F_GETFL); + ceph_assert(rc != -1); + rc = fcntl(selfpipe[i], F_SETFL, rc | O_NONBLOCK); + ceph_assert(rc != -1); + } + *pipe_rd = selfpipe[0]; + *pipe_wr = selfpipe[1]; + return 0; +} + +int Accepter::bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports) +{ + const auto& conf = msgr->cct->_conf; + // bind to a socket + ldout(msgr->cct,10) << __func__ << dendl; + + int family; + switch (bind_addr.get_family()) { + case AF_INET: + case AF_INET6: + family = bind_addr.get_family(); + break; + + default: + // bind_addr is empty + family = conf->ms_bind_ipv6 ? AF_INET6 : AF_INET; + } + + /* socket creation */ + listen_sd = socket_cloexec(family, SOCK_STREAM, 0); + if (listen_sd < 0) { + int e = errno; + lderr(msgr->cct) << __func__ << " unable to create socket: " + << cpp_strerror(e) << dendl; + return -e; + } + ldout(msgr->cct,10) << __func__ << " socket sd: " << listen_sd << dendl; + + // use whatever user specified (if anything) + entity_addr_t listen_addr = bind_addr; + if (listen_addr.get_type() == entity_addr_t::TYPE_NONE) { + listen_addr.set_type(entity_addr_t::TYPE_LEGACY); + } + listen_addr.set_family(family); + + /* bind to port */ + int rc = -1; + int r = -1; + + for (int i = 0; i < conf->ms_bind_retry_count; i++) { + + if (i > 0) { + lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in " + << conf->ms_bind_retry_delay << " seconds " << dendl; + sleep(conf->ms_bind_retry_delay); + } + + if (listen_addr.get_port()) { + // specific port + + // reuse addr+port when possible + int on = 1; + rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + if (rc < 0) { + lderr(msgr->cct) << __func__ << " unable to setsockopt: " + << cpp_strerror(errno) << dendl; + r = -errno; + continue; + } + + rc = ::bind(listen_sd, listen_addr.get_sockaddr(), + listen_addr.get_sockaddr_len()); + if (rc < 0) { + lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr + << ": " << cpp_strerror(errno) << dendl; + r = -errno; + continue; + } + } else { + // try a range of ports + for (int port = msgr->cct->_conf->ms_bind_port_min; + port <= msgr->cct->_conf->ms_bind_port_max; port++) { + if (avoid_ports.count(port)) + continue; + + listen_addr.set_port(port); + rc = ::bind(listen_sd, listen_addr.get_sockaddr(), + listen_addr.get_sockaddr_len()); + if (rc == 0) + break; + } + if (rc < 0) { + lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr + << " on any port in range " << msgr->cct->_conf->ms_bind_port_min + << "-" << msgr->cct->_conf->ms_bind_port_max + << ": " << cpp_strerror(errno) + << dendl; + r = -errno; + // Clear port before retry, otherwise we shall fail again. + listen_addr.set_port(0); + continue; + } + ldout(msgr->cct,10) << __func__ << " bound on random port " + << listen_addr << dendl; + } + + if (rc == 0) + break; + } + + // It seems that binding completely failed, return with that exit status + if (rc < 0) { + lderr(msgr->cct) << __func__ << " was unable to bind after " + << conf->ms_bind_retry_count << " attempts: " + << cpp_strerror(errno) << dendl; + ::close(listen_sd); + listen_sd = -1; + return r; + } + + // what port did we get? + sockaddr_storage ss; + socklen_t llen = sizeof(ss); + rc = getsockname(listen_sd, (sockaddr*)&ss, &llen); + if (rc < 0) { + rc = -errno; + lderr(msgr->cct) << __func__ << " failed getsockname: " + << cpp_strerror(rc) << dendl; + ::close(listen_sd); + listen_sd = -1; + return rc; + } + listen_addr.set_sockaddr((sockaddr*)&ss); + + if (msgr->cct->_conf->ms_tcp_rcvbuf) { + int size = msgr->cct->_conf->ms_tcp_rcvbuf; + rc = ::setsockopt(listen_sd, SOL_SOCKET, SO_RCVBUF, + (void*)&size, sizeof(size)); + if (rc < 0) { + rc = -errno; + lderr(msgr->cct) << __func__ << " failed to set SO_RCVBUF to " + << size << ": " << cpp_strerror(rc) << dendl; + ::close(listen_sd); + listen_sd = -1; + return rc; + } + } + + ldout(msgr->cct,10) << __func__ << " bound to " << listen_addr << dendl; + + // listen! + rc = ::listen(listen_sd, msgr->cct->_conf->ms_tcp_listen_backlog); + if (rc < 0) { + rc = -errno; + lderr(msgr->cct) << __func__ << " unable to listen on " << listen_addr + << ": " << cpp_strerror(rc) << dendl; + ::close(listen_sd); + listen_sd = -1; + return rc; + } + + msgr->set_myaddrs(entity_addrvec_t(bind_addr)); + if (bind_addr != entity_addr_t() && + !bind_addr.is_blank_ip()) + msgr->learned_addr(bind_addr); + else + ceph_assert(msgr->get_need_addr()); // should still be true. + + if (msgr->get_myaddr_legacy().get_port() == 0) { + msgr->set_myaddrs(entity_addrvec_t(listen_addr)); + } + entity_addr_t addr = msgr->get_myaddr_legacy(); + addr.nonce = nonce; + msgr->set_myaddrs(entity_addrvec_t(addr)); + + msgr->init_local_connection(); + + rc = create_selfpipe(&shutdown_rd_fd, &shutdown_wr_fd); + if (rc < 0) { + lderr(msgr->cct) << __func__ << " unable to create signalling pipe " << listen_addr + << ": " << cpp_strerror(rc) << dendl; + return rc; + } + + ldout(msgr->cct,1) << __func__ << " my_addrs " << *msgr->my_addrs + << " my_addr " << msgr->my_addr + << " need_addr=" << msgr->get_need_addr() << dendl; + return 0; +} + +int Accepter::rebind(const set<int>& avoid_ports) +{ + ldout(msgr->cct,1) << __func__ << " avoid " << avoid_ports << dendl; + + entity_addr_t addr = msgr->get_myaddr_legacy(); + set<int> new_avoid = avoid_ports; + new_avoid.insert(addr.get_port()); + addr.set_port(0); + + // adjust the nonce; we want our entity_addr_t to be truly unique. + nonce += 1000000; + entity_addrvec_t newaddrs = *msgr->my_addrs; + newaddrs.v[0].nonce = nonce; + msgr->set_myaddrs(newaddrs); + ldout(msgr->cct,10) << __func__ << " new nonce " << nonce << " and addr " + << msgr->my_addr << dendl; + + ldout(msgr->cct,10) << " will try " << addr << " and avoid ports " << new_avoid << dendl; + int r = bind(addr, new_avoid); + if (r == 0) + start(); + return r; +} + +int Accepter::start() +{ + ldout(msgr->cct,1) << __func__ << dendl; + + // start thread + create("ms_accepter"); + + return 0; +} + +void *Accepter::entry() +{ + ldout(msgr->cct,1) << __func__ << " start" << dendl; + + int errors = 0; + + struct pollfd pfd[2]; + memset(pfd, 0, sizeof(pfd)); + + pfd[0].fd = listen_sd; + pfd[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP; + pfd[1].fd = shutdown_rd_fd; + pfd[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP; + while (!done) { + ldout(msgr->cct,20) << __func__ << " calling poll for sd:" << listen_sd << dendl; + int r = poll(pfd, 2, -1); + if (r < 0) { + if (errno == EINTR) { + continue; + } + ldout(msgr->cct,1) << __func__ << " poll got error" + << " errno " << errno << " " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + ldout(msgr->cct,10) << __func__ << " poll returned oke: " << r << dendl; + ldout(msgr->cct,20) << __func__ << " pfd.revents[0]=" << pfd[0].revents << dendl; + ldout(msgr->cct,20) << __func__ << " pfd.revents[1]=" << pfd[1].revents << dendl; + + if (pfd[0].revents & (POLLERR | POLLNVAL | POLLHUP)) { + ldout(msgr->cct,1) << __func__ << " poll got errors in revents " + << pfd[0].revents << dendl; + ceph_abort(); + } + if (pfd[1].revents & (POLLIN | POLLERR | POLLNVAL | POLLHUP)) { + // We got "signaled" to exit the poll + // clean the selfpipe + char ch; + if (::read(shutdown_rd_fd, &ch, sizeof(ch)) == -1) { + if (errno != EAGAIN) + ldout(msgr->cct,1) << __func__ << " Cannot read selfpipe: " + << " errno " << errno << " " << cpp_strerror(errno) << dendl; + } + break; + } + if (done) break; + + // accept + sockaddr_storage ss; + socklen_t slen = sizeof(ss); + int sd = accept_cloexec(listen_sd, (sockaddr*)&ss, &slen); + if (sd >= 0) { + errors = 0; + ldout(msgr->cct,10) << __func__ << " incoming on sd " << sd << dendl; + + msgr->add_accept_pipe(sd); + } else { + int e = errno; + ldout(msgr->cct,0) << __func__ << " no incoming connection? sd = " << sd + << " errno " << e << " " << cpp_strerror(e) << dendl; + if (++errors > msgr->cct->_conf->ms_max_accept_failures) { + lderr(msgr->cct) << "accetper has encoutered enough errors, just do ceph_abort()." << dendl; + ceph_abort(); + } + } + } + + ldout(msgr->cct,20) << __func__ << " closing" << dendl; + // socket is closed right after the thread has joined. + // closing it here might race + if (shutdown_rd_fd >= 0) { + ::close(shutdown_rd_fd); + shutdown_rd_fd = -1; + } + + ldout(msgr->cct,10) << __func__ << " stopping" << dendl; + return 0; +} + +void Accepter::stop() +{ + done = true; + ldout(msgr->cct,10) << __func__ << " accept listening on: " << listen_sd << dendl; + + if (shutdown_wr_fd < 0) + return; + + // Send a byte to the shutdown pipe that the thread is listening to + char ch = 0x0; + int ret = safe_write(shutdown_wr_fd, &ch, sizeof(ch)); + if (ret < 0) { + ldout(msgr->cct,1) << __func__ << " close failed: " + << " errno " << errno << " " << cpp_strerror(errno) << dendl; + } else { + ldout(msgr->cct,15) << __func__ << " signaled poll" << dendl; + } + VOID_TEMP_FAILURE_RETRY(close(shutdown_wr_fd)); + shutdown_wr_fd = -1; + + // wait for thread to stop before closing the socket, to avoid + // racing against fd re-use. + if (is_started()) { + ldout(msgr->cct,5) << __func__ << " wait for thread to join." << dendl; + join(); + } + + if (listen_sd >= 0) { + if (::close(listen_sd) < 0) { + ldout(msgr->cct,1) << __func__ << " close listen_sd failed: " + << " errno " << errno << " " << cpp_strerror(errno) << dendl; + } + listen_sd = -1; + } + if (shutdown_rd_fd >= 0) { + if (::close(shutdown_rd_fd) < 0) { + ldout(msgr->cct,1) << __func__ << " close shutdown_rd_fd failed: " + << " errno " << errno << " " << cpp_strerror(errno) << dendl; + } + shutdown_rd_fd = -1; + } + done = false; +} + + + + diff --git a/src/msg/simple/Accepter.h b/src/msg/simple/Accepter.h new file mode 100644 index 00000000..7824c3a1 --- /dev/null +++ b/src/msg/simple/Accepter.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ACCEPTER_H +#define CEPH_MSG_ACCEPTER_H + +#include "common/Thread.h" + +class SimpleMessenger; +struct entity_addr_t; + +/** + * If the SimpleMessenger binds to a specific address, the Accepter runs + * and listens for incoming connections. + */ +class Accepter : public Thread { + SimpleMessenger *msgr; + bool done; + int listen_sd; + uint64_t nonce; + int shutdown_rd_fd; + int shutdown_wr_fd; + int create_selfpipe(int *pipe_rd, int *pipe_wr); + +public: + Accepter(SimpleMessenger *r, uint64_t n) + : msgr(r), done(false), listen_sd(-1), nonce(n), + shutdown_rd_fd(-1), shutdown_wr_fd(-1) + {} + + void *entry() override; + void stop(); + int bind(const entity_addr_t &bind_addr, const set<int>& avoid_ports); + int rebind(const set<int>& avoid_port); + int start(); +}; + + +#endif diff --git a/src/msg/simple/Pipe.cc b/src/msg/simple/Pipe.cc new file mode 100644 index 00000000..fd44dc4e --- /dev/null +++ b/src/msg/simple/Pipe.cc @@ -0,0 +1,2712 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <sys/uio.h> +#include <limits.h> +#include <poll.h> + +#include "msg/Message.h" +#include "Pipe.h" +#include "SimpleMessenger.h" + +#include "common/debug.h" +#include "common/errno.h" +#include "common/valgrind.h" + +// Below included to get encode_encrypt(); That probably should be in Crypto.h, instead + +#include "auth/cephx/CephxProtocol.h" +#include "auth/AuthSessionHandler.h" + +#include "include/compat.h" +#include "include/sock_compat.h" +#include "include/random.h" + +// Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR +#define SEQ_MASK 0x7fffffff +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << *this +ostream& Pipe::_pipe_prefix(std::ostream &out) const { + return out << "-- " << msgr->get_myaddr_legacy() << " >> " << peer_addr + << " pipe(" << this + << " sd=" << sd << " :" << port + << " s=" << state + << " pgs=" << peer_global_seq + << " cs=" << connect_seq + << " l=" << policy.lossy + << " c=" << connection_state + << ")."; +} + +ostream& operator<<(ostream &out, const Pipe &pipe) { + return pipe._pipe_prefix(out); +} + +/** + * The DelayedDelivery is for injecting delays into Message delivery off + * the socket. It is only enabled if delays are requested, and if they + * are then it pulls Messages off the DelayQueue and puts them into the + * in_q (SimpleMessenger::dispatch_queue). + * Please note that this probably has issues with Pipe shutdown and + * replacement semantics. I've tried, but no guarantees. + */ +class Pipe::DelayedDelivery: public Thread { + Pipe *pipe; + std::deque< pair<utime_t,Message*> > delay_queue; + Mutex delay_lock; + Cond delay_cond; + int flush_count; + bool active_flush; + bool stop_delayed_delivery; + bool delay_dispatching; // we are in fast dispatch now + bool stop_fast_dispatching_flag; // we need to stop fast dispatching + +public: + explicit DelayedDelivery(Pipe *p) + : pipe(p), + delay_lock("Pipe::DelayedDelivery::delay_lock"), flush_count(0), + active_flush(false), + stop_delayed_delivery(false), + delay_dispatching(false), + stop_fast_dispatching_flag(false) { } + ~DelayedDelivery() override { + discard(); + } + void *entry() override; + void queue(utime_t release, Message *m) { + Mutex::Locker l(delay_lock); + delay_queue.push_back(make_pair(release, m)); + delay_cond.Signal(); + } + void discard(); + void flush(); + bool is_flushing() { + Mutex::Locker l(delay_lock); + return flush_count > 0 || active_flush; + } + void wait_for_flush() { + Mutex::Locker l(delay_lock); + while (flush_count > 0 || active_flush) + delay_cond.Wait(delay_lock); + } + void stop() { + delay_lock.Lock(); + stop_delayed_delivery = true; + delay_cond.Signal(); + delay_lock.Unlock(); + } + void steal_for_pipe(Pipe *new_owner) { + Mutex::Locker l(delay_lock); + pipe = new_owner; + } + /** + * We need to stop fast dispatching before we need to stop putting + * normal messages into the DispatchQueue. + */ + void stop_fast_dispatching(); +}; + +/************************************** + * Pipe + */ + +Pipe::Pipe(SimpleMessenger *r, int st, PipeConnection *con) + : RefCountedObject(r->cct), + reader_thread(this), + writer_thread(this), + delay_thread(NULL), + msgr(r), + conn_id(r->dispatch_queue.get_id()), + recv_ofs(0), + recv_len(0), + sd(-1), port(0), + peer_type(-1), + pipe_lock("SimpleMessenger::Pipe::pipe_lock"), + state(st), + connection_state(NULL), + reader_running(false), reader_needs_join(false), + reader_dispatching(false), notify_on_dispatch_done(false), + writer_running(false), + in_q(&(r->dispatch_queue)), + send_keepalive(false), + send_keepalive_ack(false), + connect_seq(0), peer_global_seq(0), + out_seq(0), in_seq(0), in_seq_acked(0) { + ANNOTATE_BENIGN_RACE_SIZED(&sd, sizeof(sd), "Pipe socket"); + ANNOTATE_BENIGN_RACE_SIZED(&state, sizeof(state), "Pipe state"); + ANNOTATE_BENIGN_RACE_SIZED(&recv_len, sizeof(recv_len), "Pipe recv_len"); + ANNOTATE_BENIGN_RACE_SIZED(&recv_ofs, sizeof(recv_ofs), "Pipe recv_ofs"); + if (con) { + connection_state = con; + connection_state->reset_pipe(this); + } else { + connection_state = new PipeConnection(msgr->cct, msgr); + connection_state->pipe = get(); + } + + randomize_out_seq(); + + msgr->timeout = msgr->cct->_conf->ms_connection_idle_timeout * 1000; //convert to ms + if (msgr->timeout == 0) + msgr->timeout = -1; + + recv_max_prefetch = msgr->cct->_conf->ms_tcp_prefetch_max_size; + recv_buf = new char[recv_max_prefetch]; +} + +Pipe::~Pipe() +{ + ceph_assert(out_q.empty()); + ceph_assert(sent.empty()); + delete delay_thread; + delete[] recv_buf; +} + +void Pipe::handle_ack(uint64_t seq) +{ + lsubdout(msgr->cct, ms, 15) << "reader got ack seq " << seq << dendl; + // trim sent list + while (!sent.empty() && + sent.front()->get_seq() <= seq) { + Message *m = sent.front(); + sent.pop_front(); + lsubdout(msgr->cct, ms, 10) << "reader got ack seq " + << seq << " >= " << m->get_seq() << " on " << m << " " << *m << dendl; + m->put(); + } +} + +void Pipe::start_reader() +{ + ceph_assert(pipe_lock.is_locked()); + ceph_assert(!reader_running); + if (reader_needs_join) { + reader_thread.join(); + reader_needs_join = false; + } + reader_running = true; + reader_thread.create("ms_pipe_read", msgr->cct->_conf->ms_rwthread_stack_bytes); +} + +void Pipe::maybe_start_delay_thread() +{ + if (!delay_thread) { + auto pos = msgr->cct->_conf.get_val<std::string>("ms_inject_delay_type").find(ceph_entity_type_name(connection_state->peer_type)); + if (pos != string::npos) { + lsubdout(msgr->cct, ms, 1) << "setting up a delay queue on Pipe " << this << dendl; + delay_thread = new DelayedDelivery(this); + delay_thread->create("ms_pipe_delay"); + } + } +} + +void Pipe::start_writer() +{ + ceph_assert(pipe_lock.is_locked()); + ceph_assert(!writer_running); + writer_running = true; + writer_thread.create("ms_pipe_write", msgr->cct->_conf->ms_rwthread_stack_bytes); +} + +void Pipe::join_reader() +{ + if (!reader_running) + return; + cond.Signal(); + pipe_lock.Unlock(); + reader_thread.join(); + pipe_lock.Lock(); + reader_needs_join = false; +} + +void Pipe::DelayedDelivery::discard() +{ + lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::discard" << dendl; + Mutex::Locker l(delay_lock); + while (!delay_queue.empty()) { + Message *m = delay_queue.front().second; + pipe->in_q->dispatch_throttle_release(m->get_dispatch_throttle_size()); + m->put(); + delay_queue.pop_front(); + } +} + +void Pipe::DelayedDelivery::flush() +{ + lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::flush" << dendl; + Mutex::Locker l(delay_lock); + flush_count = delay_queue.size(); + delay_cond.Signal(); +} + +void *Pipe::DelayedDelivery::entry() +{ + Mutex::Locker locker(delay_lock); + lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::entry start" << dendl; + + while (!stop_delayed_delivery) { + if (delay_queue.empty()) { + lgeneric_subdout(pipe->msgr->cct, ms, 30) << *pipe << "DelayedDelivery::entry sleeping on delay_cond because delay queue is empty" << dendl; + delay_cond.Wait(delay_lock); + continue; + } + utime_t release = delay_queue.front().first; + Message *m = delay_queue.front().second; + string delay_msg_type = pipe->msgr->cct->_conf->ms_inject_delay_msg_type; + if (!flush_count && + (release > ceph_clock_now() && + (delay_msg_type.empty() || m->get_type_name() == delay_msg_type))) { + lgeneric_subdout(pipe->msgr->cct, ms, 10) << *pipe << "DelayedDelivery::entry sleeping on delay_cond until " << release << dendl; + delay_cond.WaitUntil(delay_lock, release); + continue; + } + lgeneric_subdout(pipe->msgr->cct, ms, 10) << *pipe << "DelayedDelivery::entry dequeuing message " << m << " for delivery, past " << release << dendl; + delay_queue.pop_front(); + if (flush_count > 0) { + --flush_count; + active_flush = true; + } + if (pipe->in_q->can_fast_dispatch(m)) { + if (!stop_fast_dispatching_flag) { + delay_dispatching = true; + delay_lock.Unlock(); + pipe->in_q->fast_dispatch(m); + delay_lock.Lock(); + delay_dispatching = false; + if (stop_fast_dispatching_flag) { + // we need to let the stopping thread proceed + delay_cond.Signal(); + delay_lock.Unlock(); + delay_lock.Lock(); + } + } + } else { + pipe->in_q->enqueue(m, m->get_priority(), pipe->conn_id); + } + active_flush = false; + } + lgeneric_subdout(pipe->msgr->cct, ms, 20) << *pipe << "DelayedDelivery::entry stop" << dendl; + return NULL; +} + +void Pipe::DelayedDelivery::stop_fast_dispatching() { + Mutex::Locker l(delay_lock); + stop_fast_dispatching_flag = true; + while (delay_dispatching) + delay_cond.Wait(delay_lock); +} + + +int Pipe::accept() +{ + ldout(msgr->cct,10) << "accept" << dendl; + ceph_assert(pipe_lock.is_locked()); + ceph_assert(state == STATE_ACCEPTING); + + pipe_lock.Unlock(); + + // vars + bufferlist addrs; + entity_addr_t socket_addr; + socklen_t len; + int r; + char banner[strlen(CEPH_BANNER)+1]; + bufferlist addrbl; + ceph_msg_connect connect; + ceph_msg_connect_reply reply; + Pipe *existing = 0; + bufferptr bp; + bufferlist authorizer, authorizer_reply; + bool authorizer_valid; + uint64_t feat_missing; + bool replaced = false; + // this variable denotes if the connection attempt from peer is a hard + // reset or not, it is true if there is an existing connection and the + // connection sequence from peer is equal to zero + bool is_reset_from_peer = false; + CryptoKey session_key; + int removed; // single-use down below + + // this should roughly mirror pseudocode at + // http://ceph.com/wiki/Messaging_protocol + int reply_tag = 0; + uint64_t existing_seq = -1; + + // used for reading in the remote acked seq on connect + uint64_t newly_acked_seq = 0; + + bool need_challenge = false; + bool had_challenge = false; + std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge; + + recv_reset(); + + set_socket_options(); + + // announce myself. + r = tcp_write(CEPH_BANNER, strlen(CEPH_BANNER)); + if (r < 0) { + ldout(msgr->cct,10) << "accept couldn't write banner" << dendl; + goto fail_unlocked; + } + + // and my addr + encode(msgr->my_addr, addrs, 0); // legacy + + port = msgr->my_addr.get_port(); + + // and peer's socket addr (they might not know their ip) + sockaddr_storage ss; + len = sizeof(ss); + r = ::getpeername(sd, (sockaddr*)&ss, &len); + if (r < 0) { + ldout(msgr->cct,0) << "accept failed to getpeername " << cpp_strerror(errno) << dendl; + goto fail_unlocked; + } + socket_addr.set_sockaddr((sockaddr*)&ss); + encode(socket_addr, addrs, 0); // legacy + + r = tcp_write(addrs.c_str(), addrs.length()); + if (r < 0) { + ldout(msgr->cct,10) << "accept couldn't write my+peer addr" << dendl; + goto fail_unlocked; + } + + ldout(msgr->cct,1) << "accept sd=" << sd << " " << socket_addr << dendl; + + // identify peer + if (tcp_read(banner, strlen(CEPH_BANNER)) < 0) { + ldout(msgr->cct,10) << "accept couldn't read banner" << dendl; + goto fail_unlocked; + } + if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + banner[strlen(CEPH_BANNER)] = 0; + ldout(msgr->cct,1) << "accept peer sent bad banner '" << banner << "' (should be '" << CEPH_BANNER << "')" << dendl; + goto fail_unlocked; + } + { + bufferptr tp(sizeof(ceph_entity_addr)); + addrbl.push_back(std::move(tp)); + } + if (tcp_read(addrbl.c_str(), addrbl.length()) < 0) { + ldout(msgr->cct,10) << "accept couldn't read peer_addr" << dendl; + goto fail_unlocked; + } + try { + auto ti = addrbl.cbegin(); + decode(peer_addr, ti); + } catch (const buffer::error& e) { + ldout(msgr->cct,2) << __func__ << " decode peer_addr failed: " << e.what() + << dendl; + goto fail_unlocked; + } + + ldout(msgr->cct,10) << "accept peer addr is " << peer_addr << dendl; + if (peer_addr.is_blank_ip()) { + // peer apparently doesn't know what ip they have; figure it out for them. + int port = peer_addr.get_port(); + peer_addr.u = socket_addr.u; + peer_addr.set_port(port); + ldout(msgr->cct,0) << "accept peer addr is really " << peer_addr + << " (socket is " << socket_addr << ")" << dendl; + } + set_peer_addr(peer_addr); // so that connection_state gets set up + + while (1) { + if (tcp_read((char*)&connect, sizeof(connect)) < 0) { + ldout(msgr->cct,10) << "accept couldn't read connect" << dendl; + goto fail_unlocked; + } + + authorizer.clear(); + if (connect.authorizer_len) { + bp = buffer::create(connect.authorizer_len); + if (tcp_read(bp.c_str(), connect.authorizer_len) < 0) { + ldout(msgr->cct,10) << "accept couldn't read connect authorizer" << dendl; + goto fail_unlocked; + } + authorizer.push_back(std::move(bp)); + authorizer_reply.clear(); + } + + ldout(msgr->cct,20) << "accept got peer connect_seq " << connect.connect_seq + << " global_seq " << connect.global_seq + << dendl; + + msgr->lock.Lock(); // FIXME + pipe_lock.Lock(); + if (msgr->dispatch_queue.stop) + goto shutting_down; + if (state != STATE_ACCEPTING) { + goto shutting_down; + } + + // note peer's type, flags + set_peer_type(connect.host_type); + policy = msgr->get_policy(connect.host_type); + ldout(msgr->cct,10) << "accept of host_type " << connect.host_type + << ", policy.lossy=" << policy.lossy + << " policy.server=" << policy.server + << " policy.standby=" << policy.standby + << " policy.resetcheck=" << policy.resetcheck + << dendl; + + memset(&reply, 0, sizeof(reply)); + reply.protocol_version = msgr->get_proto_version(peer_type, false); + msgr->lock.Unlock(); + + // mismatch? + ldout(msgr->cct,10) << "accept my proto " << reply.protocol_version + << ", their proto " << connect.protocol_version << dendl; + if (connect.protocol_version != reply.protocol_version) { + reply.tag = CEPH_MSGR_TAG_BADPROTOVER; + goto reply; + } + + // require signatures for cephx? + if (connect.authorizer_protocol == CEPH_AUTH_CEPHX) { + if (peer_type == CEPH_ENTITY_TYPE_OSD || + peer_type == CEPH_ENTITY_TYPE_MDS || + peer_type == CEPH_ENTITY_TYPE_MGR) { + if (msgr->cct->_conf->cephx_require_signatures || + msgr->cct->_conf->cephx_cluster_require_signatures) { + ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for cluster" << dendl; + policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (msgr->cct->_conf->cephx_require_version >= 2 || + msgr->cct->_conf->cephx_cluster_require_version >= 2) { + ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl; + policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + } else { + if (msgr->cct->_conf->cephx_require_signatures || + msgr->cct->_conf->cephx_service_require_signatures) { + ldout(msgr->cct,10) << "using cephx, requiring MSG_AUTH feature bit for service" << dendl; + policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (msgr->cct->_conf->cephx_require_version >= 2 || + msgr->cct->_conf->cephx_service_require_version >= 2) { + ldout(msgr->cct,10) << "using cephx, requiring cephx v2 feature bit for cluster" << dendl; + policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + } + } + + feat_missing = policy.features_required & ~(uint64_t)connect.features; + if (feat_missing) { + ldout(msgr->cct,1) << "peer missing required features " << std::hex << feat_missing << std::dec << dendl; + reply.tag = CEPH_MSGR_TAG_FEATURES; + goto reply; + } + + // Check the authorizer. If not good, bail out. + + pipe_lock.Unlock(); + + need_challenge = HAVE_FEATURE(connect.features, CEPHX_V2); + had_challenge = (bool)authorizer_challenge; + authorizer_reply.clear(); + if (!msgr->ms_deliver_verify_authorizer( + connection_state.get(), peer_type, connect.authorizer_protocol, + authorizer, + authorizer_reply, authorizer_valid, session_key, + nullptr /* connection_secret */, + need_challenge ? &authorizer_challenge : nullptr) || + !authorizer_valid) { + pipe_lock.Lock(); + if (state != STATE_ACCEPTING) + goto shutting_down_msgr_unlocked; + if (!had_challenge && need_challenge && authorizer_challenge) { + ldout(msgr->cct,10) << "accept: challenging authorizer " + << authorizer_reply.length() + << " bytes" << dendl; + ceph_assert(authorizer_reply.length()); + reply.tag = CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER; + } else { + ldout(msgr->cct,0) << "accept: got bad authorizer" << dendl; + reply.tag = CEPH_MSGR_TAG_BADAUTHORIZER; + } + session_security.reset(); + goto reply; + } + + // We've verified the authorizer for this pipe, so set up the session security structure. PLR + + ldout(msgr->cct,10) << "accept: setting up session_security." << dendl; + + retry_existing_lookup: + msgr->lock.Lock(); + pipe_lock.Lock(); + if (msgr->dispatch_queue.stop) + goto shutting_down; + if (state != STATE_ACCEPTING) + goto shutting_down; + + // existing? + existing = msgr->_lookup_pipe(peer_addr); + if (existing) { + existing->pipe_lock.Lock(true); // skip lockdep check (we are locking a second Pipe here) + if (existing->reader_dispatching) { + /** we need to wait, or we can deadlock if downstream + * fast_dispatchers are (naughtily!) waiting on resources + * held by somebody trying to make use of the SimpleMessenger lock. + * So drop locks, wait, and retry. It just looks like a slow network + * to everybody else. + * + * We take a ref to existing here since it might get reaped before we + * wake up (see bug #15870). We can be confident that it lived until + * locked it since we held the msgr lock from _lookup_pipe through to + * locking existing->lock and checking reader_dispatching. + */ + existing->get(); + pipe_lock.Unlock(); + msgr->lock.Unlock(); + existing->notify_on_dispatch_done = true; + while (existing->reader_dispatching) + existing->cond.Wait(existing->pipe_lock); + existing->pipe_lock.Unlock(); + existing->put(); + existing = nullptr; + goto retry_existing_lookup; + } + + if (connect.global_seq < existing->peer_global_seq) { + ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq + << " > " << connect.global_seq << ", RETRY_GLOBAL" << dendl; + reply.tag = CEPH_MSGR_TAG_RETRY_GLOBAL; + reply.global_seq = existing->peer_global_seq; // so we can send it below.. + existing->pipe_lock.Unlock(); + msgr->lock.Unlock(); + goto reply; + } else { + ldout(msgr->cct,10) << "accept existing " << existing << ".gseq " << existing->peer_global_seq + << " <= " << connect.global_seq << ", looks ok" << dendl; + } + + if (existing->policy.lossy) { + ldout(msgr->cct,0) << "accept replacing existing (lossy) channel (new one lossy=" + << policy.lossy << ")" << dendl; + existing->was_session_reset(); + goto replace; + } + + ldout(msgr->cct,0) << "accept connect_seq " << connect.connect_seq + << " vs existing " << existing->connect_seq + << " state " << existing->get_state_name() << dendl; + + if (connect.connect_seq == 0 && existing->connect_seq > 0) { + ldout(msgr->cct,0) << "accept peer reset, then tried to connect to us, replacing" << dendl; + // this is a hard reset from peer + is_reset_from_peer = true; + if (policy.resetcheck) + existing->was_session_reset(); // this resets out_queue, msg_ and connect_seq #'s + goto replace; + } + + if (connect.connect_seq < existing->connect_seq) { + // old attempt, or we sent READY but they didn't get it. + ldout(msgr->cct,10) << "accept existing " << existing << ".cseq " << existing->connect_seq + << " > " << connect.connect_seq << ", RETRY_SESSION" << dendl; + goto retry_session; + } + + if (connect.connect_seq == existing->connect_seq) { + // if the existing connection successfully opened, and/or + // subsequently went to standby, then the peer should bump + // their connect_seq and retry: this is not a connection race + // we need to resolve here. + if (existing->state == STATE_OPEN || + existing->state == STATE_STANDBY) { + ldout(msgr->cct,10) << "accept connection race, existing " << existing + << ".cseq " << existing->connect_seq + << " == " << connect.connect_seq + << ", OPEN|STANDBY, RETRY_SESSION" << dendl; + goto retry_session; + } + + // connection race? + if (peer_addr < msgr->my_addr || + existing->policy.server) { + // incoming wins + ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq + << " == " << connect.connect_seq << ", or we are server, replacing my attempt" << dendl; + if (!(existing->state == STATE_CONNECTING || + existing->state == STATE_WAIT)) + lderr(msgr->cct) << "accept race bad state, would replace, existing=" + << existing->get_state_name() + << " " << existing << ".cseq=" << existing->connect_seq + << " == " << connect.connect_seq + << dendl; + ceph_assert(existing->state == STATE_CONNECTING || + existing->state == STATE_WAIT); + goto replace; + } else { + // our existing outgoing wins + ldout(msgr->cct,10) << "accept connection race, existing " << existing << ".cseq " << existing->connect_seq + << " == " << connect.connect_seq << ", sending WAIT" << dendl; + ceph_assert(peer_addr > msgr->my_addr); + if (!(existing->state == STATE_CONNECTING)) + lderr(msgr->cct) << "accept race bad state, would send wait, existing=" + << existing->get_state_name() + << " " << existing << ".cseq=" << existing->connect_seq + << " == " << connect.connect_seq + << dendl; + ceph_assert(existing->state == STATE_CONNECTING); + // make sure our outgoing connection will follow through + existing->_send_keepalive(); + reply.tag = CEPH_MSGR_TAG_WAIT; + existing->pipe_lock.Unlock(); + msgr->lock.Unlock(); + goto reply; + } + } + + ceph_assert(connect.connect_seq > existing->connect_seq); + ceph_assert(connect.global_seq >= existing->peer_global_seq); + if (policy.resetcheck && // RESETSESSION only used by servers; peers do not reset each other + existing->connect_seq == 0) { + ldout(msgr->cct,0) << "accept we reset (peer sent cseq " << connect.connect_seq + << ", " << existing << ".cseq = " << existing->connect_seq + << "), sending RESETSESSION" << dendl; + reply.tag = CEPH_MSGR_TAG_RESETSESSION; + msgr->lock.Unlock(); + existing->pipe_lock.Unlock(); + goto reply; + } + + // reconnect + ldout(msgr->cct,10) << "accept peer sent cseq " << connect.connect_seq + << " > " << existing->connect_seq << dendl; + goto replace; + } // existing + else if (connect.connect_seq > 0) { + // we reset, and they are opening a new session + ldout(msgr->cct,0) << "accept we reset (peer sent cseq " << connect.connect_seq << "), sending RESETSESSION" << dendl; + msgr->lock.Unlock(); + reply.tag = CEPH_MSGR_TAG_RESETSESSION; + goto reply; + } else { + // new session + ldout(msgr->cct,10) << "accept new session" << dendl; + existing = NULL; + goto open; + } + ceph_abort(); + + retry_session: + ceph_assert(existing->pipe_lock.is_locked()); + ceph_assert(pipe_lock.is_locked()); + reply.tag = CEPH_MSGR_TAG_RETRY_SESSION; + reply.connect_seq = existing->connect_seq + 1; + existing->pipe_lock.Unlock(); + msgr->lock.Unlock(); + goto reply; + + reply: + ceph_assert(pipe_lock.is_locked()); + reply.features = ((uint64_t)connect.features & policy.features_supported) | policy.features_required; + reply.authorizer_len = authorizer_reply.length(); + pipe_lock.Unlock(); + r = tcp_write((char*)&reply, sizeof(reply)); + if (r < 0) + goto fail_unlocked; + if (reply.authorizer_len) { + r = tcp_write(authorizer_reply.c_str(), authorizer_reply.length()); + if (r < 0) + goto fail_unlocked; + } + } + + replace: + ceph_assert(existing->pipe_lock.is_locked()); + ceph_assert(pipe_lock.is_locked()); + // if it is a hard reset from peer, we don't need a round-trip to negotiate in/out sequence + if ((connect.features & CEPH_FEATURE_RECONNECT_SEQ) && !is_reset_from_peer) { + reply_tag = CEPH_MSGR_TAG_SEQ; + existing_seq = existing->in_seq; + } + ldout(msgr->cct,10) << "accept replacing " << existing << dendl; + existing->stop(); + existing->unregister_pipe(); + replaced = true; + + if (existing->policy.lossy) { + // disconnect from the Connection + ceph_assert(existing->connection_state); + if (existing->connection_state->clear_pipe(existing)) + msgr->dispatch_queue.queue_reset(existing->connection_state.get()); + } else { + // queue a reset on the new connection, which we're dumping for the old + msgr->dispatch_queue.queue_reset(connection_state.get()); + + // drop my Connection, and take a ref to the existing one. do not + // clear existing->connection_state, since read_message and + // write_message both dereference it without pipe_lock. + connection_state = existing->connection_state; + + // make existing Connection reference us + connection_state->reset_pipe(this); + + if (existing->delay_thread) { + existing->delay_thread->steal_for_pipe(this); + delay_thread = existing->delay_thread; + existing->delay_thread = NULL; + delay_thread->flush(); + } + + // steal incoming queue + uint64_t replaced_conn_id = conn_id; + conn_id = existing->conn_id; + existing->conn_id = replaced_conn_id; + + // reset the in_seq if this is a hard reset from peer, + // otherwise we respect our original connection's value + in_seq = is_reset_from_peer ? 0 : existing->in_seq; + in_seq_acked = in_seq; + + // steal outgoing queue and out_seq + existing->requeue_sent(); + out_seq = existing->out_seq; + ldout(msgr->cct,10) << "accept re-queuing on out_seq " << out_seq << " in_seq " << in_seq << dendl; + for (map<int, list<Message*> >::iterator p = existing->out_q.begin(); + p != existing->out_q.end(); + ++p) + out_q[p->first].splice(out_q[p->first].begin(), p->second); + } + existing->stop_and_wait(); + existing->pipe_lock.Unlock(); + + open: + // open + ceph_assert(pipe_lock.is_locked()); + connect_seq = connect.connect_seq + 1; + peer_global_seq = connect.global_seq; + ceph_assert(state == STATE_ACCEPTING); + state = STATE_OPEN; + ldout(msgr->cct,10) << "accept success, connect_seq = " << connect_seq << ", sending READY" << dendl; + + // send READY reply + reply.tag = (reply_tag ? reply_tag : CEPH_MSGR_TAG_READY); + reply.features = policy.features_supported; + reply.global_seq = msgr->get_global_seq(); + reply.connect_seq = connect_seq; + reply.flags = 0; + reply.authorizer_len = authorizer_reply.length(); + if (policy.lossy) + reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY; + + connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features); + ldout(msgr->cct,10) << "accept features " << connection_state->get_features() << dendl; + + session_security.reset( + get_auth_session_handler(msgr->cct, + connect.authorizer_protocol, + session_key, + connection_state->get_features())); + + // notify + msgr->dispatch_queue.queue_accept(connection_state.get()); + msgr->ms_deliver_handle_fast_accept(connection_state.get()); + + // ok! + if (msgr->dispatch_queue.stop) + goto shutting_down; + removed = msgr->accepting_pipes.erase(this); + ceph_assert(removed == 1); + register_pipe(); + msgr->lock.Unlock(); + pipe_lock.Unlock(); + + r = tcp_write((char*)&reply, sizeof(reply)); + if (r < 0) { + goto fail_registered; + } + + if (reply.authorizer_len) { + r = tcp_write(authorizer_reply.c_str(), authorizer_reply.length()); + if (r < 0) { + goto fail_registered; + } + } + + if (reply_tag == CEPH_MSGR_TAG_SEQ) { + if (tcp_write((char*)&existing_seq, sizeof(existing_seq)) < 0) { + ldout(msgr->cct,2) << "accept write error on in_seq" << dendl; + goto fail_registered; + } + if (tcp_read((char*)&newly_acked_seq, sizeof(newly_acked_seq)) < 0) { + ldout(msgr->cct,2) << "accept read error on newly_acked_seq" << dendl; + goto fail_registered; + } + } + + pipe_lock.Lock(); + discard_requeued_up_to(newly_acked_seq); + if (state != STATE_CLOSED) { + ldout(msgr->cct,10) << "accept starting writer, state " << get_state_name() << dendl; + start_writer(); + } + ldout(msgr->cct,20) << "accept done" << dendl; + + maybe_start_delay_thread(); + + return 0; // success. + + fail_registered: + ldout(msgr->cct, 10) << "accept fault after register" << dendl; + + if (msgr->cct->_conf->ms_inject_internal_delays) { + ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + + fail_unlocked: + pipe_lock.Lock(); + if (state != STATE_CLOSED) { + bool queued = is_queued(); + ldout(msgr->cct, 10) << " queued = " << (int)queued << dendl; + if (queued) { + state = policy.server ? STATE_STANDBY : STATE_CONNECTING; + } else if (replaced) { + state = STATE_STANDBY; + } else { + state = STATE_CLOSED; + state_closed = true; + } + fault(); + if (queued || replaced) + start_writer(); + } + return -1; + + shutting_down: + msgr->lock.Unlock(); + shutting_down_msgr_unlocked: + ceph_assert(pipe_lock.is_locked()); + + if (msgr->cct->_conf->ms_inject_internal_delays) { + ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + + state = STATE_CLOSED; + state_closed = true; + fault(); + return -1; +} + +void Pipe::set_socket_options() +{ + // disable Nagle algorithm? + if (msgr->cct->_conf->ms_tcp_nodelay) { + int flag = 1; + int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); + if (r < 0) { + r = -errno; + ldout(msgr->cct,0) << "couldn't set TCP_NODELAY: " + << cpp_strerror(r) << dendl; + } + } + if (msgr->cct->_conf->ms_tcp_rcvbuf) { + int size = msgr->cct->_conf->ms_tcp_rcvbuf; + int r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size)); + if (r < 0) { + r = -errno; + ldout(msgr->cct,0) << "couldn't set SO_RCVBUF to " << size + << ": " << cpp_strerror(r) << dendl; + } + } + + // block ESIGPIPE +#ifdef CEPH_USE_SO_NOSIGPIPE + int val = 1; + int r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val)); + if (r) { + r = -errno; + ldout(msgr->cct,0) << "couldn't set SO_NOSIGPIPE: " + << cpp_strerror(r) << dendl; + } +#endif + +#ifdef SO_PRIORITY + int prio = msgr->get_socket_priority(); + if (prio >= 0) { + int r = -1; +#ifdef IPTOS_CLASS_CS6 + int iptos = IPTOS_CLASS_CS6; + int addr_family = 0; + if (!peer_addr.is_blank_ip()) { + addr_family = peer_addr.get_family(); + } else { + addr_family = msgr->get_myaddr_legacy().get_family(); + } + switch (addr_family) { + case AF_INET: + r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos)); + break; + case AF_INET6: + r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos)); + break; + default: + lderr(msgr->cct) << "couldn't set ToS of unknown family (" + << addr_family << ")" + << " to " << iptos << dendl; + return; + } + if (r < 0) { + r = -errno; + ldout(msgr->cct,0) << "couldn't set TOS to " << iptos + << ": " << cpp_strerror(r) << dendl; + } +#endif + // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0. + // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT + // We need to call setsockopt(SO_PRIORITY) after it. + r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); + if (r < 0) { + r = -errno; + ldout(msgr->cct,0) << "couldn't set SO_PRIORITY to " << prio + << ": " << cpp_strerror(r) << dendl; + } + } +#endif +} + +int Pipe::connect() +{ + ldout(msgr->cct,10) << "connect " << connect_seq << dendl; + ceph_assert(pipe_lock.is_locked()); + + __u32 cseq = connect_seq; + __u32 gseq = msgr->get_global_seq(); + + // stop reader thread + join_reader(); + + pipe_lock.Unlock(); + + char tag = -1; + int rc = -1; + struct msghdr msg; + struct iovec msgvec[2]; + int msglen; + char banner[strlen(CEPH_BANNER) + 1]; // extra byte makes coverity happy + entity_addr_t paddr; + entity_addr_t peer_addr_for_me, socket_addr; + AuthAuthorizer *authorizer = NULL; + bufferlist addrbl, myaddrbl; + const auto& conf = msgr->cct->_conf; + + // close old socket. this is safe because we stopped the reader thread above. + if (sd >= 0) + ::close(sd); + + // create socket? + sd = socket_cloexec(peer_addr.get_family(), SOCK_STREAM, 0); + if (sd < 0) { + int e = errno; + lderr(msgr->cct) << "connect couldn't create socket " << cpp_strerror(e) << dendl; + rc = -e; + goto fail; + } + + recv_reset(); + + set_socket_options(); + + { + entity_addr_t addr2bind = msgr->get_myaddr_legacy(); + if (msgr->cct->_conf->ms_bind_before_connect && (!addr2bind.is_blank_ip())) { + addr2bind.set_port(0); + int r = ::bind(sd , addr2bind.get_sockaddr(), addr2bind.get_sockaddr_len()); + if (r < 0) { + ldout(msgr->cct,2) << "client bind error " << ", " << cpp_strerror(errno) << dendl; + goto fail; + } + } + } + + // connect! + ldout(msgr->cct,10) << "connecting to " << peer_addr << dendl; + rc = ::connect(sd, peer_addr.get_sockaddr(), peer_addr.get_sockaddr_len()); + if (rc < 0) { + int stored_errno = errno; + ldout(msgr->cct,2) << "connect error " << peer_addr + << ", " << cpp_strerror(stored_errno) << dendl; + if (stored_errno == ECONNREFUSED) { + ldout(msgr->cct, 2) << "connection refused!" << dendl; + msgr->dispatch_queue.queue_refused(connection_state.get()); + } + goto fail; + } + + // verify banner + // FIXME: this should be non-blocking, or in some other way verify the banner as we get it. + rc = tcp_read((char*)&banner, strlen(CEPH_BANNER)); + if (rc < 0) { + ldout(msgr->cct,2) << "connect couldn't read banner, " << cpp_strerror(rc) << dendl; + goto fail; + } + if (memcmp(banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + ldout(msgr->cct,0) << "connect protocol error (bad banner) on peer " << peer_addr << dendl; + goto fail; + } + + memset(&msg, 0, sizeof(msg)); + msgvec[0].iov_base = banner; + msgvec[0].iov_len = strlen(CEPH_BANNER); + msg.msg_iov = msgvec; + msg.msg_iovlen = 1; + msglen = msgvec[0].iov_len; + rc = do_sendmsg(&msg, msglen); + if (rc < 0) { + ldout(msgr->cct,2) << "connect couldn't write my banner, " << cpp_strerror(rc) << dendl; + goto fail; + } + + // identify peer + { +#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) + bufferptr p(sizeof(ceph_entity_addr) * 2); +#else + int wirelen = sizeof(__u32) * 2 + sizeof(ceph_sockaddr_storage); + bufferptr p(wirelen * 2); +#endif + addrbl.push_back(std::move(p)); + } + rc = tcp_read(addrbl.c_str(), addrbl.length()); + if (rc < 0) { + ldout(msgr->cct,2) << "connect couldn't read peer addrs, " << cpp_strerror(rc) << dendl; + goto fail; + } + try { + auto p = addrbl.cbegin(); + decode(paddr, p); + decode(peer_addr_for_me, p); + } + catch (buffer::error& e) { + ldout(msgr->cct,2) << "connect couldn't decode peer addrs: " << e.what() + << dendl; + goto fail; + } + port = peer_addr_for_me.get_port(); + + ldout(msgr->cct,20) << "connect read peer addr " << paddr << " on socket " << sd << dendl; + if (peer_addr != paddr) { + if (paddr.is_blank_ip() && + peer_addr.get_port() == paddr.get_port() && + peer_addr.get_nonce() == paddr.get_nonce()) { + ldout(msgr->cct,0) << "connect claims to be " + << paddr << " not " << peer_addr << " - presumably this is the same node!" << dendl; + } else { + ldout(msgr->cct,10) << "connect claims to be " + << paddr << " not " << peer_addr << dendl; + goto fail; + } + } + + ldout(msgr->cct,20) << "connect peer addr for me is " << peer_addr_for_me << dendl; + + msgr->learned_addr(peer_addr_for_me); + + encode(msgr->my_addr, myaddrbl, 0); // legacy + + memset(&msg, 0, sizeof(msg)); + msgvec[0].iov_base = myaddrbl.c_str(); + msgvec[0].iov_len = myaddrbl.length(); + msg.msg_iov = msgvec; + msg.msg_iovlen = 1; + msglen = msgvec[0].iov_len; + rc = do_sendmsg(&msg, msglen); + if (rc < 0) { + ldout(msgr->cct,2) << "connect couldn't write my addr, " << cpp_strerror(rc) << dendl; + goto fail; + } + ldout(msgr->cct,10) << "connect sent my addr " << msgr->my_addr << dendl; + + + while (1) { + if (!authorizer) { + authorizer = msgr->ms_deliver_get_authorizer(peer_type); + } + bufferlist authorizer_reply; + + ceph_msg_connect connect; + connect.features = policy.features_supported; + connect.host_type = msgr->get_myname().type(); + connect.global_seq = gseq; + connect.connect_seq = cseq; + connect.protocol_version = msgr->get_proto_version(peer_type, true); + connect.authorizer_protocol = authorizer ? authorizer->protocol : 0; + connect.authorizer_len = authorizer ? authorizer->bl.length() : 0; + if (authorizer) + ldout(msgr->cct,10) << "connect.authorizer_len=" << connect.authorizer_len + << " protocol=" << connect.authorizer_protocol << dendl; + connect.flags = 0; + if (policy.lossy) + connect.flags |= CEPH_MSG_CONNECT_LOSSY; // this is fyi, actually, server decides! + memset(&msg, 0, sizeof(msg)); + msgvec[0].iov_base = (char*)&connect; + msgvec[0].iov_len = sizeof(connect); + msg.msg_iov = msgvec; + msg.msg_iovlen = 1; + msglen = msgvec[0].iov_len; + if (authorizer) { + msgvec[1].iov_base = authorizer->bl.c_str(); + msgvec[1].iov_len = authorizer->bl.length(); + msg.msg_iovlen++; + msglen += msgvec[1].iov_len; + } + + ldout(msgr->cct,10) << "connect sending gseq=" << gseq << " cseq=" << cseq + << " proto=" << connect.protocol_version << dendl; + rc = do_sendmsg(&msg, msglen); + if (rc < 0) { + ldout(msgr->cct,2) << "connect couldn't write gseq, cseq, " << cpp_strerror(rc) << dendl; + goto fail; + } + + ldout(msgr->cct,20) << "connect wrote (self +) cseq, waiting for reply" << dendl; + ceph_msg_connect_reply reply; + rc = tcp_read((char*)&reply, sizeof(reply)); + if (rc < 0) { + ldout(msgr->cct,2) << "connect read reply " << cpp_strerror(rc) << dendl; + goto fail; + } + + ldout(msgr->cct,20) << "connect got reply tag " << (int)reply.tag + << " connect_seq " << reply.connect_seq + << " global_seq " << reply.global_seq + << " proto " << reply.protocol_version + << " flags " << (int)reply.flags + << " features " << reply.features + << dendl; + + authorizer_reply.clear(); + + if (reply.authorizer_len) { + ldout(msgr->cct,10) << "reply.authorizer_len=" << reply.authorizer_len << dendl; + bufferptr bp = buffer::create(reply.authorizer_len); + rc = tcp_read(bp.c_str(), reply.authorizer_len); + if (rc < 0) { + ldout(msgr->cct,10) << "connect couldn't read connect authorizer_reply" << cpp_strerror(rc) << dendl; + goto fail; + } + authorizer_reply.push_back(bp); + } + + if (reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + authorizer->add_challenge(msgr->cct, authorizer_reply); + ldout(msgr->cct,10) << " got authorizer challenge, " << authorizer_reply.length() + << " bytes" << dendl; + continue; + } + + if (authorizer) { + auto iter = authorizer_reply.cbegin(); + if (!authorizer->verify_reply(iter, nullptr /* connection_secret */)) { + ldout(msgr->cct,0) << "failed verifying authorize reply" << dendl; + goto fail; + } + } + + if (conf->ms_inject_internal_delays) { + ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + + pipe_lock.Lock(); + if (state != STATE_CONNECTING) { + ldout(msgr->cct,0) << "connect got RESETSESSION but no longer connecting" << dendl; + goto stop_locked; + } + + if (reply.tag == CEPH_MSGR_TAG_FEATURES) { + ldout(msgr->cct,0) << "connect protocol feature mismatch, my " << std::hex + << connect.features << " < peer " << reply.features + << " missing " << (reply.features & ~policy.features_supported) + << std::dec << dendl; + goto fail_locked; + } + + if (reply.tag == CEPH_MSGR_TAG_BADPROTOVER) { + ldout(msgr->cct,0) << "connect protocol version mismatch, my " << connect.protocol_version + << " != " << reply.protocol_version << dendl; + goto fail_locked; + } + + if (reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { + ldout(msgr->cct,0) << "connect got BADAUTHORIZER" << dendl; + goto fail_locked; + } + if (reply.tag == CEPH_MSGR_TAG_RESETSESSION) { + ldout(msgr->cct,0) << "connect got RESETSESSION" << dendl; + was_session_reset(); + cseq = 0; + pipe_lock.Unlock(); + continue; + } + if (reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) { + gseq = msgr->get_global_seq(reply.global_seq); + ldout(msgr->cct,10) << "connect got RETRY_GLOBAL " << reply.global_seq + << " chose new " << gseq << dendl; + pipe_lock.Unlock(); + continue; + } + if (reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) { + ceph_assert(reply.connect_seq > connect_seq); + ldout(msgr->cct,10) << "connect got RETRY_SESSION " << connect_seq + << " -> " << reply.connect_seq << dendl; + cseq = connect_seq = reply.connect_seq; + pipe_lock.Unlock(); + continue; + } + + if (reply.tag == CEPH_MSGR_TAG_WAIT) { + ldout(msgr->cct,3) << "connect got WAIT (connection race)" << dendl; + state = STATE_WAIT; + goto stop_locked; + } + + if (reply.tag == CEPH_MSGR_TAG_READY || + reply.tag == CEPH_MSGR_TAG_SEQ) { + uint64_t feat_missing = policy.features_required & ~(uint64_t)reply.features; + if (feat_missing) { + ldout(msgr->cct,1) << "missing required features " << std::hex << feat_missing << std::dec << dendl; + goto fail_locked; + } + + if (reply.tag == CEPH_MSGR_TAG_SEQ) { + ldout(msgr->cct,10) << "got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" << dendl; + uint64_t newly_acked_seq = 0; + rc = tcp_read((char*)&newly_acked_seq, sizeof(newly_acked_seq)); + if (rc < 0) { + ldout(msgr->cct,2) << "connect read error on newly_acked_seq" << cpp_strerror(rc) << dendl; + goto fail_locked; + } + ldout(msgr->cct,2) << " got newly_acked_seq " << newly_acked_seq + << " vs out_seq " << out_seq << dendl; + while (newly_acked_seq > out_seq) { + Message *m = _get_next_outgoing(); + ceph_assert(m); + ldout(msgr->cct,2) << " discarding previously sent " << m->get_seq() + << " " << *m << dendl; + ceph_assert(m->get_seq() <= newly_acked_seq); + m->put(); + ++out_seq; + } + if (tcp_write((char*)&in_seq, sizeof(in_seq)) < 0) { + ldout(msgr->cct,2) << "connect write error on in_seq" << dendl; + goto fail_locked; + } + } + + // hooray! + peer_global_seq = reply.global_seq; + policy.lossy = reply.flags & CEPH_MSG_CONNECT_LOSSY; + state = STATE_OPEN; + connect_seq = cseq + 1; + ceph_assert(connect_seq == reply.connect_seq); + backoff = utime_t(); + connection_state->set_features((uint64_t)reply.features & (uint64_t)connect.features); + ldout(msgr->cct,10) << "connect success " << connect_seq << ", lossy = " << policy.lossy + << ", features " << connection_state->get_features() << dendl; + + + // If we have an authorizer, get a new AuthSessionHandler to deal with ongoing security of the + // connection. PLR + + if (authorizer != NULL) { + session_security.reset( + get_auth_session_handler( + msgr->cct, + authorizer->protocol, + authorizer->session_key, + connection_state->get_features())); + } else { + // We have no authorizer, so we shouldn't be applying security to messages in this pipe. PLR + session_security.reset(); + } + + msgr->dispatch_queue.queue_connect(connection_state.get()); + msgr->ms_deliver_handle_fast_connect(connection_state.get()); + + if (!reader_running) { + ldout(msgr->cct,20) << "connect starting reader" << dendl; + start_reader(); + } + maybe_start_delay_thread(); + delete authorizer; + return 0; + } + + // protocol error + ldout(msgr->cct,0) << "connect got bad tag " << (int)tag << dendl; + goto fail_locked; + } + + fail: + if (conf->ms_inject_internal_delays) { + ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + + pipe_lock.Lock(); + fail_locked: + if (state == STATE_CONNECTING) + fault(); + else + ldout(msgr->cct,3) << "connect fault, but state = " << get_state_name() + << " != connecting, stopping" << dendl; + + stop_locked: + delete authorizer; + return rc; +} + +void Pipe::register_pipe() +{ + ldout(msgr->cct,10) << "register_pipe" << dendl; + ceph_assert(msgr->lock.is_locked()); + Pipe *existing = msgr->_lookup_pipe(peer_addr); + ceph_assert(existing == NULL); + msgr->rank_pipe[peer_addr] = this; +} + +void Pipe::unregister_pipe() +{ + ceph_assert(msgr->lock.is_locked()); + ceph::unordered_map<entity_addr_t,Pipe*>::iterator p = msgr->rank_pipe.find(peer_addr); + if (p != msgr->rank_pipe.end() && p->second == this) { + ldout(msgr->cct,10) << "unregister_pipe" << dendl; + msgr->rank_pipe.erase(p); + } else { + ldout(msgr->cct,10) << "unregister_pipe - not registered" << dendl; + msgr->accepting_pipes.erase(this); // somewhat overkill, but safe. + } +} + +void Pipe::join() +{ + ldout(msgr->cct, 20) << "join" << dendl; + if (writer_thread.is_started()) + writer_thread.join(); + if (reader_thread.is_started()) + reader_thread.join(); + if (delay_thread) { + ldout(msgr->cct, 20) << "joining delay_thread" << dendl; + delay_thread->stop(); + delay_thread->join(); + } +} + +void Pipe::requeue_sent() +{ + if (sent.empty()) + return; + + list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + while (!sent.empty()) { + Message *m = sent.back(); + sent.pop_back(); + ldout(msgr->cct,10) << "requeue_sent " << *m << " for resend seq " << out_seq + << " (" << m->get_seq() << ")" << dendl; + rq.push_front(m); + out_seq--; + } +} + +void Pipe::discard_requeued_up_to(uint64_t seq) +{ + ldout(msgr->cct, 10) << "discard_requeued_up_to " << seq << dendl; + if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) { + out_seq = seq; + return; + } + list<Message*>& rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + while (!rq.empty()) { + Message *m = rq.front(); + if (m->get_seq() == 0 || m->get_seq() > seq) + break; + ldout(msgr->cct,10) << "discard_requeued_up_to " << *m << " for resend seq " << out_seq + << " <= " << seq << ", discarding" << dendl; + m->put(); + rq.pop_front(); + out_seq++; + } + if (rq.empty()) + out_q.erase(CEPH_MSG_PRIO_HIGHEST); +} + +/* + * Tears down the Pipe's message queues, and removes them from the DispatchQueue + * Must hold pipe_lock prior to calling. + */ +void Pipe::discard_out_queue() +{ + ldout(msgr->cct,10) << "discard_queue" << dendl; + + for (list<Message*>::iterator p = sent.begin(); p != sent.end(); ++p) { + ldout(msgr->cct,20) << " discard " << *p << dendl; + (*p)->put(); + } + sent.clear(); + for (map<int,list<Message*> >::iterator p = out_q.begin(); p != out_q.end(); ++p) + for (list<Message*>::iterator r = p->second.begin(); r != p->second.end(); ++r) { + ldout(msgr->cct,20) << " discard " << *r << dendl; + (*r)->put(); + } + out_q.clear(); +} + +void Pipe::fault(bool onread) +{ + const auto& conf = msgr->cct->_conf; + ceph_assert(pipe_lock.is_locked()); + cond.Signal(); + + if (onread && state == STATE_CONNECTING) { + ldout(msgr->cct,10) << "fault already connecting, reader shutting down" << dendl; + return; + } + + ldout(msgr->cct,2) << "fault " << cpp_strerror(errno) << dendl; + + if (state == STATE_CLOSED || + state == STATE_CLOSING) { + ldout(msgr->cct,10) << "fault already closed|closing" << dendl; + if (connection_state->clear_pipe(this)) + msgr->dispatch_queue.queue_reset(connection_state.get()); + return; + } + + shutdown_socket(); + + // lossy channel? + if (policy.lossy && state != STATE_CONNECTING) { + ldout(msgr->cct,10) << "fault on lossy channel, failing" << dendl; + + // disconnect from Connection, and mark it failed. future messages + // will be dropped. + ceph_assert(connection_state); + stop(); + bool cleared = connection_state->clear_pipe(this); + + // crib locks, blech. note that Pipe is now STATE_CLOSED and the + // rank_pipe entry is ignored by others. + pipe_lock.Unlock(); + + if (conf->ms_inject_internal_delays) { + ldout(msgr->cct, 10) << " sleep for " << msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + + msgr->lock.Lock(); + pipe_lock.Lock(); + unregister_pipe(); + msgr->lock.Unlock(); + + if (delay_thread) + delay_thread->discard(); + in_q->discard_queue(conn_id); + discard_out_queue(); + if (cleared) + msgr->dispatch_queue.queue_reset(connection_state.get()); + return; + } + + // queue delayed items immediately + if (delay_thread) + delay_thread->flush(); + + // requeue sent items + requeue_sent(); + + if (policy.standby && !is_queued()) { + ldout(msgr->cct,0) << "fault with nothing to send, going to standby" << dendl; + state = STATE_STANDBY; + return; + } + + if (state != STATE_CONNECTING) { + if (policy.server) { + ldout(msgr->cct,0) << "fault, server, going to standby" << dendl; + state = STATE_STANDBY; + } else { + ldout(msgr->cct,0) << "fault, initiating reconnect" << dendl; + connect_seq++; + state = STATE_CONNECTING; + } + backoff = utime_t(); + } else if (backoff == utime_t()) { + ldout(msgr->cct,0) << "fault" << dendl; + backoff.set_from_double(conf->ms_initial_backoff); + } else { + ldout(msgr->cct,10) << "fault waiting " << backoff << dendl; + cond.WaitInterval(pipe_lock, backoff); + backoff += backoff; + if (backoff > conf->ms_max_backoff) + backoff.set_from_double(conf->ms_max_backoff); + ldout(msgr->cct,10) << "fault done waiting or woke up" << dendl; + } +} + +void Pipe::randomize_out_seq() +{ + if (connection_state->get_features() & CEPH_FEATURE_MSG_AUTH) { + // Set out_seq to a random value, so CRC won't be predictable. + out_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK); + lsubdout(msgr->cct, ms, 10) << "randomize_out_seq " << out_seq << dendl; + } else { + // previously, seq #'s always started at 0. + out_seq = 0; + } +} + +void Pipe::was_session_reset() +{ + ceph_assert(pipe_lock.is_locked()); + + ldout(msgr->cct,10) << "was_session_reset" << dendl; + in_q->discard_queue(conn_id); + if (delay_thread) + delay_thread->discard(); + discard_out_queue(); + + msgr->dispatch_queue.queue_remote_reset(connection_state.get()); + + randomize_out_seq(); + + in_seq = 0; + in_seq_acked = 0; + connect_seq = 0; +} + +void Pipe::stop() +{ + ldout(msgr->cct,10) << "stop" << dendl; + ceph_assert(pipe_lock.is_locked()); + state = STATE_CLOSED; + state_closed = true; + cond.Signal(); + shutdown_socket(); +} + +void Pipe::stop_and_wait() +{ + ceph_assert(pipe_lock.is_locked_by_me()); + if (state != STATE_CLOSED) + stop(); + + if (msgr->cct->_conf->ms_inject_internal_delays) { + ldout(msgr->cct, 10) << __func__ << " sleep for " + << msgr->cct->_conf->ms_inject_internal_delays + << dendl; + utime_t t; + t.set_from_double(msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + + if (delay_thread) { + pipe_lock.Unlock(); + delay_thread->stop_fast_dispatching(); + pipe_lock.Lock(); + } + while (reader_running && + reader_dispatching) + cond.Wait(pipe_lock); +} + +/* read msgs from socket. + * also, server. + */ +void Pipe::reader() +{ + pipe_lock.Lock(); + + if (state == STATE_ACCEPTING) { + accept(); + ceph_assert(pipe_lock.is_locked()); + } + + // loop. + while (state != STATE_CLOSED && + state != STATE_CONNECTING) { + ceph_assert(pipe_lock.is_locked()); + + // sleep if (re)connecting + if (state == STATE_STANDBY) { + ldout(msgr->cct,20) << "reader sleeping during reconnect|standby" << dendl; + cond.Wait(pipe_lock); + continue; + } + + // get a reference to the AuthSessionHandler while we have the pipe_lock + std::shared_ptr<AuthSessionHandler> auth_handler = session_security; + + pipe_lock.Unlock(); + + char tag = -1; + ldout(msgr->cct,20) << "reader reading tag..." << dendl; + if (tcp_read((char*)&tag, 1) < 0) { + pipe_lock.Lock(); + ldout(msgr->cct,2) << "reader couldn't read tag, " << cpp_strerror(errno) << dendl; + fault(true); + continue; + } + + if (tag == CEPH_MSGR_TAG_KEEPALIVE) { + ldout(msgr->cct,2) << "reader got KEEPALIVE" << dendl; + pipe_lock.Lock(); + connection_state->set_last_keepalive(ceph_clock_now()); + continue; + } + if (tag == CEPH_MSGR_TAG_KEEPALIVE2) { + ldout(msgr->cct,30) << "reader got KEEPALIVE2 tag ..." << dendl; + ceph_timespec t; + int rc = tcp_read((char*)&t, sizeof(t)); + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,2) << "reader couldn't read KEEPALIVE2 stamp " + << cpp_strerror(errno) << dendl; + fault(true); + } else { + send_keepalive_ack = true; + keepalive_ack_stamp = utime_t(t); + ldout(msgr->cct,2) << "reader got KEEPALIVE2 " << keepalive_ack_stamp + << dendl; + connection_state->set_last_keepalive(ceph_clock_now()); + cond.Signal(); + } + continue; + } + if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + ldout(msgr->cct,2) << "reader got KEEPALIVE_ACK" << dendl; + struct ceph_timespec t; + int rc = tcp_read((char*)&t, sizeof(t)); + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,2) << "reader couldn't read KEEPALIVE2 stamp " << cpp_strerror(errno) << dendl; + fault(true); + } else { + connection_state->set_last_keepalive_ack(utime_t(t)); + } + continue; + } + + // open ... + if (tag == CEPH_MSGR_TAG_ACK) { + ldout(msgr->cct,20) << "reader got ACK" << dendl; + ceph_le64 seq; + int rc = tcp_read((char*)&seq, sizeof(seq)); + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,2) << "reader couldn't read ack seq, " << cpp_strerror(errno) << dendl; + fault(true); + } else if (state != STATE_CLOSED) { + handle_ack(seq); + } + continue; + } + + else if (tag == CEPH_MSGR_TAG_MSG) { + ldout(msgr->cct,20) << "reader got MSG" << dendl; + Message *m = 0; + int r = read_message(&m, auth_handler.get()); + + pipe_lock.Lock(); + + if (!m) { + if (r < 0) + fault(true); + continue; + } + + m->trace.event("pipe read message"); + + if (state == STATE_CLOSED || + state == STATE_CONNECTING) { + in_q->dispatch_throttle_release(m->get_dispatch_throttle_size()); + m->put(); + continue; + } + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the client + // side queueing because messages can't be renumbered, but the (kernel) client will + // occasionally pull a message out of the sent queue to send elsewhere. in that case + // it doesn't matter if we "got" it or not. + if (m->get_seq() <= in_seq) { + ldout(msgr->cct,0) << "reader got old message " + << m->get_seq() << " <= " << in_seq << " " << m << " " << *m + << ", discarding" << dendl; + in_q->dispatch_throttle_release(m->get_dispatch_throttle_size()); + m->put(); + if (connection_state->has_feature(CEPH_FEATURE_RECONNECT_SEQ) && + msgr->cct->_conf->ms_die_on_old_message) + ceph_abort_msg("old msgs despite reconnect_seq feature"); + continue; + } + if (m->get_seq() > in_seq + 1) { + ldout(msgr->cct,0) << "reader missed message? skipped from seq " + << in_seq << " to " << m->get_seq() << dendl; + if (msgr->cct->_conf->ms_die_on_skipped_message) + ceph_abort_msg("skipped incoming seq"); + } + + m->set_connection(connection_state.get()); + + // note last received message. + in_seq = m->get_seq(); + + cond.Signal(); // wake up writer, to ack this + + ldout(msgr->cct,10) << "reader got message " + << m->get_seq() << " " << m << " " << *m + << dendl; + in_q->fast_preprocess(m); + + if (delay_thread) { + utime_t release; + if (rand() % 10000 < msgr->cct->_conf->ms_inject_delay_probability * 10000.0) { + release = m->get_recv_stamp(); + release += msgr->cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0; + lsubdout(msgr->cct, ms, 1) << "queue_received will delay until " << release << " on " << m << " " << *m << dendl; + } + delay_thread->queue(release, m); + } else { + if (in_q->can_fast_dispatch(m)) { + reader_dispatching = true; + pipe_lock.Unlock(); + in_q->fast_dispatch(m); + pipe_lock.Lock(); + reader_dispatching = false; + if (state == STATE_CLOSED || + notify_on_dispatch_done) { // there might be somebody waiting + notify_on_dispatch_done = false; + cond.Signal(); + } + } else { + in_q->enqueue(m, m->get_priority(), conn_id); + } + } + } + + else if (tag == CEPH_MSGR_TAG_CLOSE) { + ldout(msgr->cct,20) << "reader got CLOSE" << dendl; + pipe_lock.Lock(); + if (state == STATE_CLOSING) { + state = STATE_CLOSED; + state_closed = true; + } else { + state = STATE_CLOSING; + } + cond.Signal(); + break; + } + else { + ldout(msgr->cct,0) << "reader bad tag " << (int)tag << dendl; + pipe_lock.Lock(); + fault(true); + } + } + + + // reap? + reader_running = false; + reader_needs_join = true; + unlock_maybe_reap(); + ldout(msgr->cct,10) << "reader done" << dendl; +} + +/* write msgs to socket. + * also, client. + */ +void Pipe::writer() +{ + pipe_lock.Lock(); + while (state != STATE_CLOSED) {// && state != STATE_WAIT) { + ldout(msgr->cct,10) << "writer: state = " << get_state_name() + << " policy.server=" << policy.server << dendl; + + // standby? + if (is_queued() && state == STATE_STANDBY && !policy.server) + state = STATE_CONNECTING; + + // connect? + if (state == STATE_CONNECTING) { + ceph_assert(!policy.server); + connect(); + continue; + } + + if (state == STATE_CLOSING) { + // write close tag + ldout(msgr->cct,20) << "writer writing CLOSE tag" << dendl; + char tag = CEPH_MSGR_TAG_CLOSE; + state = STATE_CLOSED; + state_closed = true; + pipe_lock.Unlock(); + if (sd >= 0) { + // we can ignore return value, actually; we don't care if this succeeds. + int r = ::write(sd, &tag, 1); + (void)r; + } + pipe_lock.Lock(); + continue; + } + + if (state != STATE_CONNECTING && state != STATE_WAIT && state != STATE_STANDBY && + (is_queued() || in_seq > in_seq_acked)) { + + // keepalive? + if (send_keepalive) { + int rc; + if (connection_state->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + pipe_lock.Unlock(); + rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2, + ceph_clock_now()); + } else { + pipe_lock.Unlock(); + rc = write_keepalive(); + } + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,2) << "writer couldn't write keepalive[2], " + << cpp_strerror(errno) << dendl; + fault(); + continue; + } + send_keepalive = false; + } + if (send_keepalive_ack) { + utime_t t = keepalive_ack_stamp; + pipe_lock.Unlock(); + int rc = write_keepalive2(CEPH_MSGR_TAG_KEEPALIVE2_ACK, t); + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,2) << "writer couldn't write keepalive_ack, " << cpp_strerror(errno) << dendl; + fault(); + continue; + } + send_keepalive_ack = false; + } + + // send ack? + if (in_seq > in_seq_acked) { + uint64_t send_seq = in_seq; + pipe_lock.Unlock(); + int rc = write_ack(send_seq); + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,2) << "writer couldn't write ack, " << cpp_strerror(errno) << dendl; + fault(); + continue; + } + in_seq_acked = send_seq; + } + + // grab outgoing message + Message *m = _get_next_outgoing(); + if (m) { + m->set_seq(++out_seq); + if (!policy.lossy) { + // put on sent list + sent.push_back(m); + m->get(); + } + + // associate message with Connection (for benefit of encode_payload) + m->set_connection(connection_state.get()); + + uint64_t features = connection_state->get_features(); + + if (m->empty_payload()) + ldout(msgr->cct,20) << "writer encoding " << m->get_seq() << " features " << features + << " " << m << " " << *m << dendl; + else + ldout(msgr->cct,20) << "writer half-reencoding " << m->get_seq() << " features " << features + << " " << m << " " << *m << dendl; + + // encode and copy out of *m + m->encode(features, msgr->crcflags); + + // prepare everything + const ceph_msg_header& header = m->get_header(); + const ceph_msg_footer& footer = m->get_footer(); + + // Now that we have all the crcs calculated, handle the + // digital signature for the message, if the pipe has session + // security set up. Some session security options do not + // actually calculate and check the signature, but they should + // handle the calls to sign_message and check_signature. PLR + if (session_security.get() == NULL) { + ldout(msgr->cct, 20) << "writer no session security" << dendl; + } else { + if (session_security->sign_message(m)) { + ldout(msgr->cct, 20) << "writer failed to sign seq # " << header.seq + << "): sig = " << footer.sig << dendl; + } else { + ldout(msgr->cct, 20) << "writer signed seq # " << header.seq + << "): sig = " << footer.sig << dendl; + } + } + + bufferlist blist = m->get_payload(); + blist.append(m->get_middle()); + blist.append(m->get_data()); + + pipe_lock.Unlock(); + + m->trace.event("pipe writing message"); + + ldout(msgr->cct,20) << "writer sending " << m->get_seq() << " " << m << dendl; + int rc = write_message(header, footer, blist); + + pipe_lock.Lock(); + if (rc < 0) { + ldout(msgr->cct,1) << "writer error sending " << m << ", " + << cpp_strerror(errno) << dendl; + fault(); + } + m->put(); + } + continue; + } + + // wait + ldout(msgr->cct,20) << "writer sleeping" << dendl; + cond.Wait(pipe_lock); + } + + ldout(msgr->cct,20) << "writer finishing" << dendl; + + // reap? + writer_running = false; + unlock_maybe_reap(); + ldout(msgr->cct,10) << "writer done" << dendl; +} + +void Pipe::unlock_maybe_reap() +{ + if (!reader_running && !writer_running) { + shutdown_socket(); + pipe_lock.Unlock(); + if (delay_thread && delay_thread->is_flushing()) { + delay_thread->wait_for_flush(); + } + msgr->queue_reap(this); + } else { + pipe_lock.Unlock(); + } +} + +static void alloc_aligned_buffer(bufferlist& data, unsigned len, unsigned off) +{ + // create a buffer to read into that matches the data alignment + unsigned left = len; + if (off & ~CEPH_PAGE_MASK) { + // head + unsigned head = 0; + head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left); + data.push_back(buffer::create(head)); + left -= head; + } + unsigned middle = left & CEPH_PAGE_MASK; + if (middle > 0) { + data.push_back(buffer::create_small_page_aligned(middle)); + left -= middle; + } + if (left) { + data.push_back(buffer::create(left)); + } +} + +int Pipe::read_message(Message **pm, AuthSessionHandler* auth_handler) +{ + int ret = -1; + // envelope + //ldout(msgr->cct,10) << "receiver.read_message from sd " << sd << dendl; + + ceph_msg_header header; + ceph_msg_footer footer; + __u32 header_crc = 0; + + if (tcp_read((char*)&header, sizeof(header)) < 0) + return -1; + if (msgr->crcflags & MSG_CRC_HEADER) { + header_crc = ceph_crc32c(0, (unsigned char *)&header, sizeof(header) - sizeof(header.crc)); + } + + ldout(msgr->cct,20) << "reader got envelope type=" << header.type + << " src " << entity_name_t(header.src) + << " front=" << header.front_len + << " data=" << header.data_len + << " off " << header.data_off + << dendl; + + // verify header crc + if ((msgr->crcflags & MSG_CRC_HEADER) && header_crc != header.crc) { + ldout(msgr->cct,0) << "reader got bad header crc " << header_crc << " != " << header.crc << dendl; + return -1; + } + + bufferlist front, middle, data; + int front_len, middle_len; + unsigned data_len, data_off; + int aborted; + Message *message; + utime_t recv_stamp = ceph_clock_now(); + + if (policy.throttler_messages) { + ldout(msgr->cct,10) << "reader wants " << 1 << " message from policy throttler " + << policy.throttler_messages->get_current() << "/" + << policy.throttler_messages->get_max() << dendl; + policy.throttler_messages->get(); + } + + uint64_t message_size = header.front_len + header.middle_len + header.data_len; + if (message_size) { + if (policy.throttler_bytes) { + ldout(msgr->cct,10) << "reader wants " << message_size << " bytes from policy throttler " + << policy.throttler_bytes->get_current() << "/" + << policy.throttler_bytes->get_max() << dendl; + policy.throttler_bytes->get(message_size); + } + + // throttle total bytes waiting for dispatch. do this _after_ the + // policy throttle, as this one does not deadlock (unless dispatch + // blocks indefinitely, which it shouldn't). in contrast, the + // policy throttle carries for the lifetime of the message. + ldout(msgr->cct,10) << "reader wants " << message_size << " from dispatch throttler " + << in_q->dispatch_throttler.get_current() << "/" + << in_q->dispatch_throttler.get_max() << dendl; + in_q->dispatch_throttler.get(message_size); + } + + utime_t throttle_stamp = ceph_clock_now(); + + // read front + front_len = header.front_len; + if (front_len) { + bufferptr bp = buffer::create(front_len); + if (tcp_read(bp.c_str(), front_len) < 0) + goto out_dethrottle; + front.push_back(std::move(bp)); + ldout(msgr->cct,20) << "reader got front " << front.length() << dendl; + } + + // read middle + middle_len = header.middle_len; + if (middle_len) { + bufferptr bp = buffer::create(middle_len); + if (tcp_read(bp.c_str(), middle_len) < 0) + goto out_dethrottle; + middle.push_back(std::move(bp)); + ldout(msgr->cct,20) << "reader got middle " << middle.length() << dendl; + } + + + // read data + data_len = le32_to_cpu(header.data_len); + data_off = le32_to_cpu(header.data_off); + if (data_len) { + unsigned offset = 0; + unsigned left = data_len; + + bufferlist newbuf, rxbuf; + bufferlist::iterator blp; +// int rxbuf_version = 0; + + while (left > 0) { + // wait for data + if (tcp_read_wait() < 0) + goto out_dethrottle; + + // get a buffer +#if 0 + // The rx_buffers implementation is buggy: + // - see http://tracker.ceph.com/issues/22480 + // + // - From inspection, I think that we have problems if we read *part* + // of the message into an rx_buffer, then drop the lock, someone revokes, + // and then later try to read the rest. In that case our final bufferlist + // will have part of the original static_buffer from the first chunk and + // partly a piece that we allocated. I think that to make this correct, + // we need to keep the bufferlist we are reading into in Connection under + // the lock, and on revoke, if the data is partly read, rebuild() to copy + // into fresh buffers so that all references to our static buffer are + // cleared up. + // + // - Also... what happens if we fully read into the static + // buffer, then revoke? We still have some bufferlist out there + // in the process of getting dispatched back to objecter or + // librados that references the static buffer. + connection_state->lock.Lock(); + map<ceph_tid_t,pair<bufferlist,int> >::iterator p = connection_state->rx_buffers.find(header.tid); + if (p != connection_state->rx_buffers.end()) { + if (rxbuf.length() == 0 || p->second.second != rxbuf_version) { + ldout(msgr->cct,10) << "reader seleting rx buffer v " << p->second.second + << " at offset " << offset + << " len " << p->second.first.length() << dendl; + rxbuf = p->second.first; + rxbuf_version = p->second.second; + // make sure it's big enough + if (rxbuf.length() < data_len) + rxbuf.push_back(buffer::create(data_len - rxbuf.length())); + blp = p->second.first.begin(); + blp.advance(offset); + } + } else { + if (!newbuf.length()) { + ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " << offset << dendl; + alloc_aligned_buffer(newbuf, data_len, data_off); + blp = newbuf.begin(); + blp.advance(offset); + } + } + bufferptr bp = blp.get_current_ptr(); + int read = std::min(bp.length(), left); + ldout(msgr->cct,20) << "reader reading nonblocking into " << (void*)bp.c_str() << " len " << bp.length() << dendl; + ssize_t got = tcp_read_nonblocking(bp.c_str(), read); + ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl; + connection_state->lock.Unlock(); +#else + // rx_buffer-less implementation + if (!newbuf.length()) { + ldout(msgr->cct,20) << "reader allocating new rx buffer at offset " + << offset << dendl; + alloc_aligned_buffer(newbuf, data_len, data_off); + blp = newbuf.begin(); + blp.advance(offset); + } + bufferptr bp = blp.get_current_ptr(); + int read = std::min(bp.length(), left); + ldout(msgr->cct,20) << "reader reading nonblocking into " + << (void*)bp.c_str() << " len " << bp.length() + << dendl; + ssize_t got = tcp_read_nonblocking(bp.c_str(), read); + ldout(msgr->cct,30) << "reader read " << got << " of " << read << dendl; +#endif + if (got < 0) + goto out_dethrottle; + if (got > 0) { + blp.advance(static_cast<size_t>(got)); + data.append(bp, 0, got); + offset += got; + left -= got; + } // else we got a signal or something; just loop. + } + } + + // footer + if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) { + if (tcp_read((char*)&footer, sizeof(footer)) < 0) + goto out_dethrottle; + } else { + ceph_msg_footer_old old_footer; + if (tcp_read((char*)&old_footer, sizeof(old_footer)) < 0) + goto out_dethrottle; + footer.front_crc = old_footer.front_crc; + footer.middle_crc = old_footer.middle_crc; + footer.data_crc = old_footer.data_crc; + footer.sig = 0; + footer.flags = old_footer.flags; + } + + aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0; + ldout(msgr->cct,10) << "aborted = " << aborted << dendl; + if (aborted) { + ldout(msgr->cct,0) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length() + << " byte message.. ABORTED" << dendl; + ret = 0; + goto out_dethrottle; + } + + ldout(msgr->cct,20) << "reader got " << front.length() << " + " << middle.length() << " + " << data.length() + << " byte message" << dendl; + message = decode_message(msgr->cct, msgr->crcflags, header, footer, + front, middle, data, connection_state.get()); + if (!message) { + ret = -EINVAL; + goto out_dethrottle; + } + + // + // Check the signature if one should be present. A zero return indicates success. PLR + // + + if (auth_handler == NULL) { + ldout(msgr->cct, 10) << "No session security set" << dendl; + } else { + if (auth_handler->check_message_signature(message)) { + ldout(msgr->cct, 0) << "Signature check failed" << dendl; + message->put(); + ret = -EINVAL; + goto out_dethrottle; + } + } + + message->set_byte_throttler(policy.throttler_bytes); + message->set_message_throttler(policy.throttler_messages); + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(message_size); + + message->set_recv_stamp(recv_stamp); + message->set_throttle_stamp(throttle_stamp); + message->set_recv_complete_stamp(ceph_clock_now()); + + *pm = message; + return 0; + + out_dethrottle: + // release bytes reserved from the throttlers on failure + if (policy.throttler_messages) { + ldout(msgr->cct,10) << "reader releasing " << 1 << " message to policy throttler " + << policy.throttler_messages->get_current() << "/" + << policy.throttler_messages->get_max() << dendl; + policy.throttler_messages->put(); + } + if (message_size) { + if (policy.throttler_bytes) { + ldout(msgr->cct,10) << "reader releasing " << message_size << " bytes to policy throttler " + << policy.throttler_bytes->get_current() << "/" + << policy.throttler_bytes->get_max() << dendl; + policy.throttler_bytes->put(message_size); + } + + in_q->dispatch_throttle_release(message_size); + } + return ret; +} + +int Pipe::do_sendmsg(struct msghdr *msg, unsigned len, bool more) +{ + MSGR_SIGPIPE_STOPPER; + while (len > 0) { + int r; + r = ::sendmsg(sd, msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0)); + if (r == 0) + ldout(msgr->cct,10) << "do_sendmsg hmm do_sendmsg got r==0!" << dendl; + if (r < 0) { + r = -errno; + ldout(msgr->cct,1) << "do_sendmsg error " << cpp_strerror(r) << dendl; + return r; + } + if (state == STATE_CLOSED) { + ldout(msgr->cct,10) << "do_sendmsg oh look, state == CLOSED, giving up" << dendl; + return -EINTR; // close enough + } + + len -= r; + if (len == 0) break; + + // hrmph. trim r bytes off the front of our message. + ldout(msgr->cct,20) << "do_sendmsg short write did " << r << ", still have " << len << dendl; + while (r > 0) { + if (msg->msg_iov[0].iov_len <= (size_t)r) { + // lose this whole item + //ldout(msgr->cct,30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl; + r -= msg->msg_iov[0].iov_len; + msg->msg_iov++; + msg->msg_iovlen--; + } else { + // partial! + //ldout(msgr->cct,30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl; + msg->msg_iov[0].iov_base = (char *)msg->msg_iov[0].iov_base + r; + msg->msg_iov[0].iov_len -= r; + break; + } + } + } + return 0; +} + + +int Pipe::write_ack(uint64_t seq) +{ + ldout(msgr->cct,10) << "write_ack " << seq << dendl; + + char c = CEPH_MSGR_TAG_ACK; + ceph_le64 s; + s = seq; + + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + struct iovec msgvec[2]; + msgvec[0].iov_base = &c; + msgvec[0].iov_len = 1; + msgvec[1].iov_base = &s; + msgvec[1].iov_len = sizeof(s); + msg.msg_iov = msgvec; + msg.msg_iovlen = 2; + + if (do_sendmsg(&msg, 1 + sizeof(s), true) < 0) + return -1; + return 0; +} + +int Pipe::write_keepalive() +{ + ldout(msgr->cct,10) << "write_keepalive" << dendl; + + char c = CEPH_MSGR_TAG_KEEPALIVE; + + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + struct iovec msgvec[2]; + msgvec[0].iov_base = &c; + msgvec[0].iov_len = 1; + msg.msg_iov = msgvec; + msg.msg_iovlen = 1; + + if (do_sendmsg(&msg, 1) < 0) + return -1; + return 0; +} + +int Pipe::write_keepalive2(char tag, const utime_t& t) +{ + ldout(msgr->cct,10) << "write_keepalive2 " << (int)tag << " " << t << dendl; + struct ceph_timespec ts; + t.encode_timeval(&ts); + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + struct iovec msgvec[2]; + msgvec[0].iov_base = &tag; + msgvec[0].iov_len = 1; + msgvec[1].iov_base = &ts; + msgvec[1].iov_len = sizeof(ts); + msg.msg_iov = msgvec; + msg.msg_iovlen = 2; + + if (do_sendmsg(&msg, 1 + sizeof(ts)) < 0) + return -1; + return 0; +} + + +int Pipe::write_message(const ceph_msg_header& header, const ceph_msg_footer& footer, bufferlist& blist) +{ + int ret; + + // set up msghdr and iovecs + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = msgvec; + int msglen = 0; + + // send tag + char tag = CEPH_MSGR_TAG_MSG; + msgvec[msg.msg_iovlen].iov_base = &tag; + msgvec[msg.msg_iovlen].iov_len = 1; + msglen++; + msg.msg_iovlen++; + + // send envelope + msgvec[msg.msg_iovlen].iov_base = (char*)&header; + msgvec[msg.msg_iovlen].iov_len = sizeof(header); + msglen += sizeof(header); + msg.msg_iovlen++; + + // payload (front+data) + auto pb = std::cbegin(blist.buffers()); + unsigned b_off = 0; // carry-over buffer offset, if any + unsigned bl_pos = 0; // blist pos + unsigned left = blist.length(); + + while (left > 0) { + unsigned donow = std::min(left, pb->length()-b_off); + if (donow == 0) { + ldout(msgr->cct,0) << "donow = " << donow << " left " << left << " pb->length " << pb->length() + << " b_off " << b_off << dendl; + } + ceph_assert(donow > 0); + ldout(msgr->cct,30) << " bl_pos " << bl_pos << " b_off " << b_off + << " leftinchunk " << left + << " buffer len " << pb->length() + << " writing " << donow + << dendl; + + if (msg.msg_iovlen >= SM_IOV_MAX-2) { + if (do_sendmsg(&msg, msglen, true)) + goto fail; + + // and restart the iov + msg.msg_iov = msgvec; + msg.msg_iovlen = 0; + msglen = 0; + } + + msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off); + msgvec[msg.msg_iovlen].iov_len = donow; + msglen += donow; + msg.msg_iovlen++; + + ceph_assert(left >= donow); + left -= donow; + b_off += donow; + bl_pos += donow; + if (left == 0) + break; + while (b_off == pb->length()) { + ++pb; + b_off = 0; + } + } + ceph_assert(left == 0); + + // send footer; if receiver doesn't support signatures, use the old footer format + + ceph_msg_footer_old old_footer; + if (connection_state->has_feature(CEPH_FEATURE_MSG_AUTH)) { + msgvec[msg.msg_iovlen].iov_base = (void*)&footer; + msgvec[msg.msg_iovlen].iov_len = sizeof(footer); + msglen += sizeof(footer); + msg.msg_iovlen++; + } else { + if (msgr->crcflags & MSG_CRC_HEADER) { + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + } else { + old_footer.front_crc = old_footer.middle_crc = 0; + } + old_footer.data_crc = msgr->crcflags & MSG_CRC_DATA ? footer.data_crc : 0; + old_footer.flags = footer.flags; + msgvec[msg.msg_iovlen].iov_base = (char*)&old_footer; + msgvec[msg.msg_iovlen].iov_len = sizeof(old_footer); + msglen += sizeof(old_footer); + msg.msg_iovlen++; + } + + // send + if (do_sendmsg(&msg, msglen)) + goto fail; + + ret = 0; + + out: + return ret; + + fail: + ret = -1; + goto out; +} + + +int Pipe::tcp_read(char *buf, unsigned len) +{ + if (sd < 0) + return -EINVAL; + + while (len > 0) { + + if (msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) { + if (rand() % msgr->cct->_conf->ms_inject_socket_failures == 0) { + ldout(msgr->cct, 0) << "injecting socket failure" << dendl; + ::shutdown(sd, SHUT_RDWR); + } + } + + if (tcp_read_wait() < 0) + return -1; + + ssize_t got = tcp_read_nonblocking(buf, len); + + if (got < 0) + return -1; + + len -= got; + buf += got; + //lgeneric_dout(cct, DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; + } + return 0; +} + +int Pipe::tcp_read_wait() +{ + if (sd < 0) + return -EINVAL; + struct pollfd pfd; + short evmask; + pfd.fd = sd; + pfd.events = POLLIN; +#if defined(__linux__) + pfd.events |= POLLRDHUP; +#endif + + if (has_pending_data()) + return 0; + + int r = poll(&pfd, 1, msgr->timeout); + if (r < 0) + return -errno; + if (r == 0) + return -EAGAIN; + + evmask = POLLERR | POLLHUP | POLLNVAL; +#if defined(__linux__) + evmask |= POLLRDHUP; +#endif + if (pfd.revents & evmask) + return -1; + + if (!(pfd.revents & POLLIN)) + return -1; + + return 0; +} + +ssize_t Pipe::do_recv(char *buf, size_t len, int flags) +{ +again: + ssize_t got = ::recv( sd, buf, len, flags ); + if (got < 0) { + if (errno == EINTR) { + goto again; + } + ldout(msgr->cct, 10) << __func__ << " socket " << sd << " returned " + << got << " " << cpp_strerror(errno) << dendl; + return -1; + } + if (got == 0) { + return -1; + } + return got; +} + +ssize_t Pipe::buffered_recv(char *buf, size_t len, int flags) +{ + size_t left = len; + ssize_t total_recv = 0; + if (recv_len > recv_ofs) { + int to_read = std::min(recv_len - recv_ofs, left); + memcpy(buf, &recv_buf[recv_ofs], to_read); + recv_ofs += to_read; + left -= to_read; + if (left == 0) { + return to_read; + } + buf += to_read; + total_recv += to_read; + } + + /* nothing left in the prefetch buffer */ + + if (left > recv_max_prefetch) { + /* this was a large read, we don't prefetch for these */ + ssize_t ret = do_recv(buf, left, flags ); + if (ret < 0) { + if (total_recv > 0) + return total_recv; + return ret; + } + total_recv += ret; + return total_recv; + } + + + ssize_t got = do_recv(recv_buf, recv_max_prefetch, flags); + if (got < 0) { + if (total_recv > 0) + return total_recv; + + return got; + } + + recv_len = (size_t)got; + got = std::min(left, (size_t)got); + memcpy(buf, recv_buf, got); + recv_ofs = got; + total_recv += got; + return total_recv; +} + +ssize_t Pipe::tcp_read_nonblocking(char *buf, unsigned len) +{ + ssize_t got = buffered_recv(buf, len, MSG_DONTWAIT ); + if (got < 0) { + ldout(msgr->cct, 10) << __func__ << " socket " << sd << " returned " + << got << " " << cpp_strerror(errno) << dendl; + return -1; + } + if (got == 0) { + /* poll() said there was data, but we didn't read any - peer + * sent a FIN. Maybe POLLRDHUP signals this, but this is + * standard socket behavior as documented by Stevens. + */ + return -1; + } + return got; +} + +int Pipe::tcp_write(const char *buf, unsigned len) +{ + if (sd < 0) + return -1; + struct pollfd pfd; + pfd.fd = sd; + pfd.events = POLLOUT | POLLHUP | POLLNVAL | POLLERR; +#if defined(__linux__) + pfd.events |= POLLRDHUP; +#endif + + if (msgr->cct->_conf->ms_inject_socket_failures && sd >= 0) { + if (rand() % msgr->cct->_conf->ms_inject_socket_failures == 0) { + ldout(msgr->cct, 0) << "injecting socket failure" << dendl; + ::shutdown(sd, SHUT_RDWR); + } + } + + if (poll(&pfd, 1, -1) < 0) + return -1; + + if (!(pfd.revents & POLLOUT)) + return -1; + + //lgeneric_dout(cct, DBL) << "tcp_write writing " << len << dendl; + ceph_assert(len > 0); + while (len > 0) { + MSGR_SIGPIPE_STOPPER; + int did = ::send( sd, buf, len, MSG_NOSIGNAL ); + if (did < 0) { + //lgeneric_dout(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl; + //lgeneric_derr(cct, 1) << "tcp_write error did = " << did << " " << cpp_strerror(errno) << dendl; + return did; + } + len -= did; + buf += did; + //lgeneric_dout(cct, DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; + } + return 0; +} diff --git a/src/msg/simple/Pipe.h b/src/msg/simple/Pipe.h new file mode 100644 index 00000000..81245198 --- /dev/null +++ b/src/msg/simple/Pipe.h @@ -0,0 +1,315 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSGR_PIPE_H +#define CEPH_MSGR_PIPE_H + +#include "auth/AuthSessionHandler.h" + +#include "msg/msg_types.h" +#include "msg/Messenger.h" +#include "PipeConnection.h" + + +class SimpleMessenger; +class DispatchQueue; + +static const int SM_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX); + + /** + * The Pipe is the most complex SimpleMessenger component. It gets + * two threads, one each for reading and writing on a socket it's handed + * at creation time, and is responsible for everything that happens on + * that socket. Besides message transmission, it's responsible for + * propagating socket errors to the SimpleMessenger and then sticking + * around in a state where it can provide enough data for the SimpleMessenger + * to provide reliable Message delivery when it manages to reconnect. + */ + class Pipe : public RefCountedObject { + /** + * The Reader thread handles all reads off the socket -- not just + * Messages, but also acks and other protocol bits (excepting startup, + * when the Writer does a couple of reads). + * All the work is implemented in Pipe itself, of course. + */ + class Reader : public Thread { + Pipe *pipe; + public: + explicit Reader(Pipe *p) : pipe(p) {} + void *entry() override { pipe->reader(); return 0; } + } reader_thread; + + /** + * The Writer thread handles all writes to the socket (after startup). + * All the work is implemented in Pipe itself, of course. + */ + class Writer : public Thread { + Pipe *pipe; + public: + explicit Writer(Pipe *p) : pipe(p) {} + void *entry() override { pipe->writer(); return 0; } + } writer_thread; + + class DelayedDelivery; + DelayedDelivery *delay_thread; + public: + Pipe(SimpleMessenger *r, int st, PipeConnection *con); + ~Pipe() override; + + SimpleMessenger *msgr; + uint64_t conn_id; + ostream& _pipe_prefix(std::ostream &out) const; + + Pipe* get() { + return static_cast<Pipe*>(RefCountedObject::get()); + } + + bool is_connected() { + Mutex::Locker l(pipe_lock); + return state == STATE_OPEN; + } + + char *recv_buf; + size_t recv_max_prefetch; + size_t recv_ofs; + size_t recv_len; + + enum { + STATE_ACCEPTING, + STATE_CONNECTING, + STATE_OPEN, + STATE_STANDBY, + STATE_CLOSED, + STATE_CLOSING, + STATE_WAIT // just wait for racing connection + }; + + static const char *get_state_name(int s) { + switch (s) { + case STATE_ACCEPTING: return "accepting"; + case STATE_CONNECTING: return "connecting"; + case STATE_OPEN: return "open"; + case STATE_STANDBY: return "standby"; + case STATE_CLOSED: return "closed"; + case STATE_CLOSING: return "closing"; + case STATE_WAIT: return "wait"; + default: return "UNKNOWN"; + } + } + const char *get_state_name() { + return get_state_name(state); + } + + private: + int sd; + struct iovec msgvec[SM_IOV_MAX]; + + public: + int port; + int peer_type; + entity_addr_t peer_addr; + Messenger::Policy policy; + + Mutex pipe_lock; + int state; + std::atomic<bool> state_closed = { false }; // true iff state = STATE_CLOSED + + // session_security handles any signatures or encryptions required for this pipe's msgs. PLR + + std::shared_ptr<AuthSessionHandler> session_security; + + protected: + friend class SimpleMessenger; + PipeConnectionRef connection_state; + + utime_t backoff; // backoff time + + bool reader_running, reader_needs_join; + bool reader_dispatching; /// reader thread is dispatching without pipe_lock + bool notify_on_dispatch_done; /// something wants a signal when dispatch done + bool writer_running; + + map<int, list<Message*> > out_q; // priority queue for outbound msgs + DispatchQueue *in_q; + list<Message*> sent; + Cond cond; + bool send_keepalive; + bool send_keepalive_ack; + utime_t keepalive_ack_stamp; + bool halt_delivery; //if a pipe's queue is destroyed, stop adding to it + + __u32 connect_seq, peer_global_seq; + uint64_t out_seq; + uint64_t in_seq, in_seq_acked; + + void set_socket_options(); + + int accept(); // server handshake + int connect(); // client handshake + void reader(); + void writer(); + void unlock_maybe_reap(); + + void randomize_out_seq(); + + int read_message(Message **pm, + AuthSessionHandler *session_security_copy); + int write_message(const ceph_msg_header& h, const ceph_msg_footer& f, bufferlist& body); + /** + * Write the given data (of length len) to the Pipe's socket. This function + * will loop until all passed data has been written out. + * If more is set, the function will optimize socket writes + * for additional data (by passing the MSG_MORE flag, aka TCP_CORK). + * + * @param msg The msghdr to write out + * @param len The length of the data in msg + * @param more Should be set true if this is one part of a larger message + * @return 0, or -1 on failure (unrecoverable -- close the socket). + */ + int do_sendmsg(struct msghdr *msg, unsigned len, bool more=false); + int write_ack(uint64_t s); + int write_keepalive(); + int write_keepalive2(char tag, const utime_t &t); + + void fault(bool reader=false); + + void was_session_reset(); + + /* Clean up sent list */ + void handle_ack(uint64_t seq); + + public: + Pipe(const Pipe& other); + const Pipe& operator=(const Pipe& other); + + void start_reader(); + void start_writer(); + void maybe_start_delay_thread(); + void join_reader(); + + // public constructors + static const Pipe& Server(int s); + static const Pipe& Client(const entity_addr_t& pi); + + uint64_t get_out_seq() { return out_seq; } + + bool is_queued() { return !out_q.empty() || send_keepalive || send_keepalive_ack; } + + entity_addr_t& get_peer_addr() { return peer_addr; } + + void set_peer_addr(const entity_addr_t& a) { + if (&peer_addr != &a) // shut up valgrind + peer_addr = a; + connection_state->set_peer_addr(a); + } + void set_peer_type(int t) { + peer_type = t; + connection_state->set_peer_type(t); + } + + void register_pipe(); + void unregister_pipe(); + void join(); + /// stop a Pipe by closing its socket and setting it to STATE_CLOSED + void stop(); + /// stop() a Pipe if not already done, and wait for it to finish any + /// fast_dispatch in progress. + void stop_and_wait(); + + void _send(Message *m) { + ceph_assert(pipe_lock.is_locked()); + out_q[m->get_priority()].push_back(m); + cond.Signal(); + } + void _send_keepalive() { + ceph_assert(pipe_lock.is_locked()); + send_keepalive = true; + cond.Signal(); + } + Message *_get_next_outgoing() { + ceph_assert(pipe_lock.is_locked()); + Message *m = 0; + while (!m && !out_q.empty()) { + map<int, list<Message*> >::reverse_iterator p = out_q.rbegin(); + if (!p->second.empty()) { + m = p->second.front(); + p->second.pop_front(); + } + if (p->second.empty()) + out_q.erase(p->first); + } + return m; + } + + /// move all messages in the sent list back into the queue at the highest priority. + void requeue_sent(); + /// discard messages requeued by requeued_sent() up to a given seq + void discard_requeued_up_to(uint64_t seq); + void discard_out_queue(); + + void shutdown_socket() { + recv_reset(); + if (sd >= 0) + ::shutdown(sd, SHUT_RDWR); + } + + void recv_reset() { + recv_len = 0; + recv_ofs = 0; + } + ssize_t do_recv(char *buf, size_t len, int flags); + ssize_t buffered_recv(char *buf, size_t len, int flags); + bool has_pending_data() { return recv_len > recv_ofs; } + + /** + * do a blocking read of len bytes from socket + * + * @param buf buffer to read into + * @param len exact number of bytes to read + * @return 0 for success, or -1 on error + */ + int tcp_read(char *buf, unsigned len); + + /** + * wait for bytes to become available on the socket + * + * @return 0 for success, or -1 on error + */ + int tcp_read_wait(); + + /** + * non-blocking read of available bytes on socket + * + * This is expected to be used after tcp_read_wait(), and will return + * an error if there is no data on the socket to consume. + * + * @param buf buffer to read into + * @param len maximum number of bytes to read + * @return bytes read, or -1 on error or when there is no data + */ + ssize_t tcp_read_nonblocking(char *buf, unsigned len); + + /** + * blocking write of bytes to socket + * + * @param buf buffer + * @param len number of bytes to write + * @return 0 for success, or -1 on error + */ + int tcp_write(const char *buf, unsigned len); + + }; + + +#endif diff --git a/src/msg/simple/PipeConnection.cc b/src/msg/simple/PipeConnection.cc new file mode 100644 index 00000000..faa1ea9e --- /dev/null +++ b/src/msg/simple/PipeConnection.cc @@ -0,0 +1,96 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "msg/Message.h" +#include "Pipe.h" +#include "SimpleMessenger.h" +#include "PipeConnection.h" + +PipeConnection::~PipeConnection() +{ + if (pipe) { + pipe->put(); + pipe = NULL; + } +} + +Pipe* PipeConnection::get_pipe() +{ + Mutex::Locker l(lock); + if (pipe) + return pipe->get(); + return NULL; +} + +bool PipeConnection::try_get_pipe(Pipe **p) +{ + Mutex::Locker l(lock); + if (failed) { + *p = NULL; + } else { + if (pipe) + *p = pipe->get(); + else + *p = NULL; + } + return !failed; +} + +bool PipeConnection::clear_pipe(Pipe *old_p) +{ + Mutex::Locker l(lock); + if (old_p == pipe) { + pipe->put(); + pipe = NULL; + failed = true; + return true; + } + return false; +} + +void PipeConnection::reset_pipe(Pipe *p) +{ + Mutex::Locker l(lock); + if (pipe) + pipe->put(); + pipe = p->get(); +} + +bool PipeConnection::is_connected() +{ + return static_cast<SimpleMessenger*>(msgr)->is_connected(this); +} + +int PipeConnection::send_message(Message *m) +{ + ceph_assert(msgr); + return static_cast<SimpleMessenger*>(msgr)->send_message(m, this); +} + +void PipeConnection::send_keepalive() +{ + static_cast<SimpleMessenger*>(msgr)->send_keepalive(this); +} + +void PipeConnection::mark_down() +{ + if (msgr) + static_cast<SimpleMessenger*>(msgr)->mark_down(this); +} + +void PipeConnection::mark_disposable() +{ + if (msgr) + static_cast<SimpleMessenger*>(msgr)->mark_disposable(this); +} diff --git a/src/msg/simple/PipeConnection.h b/src/msg/simple/PipeConnection.h new file mode 100644 index 00000000..e5460440 --- /dev/null +++ b/src/msg/simple/PipeConnection.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_PIPECONNECTION_H +#define CEPH_MSG_PIPECONNECTION_H + +#include "msg/Connection.h" + +class Pipe; + +class PipeConnection : public Connection { + Pipe* pipe; + + friend class boost::intrusive_ptr<PipeConnection>; + friend class Pipe; + +public: + + PipeConnection(CephContext *cct, Messenger *m) + : Connection(cct, m), + pipe(NULL) { } + + ~PipeConnection() override; + + Pipe* get_pipe(); + + bool try_get_pipe(Pipe** p); + + bool clear_pipe(Pipe* old_p); + + void reset_pipe(Pipe* p); + + bool is_connected() override; + + int send_message(Message *m) override; + void send_keepalive() override; + void mark_down() override; + void mark_disposable() override; + + entity_addr_t get_peer_socket_addr() const override { + return peer_addrs->front(); + } + +}; /* PipeConnection */ + +typedef boost::intrusive_ptr<PipeConnection> PipeConnectionRef; + +#endif diff --git a/src/msg/simple/SimpleMessenger.cc b/src/msg/simple/SimpleMessenger.cc new file mode 100644 index 00000000..09d1ab7b --- /dev/null +++ b/src/msg/simple/SimpleMessenger.cc @@ -0,0 +1,769 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <errno.h> +#include <iostream> +#include <fstream> + + +#include "SimpleMessenger.h" + +#include "common/config.h" +#include "common/Timer.h" +#include "common/errno.h" +#include "common/valgrind.h" +#include "auth/Crypto.h" +#include "include/spinlock.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, SimpleMessenger *msgr) { + return *_dout << "-- " << msgr->get_myaddr_legacy() << " "; +} + + +/******************* + * SimpleMessenger + */ + +SimpleMessenger::SimpleMessenger(CephContext *cct, entity_name_t name, + string mname, uint64_t _nonce) + : SimplePolicyMessenger(cct, name,mname, _nonce), + accepter(this, _nonce), + dispatch_queue(cct, this, mname), + reaper_thread(this), + nonce(_nonce), + lock("SimpleMessenger::lock"), need_addr(true), did_bind(false), + global_seq(0), + cluster_protocol(0), + reaper_started(false), reaper_stop(false), + timeout(0), + local_connection(new PipeConnection(cct, this)) +{ + ANNOTATE_BENIGN_RACE_SIZED(&timeout, sizeof(timeout), + "SimpleMessenger read timeout"); + init_local_connection(); +} + +/** + * Destroy the SimpleMessenger. Pretty simple since all the work is done + * elsewhere. + */ +SimpleMessenger::~SimpleMessenger() +{ + ceph_assert(!did_bind); // either we didn't bind or we shut down the Accepter + ceph_assert(rank_pipe.empty()); // we don't have any running Pipes. + ceph_assert(!reaper_started); // the reaper thread is stopped +} + +void SimpleMessenger::ready() +{ + ldout(cct,10) << "ready " << get_myaddr_legacy() << dendl; + dispatch_queue.start(); + + lock.Lock(); + if (did_bind) + accepter.start(); + lock.Unlock(); +} + + +int SimpleMessenger::shutdown() +{ + ldout(cct,10) << "shutdown " << get_myaddr_legacy() << dendl; + mark_down_all(); + + // break ref cycles on the loopback connection + local_connection->set_priv(NULL); + + lock.Lock(); + stop_cond.Signal(); + stopped = true; + lock.Unlock(); + + return 0; +} + +int SimpleMessenger::_send_message(Message *m, const entity_inst_t& dest) +{ + // set envelope + m->get_header().src = get_myname(); + m->set_cct(cct); + + if (!m->get_priority()) m->set_priority(get_default_send_priority()); + + ldout(cct,1) <<"--> " << dest.name << " " + << dest.addr << " -- " << *m + << " -- ?+" << m->get_data().length() + << " " << m + << dendl; + + if (dest.addr == entity_addr_t()) { + ldout(cct,0) << "send_message message " << *m + << " with empty dest " << dest.addr << dendl; + m->put(); + return -EINVAL; + } + + lock.Lock(); + Pipe *pipe = _lookup_pipe(dest.addr); + submit_message(m, (pipe ? pipe->connection_state.get() : NULL), + dest.addr, dest.name.type(), true); + lock.Unlock(); + return 0; +} + +int SimpleMessenger::_send_message(Message *m, Connection *con) +{ + //set envelope + m->get_header().src = get_myname(); + + if (!m->get_priority()) m->set_priority(get_default_send_priority()); + + ldout(cct,1) << "--> " << con->get_peer_addr() + << " -- " << *m + << " -- ?+" << m->get_data().length() + << " " << m << " con " << con + << dendl; + + submit_message(m, static_cast<PipeConnection*>(con), + con->get_peer_addr(), con->get_peer_type(), false); + return 0; +} + +/** + * If my_inst.addr doesn't have an IP set, this function + * will fill it in from the passed addr. Otherwise it does nothing and returns. + */ +bool SimpleMessenger::set_addr_unknowns(const entity_addrvec_t &addrs) +{ + bool ret = false; + auto addr = addrs.front(); + ceph_assert(my_addr == my_addrs->front()); + if (my_addr.is_blank_ip()) { + ldout(cct,1) << __func__ << " " << addr << dendl; + entity_addr_t t = my_addr; + int port = t.get_port(); + t.u = addr.u; + t.set_port(port); + set_addrs(entity_addrvec_t(t)); + init_local_connection(); + ret = true; + } else { + ldout(cct,1) << __func__ << " " << addr << " no-op" << dendl; + } + ceph_assert(my_addr == my_addrs->front()); + return ret; +} + +void SimpleMessenger::set_myaddrs(const entity_addrvec_t &av) +{ + my_addr = av.front(); + Messenger::set_myaddrs(av); +} + +void SimpleMessenger::set_addrs(const entity_addrvec_t &av) +{ + auto t = av; + for (auto& a : t.v) { + a.set_nonce(nonce); + } + set_myaddrs(t); + init_local_connection(); +} + +int SimpleMessenger::get_proto_version(int peer_type, bool connect) +{ + int my_type = my_name.type(); + + // set reply protocol version + if (peer_type == my_type) { + // internal + return cluster_protocol; + } else { + // public + if (connect) { + switch (peer_type) { + case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL; + case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL; + case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL; + } + } else { + switch (my_type) { + case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL; + case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL; + case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL; + } + } + } + return 0; +} + + + + + + + +/******************************************** + * SimpleMessenger + */ +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) + +void SimpleMessenger::reaper_entry() +{ + ldout(cct,10) << "reaper_entry start" << dendl; + lock.Lock(); + while (!reaper_stop) { + reaper(); // may drop and retake the lock + if (reaper_stop) + break; + reaper_cond.Wait(lock); + } + lock.Unlock(); + ldout(cct,10) << "reaper_entry done" << dendl; +} + +/* + * note: assumes lock is held + */ +void SimpleMessenger::reaper() +{ + ldout(cct,10) << "reaper" << dendl; + ceph_assert(lock.is_locked()); + + while (!pipe_reap_queue.empty()) { + Pipe *p = pipe_reap_queue.front(); + pipe_reap_queue.pop_front(); + ldout(cct,10) << "reaper reaping pipe " << p << " " << + p->get_peer_addr() << dendl; + p->pipe_lock.Lock(); + p->discard_out_queue(); + if (p->connection_state) { + // mark_down, mark_down_all, or fault() should have done this, + // or accept() may have switch the Connection to a different + // Pipe... but make sure! + bool cleared = p->connection_state->clear_pipe(p); + ceph_assert(!cleared); + } + p->pipe_lock.Unlock(); + p->unregister_pipe(); + ceph_assert(pipes.count(p)); + pipes.erase(p); + + // drop msgr lock while joining thread; the delay through could be + // trying to fast dispatch, preventing it from joining without + // blocking and deadlocking. + lock.Unlock(); + p->join(); + lock.Lock(); + + if (p->sd >= 0) + ::close(p->sd); + ldout(cct,10) << "reaper reaped pipe " << p << " " << p->get_peer_addr() << dendl; + p->put(); + ldout(cct,10) << "reaper deleted pipe " << p << dendl; + } + ldout(cct,10) << "reaper done" << dendl; +} + +void SimpleMessenger::queue_reap(Pipe *pipe) +{ + ldout(cct,10) << "queue_reap " << pipe << dendl; + lock.Lock(); + pipe_reap_queue.push_back(pipe); + reaper_cond.Signal(); + lock.Unlock(); +} + +bool SimpleMessenger::is_connected(Connection *con) +{ + bool r = false; + if (con) { + Pipe *p = static_cast<Pipe *>(static_cast<PipeConnection*>(con)->get_pipe()); + if (p) { + ceph_assert(p->msgr == this); + r = p->is_connected(); + p->put(); + } + } + return r; +} + +int SimpleMessenger::bind(const entity_addr_t &bind_addr) +{ + lock.Lock(); + if (started) { + ldout(cct,10) << "rank.bind already started" << dendl; + lock.Unlock(); + return -1; + } + ldout(cct,10) << "rank.bind " << bind_addr << dendl; + lock.Unlock(); + + // bind to a socket + set<int> avoid_ports; + int r = accepter.bind(bind_addr, avoid_ports); + if (r >= 0) + did_bind = true; + return r; +} + +int SimpleMessenger::rebind(const set<int>& avoid_ports) +{ + ldout(cct,1) << "rebind avoid " << avoid_ports << dendl; + ceph_assert(did_bind); + accepter.stop(); + mark_down_all(); + return accepter.rebind(avoid_ports); +} + + +int SimpleMessenger::client_bind(const entity_addr_t &bind_addr) +{ + if (!cct->_conf->ms_bind_before_connect) + return 0; + Mutex::Locker l(lock); + if (did_bind) { + ceph_assert(*my_addrs == entity_addrvec_t(bind_addr)); + return 0; + } + if (started) { + ldout(cct,10) << "rank.bind already started" << dendl; + return -1; + } + ldout(cct,10) << "rank.bind " << bind_addr << dendl; + + set_myaddrs(entity_addrvec_t(bind_addr)); + return 0; +} + + +int SimpleMessenger::start() +{ + lock.Lock(); + ldout(cct,1) << "messenger.start" << dendl; + + // register at least one entity, first! + ceph_assert(my_name.type() >= 0); + + ceph_assert(!started); + started = true; + stopped = false; + + if (!did_bind) { + my_addr.nonce = nonce; + init_local_connection(); + } + + lock.Unlock(); + + reaper_started = true; + reaper_thread.create("ms_reaper"); + return 0; +} + +Pipe *SimpleMessenger::add_accept_pipe(int sd) +{ + lock.Lock(); + Pipe *p = new Pipe(this, Pipe::STATE_ACCEPTING, NULL); + p->sd = sd; + p->pipe_lock.Lock(); + p->start_reader(); + p->pipe_lock.Unlock(); + pipes.insert(p); + accepting_pipes.insert(p); + lock.Unlock(); + return p; +} + +/* connect_rank + * NOTE: assumes messenger.lock held. + */ +Pipe *SimpleMessenger::connect_rank(const entity_addr_t& addr, + int type, + PipeConnection *con, + Message *first) +{ + ceph_assert(lock.is_locked()); + ceph_assert(addr != my_addr); + + ldout(cct,10) << "connect_rank to " << addr << ", creating pipe and registering" << dendl; + + // create pipe + Pipe *pipe = new Pipe(this, Pipe::STATE_CONNECTING, + static_cast<PipeConnection*>(con)); + pipe->pipe_lock.Lock(); + pipe->set_peer_type(type); + pipe->set_peer_addr(addr); + pipe->policy = get_policy(type); + pipe->start_writer(); + if (first) + pipe->_send(first); + pipe->pipe_lock.Unlock(); + pipe->register_pipe(); + pipes.insert(pipe); + + return pipe; +} + + + + + + +ConnectionRef SimpleMessenger::connect_to(int type, + const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + if (my_addr == addrs.front()) { + // local + return local_connection; + } + + // remote + while (true) { + Pipe *pipe = _lookup_pipe(addrs.legacy_addr()); + if (pipe) { + ldout(cct, 10) << "get_connection " << addrs << " existing " << pipe << dendl; + } else { + pipe = connect_rank(addrs.legacy_addr(), type, NULL, NULL); + ldout(cct, 10) << "get_connection " << addrs << " new " << pipe << dendl; + } + Mutex::Locker l(pipe->pipe_lock); + if (pipe->connection_state) + return pipe->connection_state; + // we failed too quickly! retry. FIXME. + } +} + +ConnectionRef SimpleMessenger::get_loopback_connection() +{ + return local_connection; +} + +void SimpleMessenger::submit_message(Message *m, PipeConnection *con, + const entity_addr_t& dest_addr, int dest_type, + bool already_locked) +{ + m->trace.event("simple submitting message"); + if (cct->_conf->ms_dump_on_send) { + m->encode(-1, true); + ldout(cct, 0) << "submit_message " << *m << "\n"; + m->get_payload().hexdump(*_dout); + if (m->get_data().length() > 0) { + *_dout << " data:\n"; + m->get_data().hexdump(*_dout); + } + *_dout << dendl; + m->clear_payload(); + } + + // existing connection? + if (con) { + Pipe *pipe = NULL; + bool ok = static_cast<PipeConnection*>(con)->try_get_pipe(&pipe); + if (!ok) { + ldout(cct,0) << "submit_message " << *m << " remote, " << dest_addr + << ", failed lossy con, dropping message " << m << dendl; + m->put(); + return; + } + while (pipe && ok) { + // we loop in case of a racing reconnect, either from us or them + pipe->pipe_lock.Lock(); // can't use a Locker because of the Pipe ref + if (pipe->state != Pipe::STATE_CLOSED) { + ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr << ", have pipe." << dendl; + pipe->_send(m); + pipe->pipe_lock.Unlock(); + pipe->put(); + return; + } + Pipe *current_pipe; + ok = con->try_get_pipe(¤t_pipe); + pipe->pipe_lock.Unlock(); + if (current_pipe == pipe) { + ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr + << ", had pipe " << pipe << ", but it closed." << dendl; + pipe->put(); + current_pipe->put(); + m->put(); + return; + } else { + pipe->put(); + pipe = current_pipe; + } + } + } + + // local? + if (my_addr == dest_addr) { + // local + ldout(cct,20) << "submit_message " << *m << " local" << dendl; + m->set_connection(local_connection.get()); + dispatch_queue.local_delivery(m, m->get_priority()); + return; + } + + // remote, no existing pipe. + const Policy& policy = get_policy(dest_type); + if (policy.server) { + ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr << ", lossy server for target type " + << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl; + m->put(); + } else { + ldout(cct,20) << "submit_message " << *m << " remote, " << dest_addr << ", new pipe." << dendl; + if (!already_locked) { + /** We couldn't handle the Message without reference to global data, so + * grab the lock and do it again. If we got here, we know it's a non-lossy + * Connection, so we can use our existing pointer without doing another lookup. */ + Mutex::Locker l(lock); + submit_message(m, con, dest_addr, dest_type, true); + } else { + connect_rank(dest_addr, dest_type, static_cast<PipeConnection*>(con), m); + } + } +} + +int SimpleMessenger::send_keepalive(Connection *con) +{ + int ret = 0; + Pipe *pipe = static_cast<Pipe *>( + static_cast<PipeConnection*>(con)->get_pipe()); + if (pipe) { + ldout(cct,20) << "send_keepalive con " << con << ", have pipe." << dendl; + ceph_assert(pipe->msgr == this); + pipe->pipe_lock.Lock(); + pipe->_send_keepalive(); + pipe->pipe_lock.Unlock(); + pipe->put(); + } else { + ldout(cct,0) << "send_keepalive con " << con << ", no pipe." << dendl; + ret = -EPIPE; + } + return ret; +} + + + +void SimpleMessenger::wait() +{ + lock.Lock(); + if (!started) { + lock.Unlock(); + return; + } + if (!stopped) + stop_cond.Wait(lock); + + lock.Unlock(); + + // done! clean up. + if (did_bind) { + ldout(cct,20) << "wait: stopping accepter thread" << dendl; + accepter.stop(); + did_bind = false; + ldout(cct,20) << "wait: stopped accepter thread" << dendl; + } + + dispatch_queue.shutdown(); + if (dispatch_queue.is_started()) { + ldout(cct,10) << "wait: waiting for dispatch queue" << dendl; + dispatch_queue.wait(); + dispatch_queue.discard_local(); + ldout(cct,10) << "wait: dispatch queue is stopped" << dendl; + } + + if (reaper_started) { + ldout(cct,20) << "wait: stopping reaper thread" << dendl; + lock.Lock(); + reaper_cond.Signal(); + reaper_stop = true; + lock.Unlock(); + reaper_thread.join(); + reaper_started = false; + ldout(cct,20) << "wait: stopped reaper thread" << dendl; + } + + // close+reap all pipes + lock.Lock(); + { + ldout(cct,10) << "wait: closing pipes" << dendl; + + while (!rank_pipe.empty()) { + Pipe *p = rank_pipe.begin()->second; + p->unregister_pipe(); + p->pipe_lock.Lock(); + p->stop_and_wait(); + // don't generate an event here; we're shutting down anyway. + PipeConnectionRef con = p->connection_state; + if (con) + con->clear_pipe(p); + p->pipe_lock.Unlock(); + } + + reaper(); + ldout(cct,10) << "wait: waiting for pipes " << pipes << " to close" << dendl; + while (!pipes.empty()) { + reaper_cond.Wait(lock); + reaper(); + } + } + lock.Unlock(); + + ldout(cct,10) << "wait: done." << dendl; + ldout(cct,1) << "shutdown complete." << dendl; + started = false; +} + + +void SimpleMessenger::mark_down_all() +{ + ldout(cct,1) << "mark_down_all" << dendl; + lock.Lock(); + for (set<Pipe*>::iterator q = accepting_pipes.begin(); q != accepting_pipes.end(); ++q) { + Pipe *p = *q; + ldout(cct,5) << "mark_down_all accepting_pipe " << p << dendl; + p->pipe_lock.Lock(); + p->stop(); + PipeConnectionRef con = p->connection_state; + if (con && con->clear_pipe(p)) + dispatch_queue.queue_reset(con.get()); + p->pipe_lock.Unlock(); + } + accepting_pipes.clear(); + + while (!rank_pipe.empty()) { + ceph::unordered_map<entity_addr_t,Pipe*>::iterator it = rank_pipe.begin(); + Pipe *p = it->second; + ldout(cct,5) << "mark_down_all " << it->first << " " << p << dendl; + rank_pipe.erase(it); + p->unregister_pipe(); + p->pipe_lock.Lock(); + p->stop(); + PipeConnectionRef con = p->connection_state; + if (con && con->clear_pipe(p)) + dispatch_queue.queue_reset(con.get()); + p->pipe_lock.Unlock(); + } + lock.Unlock(); +} + +void SimpleMessenger::mark_down(const entity_addr_t& addr) +{ + lock.Lock(); + Pipe *p = _lookup_pipe(addr); + if (p) { + ldout(cct,1) << "mark_down " << addr << " -- " << p << dendl; + p->unregister_pipe(); + p->pipe_lock.Lock(); + p->stop(); + if (p->connection_state) { + // generate a reset event for the caller in this case, even + // though they asked for it, since this is the addr-based (and + // not Connection* based) interface + PipeConnectionRef con = p->connection_state; + if (con && con->clear_pipe(p)) + dispatch_queue.queue_reset(con.get()); + } + p->pipe_lock.Unlock(); + } else { + ldout(cct,1) << "mark_down " << addr << " -- pipe dne" << dendl; + } + lock.Unlock(); +} + +void SimpleMessenger::mark_down(Connection *con) +{ + if (con == NULL) + return; + lock.Lock(); + Pipe *p = static_cast<Pipe *>(static_cast<PipeConnection*>(con)->get_pipe()); + if (p) { + ldout(cct,1) << "mark_down " << con << " -- " << p << dendl; + ceph_assert(p->msgr == this); + p->unregister_pipe(); + p->pipe_lock.Lock(); + p->stop(); + if (p->connection_state) { + // do not generate a reset event for the caller in this case, + // since they asked for it. + p->connection_state->clear_pipe(p); + } + p->pipe_lock.Unlock(); + p->put(); + } else { + ldout(cct,1) << "mark_down " << con << " -- pipe dne" << dendl; + } + lock.Unlock(); +} + +void SimpleMessenger::mark_disposable(Connection *con) +{ + lock.Lock(); + Pipe *p = static_cast<Pipe *>(static_cast<PipeConnection*>(con)->get_pipe()); + if (p) { + ldout(cct,1) << "mark_disposable " << con << " -- " << p << dendl; + ceph_assert(p->msgr == this); + p->pipe_lock.Lock(); + p->policy.lossy = true; + p->pipe_lock.Unlock(); + p->put(); + } else { + ldout(cct,1) << "mark_disposable " << con << " -- pipe dne" << dendl; + } + lock.Unlock(); +} + +void SimpleMessenger::learned_addr(const entity_addr_t &peer_addr_for_me) +{ + // be careful here: multiple threads may block here, and readers of + // my_addr do NOT hold any lock. + + // this always goes from true -> false under the protection of the + // mutex. if it is already false, we need not retake the mutex at + // all. + if (!need_addr) + return; + + lock.Lock(); + if (need_addr && my_addr.is_blank_ip()) { + entity_addr_t t = peer_addr_for_me; + if (!did_bind) { + t.set_type(entity_addr_t::TYPE_ANY); + t.set_port(0); + } else { + t.set_type(entity_addr_t::TYPE_LEGACY); + t.set_port(my_addr.get_port()); + } + t.set_nonce(my_addr.get_nonce()); + ANNOTATE_BENIGN_RACE_SIZED(&my_addr, sizeof(my_addr), + "SimpleMessenger learned addr"); + set_myaddrs(entity_addrvec_t(t)); + ldout(cct,1) << "learned my addr " << my_addr << dendl; + need_addr = false; + init_local_connection(); + } + lock.Unlock(); +} + +void SimpleMessenger::init_local_connection() +{ + local_connection->peer_addrs = *my_addrs; + local_connection->peer_type = my_name.type(); + local_connection->set_features(CEPH_FEATURES_ALL); + ms_deliver_handle_fast_connect(local_connection.get()); +} diff --git a/src/msg/simple/SimpleMessenger.h b/src/msg/simple/SimpleMessenger.h new file mode 100644 index 00000000..b1aad539 --- /dev/null +++ b/src/msg/simple/SimpleMessenger.h @@ -0,0 +1,414 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_SIMPLEMESSENGER_H +#define CEPH_SIMPLEMESSENGER_H + +#include <list> +#include <map> + +#include "include/types.h" +#include "include/xlist.h" + +#include "include/unordered_map.h" +#include "include/unordered_set.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" +#include "common/Throttle.h" + +#include "include/spinlock.h" + +#include "msg/SimplePolicyMessenger.h" +#include "msg/Message.h" +#include "include/ceph_assert.h" + +#include "msg/DispatchQueue.h" +#include "Pipe.h" +#include "Accepter.h" + +/* + * This class handles transmission and reception of messages. Generally + * speaking, there are several major components: + * + * - Connection + * Each logical session is associated with a Connection. + * - Pipe + * Each network connection is handled through a pipe, which handles + * the input and output of each message. There is normally a 1:1 + * relationship between Pipe and Connection, but logical sessions may + * get handed off between Pipes when sockets reconnect or during + * connection races. + * - IncomingQueue + * Incoming messages are associated with an IncomingQueue, and there + * is one such queue associated with each Pipe. + * - DispatchQueue + * IncomingQueues get queued in the DIspatchQueue, which is responsible + * for doing a round-robin sweep and processing them via a worker thread. + * - SimpleMessenger + * It's the exterior class passed to the external message handler and + * most of the API details. + * + * Lock ordering: + * + * SimpleMessenger::lock + * Pipe::pipe_lock + * DispatchQueue::lock + * IncomingQueue::lock + */ + +class SimpleMessenger : public SimplePolicyMessenger { + // First we have the public Messenger interface implementation... +public: + /** + * Initialize the SimpleMessenger! + * + * @param cct The CephContext to use + * @param name The name to assign ourselves + * _nonce A unique ID to use for this SimpleMessenger. It should not + * be a value that will be repeated if the daemon restarts. + * features The local features bits for the local_connection + */ + SimpleMessenger(CephContext *cct, entity_name_t name, + string mname, uint64_t _nonce); + + /** + * Destroy the SimpleMessenger. Pretty simple since all the work is done + * elsewhere. + */ + ~SimpleMessenger() override; + + /** @defgroup Accessors + * @{ + */ + bool set_addr_unknowns(const entity_addrvec_t& addr) override; + void set_addrs(const entity_addrvec_t &addr) override; + void set_myaddrs(const entity_addrvec_t& a) override; + + int get_dispatch_queue_len() override { + return dispatch_queue.get_queue_len(); + } + + double get_dispatch_queue_max_age(utime_t now) override { + return dispatch_queue.get_max_age(now); + } + /** @} Accessors */ + + /** + * @defgroup Configuration functions + * @{ + */ + void set_cluster_protocol(int p) override { + ceph_assert(!started && !did_bind); + cluster_protocol = p; + } + + int bind(const entity_addr_t& bind_addr) override; + int rebind(const set<int>& avoid_ports) override; + int client_bind(const entity_addr_t& bind_addr) override; + + /** @} Configuration functions */ + + /** + * @defgroup Startup/Shutdown + * @{ + */ + int start() override; + void wait() override; + int shutdown() override; + + /** @} // Startup/Shutdown */ + + /** + * @defgroup Messaging + * @{ + */ + int send_to( + Message *m, + int type, + const entity_addrvec_t& addr) override { + // temporary + return _send_message(m, entity_inst_t(entity_name_t(type, -1), + addr.legacy_addr())); + } + + int send_message(Message *m, Connection *con) { + return _send_message(m, con); + } + + /** @} // Messaging */ + + /** + * @defgroup Connection Management + * @{ + */ + ConnectionRef connect_to(int type, const entity_addrvec_t& addrs) override; + ConnectionRef get_loopback_connection() override; + int send_keepalive(Connection *con); + void mark_down(const entity_addr_t& addr) override; + void mark_down(Connection *con); + void mark_disposable(Connection *con); + void mark_down_all() override; + /** @} // Connection Management */ +protected: + /** + * @defgroup Messenger Interfaces + * @{ + */ + /** + * Start up the DispatchQueue thread once we have somebody to dispatch to. + */ + void ready() override; + /** @} // Messenger Interfaces */ +private: + /** + * @defgroup Inner classes + * @{ + */ + +public: + Accepter accepter; + DispatchQueue dispatch_queue; + + friend class Accepter; + + /** + * Register a new pipe for accept + * + * @param sd socket + */ + Pipe *add_accept_pipe(int sd); + +private: + + /** + * A thread used to tear down Pipes when they're complete. + */ + class ReaperThread : public Thread { + SimpleMessenger *msgr; + public: + explicit ReaperThread(SimpleMessenger *m) : msgr(m) {} + void *entry() override { + msgr->reaper_entry(); + return 0; + } + } reaper_thread; + + /** + * @} // Inner classes + */ + + /** + * @defgroup Utility functions + * @{ + */ + + /** + * Create a Pipe associated with the given entity (of the given type). + * Initiate the connection. (This function returning does not guarantee + * connection success.) + * + * @param addr The address of the entity to connect to. + * @param type The peer type of the entity at the address. + * @param con An existing Connection to associate with the new Pipe. If + * NULL, it creates a new Connection. + * @param first an initial message to queue on the new pipe + * + * @return a pointer to the newly-created Pipe. Caller does not own a + * reference; take one if you need it. + */ + Pipe *connect_rank(const entity_addr_t& addr, int type, PipeConnection *con, + Message *first); + /** + * Send a message, lazily or not. + * This just glues send_message together and passes + * the input on to submit_message. + */ + int _send_message(Message *m, const entity_inst_t& dest); + /** + * Same as above, but for the Connection-based variants. + */ + int _send_message(Message *m, Connection *con); + /** + * Queue up a Message for delivery to the entity specified + * by addr and dest_type. + * submit_message() is responsible for creating + * new Pipes (and closing old ones) as necessary. + * + * @param m The Message to queue up. This function eats a reference. + * @param con The existing Connection to use, or NULL if you don't know of one. + * @param addr The address to send the Message to. + * @param dest_type The peer type of the address we're sending to + * just drop silently under failure. + * @param already_locked If false, submit_message() will acquire the + * SimpleMessenger lock before accessing shared data structures; otherwise + * it will assume the lock is held. NOTE: if you are making a request + * without locking, you MUST have filled in the con with a valid pointer. + */ + void submit_message(Message *m, PipeConnection *con, + const entity_addr_t& addr, int dest_type, + bool already_locked); + /** + * Look through the pipes in the pipe_reap_queue and tear them down. + */ + void reaper(); + /** + * @} // Utility functions + */ + + // SimpleMessenger stuff + /// approximately unique ID set by the Constructor for use in entity_addr_t + uint64_t nonce; + /// overall lock used for SimpleMessenger data structures + Mutex lock; + /// true, specifying we haven't learned our addr; set false when we find it. + // maybe this should be protected by the lock? + bool need_addr; + +public: + bool get_need_addr() const { return need_addr; } + +private: + /** + * false; set to true if the SimpleMessenger bound to a specific address; + * and set false again by Accepter::stop(). This isn't lock-protected + * since you shouldn't be able to race the only writers. + */ + bool did_bind; + /// counter for the global seq our connection protocol uses + __u32 global_seq; + /// lock to protect the global_seq + ceph::spinlock global_seq_lock; + + entity_addr_t my_addr; + + /** + * hash map of addresses to Pipes + * + * NOTE: a Pipe* with state CLOSED may still be in the map but is considered + * invalid and can be replaced by anyone holding the msgr lock + */ + ceph::unordered_map<entity_addr_t, Pipe*> rank_pipe; + /** + * list of pipes are in the process of accepting + * + * These are not yet in the rank_pipe map. + */ + set<Pipe*> accepting_pipes; + /// a set of all the Pipes we have which are somehow active + set<Pipe*> pipes; + /// a list of Pipes we want to tear down + list<Pipe*> pipe_reap_queue; + + /// internal cluster protocol version, if any, for talking to entities of the same type. + int cluster_protocol; + + Cond stop_cond; + bool stopped = true; + + bool reaper_started, reaper_stop; + Cond reaper_cond; + + /// This Cond is slept on by wait() and signaled by dispatch_entry() + Cond wait_cond; + + friend class Pipe; + + Pipe *_lookup_pipe(const entity_addr_t& k) { + ceph::unordered_map<entity_addr_t, Pipe*>::iterator p = rank_pipe.find(k); + if (p == rank_pipe.end()) + return NULL; + // see lock cribbing in Pipe::fault() + if (p->second->state_closed) + return NULL; + return p->second; + } + +public: + + int timeout; + + /// con used for sending messages to ourselves + ConnectionRef local_connection; + + /** + * @defgroup SimpleMessenger internals + * @{ + */ + + /** + * Increment the global sequence for this SimpleMessenger and return it. + * This is for the connect protocol, although it doesn't hurt if somebody + * else calls it. + * + * @return a global sequence ID that nobody else has seen. + */ + __u32 get_global_seq(__u32 old=0) { + std::lock_guard<decltype(global_seq_lock)> lg(global_seq_lock); + + if (old > global_seq) + global_seq = old; + __u32 ret = ++global_seq; + + return ret; + } + /** + * Get the protocol version we support for the given peer type: either + * a peer protocol (if it matches our own), the protocol version for the + * peer (if we're connecting), or our protocol version (if we're accepting). + */ + int get_proto_version(int peer_type, bool connect); + + /** + * Fill in the features, address and peer type for the local connection, which + * is used for delivering messages back to ourself. + */ + void init_local_connection(); + /** + * Tell the SimpleMessenger its full IP address. + * + * This is used by Pipes when connecting to other endpoints, and + * probably shouldn't be called by anybody else. + */ + void learned_addr(const entity_addr_t& peer_addr_for_me); + + /** + * This function is used by the reaper thread. As long as nobody + * has set reaper_stop, it calls the reaper function, then + * waits to be signaled when it needs to reap again (or when it needs + * to stop). + */ + void reaper_entry(); + /** + * Add a pipe to the pipe_reap_queue, to be torn down on + * the next call to reaper(). + * It should really only be the Pipe calling this, in our current + * implementation. + * + * @param pipe A Pipe which has stopped its threads and is + * ready to be torn down. + */ + void queue_reap(Pipe *pipe); + + /** + * Used to get whether this connection ready to send + */ + bool is_connected(Connection *con); + /** + * @} // SimpleMessenger Internals + */ +} ; + +#endif /* CEPH_SIMPLEMESSENGER_H */ diff --git a/src/msg/xio/XioConnection.cc b/src/msg/xio/XioConnection.cc new file mode 100644 index 00000000..4bfab39b --- /dev/null +++ b/src/msg/xio/XioConnection.cc @@ -0,0 +1,858 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XioMsg.h" +#include "XioConnection.h" +#include "XioMessenger.h" +#include "messages/MDataPing.h" +#include "msg/msg_types.h" +#include "auth/none/AuthNoneProtocol.h" // XXX + +#include "include/ceph_assert.h" +#include "common/dout.h" + +extern struct xio_mempool *xio_msgr_mpool; +extern struct xio_mempool *xio_msgr_noreg_mpool; + +#define dout_subsys ceph_subsys_xio + +void print_xio_msg_hdr(CephContext *cct, const char *tag, + const XioMsgHdr &hdr, const struct xio_msg *msg) +{ + if (msg) { + ldout(cct,4) << tag << + " xio msg:" << + " sn: " << msg->sn << + " timestamp: " << msg->timestamp << + dendl; + } + + ldout(cct,4) << tag << + " ceph header: " << + " front_len: " << hdr.hdr->front_len << + " seq: " << hdr.hdr->seq << + " tid: " << hdr.hdr->tid << + " type: " << hdr.hdr->type << + " prio: " << hdr.hdr->priority << + " name type: " << (int) hdr.hdr->src.type << + " name num: " << (int) hdr.hdr->src.num << + " version: " << hdr.hdr->version << + " compat_version: " << hdr.hdr->compat_version << + " front_len: " << hdr.hdr->front_len << + " middle_len: " << hdr.hdr->middle_len << + " data_len: " << hdr.hdr->data_len << + " xio header: " << + " msg_cnt: " << hdr.msg_cnt << + dendl; + + ldout(cct,4) << tag << + " ceph footer: " << + " front_crc: " << hdr.ftr->front_crc << + " middle_crc: " << hdr.ftr->middle_crc << + " data_crc: " << hdr.ftr->data_crc << + " sig: " << hdr.ftr->sig << + " flags: " << (uint32_t) hdr.ftr->flags << + dendl; +} + +void print_ceph_msg(CephContext *cct, const char *tag, Message *m) +{ + if (m->get_magic() & (MSG_MAGIC_XIO & MSG_MAGIC_TRACE_DTOR)) { + ceph_msg_header& header = m->get_header(); + ldout(cct,4) << tag << " header version " << header.version << + " compat version " << header.compat_version << + dendl; + } +} + +#undef dout_prefix +#define dout_prefix conn_prefix(_dout) +ostream& XioConnection::conn_prefix(std::ostream *_dout) { + return *_dout << "-- " << get_messenger()->get_myinst().addr << " >> " << peer_addr + << " peer=" << peer.name.type_str() + << " conn=" << conn << " sess=" << session << " "; +} + +XioConnection::XioConnection(XioMessenger *m, XioConnection::type _type, + const entity_inst_t& _peer) : + Connection(m->cct, m), + xio_conn_type(_type), + portal(m->get_portal()), + connected(false), + peer(_peer), + session(NULL), + conn(NULL), + magic(m->get_magic()), + scount(0), + send_ctr(0), + in_seq(), + cstate(this) +{ + set_peer_type(peer.name.type()); + set_peer_addr(peer.addr); + + Messenger::Policy policy; + int64_t max_msgs = 0, max_bytes = 0, bytes_opt = 0; + int xopt; + + policy = m->get_policy(peer_type); + + if (policy.throttler_messages) { + max_msgs = policy.throttler_messages->get_max(); + ldout(m->cct,4) << "XioMessenger throttle_msgs: " << max_msgs << dendl; + } + + xopt = m->cct->_conf->xio_queue_depth; + if (max_msgs > xopt) + xopt = max_msgs; + + /* set high mark for send, reserved 20% for credits */ + q_high_mark = xopt * 4 / 5; + q_low_mark = q_high_mark/2; + + /* set send & receive msgs queue depth */ + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_SND_QUEUE_DEPTH_MSGS, + &xopt, sizeof(xopt)); + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_RCV_QUEUE_DEPTH_MSGS, + &xopt, sizeof(xopt)); + + if (policy.throttler_bytes) { + max_bytes = policy.throttler_bytes->get_max(); + ldout(m->cct,4) << "XioMessenger throttle_bytes: " << max_bytes << dendl; + } + + bytes_opt = (2 << 28); /* default: 512 MB */ + if (max_bytes > bytes_opt) + bytes_opt = max_bytes; + + /* set send & receive total bytes throttle */ + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_SND_QUEUE_DEPTH_BYTES, + &bytes_opt, sizeof(bytes_opt)); + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_RCV_QUEUE_DEPTH_BYTES, + &bytes_opt, sizeof(bytes_opt)); + + ldout(m->cct,4) << "throttle_msgs: " << xopt << " throttle_bytes: " << bytes_opt << dendl; + + /* XXXX fake features, aieee! */ + set_features(XIO_ALL_FEATURES); +} + +int XioConnection::send_message(Message *m) +{ + XioMessenger *ms = static_cast<XioMessenger*>(get_messenger()); + return ms->_send_message(m, this); +} + +void XioConnection::send_keepalive_or_ack(bool ack, const utime_t *tp) +{ + /* If con is not in READY state, we need to queue the request */ + if (cstate.session_state.read() != XioConnection::UP) { + std::lock_guad<ceph::util::spinlock> lg(sp); + if (cstate.session_state.read() != XioConnection::UP) { + if (ack) { + outgoing.ack = true; + outgoing.ack_time = *tp; + } + else { + outgoing.keepalive = true; + } + return; + } + } + + send_keepalive_or_ack_internal(ack, tp); +} + +void XioConnection::send_keepalive_or_ack_internal(bool ack, const utime_t *tp) +{ + XioCommand *xcmd = pool_alloc_xio_command(this); + if (! xcmd) { + /* could happen if Accelio has been shutdown */ + return; + } + + struct ceph_timespec ts; + if (ack) { + ceph_assert(tp); + tp->encode_timeval(&ts); + xcmd->get_bl_ref().append(CEPH_MSGR_TAG_KEEPALIVE2_ACK); + xcmd->get_bl_ref().append((char*)&ts, sizeof(ts)); + } else if (has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + utime_t t = ceph_clock_now(); + t.encode_timeval(&ts); + xcmd->get_bl_ref().append(CEPH_MSGR_TAG_KEEPALIVE2); + xcmd->get_bl_ref().append((char*)&ts, sizeof(ts)); + } else { + xcmd->get_bl_ref().append(CEPH_MSGR_TAG_KEEPALIVE); + } + + const std::list<buffer::ptr>& header = xcmd->get_bl_ref().buffers(); + ceph_assert(header.size() == 1); /* accelio header must be without scatter gather */ + list<bufferptr>::const_iterator pb = header.begin(); + ceph_assert(pb->length() < XioMsgHdr::get_max_encoded_length()); + struct xio_msg * msg = xcmd->get_xio_msg(); + msg->out.header.iov_base = (char*) pb->c_str(); + msg->out.header.iov_len = pb->length(); + + ldout(msgr->cct,8) << __func__ << " sending command with tag " << (int)(*(char*)msg->out.header.iov_base) + << " len " << msg->out.header.iov_len << dendl; + + portal->enqueue(this, xcmd); +} + + +int XioConnection::passive_setup() +{ + /* XXX passive setup is a placeholder for (potentially active-side + initiated) feature and auth* negotiation */ + static bufferlist authorizer_reply; /* static because fake */ + static CryptoKey session_key; /* ditto */ + bool authorizer_valid; + + XioMessenger *msgr = static_cast<XioMessenger*>(get_messenger()); + + // fake an auth buffer + EntityName name; + name.set_type(peer.name.type()); + + AuthNoneAuthorizer auth; + auth.build_authorizer(name, peer.name.num()); + + /* XXX fake authorizer! */ + msgr->ms_deliver_verify_authorizer( + this, peer_type, CEPH_AUTH_NONE, + auth.bl, + 0, + authorizer_reply, + authorizer_valid, + session_key); + + /* notify hook */ + msgr->ms_deliver_handle_accept(this); + msgr->ms_deliver_handle_fast_accept(this); + + /* try to insert in conns_entity_map */ + msgr->try_insert(this); + return (0); +} + +static inline XioDispatchHook* pool_alloc_xio_dispatch_hook( + XioConnection *xcon, Message *m, XioInSeq& msg_seq) +{ + struct xio_reg_mem mp_mem; + int e = xpool_alloc(xio_msgr_noreg_mpool, + sizeof(XioDispatchHook), &mp_mem); + if (!!e) + return NULL; + XioDispatchHook *xhook = static_cast<XioDispatchHook*>(mp_mem.addr); + new (xhook) XioDispatchHook(xcon, m, msg_seq, mp_mem); + return xhook; +} + +int XioConnection::handle_data_msg(struct xio_session *session, + struct xio_msg *msg, + int more_in_batch, + void *cb_user_context) +{ + struct xio_msg *tmsg = msg; + + /* XXX Accelio guarantees message ordering at + * xio_session */ + + if (! in_seq.p()) { + if (!tmsg->in.header.iov_len) { + ldout(msgr->cct,0) << __func__ << " empty header: packet out of sequence?" << dendl; + xio_release_msg(msg); + return 0; + } + const size_t sizeof_tag = 1; + XioMsgCnt msg_cnt( + buffer::create_static(tmsg->in.header.iov_len-sizeof_tag, + ((char*) tmsg->in.header.iov_base)+sizeof_tag)); + ldout(msgr->cct,10) << __func__ << " receive msg " << "tmsg " << tmsg + << " msg_cnt " << msg_cnt.msg_cnt + << " iov_base " << tmsg->in.header.iov_base + << " iov_len " << (int) tmsg->in.header.iov_len + << " nents " << tmsg->in.pdata_iov.nents + << " sn " << tmsg->sn << dendl; + ceph_assert(session == this->session); + in_seq.set_count(msg_cnt.msg_cnt); + } else { + /* XXX major sequence error */ + ceph_assert(! tmsg->in.header.iov_len); + } + + in_seq.append(msg); + if (in_seq.count() > 0) { + return 0; + } + + XioMessenger *msgr = static_cast<XioMessenger*>(get_messenger()); + XioDispatchHook *m_hook = + pool_alloc_xio_dispatch_hook(this, NULL /* msg */, in_seq); + XioInSeq& msg_seq = m_hook->msg_seq; + in_seq.clear(); + + ceph_msg_header header; + ceph_msg_footer footer; + buffer::list payload, middle, data; + + const utime_t recv_stamp = ceph_clock_now(); + + ldout(msgr->cct,4) << __func__ << " " << "msg_seq.size()=" << msg_seq.size() << + dendl; + + struct xio_msg* msg_iter = msg_seq.begin(); + tmsg = msg_iter; + XioMsgHdr hdr(header, footer, + buffer::create_static(tmsg->in.header.iov_len, + (char*) tmsg->in.header.iov_base)); + + if (magic & (MSG_MAGIC_TRACE_XCON)) { + if (hdr.hdr->type == 43) { + print_xio_msg_hdr(msgr->cct, "on_msg", hdr, NULL); + } + } + + unsigned int ix, blen, iov_len; + struct xio_iovec_ex *msg_iov, *iovs; + uint32_t take_len, left_len = 0; + char *left_base = NULL; + + ix = 0; + blen = header.front_len; + + while (blen && (msg_iter != msg_seq.end())) { + tmsg = msg_iter; + iov_len = vmsg_sglist_nents(&tmsg->in); + iovs = vmsg_sglist(&tmsg->in); + for (; blen && (ix < iov_len); ++ix) { + msg_iov = &iovs[ix]; + + /* XXX need to detect any buffer which needs to be + * split due to coalescing of a segment (front, middle, + * data) boundary */ + + take_len = std::min(blen, msg_iov->iov_len); + payload.append( + buffer::create_msg( + take_len, (char*) msg_iov->iov_base, m_hook)); + blen -= take_len; + if (! blen) { + left_len = msg_iov->iov_len - take_len; + if (left_len) { + left_base = ((char*) msg_iov->iov_base) + take_len; + } + } + } + /* XXX as above, if a buffer is split, then we needed to track + * the new start (carry) and not advance */ + if (ix == iov_len) { + msg_seq.next(&msg_iter); + ix = 0; + } + } + + if (magic & (MSG_MAGIC_TRACE_XCON)) { + if (hdr.hdr->type == 43) { + ldout(msgr->cct,4) << "front (payload) dump:"; + payload.hexdump( *_dout ); + *_dout << dendl; + } + } + + blen = header.middle_len; + + if (blen && left_len) { + middle.append( + buffer::create_msg(left_len, left_base, m_hook)); + left_len = 0; + } + + while (blen && (msg_iter != msg_seq.end())) { + tmsg = msg_iter; + iov_len = vmsg_sglist_nents(&tmsg->in); + iovs = vmsg_sglist(&tmsg->in); + for (; blen && (ix < iov_len); ++ix) { + msg_iov = &iovs[ix]; + take_len = std::min(blen, msg_iov->iov_len); + middle.append( + buffer::create_msg( + take_len, (char*) msg_iov->iov_base, m_hook)); + blen -= take_len; + if (! blen) { + left_len = msg_iov->iov_len - take_len; + if (left_len) { + left_base = ((char*) msg_iov->iov_base) + take_len; + } + } + } + if (ix == iov_len) { + msg_seq.next(&msg_iter); + ix = 0; + } + } + + blen = header.data_len; + + if (blen && left_len) { + data.append( + buffer::create_msg(left_len, left_base, m_hook)); + left_len = 0; + } + + while (blen && (msg_iter != msg_seq.end())) { + tmsg = msg_iter; + iov_len = vmsg_sglist_nents(&tmsg->in); + iovs = vmsg_sglist(&tmsg->in); + for (; blen && (ix < iov_len); ++ix) { + msg_iov = &iovs[ix]; + data.append( + buffer::create_msg( + msg_iov->iov_len, (char*) msg_iov->iov_base, m_hook)); + blen -= msg_iov->iov_len; + } + if (ix == iov_len) { + msg_seq.next(&msg_iter); + ix = 0; + } + } + + /* update connection timestamp */ + recv = tmsg->timestamp; + + Message *m = decode_message(msgr->cct, msgr->crcflags, header, footer, + payload, middle, data, this); + + if (m) { + /* completion */ + m->set_connection(this); + + /* reply hook */ + m_hook->set_message(m); + m->set_completion_hook(m_hook); + + /* trace flag */ + m->set_magic(magic); + + /* update timestamps */ + m->set_recv_stamp(recv_stamp); + m->set_recv_complete_stamp(ceph_clock_now()); + m->set_seq(header.seq); + + /* MP-SAFE */ + state.set_in_seq(header.seq); + + /* XXXX validate peer type */ + if (peer_type != (int) hdr.peer_type) { /* XXX isn't peer_type -1? */ + peer_type = hdr.peer_type; + peer_addr = hdr.addr; + peer.addr = peer_addr; + peer.name = entity_name_t(hdr.hdr->src); + if (xio_conn_type == XioConnection::PASSIVE) { + /* XXX kick off feature/authn/authz negotiation + * nb: very possibly the active side should initiate this, but + * for now, call a passive hook so OSD and friends can create + * sessions without actually negotiating + */ + passive_setup(); + } + } + + if (magic & (MSG_MAGIC_TRACE_XCON)) { + ldout(msgr->cct,4) << "decode m is " << m->get_type() << dendl; + } + + /* dispatch it */ + msgr->ds_dispatch(m); + } else { + /* responds for undecoded messages and frees hook */ + ldout(msgr->cct,4) << "decode m failed" << dendl; + m_hook->on_err_finalize(this); + } + + return 0; +} + +int XioConnection::on_msg(struct xio_session *session, + struct xio_msg *msg, + int more_in_batch, + void *cb_user_context) +{ + char tag = CEPH_MSGR_TAG_MSG; + if (msg->in.header.iov_len) + tag = *(char*)msg->in.header.iov_base; + + ldout(msgr->cct,8) << __func__ << " receive msg with iov_len " + << (int) msg->in.header.iov_len << " tag " << (int)tag << dendl; + + //header_len_without_tag is only meaningful in case we have tag + size_t header_len_without_tag = msg->in.header.iov_len - sizeof(tag); + + switch(tag) { + case CEPH_MSGR_TAG_MSG: + ldout(msgr->cct, 20) << __func__ << " got data message" << dendl; + return handle_data_msg(session, msg, more_in_batch, cb_user_context); + + case CEPH_MSGR_TAG_KEEPALIVE: + ldout(msgr->cct, 20) << __func__ << " got KEEPALIVE" << dendl; + set_last_keepalive(ceph_clock_now()); + break; + + case CEPH_MSGR_TAG_KEEPALIVE2: + if (header_len_without_tag < sizeof(ceph_timespec)) { + lderr(msgr->cct) << __func__ << " too few data for KEEPALIVE2: got " << header_len_without_tag << + " bytes instead of " << sizeof(ceph_timespec) << " bytes" << dendl; + } + else { + ceph_timespec *t = (ceph_timespec *) ((char*)msg->in.header.iov_base + sizeof(tag)); + utime_t kp_t = utime_t(*t); + ldout(msgr->cct, 20) << __func__ << " got KEEPALIVE2 with timestamp" << kp_t << dendl; + send_keepalive_or_ack(true, &kp_t); + set_last_keepalive(ceph_clock_now()); + } + + break; + + case CEPH_MSGR_TAG_KEEPALIVE2_ACK: + if (header_len_without_tag < sizeof(ceph_timespec)) { + lderr(msgr->cct) << __func__ << " too few data for KEEPALIVE2_ACK: got " << header_len_without_tag << + " bytes instead of " << sizeof(ceph_timespec) << " bytes" << dendl; + } + else { + ceph_timespec *t = (ceph_timespec *) ((char*)msg->in.header.iov_base + sizeof(tag)); + utime_t kp_t(*t); + ldout(msgr->cct, 20) << __func__ << " got KEEPALIVE2_ACK with timestamp" << kp_t << dendl; + set_last_keepalive_ack(kp_t); + } + break; + + default: + lderr(msgr->cct) << __func__ << " unsupported message tag " << (int) tag << dendl; + ceph_assert(! "unsupported message tag"); + } + + xio_release_msg(msg); + return 0; +} + + +int XioConnection::on_ow_msg_send_complete(struct xio_session *session, + struct xio_msg *req, + void *conn_user_context) +{ + /* requester send complete (one-way) */ + uint64_t rc = ++scount; + + XioSend* xsend = static_cast<XioSend*>(req->user_context); + if (unlikely(magic & MSG_MAGIC_TRACE_CTR)) { + if (unlikely((rc % 1000000) == 0)) { + std::cout << "xio finished " << rc << " " << time(0) << std::endl; + } + } /* trace ctr */ + + ldout(msgr->cct,11) << "on_msg_delivered xcon: " << xsend->xcon << + " msg: " << req << " sn: " << req->sn << dendl; + + XioMsg *xmsg = dynamic_cast<XioMsg*>(xsend); + if (xmsg) { + ldout(msgr->cct,11) << "on_msg_delivered xcon: " << + " type: " << xmsg->m->get_type() << " tid: " << xmsg->m->get_tid() << + " seq: " << xmsg->m->get_seq() << dendl; + } + + --send_ctr; /* atomic, because portal thread */ + + /* unblock flow-controlled connections, avoid oscillation */ + if (unlikely(cstate.session_state.read() == + XioConnection::FLOW_CONTROLLED)) { + if ((send_ctr <= uint32_t(xio_qdepth_low_mark())) && + (1 /* XXX memory <= memory low-water mark */)) { + cstate.state_up_ready(XioConnection::CState::OP_FLAG_NONE); + ldout(msgr->cct,2) << "on_msg_delivered xcon: " << xsend->xcon + << " up_ready from flow_controlled" << dendl; + } + } + + xsend->put(); + + return 0; +} /* on_msg_delivered */ + +void XioConnection::msg_send_fail(XioSend *xsend, int code) +{ + ldout(msgr->cct,2) << "xio_send_msg FAILED xcon: " << this << + " msg: " << xsend->get_xio_msg() << " code=" << code << + " (" << xio_strerror(code) << ")" << dendl; + /* return refs taken for each xio_msg */ + xsend->put_msg_refs(); +} /* msg_send_fail */ + +void XioConnection::msg_release_fail(struct xio_msg *msg, int code) +{ + ldout(msgr->cct,2) << "xio_release_msg FAILED xcon: " << this << + " msg: " << msg << "code=" << code << + " (" << xio_strerror(code) << ")" << dendl; +} /* msg_release_fail */ + +int XioConnection::flush_out_queues(uint32_t flags) { + XioMessenger* msgr = static_cast<XioMessenger*>(get_messenger()); + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.lock(); + + if (outgoing.keepalive) { + outgoing.keepalive = false; + send_keepalive_or_ack_internal(); + } + + if (outgoing.ack) { + outgoing.ack = false; + send_keepalive_or_ack_internal(true, &outgoing.ack_time); + } + + // send deferred 1 (direct backpresssure) + if (outgoing.requeue.size() > 0) + portal->requeue(this, outgoing.requeue); + + // send deferred 2 (sent while deferred) + int ix, q_size = outgoing.mqueue.size(); + for (ix = 0; ix < q_size; ++ix) { + Message::Queue::iterator q_iter = outgoing.mqueue.begin(); + Message* m = &(*q_iter); + outgoing.mqueue.erase(q_iter); + msgr->_send_message_impl(m, this); + } + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.unlock(); + return 0; +} + +int XioConnection::discard_out_queues(uint32_t flags) +{ + Message::Queue disc_q; + XioSubmit::Queue deferred_q; + + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.lock(); + + /* the two send queues contain different objects: + * - anything on the mqueue is a Message + * - anything on the requeue is an XioSend + */ + Message::Queue::const_iterator i1 = disc_q.end(); + disc_q.splice(i1, outgoing.mqueue); + + XioSubmit::Queue::const_iterator i2 = deferred_q.end(); + deferred_q.splice(i2, outgoing.requeue); + + outgoing.keepalive = outgoing.ack = false; + + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.unlock(); + + // mqueue + while (!disc_q.empty()) { + Message::Queue::iterator q_iter = disc_q.begin(); + Message* m = &(*q_iter); + disc_q.erase(q_iter); + m->put(); + } + + // requeue + while (!deferred_q.empty()) { + XioSubmit::Queue::iterator q_iter = deferred_q.begin(); + XioSubmit* xs = &(*q_iter); + XioSend* xsend; + switch (xs->type) { + case XioSubmit::OUTGOING_MSG: + xsend = static_cast<XioSend*>(xs); + deferred_q.erase(q_iter); + // release once for each chained xio_msg + xsend->put(xsend->get_msg_count()); + break; + case XioSubmit::INCOMING_MSG_RELEASE: + deferred_q.erase(q_iter); + portal->release_xio_msg(static_cast<XioCompletion*>(xs)); + break; + default: + ldout(msgr->cct,0) << __func__ << ": Unknown Msg type " << xs->type << dendl; + break; + } + } + + return 0; +} + +int XioConnection::adjust_clru(uint32_t flags) +{ + if (flags & CState::OP_FLAG_LOCKED) + sp.unlock(); + + XioMessenger* msgr = static_cast<XioMessenger*>(get_messenger()); + msgr->conns_sp.lock(); + sp.lock(); + + if (cstate.flags & CState::FLAG_MAPPED) { + XioConnection::ConnList::iterator citer = + XioConnection::ConnList::s_iterator_to(*this); + msgr->conns_list.erase(citer); + msgr->conns_list.push_front(*this); // LRU + } + + msgr->conns_sp.unlock(); + + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.unlock(); + + return 0; +} + +int XioConnection::on_msg_error(struct xio_session *session, + enum xio_status error, + struct xio_msg *msg, + void *conn_user_context) +{ + XioSend *xsend = static_cast<XioSend*>(msg->user_context); + if (xsend) + xsend->put(); + + --send_ctr; /* atomic, because portal thread */ + return 0; +} /* on_msg_error */ + +void XioConnection::mark_down() +{ + _mark_down(XioConnection::CState::OP_FLAG_NONE); +} + +int XioConnection::_mark_down(uint32_t flags) +{ + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.lock(); + + // per interface comment, we only stage a remote reset if the + // current policy required it + if (cstate.policy.resetcheck) + cstate.flags |= CState::FLAG_RESET; + + disconnect(); + + /* XXX this will almost certainly be called again from + * on_disconnect_event() */ + discard_out_queues(flags|CState::OP_FLAG_LOCKED); + + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.unlock(); + + return 0; +} + +void XioConnection::mark_disposable() +{ + _mark_disposable(XioConnection::CState::OP_FLAG_NONE); +} + +int XioConnection::_mark_disposable(uint32_t flags) +{ + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.lock(); + + cstate.policy.lossy = true; + + if (! (flags & CState::OP_FLAG_LOCKED)) + sp.unlock(); + + return 0; +} + +int XioConnection::CState::state_up_ready(uint32_t flags) +{ + if (! (flags & CState::OP_FLAG_LOCKED)) + xcon->sp.lock(); + + xcon->flush_out_queues(flags|CState::OP_FLAG_LOCKED); + + session_state = session_states::UP; + startup_state = session_startup_states::READY; + + if (! (flags & CState::OP_FLAG_LOCKED)) + xcon->sp.unlock(); + + return (0); +} + +int XioConnection::CState::state_discon() +{ + session_state = session_states::DISCONNECTED; + startup_state = session_startup_states::IDLE; + + return 0; +} + +int XioConnection::CState::state_flow_controlled(uint32_t flags) +{ + if (! (flags & OP_FLAG_LOCKED)) + xcon->sp.lock(); + + session_state = session_states::FLOW_CONTROLLED; + + if (! (flags & OP_FLAG_LOCKED)) + xcon->sp.unlock(); + + return (0); +} + +int XioConnection::CState::state_fail(Message* m, uint32_t flags) +{ + if (! (flags & OP_FLAG_LOCKED)) + xcon->sp.lock(); + + // advance to state FAIL, drop queued, msgs, adjust LRU + session_state = session_states::DISCONNECTED; + startup_state = session_startup_states::FAIL; + + xcon->discard_out_queues(flags|OP_FLAG_LOCKED); + xcon->adjust_clru(flags|OP_FLAG_LOCKED|OP_FLAG_LRU); + + xcon->disconnect(); + + if (! (flags & OP_FLAG_LOCKED)) + xcon->sp.unlock(); + + // notify ULP + XioMessenger* msgr = static_cast<XioMessenger*>(xcon->get_messenger()); + msgr->ms_deliver_handle_reset(xcon); + m->put(); + + return 0; +} + + +int XioLoopbackConnection::send_message(Message *m) +{ + XioMessenger *ms = static_cast<XioMessenger*>(get_messenger()); + m->set_connection(this); + m->set_seq(next_seq()); + m->set_src(ms->get_myinst().name); + ms->ds_dispatch(m); + return 0; +} + +void XioLoopbackConnection::send_keepalive() +{ + utime_t t = ceph_clock_now(); + set_last_keepalive(t); + set_last_keepalive_ack(t); +} diff --git a/src/msg/xio/XioConnection.h b/src/msg/xio/XioConnection.h new file mode 100644 index 00000000..00024ef3 --- /dev/null +++ b/src/msg/xio/XioConnection.h @@ -0,0 +1,380 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef XIO_CONNECTION_H +#define XIO_CONNECTION_H + +#include <atomic> + +#include <boost/intrusive/avl_set.hpp> +#include <boost/intrusive/list.hpp> + +extern "C" { +#include "libxio.h" +} + +#include "XioInSeq.h" +#include "XioSubmit.h" +#include "msg/Connection.h" +#include "msg/Messenger.h" +#include "auth/AuthSessionHandler.h" + +#define XIO_ALL_FEATURES (CEPH_FEATURES_ALL) + + +#define XIO_NOP_TAG_MARKDOWN 0x0001 + +namespace bi = boost::intrusive; + +class XioPortal; +class XioMessenger; +class XioSend; + +class XioConnection : public Connection +{ +public: + enum type { ACTIVE, PASSIVE }; + + enum class session_states : unsigned { + INIT = 0, + START, + UP, + FLOW_CONTROLLED, + DISCONNECTED, + DELETED, + BARRIER + }; + + enum class session_startup_states : unsigned { + IDLE = 0, + CONNECTING, + ACCEPTING, + READY, + FAIL + }; + +private: + XioConnection::type xio_conn_type; + XioPortal *portal; + std::atomic<bool> connected = { false }; + entity_inst_t peer; + struct xio_session *session; + struct xio_connection *conn; + ceph::util::spinlock sp; + std::atomic<int64_t> send = { 0 }; + std::atomic<int64_t> recv = { 0 }; + uint32_t n_reqs; // Accelio-initiated reqs in progress (!counting partials) + uint32_t magic; + uint32_t special_handling; + uint64_t scount; + uint32_t send_ctr; + int q_high_mark; + int q_low_mark; + + struct lifecycle { + // different from Pipe states? + enum lf_state { + INIT, + LOCAL_DISCON, + REMOTE_DISCON, + RECONNECTING, + UP, + DEAD } state; + + /* XXX */ + uint32_t reconnects; + uint32_t connect_seq, peer_global_seq; + uint64_t in_seq, out_seq_acked; // atomic<uint64_t>, got receipt + std::atomic<int64_t> out_seq = { 0 }; + + lifecycle() : state(lifecycle::INIT), reconnects(0), connect_seq(0), + peer_global_seq(0), in_seq(0), out_seq_acked(0) + {} + + void set_in_seq(uint64_t seq) { + in_seq = seq; + } + + uint64_t next_out_seq() { + return ++out_seq; + } + + } state; + + /* batching */ + XioInSeq in_seq; + + class CState + { + public: + static const int FLAG_NONE = 0x0000; + static const int FLAG_BAD_AUTH = 0x0001; + static const int FLAG_MAPPED = 0x0002; + static const int FLAG_RESET = 0x0004; + + static const int OP_FLAG_NONE = 0x0000; + static const int OP_FLAG_LOCKED = 0x0001; + static const int OP_FLAG_LRU = 0x0002; + + uint64_t features; + Messenger::Policy policy; + + CryptoKey session_key; + std::shared_ptr<AuthSessionHandler> session_security; + AuthAuthorizer *authorizer; + XioConnection *xcon; + uint32_t protocol_version; + + std::atomic<session_states> session_state = { 0 }; + std::atomic<session_startup_state> startup_state = { 0 }; + + uint32_t reconnects; + uint32_t connect_seq, global_seq, peer_global_seq; + uint64_t in_seq, out_seq_acked; // atomic<uint64_t>, got receipt + std::atomic<uint64_t> out_seq = { 0 }; + + uint32_t flags; + + explicit CState(XioConnection* _xcon) + : features(0), + authorizer(NULL), + xcon(_xcon), + protocol_version(0), + session_state(INIT), + startup_state(IDLE), + reconnects(0), + connect_seq(0), + global_seq(0), + peer_global_seq(0), + in_seq(0), + out_seq_acked(0), + flags(FLAG_NONE) {} + + uint64_t get_session_state() { + return session_state; + } + + uint64_t get_startup_state() { + return startup_state; + } + + void set_in_seq(uint64_t seq) { + in_seq = seq; + } + + uint64_t next_out_seq() { + return ++out_seq; + }; + + // state machine + int init_state(); + int next_state(Message* m); +#if 0 // future (session startup) + int msg_connect(MConnect *m); + int msg_connect_reply(MConnectReply *m); + int msg_connect_reply(MConnectAuthReply *m); + int msg_connect_auth(MConnectAuth *m); + int msg_connect_auth_reply(MConnectAuthReply *m); +#endif + int state_up_ready(uint32_t flags); + int state_flow_controlled(uint32_t flags); + int state_discon(); + int state_fail(Message* m, uint32_t flags); + + } cstate; /* CState */ + + // message submission queue + struct SendQ { + bool keepalive; + bool ack; + utime_t ack_time; + Message::Queue mqueue; // deferred + XioSubmit::Queue requeue; + + SendQ():keepalive(false), ack(false){} + } outgoing; + + // conns_entity_map comparison functor + struct EntityComp + { + // for internal ordering + bool operator()(const XioConnection &lhs, const XioConnection &rhs) const + { return lhs.get_peer() < rhs.get_peer(); } + + // for external search by entity_inst_t(peer) + bool operator()(const entity_inst_t &peer, const XioConnection &c) const + { return peer < c.get_peer(); } + + bool operator()(const XioConnection &c, const entity_inst_t &peer) const + { return c.get_peer() < peer; } + }; + + bi::list_member_hook<> conns_hook; + bi::avl_set_member_hook<> conns_entity_map_hook; + + typedef bi::list< XioConnection, + bi::member_hook<XioConnection, bi::list_member_hook<>, + &XioConnection::conns_hook > > ConnList; + + typedef bi::member_hook<XioConnection, bi::avl_set_member_hook<>, + &XioConnection::conns_entity_map_hook> EntityHook; + + typedef bi::avl_set< XioConnection, EntityHook, + bi::compare<EntityComp> > EntitySet; + + friend class XioPortal; + friend class XioMessenger; + friend class XioDispatchHook; + friend class XioMarkDownHook; + friend class XioSend; + + int on_disconnect_event() { + std::lock_guard<ceph::spinlock> lg(sp); + + connected = false; + discard_out_queues(CState::OP_FLAG_LOCKED); + + return 0; + } + + int on_teardown_event() { + + { + std::lock_guard<ceph::spinlock> lg(sp); + + if (conn) + xio_connection_destroy(conn); + conn = NULL; + } + + this->put(); + return 0; + } + + int xio_qdepth_high_mark() { + return q_high_mark; + } + + int xio_qdepth_low_mark() { + return q_low_mark; + } + +public: + XioConnection(XioMessenger *m, XioConnection::type _type, + const entity_inst_t& peer); + + ~XioConnection() { + if (conn) + xio_connection_destroy(conn); + } + ostream& conn_prefix(std::ostream *_dout); + + bool is_connected() override { return connected; } + + int send_message(Message *m) override; + void send_keepalive() override {send_keepalive_or_ack();} + void send_keepalive_or_ack(bool ack = false, const utime_t *tp = nullptr); + void mark_down() override; + int _mark_down(uint32_t flags); + void mark_disposable() override; + int _mark_disposable(uint32_t flags); + + const entity_inst_t& get_peer() const { return peer; } + + XioConnection* get() { +#if 0 + cout << "XioConnection::get " << this << " " << nref.load() << std::endl; +#endif + RefCountedObject::get(); + return this; + } + + void put() { + RefCountedObject::put(); +#if 0 + cout << "XioConnection::put " << this << " " << nref.load() << std::endl; +#endif + } + + void disconnect() { + if (is_connected()) { + connected = false; + xio_disconnect(conn); // normal teardown will clean up conn + } + } + + uint32_t get_magic() { return magic; } + void set_magic(int _magic) { magic = _magic; } + uint32_t get_special_handling() { return special_handling; } + void set_special_handling(int n) { special_handling = n; } + uint64_t get_scount() { return scount; } + + int passive_setup(); /* XXX */ + + int handle_data_msg(struct xio_session *session, struct xio_msg *msg, + int more_in_batch, void *cb_user_context); + int on_msg(struct xio_session *session, struct xio_msg *msg, + int more_in_batch, void *cb_user_context); + int on_ow_msg_send_complete(struct xio_session *session, struct xio_msg *msg, + void *conn_user_context); + int on_msg_error(struct xio_session *session, enum xio_status error, + struct xio_msg *msg, void *conn_user_context); + void msg_send_fail(XioSend *xsend, int code); + void msg_release_fail(struct xio_msg *msg, int code); +private: + void send_keepalive_or_ack_internal(bool ack = false, const utime_t *tp = nullptr); + int flush_out_queues(uint32_t flags); + int discard_out_queues(uint32_t flags); + int adjust_clru(uint32_t flags); +}; + +typedef boost::intrusive_ptr<XioConnection> XioConnectionRef; + +class XioLoopbackConnection : public Connection +{ +private: + std::atomic<uint64_t> seq = { 0 }; +public: + explicit XioLoopbackConnection(Messenger *m) : Connection(m->cct, m) + { + const entity_inst_t& m_inst = m->get_myinst(); + peer_addr = m_inst.addr; + peer_type = m_inst.name.type(); + set_features(XIO_ALL_FEATURES); /* XXXX set to ours */ + } + + XioLoopbackConnection* get() { + return static_cast<XioLoopbackConnection*>(RefCountedObject::get()); + } + + bool is_connected() override { return true; } + + int send_message(Message *m) override; + void send_keepalive() override; + void mark_down() override {} + void mark_disposable() override {} + + uint64_t get_seq() { + return seq; + } + + uint64_t next_seq() { + return ++seq; + } +}; + +typedef boost::intrusive_ptr<XioLoopbackConnection> XioLoopbackConnectionRef; + +#endif /* XIO_CONNECTION_H */ diff --git a/src/msg/xio/XioInSeq.h b/src/msg/xio/XioInSeq.h new file mode 100644 index 00000000..7863a8f6 --- /dev/null +++ b/src/msg/xio/XioInSeq.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef XIO_IN_SEQ_H +#define XIO_IN_SEQ_H + +#include <boost/intrusive/list.hpp> +#include "msg/SimplePolicyMessenger.h" +extern "C" { +#include "libxio.h" +} + +/* For inbound messages (Accelio-owned) ONLY, use the message's + * user_context as an SLIST */ +class XioInSeq { +private: + int cnt; + int sz; + struct xio_msg* head; + struct xio_msg* tail; + +public: + XioInSeq() : cnt(0), sz(0), head(NULL), tail(NULL) {} + XioInSeq(const XioInSeq& seq) { + cnt = seq.cnt; + sz = seq.sz; + head = seq.head; + tail = seq.tail; + } + + int count() { return cnt; } + + int size() { return sz; } + + bool p() { return !!head; } + + void set_count(int _cnt) { cnt = _cnt; } + + void append(struct xio_msg* msg) { + msg->user_context = NULL; + if (!head) { + head = tail = msg; + } else { + tail->user_context = msg; + tail = msg; + } + ++sz; + --cnt; + } + + struct xio_msg* begin() { return head; } + + struct xio_msg* end() { return NULL; } + + void next(struct xio_msg** msg) { + *msg = static_cast<struct xio_msg *>((*msg)->user_context); + } + + struct xio_msg* dequeue() { + struct xio_msg* msgs = head; + clear(); + return msgs; + } + + void clear() { + head = tail = NULL; + cnt = 0; + sz = 0; + } +}; + +#endif /* XIO_IN_SEQ_H */ diff --git a/src/msg/xio/XioMessenger.cc b/src/msg/xio/XioMessenger.cc new file mode 100644 index 00000000..dec7d0c7 --- /dev/null +++ b/src/msg/xio/XioMessenger.cc @@ -0,0 +1,1136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <arpa/inet.h> +#include <boost/lexical_cast.hpp> +#include <set> +#include <stdlib.h> +#include <memory> + +#include "XioMsg.h" +#include "XioMessenger.h" +#include "common/address_helper.h" +#include "common/code_environment.h" +#include "messages/MNop.h" + +#define dout_subsys ceph_subsys_xio +#undef dout_prefix +#define dout_prefix *_dout << "xio." + +Mutex mtx("XioMessenger Package Lock"); +std::atomic<bool> initialized = { false }; + +std::atomic<unsigned> XioMessenger::nInstances = { 0 }; + +struct xio_mempool *xio_msgr_noreg_mpool; + +static struct xio_session_ops xio_msgr_ops; + +/* Accelio API callouts */ + +namespace xio_log +{ +typedef pair<const char*, int> level_pair; +static const level_pair LEVELS[] = { + make_pair("fatal", 0), + make_pair("error", 0), + make_pair("warn", 1), + make_pair("info", 1), + make_pair("debug", 2), + make_pair("trace", 20) +}; + +static CephContext *context; + +int get_level() +{ + int level = 0; + for (size_t i = 0; i < sizeof(LEVELS); i++) { + if (!ldlog_p1(context, dout_subsys, LEVELS[i].second)) + break; + level++; + } + return level; +} + +void log_dout(const char *file, unsigned line, + const char *function, unsigned level, + const char *fmt, ...) +{ + char buffer[2048]; + va_list args; + va_start(args, fmt); + int n = vsnprintf(buffer, sizeof(buffer), fmt, args); + va_end(args); + + if (n > 0) { + const char *short_file = strrchr(file, '/'); + short_file = (short_file == NULL) ? file : short_file + 1; + + const level_pair &lvl = LEVELS[level]; + ldout(context, lvl.second) << '[' << lvl.first << "] " + << short_file << ':' << line << ' ' + << function << " - " << buffer << dendl; + } +} +} + +static int on_session_event(struct xio_session *session, + struct xio_session_event_data *event_data, + void *cb_user_context) +{ + XioMessenger *msgr = static_cast<XioMessenger*>(cb_user_context); + CephContext *cct = msgr->cct; + + ldout(cct,4) << "session event: " << xio_session_event_str(event_data->event) + << ". reason: " << xio_strerror(event_data->reason) << dendl; + + return msgr->session_event(session, event_data, cb_user_context); +} + +static int on_new_session(struct xio_session *session, + struct xio_new_session_req *req, + void *cb_user_context) +{ + XioMessenger *msgr = static_cast<XioMessenger*>(cb_user_context); + CephContext *cct = msgr->cct; + + ldout(cct,4) << "new session " << session + << " user_context " << cb_user_context << dendl; + + return (msgr->new_session(session, req, cb_user_context)); +} + +static int on_msg(struct xio_session *session, + struct xio_msg *req, + int more_in_batch, + void *cb_user_context) +{ + XioConnection* xcon __attribute__((unused)) = + static_cast<XioConnection*>(cb_user_context); + CephContext *cct = xcon->get_messenger()->cct; + + ldout(cct,25) << "on_msg session " << session << " xcon " << xcon << dendl; + + if (unlikely(XioPool::trace_mempool)) { + static uint32_t nreqs; + if (unlikely((++nreqs % 65536) == 0)) { + xp_stats.dump(__func__, nreqs); + } + } + + return xcon->on_msg(session, req, more_in_batch, + cb_user_context); +} + +static int on_ow_msg_send_complete(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context) +{ + XioConnection *xcon = + static_cast<XioConnection*>(conn_user_context); + CephContext *cct = xcon->get_messenger()->cct; + + ldout(cct,25) << "msg delivered session: " << session + << " msg: " << msg << " conn_user_context " + << conn_user_context << dendl; + + return xcon->on_ow_msg_send_complete(session, msg, conn_user_context); +} + +static int on_msg_error(struct xio_session *session, + enum xio_status error, + enum xio_msg_direction dir, + struct xio_msg *msg, + void *conn_user_context) +{ + /* XIO promises to flush back undelivered messages */ + XioConnection *xcon = + static_cast<XioConnection*>(conn_user_context); + CephContext *cct = xcon->get_messenger()->cct; + + ldout(cct,4) << "msg error session: " << session + << " error: " << xio_strerror(error) << " msg: " << msg + << " conn_user_context " << conn_user_context << dendl; + + return xcon->on_msg_error(session, error, msg, conn_user_context); +} + +static int on_cancel(struct xio_session *session, + struct xio_msg *msg, + enum xio_status result, + void *conn_user_context) +{ + XioConnection* xcon __attribute__((unused)) = + static_cast<XioConnection*>(conn_user_context); + CephContext *cct = xcon->get_messenger()->cct; + + ldout(cct,25) << "on cancel: session: " << session << " msg: " << msg + << " conn_user_context " << conn_user_context << dendl; + + return 0; +} + +static int on_cancel_request(struct xio_session *session, + struct xio_msg *msg, + void *conn_user_context) +{ + XioConnection* xcon __attribute__((unused)) = + static_cast<XioConnection*>(conn_user_context); + CephContext *cct = xcon->get_messenger()->cct; + + ldout(cct,25) << "on cancel request: session: " << session << " msg: " << msg + << " conn_user_context " << conn_user_context << dendl; + + return 0; +} + +/* free functions */ +static string xio_uri_from_entity(const string &type, + const entity_addr_t& addr, bool want_port) +{ + const char *host = NULL; + char addr_buf[129]; + string xio_uri; + + switch(addr.get_family()) { + case AF_INET: + host = inet_ntop(AF_INET, &addr.in4_addr().sin_addr, addr_buf, + INET_ADDRSTRLEN); + break; + case AF_INET6: + host = inet_ntop(AF_INET6, &addr.in6_addr().sin6_addr, addr_buf, + INET6_ADDRSTRLEN); + break; + default: + abort(); + break; + }; + + if (type == "rdma" || type == "tcp") + xio_uri = type + "://"; + else + xio_uri = "rdma://"; + + /* The following can only succeed if the host is rdma-capable */ + xio_uri += host; + if (want_port) { + xio_uri += ":"; + xio_uri += boost::lexical_cast<std::string>(addr.get_port()); + } + + return xio_uri; +} /* xio_uri_from_entity */ + +void XioInit::package_init(CephContext *cct) { + if (! initialized) { + + mtx.Lock(); + if (! initialized) { + + xio_init(); + + // claim a reference to the first context we see + xio_log::context = cct->get(); + + int xopt; + xopt = xio_log::get_level(); + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_LEVEL, + &xopt, sizeof(xopt)); + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_LOG_FN, + (const void*)xio_log::log_dout, sizeof(xio_log_fn)); + + xopt = 1; + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_DISABLE_HUGETBL, + &xopt, sizeof(xopt)); + + if (g_code_env == CODE_ENVIRONMENT_DAEMON) { + xopt = 1; + xio_set_opt(NULL, XIO_OPTLEVEL_RDMA, XIO_OPTNAME_ENABLE_FORK_INIT, + &xopt, sizeof(xopt)); + } + + xopt = XIO_MSGR_IOVLEN; + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_IN_IOVLEN, + &xopt, sizeof(xopt)); + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_OUT_IOVLEN, + &xopt, sizeof(xopt)); + + /* enable flow-control */ + xopt = 1; + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_ENABLE_FLOW_CONTROL, + &xopt, sizeof(xopt)); + + /* and set threshold for buffer callouts */ + xopt = max(cct->_conf->xio_max_send_inline, 512); + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_DATA, + &xopt, sizeof(xopt)); + + xopt = XioMsgHdr::get_max_encoded_length(); + ldout(cct,2) << "setting accelio max header size " << xopt << dendl; + xio_set_opt(NULL, XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_MAX_INLINE_XIO_HEADER, + &xopt, sizeof(xopt)); + + size_t queue_depth = cct->_conf->xio_queue_depth; + struct xio_mempool_config mempool_config = { + 6, + { + {1024, 0, queue_depth, 262144}, + {4096, 0, queue_depth, 262144}, + {16384, 0, queue_depth, 262144}, + {65536, 0, 128, 65536}, + {262144, 0, 32, 16384}, + {1048576, 0, 8, 8192} + } + }; + xio_set_opt(NULL, + XIO_OPTLEVEL_ACCELIO, XIO_OPTNAME_CONFIG_MEMPOOL, + &mempool_config, sizeof(mempool_config)); + + /* and unregisterd one */ + #define XMSG_MEMPOOL_QUANTUM 4096 + + xio_msgr_noreg_mpool = + xio_mempool_create(-1 /* nodeid */, + XIO_MEMPOOL_FLAG_REGULAR_PAGES_ALLOC); + + (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 64, + cct->_conf->xio_mp_min, + cct->_conf->xio_mp_max_64, + XMSG_MEMPOOL_QUANTUM, 0); + (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 256, + cct->_conf->xio_mp_min, + cct->_conf->xio_mp_max_256, + XMSG_MEMPOOL_QUANTUM, 0); + (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, 1024, + cct->_conf->xio_mp_min, + cct->_conf->xio_mp_max_1k, + XMSG_MEMPOOL_QUANTUM, 0); + (void) xio_mempool_add_slab(xio_msgr_noreg_mpool, getpagesize(), + cct->_conf->xio_mp_min, + cct->_conf->xio_mp_max_page, + XMSG_MEMPOOL_QUANTUM, 0); + + /* initialize ops singleton */ + xio_msgr_ops.on_session_event = on_session_event; + xio_msgr_ops.on_new_session = on_new_session; + xio_msgr_ops.on_session_established = NULL; + xio_msgr_ops.on_msg = on_msg; + xio_msgr_ops.on_ow_msg_send_complete = on_ow_msg_send_complete; + xio_msgr_ops.on_msg_error = on_msg_error; + xio_msgr_ops.on_cancel = on_cancel; + xio_msgr_ops.on_cancel_request = on_cancel_request; + + /* mark initialized */ + initialized = true; + } + mtx.Unlock(); + } + } + +/* XioMessenger */ +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, XioMessenger *msgr) { + return *_dout << "-- " << msgr->get_myaddr_legacy() << " "; +} + +XioMessenger::XioMessenger(CephContext *cct, entity_name_t name, + string mname, uint64_t _nonce, + uint64_t cflags, DispatchStrategy *ds) + : SimplePolicyMessenger(cct, name, mname, _nonce), + XioInit(cct), + portals(this, get_nportals(cflags), get_nconns_per_portal(cflags)), + dispatch_strategy(ds), + loop_con(new XioLoopbackConnection(this)), + special_handling(0), + sh_mtx("XioMessenger session mutex"), + sh_cond(), + need_addr(true), + did_bind(false), + nonce(_nonce) +{ + + if (cct->_conf->xio_trace_xcon) + magic |= MSG_MAGIC_TRACE_XCON; + + XioPool::trace_mempool = (cct->_conf->xio_trace_mempool); + XioPool::trace_msgcnt = (cct->_conf->xio_trace_msgcnt); + + dispatch_strategy->set_messenger(this); + + /* update class instance count */ + nInstances++; + + loop_con->set_features(CEPH_FEATURES_ALL); + + ldout(cct,2) << "Create msgr: " << this << " instance: " + << nInstances << " type: " << name.type_str() + << " subtype: " << mname << " nportals: " << get_nportals(cflags) + << " nconns_per_portal: " << get_nconns_per_portal(cflags) + << dendl; + +} /* ctor */ + +int XioMessenger::pool_hint(uint32_t dsize) { + if (dsize > 1024*1024) + return 0; + + /* if dsize is already present, returns -EEXIST */ + return xio_mempool_add_slab(xio_msgr_noreg_mpool, dsize, 0, + cct->_conf->xio_mp_max_hint, + XMSG_MEMPOOL_QUANTUM, 0); +} + +int XioMessenger::get_nconns_per_portal(uint64_t cflags) +{ + const int XIO_DEFAULT_NUM_CONNS_PER_PORTAL = 8; + int nconns = XIO_DEFAULT_NUM_CONNS_PER_PORTAL; + + if (cflags & Messenger::HAS_MANY_CONNECTIONS) + nconns = max(cct->_conf->xio_max_conns_per_portal, XIO_DEFAULT_NUM_CONNS_PER_PORTAL); + else if (cflags & Messenger::HEARTBEAT) + nconns = max(cct->_conf->osd_heartbeat_min_peers * 4, XIO_DEFAULT_NUM_CONNS_PER_PORTAL); + + return nconns; +} + +int XioMessenger::get_nportals(uint64_t cflags) +{ + int nportals = 1; + + if (cflags & Messenger::HAS_HEAVY_TRAFFIC) + nportals = max(cct->_conf->xio_portal_threads, 1); + + return nportals; +} + +void XioMessenger::learned_addr(const entity_addr_t &peer_addr_for_me) +{ + // be careful here: multiple threads may block here, and readers of + // my_inst.addr do NOT hold any lock. + + // this always goes from true -> false under the protection of the + // mutex. if it is already false, we need not retake the mutex at + // all. + if (!need_addr) + return; + + sh_mtx.Lock(); + if (need_addr) { + entity_addr_t t = peer_addr_for_me; + t.set_port(my_inst.addr.get_port()); + my_inst.addr.set_sockaddr(t.get_sockaddr()); + ldout(cct,2) << "learned my addr " << my_inst.addr << dendl; + need_addr = false; + // init_local_connection(); + } + sh_mtx.Unlock(); + +} + +int XioMessenger::new_session(struct xio_session *session, + struct xio_new_session_req *req, + void *cb_user_context) +{ + if (shutdown_called) { + return xio_reject( + session, XIO_E_SESSION_REFUSED, NULL /* udata */, 0 /* udata len */); + } + int code = portals.accept(session, req, cb_user_context); + if (! code) + nsessions++; + return code; +} /* new_session */ + +int XioMessenger::session_event(struct xio_session *session, + struct xio_session_event_data *event_data, + void *cb_user_context) +{ + XioConnection *xcon; + + switch (event_data->event) { + case XIO_SESSION_CONNECTION_ESTABLISHED_EVENT: + { + struct xio_connection *conn = event_data->conn; + struct xio_connection_attr xcona; + entity_addr_t peer_addr_for_me, paddr; + + xcon = static_cast<XioConnection*>(event_data->conn_user_context); + + ldout(cct,2) << "connection established " << event_data->conn + << " session " << session << " xcon " << xcon << dendl; + + (void) xio_query_connection(conn, &xcona, + XIO_CONNECTION_ATTR_LOCAL_ADDR| + XIO_CONNECTION_ATTR_PEER_ADDR); + peer_addr_for_me.set_sockaddr((struct sockaddr *)&xcona.local_addr); + paddr.set_sockaddr((struct sockaddr *)&xcona.peer_addr); + //set_myaddr(peer_addr_for_me); + learned_addr(peer_addr_for_me); + ldout(cct,2) << "client: connected from " << peer_addr_for_me << " to " << paddr << dendl; + + /* notify hook */ + this->ms_deliver_handle_connect(xcon); + this->ms_deliver_handle_fast_connect(xcon); + } + break; + + case XIO_SESSION_NEW_CONNECTION_EVENT: + { + struct xio_connection *conn = event_data->conn; + struct xio_connection_attr xcona; + entity_inst_t s_inst; + entity_addr_t peer_addr_for_me; + + (void) xio_query_connection(conn, &xcona, + XIO_CONNECTION_ATTR_CTX| + XIO_CONNECTION_ATTR_PEER_ADDR| + XIO_CONNECTION_ATTR_LOCAL_ADDR); + /* XXX assumes RDMA */ + s_inst.addr.set_sockaddr((struct sockaddr *)&xcona.peer_addr); + peer_addr_for_me.set_sockaddr((struct sockaddr *)&xcona.local_addr); + + xcon = new XioConnection(this, XioConnection::PASSIVE, s_inst); + xcon->session = session; + + struct xio_context_attr xctxa; + (void) xio_query_context(xcona.ctx, &xctxa, XIO_CONTEXT_ATTR_USER_CTX); + + xcon->conn = conn; + xcon->portal = static_cast<XioPortal*>(xctxa.user_context); + ceph_assert(xcon->portal); + + xcona.user_context = xcon; + (void) xio_modify_connection(conn, &xcona, XIO_CONNECTION_ATTR_USER_CTX); + + xcon->connected = true; + + /* sentinel ref */ + xcon->get(); /* xcon->nref == 1 */ + conns_sp.lock(); + conns_list.push_back(*xcon); + /* XXX we can't put xcon in conns_entity_map becase we don't yet know + * it's peer address */ + conns_sp.unlock(); + + /* XXXX pre-merge of session startup negotiation ONLY! */ + xcon->cstate.state_up_ready(XioConnection::CState::OP_FLAG_NONE); + + ldout(cct,2) << "New connection session " << session + << " xcon " << xcon << " on msgr: " << this << " portal: " << xcon->portal << dendl; + ldout(cct,2) << "Server: connected from " << s_inst.addr << " to " << peer_addr_for_me << dendl; + } + break; + case XIO_SESSION_CONNECTION_ERROR_EVENT: + case XIO_SESSION_CONNECTION_CLOSED_EVENT: /* orderly discon */ + case XIO_SESSION_CONNECTION_DISCONNECTED_EVENT: /* unexpected discon */ + case XIO_SESSION_CONNECTION_REFUSED_EVENT: + xcon = static_cast<XioConnection*>(event_data->conn_user_context); + ldout(cct,2) << xio_session_event_str(event_data->event) + << " xcon " << xcon << " session " << session << dendl; + if (likely(!!xcon)) { + unregister_xcon(xcon); + xcon->on_disconnect_event(); + } + break; + case XIO_SESSION_CONNECTION_TEARDOWN_EVENT: + xcon = static_cast<XioConnection*>(event_data->conn_user_context); + ldout(cct,2) << xio_session_event_str(event_data->event) + << " xcon " << xcon << " session " << session << dendl; + /* + * There are flows where Accelio sends teardown event without going + * through disconnect event. so we make sure we cleaned the connection. + */ + unregister_xcon(xcon); + xcon->on_teardown_event(); + break; + case XIO_SESSION_TEARDOWN_EVENT: + ldout(cct,2) << xio_session_event_str(event_data->event) + << " session " << session << dendl; + if (unlikely(XioPool::trace_mempool)) { + xp_stats.dump("xio session dtor", reinterpret_cast<uint64_t>(session)); + } + xio_session_destroy(session); + if (--nsessions == 0) { + Mutex::Locker lck(sh_mtx); + if (nsessions == 0) + sh_cond.Signal(); + } + break; + default: + break; + }; + + return 0; +} + +enum bl_type +{ + BUFFER_PAYLOAD, + BUFFER_MIDDLE, + BUFFER_DATA +}; + +#define MAX_XIO_BUF_SIZE 1044480 + +static inline int +xio_count_buffers(const buffer::list& bl, int& req_size, int& msg_off, int& req_off) +{ + + const std::list<buffer::ptr>& buffers = bl.buffers(); + list<bufferptr>::const_iterator pb; + size_t size, off; + int result; + int first = 1; + + off = size = 0; + result = 0; + for (;;) { + if (off >= size) { + if (first) pb = buffers.begin(); else ++pb; + if (pb == buffers.end()) { + break; + } + off = 0; + size = pb->length(); + first = 0; + } + size_t count = size - off; + if (!count) continue; + if (req_size + count > MAX_XIO_BUF_SIZE) { + count = MAX_XIO_BUF_SIZE - req_size; + } + + ++result; + + /* advance iov and perhaps request */ + + off += count; + req_size += count; + ++msg_off; + if (unlikely(msg_off >= XIO_MSGR_IOVLEN || req_size >= MAX_XIO_BUF_SIZE)) { + ++req_off; + msg_off = 0; + req_size = 0; + } + } + + return result; +} + +static inline void +xio_place_buffers(const buffer::list& bl, XioMsg *xmsg, struct xio_msg*& req, + struct xio_iovec_ex*& msg_iov, int& req_size, + int ex_cnt, int& msg_off, int& req_off, bl_type type) +{ + + const std::list<buffer::ptr>& buffers = bl.buffers(); + list<bufferptr>::const_iterator pb; + struct xio_iovec_ex* iov; + size_t size, off; + const char *data = NULL; + int first = 1; + + off = size = 0; + for (;;) { + if (off >= size) { + if (first) pb = buffers.begin(); else ++pb; + if (pb == buffers.end()) { + break; + } + off = 0; + size = pb->length(); + data = pb->c_str(); // is c_str() efficient? + first = 0; + } + size_t count = size - off; + if (!count) continue; + if (req_size + count > MAX_XIO_BUF_SIZE) { + count = MAX_XIO_BUF_SIZE - req_size; + } + + /* assign buffer */ + iov = &msg_iov[msg_off]; + iov->iov_base = (void *) (&data[off]); + iov->iov_len = count; + + switch (type) { + case BUFFER_DATA: + //break; + default: + { + struct xio_reg_mem *mp = get_xio_mp(*pb); + iov->mr = (mp) ? mp->mr : NULL; + } + break; + } + + /* advance iov(s) */ + + off += count; + req_size += count; + ++msg_off; + + /* next request if necessary */ + + if (unlikely(msg_off >= XIO_MSGR_IOVLEN || req_size >= MAX_XIO_BUF_SIZE)) { + /* finish this request */ + req->out.pdata_iov.nents = msg_off; + /* advance to next, and write in it if it's not the last one. */ + if (++req_off >= ex_cnt) { + req = 0; /* poison. trap if we try to use it. */ + msg_iov = NULL; + } else { + req = &xmsg->req_arr[req_off].msg; + msg_iov = req->out.pdata_iov.sglist; + } + msg_off = 0; + req_size = 0; + } + } +} + +int XioMessenger::bind(const entity_addr_t& addr) +{ + if (addr.is_blank_ip()) { + lderr(cct) << "ERROR: need rdma ip for remote use! " << dendl; + cout << "Error: xio bind failed. public/cluster ip not specified" << std::endl; + return -1; + } + + entity_addr_t shift_addr = addr; + string base_uri = xio_uri_from_entity(cct->_conf->xio_transport_type, + shift_addr, false /* want_port */); + ldout(cct,4) << "XioMessenger " << this << " bind: xio_uri " + << base_uri << ':' << shift_addr.get_port() << dendl; + + uint16_t port0; + int r = portals.bind(&xio_msgr_ops, base_uri, shift_addr.get_port(), &port0); + if (r == 0) { + shift_addr.set_port(port0); + shift_addr.nonce = nonce; + set_myaddr(shift_addr); + need_addr = false; + did_bind = true; + } + return r; +} /* bind */ + +int XioMessenger::rebind(const set<int>& avoid_ports) +{ + ldout(cct,4) << "XioMessenger " << this << " rebind attempt" << dendl; + return 0; +} /* rebind */ + +int XioMessenger::start() +{ + portals.start(); + dispatch_strategy->start(); + if (!did_bind) { + my_inst.addr.nonce = nonce; + } + started = true; + return 0; +} + +void XioMessenger::wait() +{ + portals.join(); + dispatch_strategy->wait(); +} /* wait */ + +int XioMessenger::_send_message(Message *m, const entity_inst_t& dest) +{ + ConnectionRef conn = get_connection(dest); + if (conn) + return _send_message(m, &(*conn)); + else + return EINVAL; +} /* send_message(Message *, const entity_inst_t&) */ + +static inline XioMsg* pool_alloc_xio_msg(Message *m, XioConnection *xcon, + int ex_cnt) +{ + struct xio_reg_mem mp_mem; + int e = xpool_alloc(xio_msgr_noreg_mpool, sizeof(XioMsg), &mp_mem); + if (!!e) + return NULL; + XioMsg *xmsg = reinterpret_cast<XioMsg*>(mp_mem.addr); + ceph_assert(!!xmsg); + new (xmsg) XioMsg(m, xcon, mp_mem, ex_cnt, CEPH_FEATURES_ALL); + return xmsg; +} + +XioCommand* pool_alloc_xio_command(XioConnection *xcon) +{ + struct xio_reg_mem mp_mem; + int e = xpool_alloc(xio_msgr_noreg_mpool, sizeof(XioCommand), &mp_mem); + if (!!e) + return NULL; + XioCommand *xcmd = reinterpret_cast<XioCommand*>(mp_mem.addr); + ceph_assert(!!xcmd); + new (xcmd) XioCommand(xcon, mp_mem); + return xcmd; +} + +int XioMessenger::_send_message(Message *m, Connection *con) +{ + if (con == loop_con.get() /* intrusive_ptr get() */) { + m->set_connection(con); + m->set_src(get_myinst().name); + m->set_seq(loop_con->next_seq()); + ds_dispatch(m); + return 0; + } + + XioConnection *xcon = static_cast<XioConnection*>(con); + + /* If con is not in READY state, we have to enforce policy */ + if (xcon->cstate.session_state.read() != XioConnection::UP) { + std::lock_guard<decltype(xcon->sp) lg(xcon->sp); + + if (xcon->cstate.session_state.read() != XioConnection::UP) { + xcon->outgoing.mqueue.push_back(*m); + return 0; + } + } + + return _send_message_impl(m, xcon); +} /* send_message(Message* m, Connection *con) */ + +int XioMessenger::_send_message_impl(Message* m, XioConnection* xcon) +{ + int code = 0; + + Mutex::Locker l(xcon->lock); + if (unlikely(XioPool::trace_mempool)) { + static uint32_t nreqs; + if (unlikely((++nreqs % 65536) == 0)) { + xp_stats.dump(__func__, nreqs); + } + } + + m->set_seq(xcon->state.next_out_seq()); + m->set_magic(magic); // trace flags and special handling + + m->encode(xcon->get_features(), this->crcflags); + + buffer::list &payload = m->get_payload(); + buffer::list &middle = m->get_middle(); + buffer::list &data = m->get_data(); + + int msg_off = 0; + int req_off = 0; + int req_size = 0; + int nbuffers = + xio_count_buffers(payload, req_size, msg_off, req_off) + + xio_count_buffers(middle, req_size, msg_off, req_off) + + xio_count_buffers(data, req_size, msg_off, req_off); + + int ex_cnt = req_off; + if (msg_off == 0 && ex_cnt > 0) { + // no buffers for last msg + ldout(cct,10) << "msg_off 0, ex_cnt " << ex_cnt << " -> " << ex_cnt-1 << dendl; + ex_cnt--; + } + + /* get an XioMsg frame */ + XioMsg *xmsg = pool_alloc_xio_msg(m, xcon, ex_cnt); + if (! xmsg) { + /* could happen if Accelio has been shutdown */ + return ENOMEM; + } + + ldout(cct,4) << __func__ << " " << m << " new XioMsg " << xmsg + << " tag " << (int)xmsg->hdr.tag + << " req_0 " << xmsg->get_xio_msg() << " msg type " << m->get_type() + << " features: " << xcon->get_features() + << " conn " << xcon->conn << " sess " << xcon->session << dendl; + + if (magic & (MSG_MAGIC_XIO)) { + + /* XXXX verify */ + switch (m->get_type()) { + case 43: + // case 15: + ldout(cct,4) << __func__ << " stop 43 " << m->get_type() << " " << *m << dendl; + buffer::list &payload = m->get_payload(); + ldout(cct,4) << __func__ << " payload dump:" << dendl; + payload.hexdump(cout); + } + } + + struct xio_msg *req = xmsg->get_xio_msg(); + struct xio_iovec_ex *msg_iov = req->out.pdata_iov.sglist; + + if (magic & (MSG_MAGIC_XIO)) { + ldout(cct,4) << "payload: " << payload.buffers().size() << + " middle: " << middle.buffers().size() << + " data: " << data.buffers().size() << + dendl; + } + + if (unlikely(ex_cnt > 0)) { + ldout(cct,4) << __func__ << " buffer cnt > XIO_MSGR_IOVLEN (" << + ((XIO_MSGR_IOVLEN-1) + nbuffers) << ")" << dendl; + } + + /* do the invariant part */ + msg_off = 0; + req_off = -1; /* most often, not used */ + req_size = 0; + + xio_place_buffers(payload, xmsg, req, msg_iov, req_size, ex_cnt, msg_off, + req_off, BUFFER_PAYLOAD); + + xio_place_buffers(middle, xmsg, req, msg_iov, req_size, ex_cnt, msg_off, + req_off, BUFFER_MIDDLE); + + xio_place_buffers(data, xmsg, req, msg_iov, req_size, ex_cnt, msg_off, + req_off, BUFFER_DATA); + ldout(cct,10) << "ex_cnt " << ex_cnt << ", req_off " << req_off + << ", msg_cnt " << xmsg->get_msg_count() << dendl; + + /* finalize request */ + if (msg_off) + req->out.pdata_iov.nents = msg_off; + + /* fixup first msg */ + req = xmsg->get_xio_msg(); + + const std::list<buffer::ptr>& header = xmsg->hdr.get_bl().buffers(); + ceph_assert(header.size() == 1); /* XXX */ + list<bufferptr>::const_iterator pb = header.begin(); + req->out.header.iov_base = (char*) pb->c_str(); + req->out.header.iov_len = pb->length(); + + /* deliver via xio, preserve ordering */ + if (xmsg->get_msg_count() > 1) { + struct xio_msg *head = xmsg->get_xio_msg(); + struct xio_msg *tail = head; + for (req_off = 0; ((unsigned) req_off) < xmsg->get_msg_count()-1; ++req_off) { + req = &xmsg->req_arr[req_off].msg; +assert(!req->in.pdata_iov.nents); +assert(req->out.pdata_iov.nents || !nbuffers); + tail->next = req; + tail = req; + } + tail->next = NULL; + } + xmsg->trace = m->trace; + m->trace.event("xio portal enqueue for send"); + m->trace.keyval("xio message segments", xmsg->hdr.msg_cnt); + xcon->portal->enqueue_for_send(xcon, xmsg); + + return code; +} /* send_message(Message *, Connection *) */ + +int XioMessenger::shutdown() +{ + shutdown_called = true; + conns_sp.lock(); + XioConnection::ConnList::iterator iter; + iter = conns_list.begin(); + for (iter = conns_list.begin(); iter != conns_list.end(); ++iter) { + (void) iter->disconnect(); // XXX mark down? + } + conns_sp.unlock(); + while(nsessions > 0) { + Mutex::Locker lck(sh_mtx); + if (nsessions > 0) + sh_cond.Wait(sh_mtx); + } + portals.shutdown(); + dispatch_strategy->shutdown(); + did_bind = false; + started = false; + return 0; +} /* shutdown */ + +ConnectionRef XioMessenger::get_connection(const entity_inst_t& dest) +{ + if (shutdown_called) + return NULL; + + const entity_inst_t& self_inst = get_myinst(); + if ((&dest == &self_inst) || + (dest == self_inst)) { + return get_loopback_connection(); + } + + conns_sp.lock(); + XioConnection::EntitySet::iterator conn_iter = + conns_entity_map.find(dest, XioConnection::EntityComp()); + if (conn_iter != conns_entity_map.end()) { + ConnectionRef cref = &(*conn_iter); + conns_sp.unlock(); + return cref; + } + else { + conns_sp.unlock(); + string xio_uri = xio_uri_from_entity(cct->_conf->xio_transport_type, + dest.addr, true /* want_port */); + + ldout(cct,4) << "XioMessenger " << this << " get_connection: xio_uri " + << xio_uri << dendl; + + /* XXX client session creation parameters */ + struct xio_session_params params = {}; + params.type = XIO_SESSION_CLIENT; + params.ses_ops = &xio_msgr_ops; + params.user_context = this; + params.uri = xio_uri.c_str(); + + XioConnection *xcon = new XioConnection(this, XioConnection::ACTIVE, + dest); + + xcon->session = xio_session_create(¶ms); + if (! xcon->session) { + delete xcon; + return NULL; + } + + /* this should cause callbacks with user context of conn, but + * we can always set it explicitly */ + struct xio_connection_params xcp = {}; + xcp.session = xcon->session; + xcp.ctx = xcon->portal->ctx; + xcp.conn_user_context = xcon; + + xcon->conn = xio_connect(&xcp); + if (!xcon->conn) { + xio_session_destroy(xcon->session); + delete xcon; + return NULL; + } + + nsessions++; + xcon->connected = true; + + /* sentinel ref */ + xcon->get(); /* xcon->nref == 1 */ + conns_sp.lock(); + conns_list.push_back(*xcon); + conns_entity_map.insert(*xcon); + conns_sp.unlock(); + + /* XXXX pre-merge of session startup negotiation ONLY! */ + xcon->cstate.state_up_ready(XioConnection::CState::OP_FLAG_NONE); + + ldout(cct,2) << "New connection xcon: " << xcon << + " up_ready on session " << xcon->session << + " on msgr: " << this << " portal: " << xcon->portal << dendl; + + return xcon->get(); /* nref +1 */ + } +} /* get_connection */ + +ConnectionRef XioMessenger::get_loopback_connection() +{ + return (loop_con.get()); +} /* get_loopback_connection */ + +void XioMessenger::unregister_xcon(XioConnection *xcon) +{ + std::lock_guard<decltype(conns_sp)> lckr(conns_sp); + + XioConnection::EntitySet::iterator conn_iter = + conns_entity_map.find(xcon->peer, XioConnection::EntityComp()); + if (conn_iter != conns_entity_map.end()) { + XioConnection *xcon2 = &(*conn_iter); + if (xcon == xcon2) { + conns_entity_map.erase(conn_iter); + } + } + + /* check if citer on conn_list */ + if (xcon->conns_hook.is_linked()) { + /* now find xcon on conns_list and erase */ + XioConnection::ConnList::iterator citer = + XioConnection::ConnList::s_iterator_to(*xcon); + conns_list.erase(citer); + } +} + +void XioMessenger::mark_down(const entity_addr_t& addr) +{ + entity_inst_t inst(entity_name_t(), addr); + std::lock_guard<decltype(conns_sp)> lckr(conns_sp); + XioConnection::EntitySet::iterator conn_iter = + conns_entity_map.find(inst, XioConnection::EntityComp()); + if (conn_iter != conns_entity_map.end()) { + (*conn_iter)._mark_down(XioConnection::CState::OP_FLAG_NONE); + } +} /* mark_down(const entity_addr_t& */ + +void XioMessenger::mark_down(Connection* con) +{ + XioConnection *xcon = static_cast<XioConnection*>(con); + xcon->_mark_down(XioConnection::CState::OP_FLAG_NONE); +} /* mark_down(Connection*) */ + +void XioMessenger::mark_down_all() +{ + std::lock_guard<decltype(conns_sp)> lckr(conns_sp); + XioConnection::EntitySet::iterator conn_iter; + for (conn_iter = conns_entity_map.begin(); conn_iter != + conns_entity_map.begin(); ++conn_iter) { + (*conn_iter)._mark_down(XioConnection::CState::OP_FLAG_NONE); + } +} /* mark_down_all */ + +static inline XioMarkDownHook* pool_alloc_markdown_hook( + XioConnection *xcon, Message *m) +{ + struct xio_reg_mem mp_mem; + int e = xio_mempool_alloc(xio_msgr_noreg_mpool, + sizeof(XioMarkDownHook), &mp_mem); + if (!!e) + return NULL; + XioMarkDownHook *hook = static_cast<XioMarkDownHook*>(mp_mem.addr); + new (hook) XioMarkDownHook(xcon, m, mp_mem); + return hook; +} + +void XioMessenger::mark_down_on_empty(Connection* con) +{ + XioConnection *xcon = static_cast<XioConnection*>(con); + MNop* m = new MNop(); + m->tag = XIO_NOP_TAG_MARKDOWN; + m->set_completion_hook(pool_alloc_markdown_hook(xcon, m)); + // stall new messages + xcon->cstate.session_state = XioConnection::session_states::BARRIER; + (void) _send_message_impl(m, xcon); +} + +void XioMessenger::mark_disposable(Connection *con) +{ + XioConnection *xcon = static_cast<XioConnection*>(con); + xcon->_mark_disposable(XioConnection::CState::OP_FLAG_NONE); +} + +void XioMessenger::try_insert(XioConnection *xcon) +{ + std::lock_guard<decltype(conns_sp)> lckr(conns_sp); + /* already resident in conns_list */ + conns_entity_map.insert(*xcon); +} + +XioMessenger::~XioMessenger() +{ + delete dispatch_strategy; + nInstances--; +} /* dtor */ diff --git a/src/msg/xio/XioMessenger.h b/src/msg/xio/XioMessenger.h new file mode 100644 index 00000000..6f8a67ba --- /dev/null +++ b/src/msg/xio/XioMessenger.h @@ -0,0 +1,176 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef XIO_MESSENGER_H +#define XIO_MESSENGER_H + +#include "msg/SimplePolicyMessenger.h" + +#include <atomic> + +extern "C" { +#include "libxio.h" +} + +#include "XioConnection.h" +#include "XioPortal.h" +#include "QueueStrategy.h" +#include "common/Thread.h" +#include "common/Mutex.h" +#include "include/spinlock.h" + +class XioInit { + /* safe to be called multiple times */ + void package_init(CephContext *cct); + +protected: + explicit XioInit(CephContext *cct) { + this->package_init(cct); + } +}; + +class XioMessenger : public SimplePolicyMessenger, XioInit +{ +private: + static std::atomic<uint64_t> nInstances = { 0 }; + std::atomic<uint64_t> nsessions = { 0 }; + std::atomic<bool> shutdown_called = { false }; + ceph::spinlock conns_sp; + XioConnection::ConnList conns_list; + XioConnection::EntitySet conns_entity_map; + XioPortals portals; + DispatchStrategy* dispatch_strategy; + XioLoopbackConnectionRef loop_con; + uint32_t special_handling; + Mutex sh_mtx; + Cond sh_cond; + bool need_addr; + bool did_bind; + + /// approximately unique ID set by the Constructor for use in entity_addr_t + uint64_t nonce; + + friend class XioConnection; + +public: + XioMessenger(CephContext *cct, entity_name_t name, + string mname, uint64_t nonce, + uint64_t cflags = 0, + DispatchStrategy* ds = new QueueStrategy(1)); + + virtual ~XioMessenger(); + + XioPortal* get_portal() { return portals.get_next_portal(); } + + virtual void set_myaddr(const entity_addr_t& a) { + Messenger::set_myaddr(a); + loop_con->set_peer_addr(a); + } + + int _send_message(Message *m, const entity_inst_t &dest); + int _send_message(Message *m, Connection *con); + int _send_message_impl(Message *m, XioConnection *xcon); + + uint32_t get_special_handling() { return special_handling; } + void set_special_handling(int n) { special_handling = n; } + int pool_hint(uint32_t size); + void try_insert(XioConnection *xcon); + + /* xio hooks */ + int new_session(struct xio_session *session, + struct xio_new_session_req *req, + void *cb_user_context); + + int session_event(struct xio_session *session, + struct xio_session_event_data *event_data, + void *cb_user_context); + + /* Messenger interface */ + virtual bool set_addr_unknowns(const entity_addrvec_t &addr) override + { } /* XXX applicable? */ + virtual void set_addr(const entity_addr_t &addr) override + { } /* XXX applicable? */ + + virtual int get_dispatch_queue_len() + { return 0; } /* XXX bogus? */ + + virtual double get_dispatch_queue_max_age(utime_t now) + { return 0; } /* XXX bogus? */ + + virtual void set_cluster_protocol(int p) + { } + + virtual int bind(const entity_addr_t& addr); + + virtual int rebind(const set<int>& avoid_ports); + + virtual int start(); + + virtual void wait(); + + virtual int shutdown(); + + virtual int send_message(Message *m, const entity_inst_t &dest) { + return _send_message(m, dest); + } + + virtual int lazy_send_message(Message *m, const entity_inst_t& dest) + { return EINVAL; } + + virtual int lazy_send_message(Message *m, Connection *con) + { return EINVAL; } + + virtual ConnectionRef get_connection(const entity_inst_t& dest); + + // compat hack + ConnectionRef connect_to( + int type, const entity_addrvec_t& dest) override { + return get_connection(entity_inst_t(entity_name_t(type, -1), + dest.legacy_addr())); + } + + virtual ConnectionRef get_loopback_connection(); + + void unregister_xcon(XioConnection *xcon); + virtual void mark_down(const entity_addr_t& a); + virtual void mark_down(Connection *con); + virtual void mark_down_all(); + virtual void mark_down_on_empty(Connection *con); + virtual void mark_disposable(Connection *con); + + void ds_dispatch(Message *m) + { dispatch_strategy->ds_dispatch(m); } + + /** + * Tell the XioMessenger its full IP address. + * + * This is used by clients when connecting to other endpoints, and + * probably shouldn't be called by anybody else. + */ + void learned_addr(const entity_addr_t& peer_addr_for_me); + +private: + int get_nconns_per_portal(uint64_t cflags); + int get_nportals(uint64_t cflags); + +protected: + virtual void ready() + { } +}; + +XioCommand* pool_alloc_xio_command(XioConnection *xcon); + + +#endif /* XIO_MESSENGER_H */ diff --git a/src/msg/xio/XioMsg.cc b/src/msg/xio/XioMsg.cc new file mode 100644 index 00000000..4b6a5d68 --- /dev/null +++ b/src/msg/xio/XioMsg.cc @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XioMessenger.h" +#include "XioConnection.h" +#include "XioMsg.h" + + +int XioDispatchHook::release_msgs() +{ + XioCompletion *xcmp; + int r = msg_seq.size(); + cl_flag = true; + + /* queue for release */ + xcmp = static_cast<XioCompletion *>(rsp_pool.alloc(sizeof(XioCompletion))); + new (xcmp) XioCompletion(xcon, this); + xcmp->trace = m->trace; + + /* merge with portal traffic */ + xcon->portal->enqueue(xcon, xcmp); + + ceph_assert(r); + return r; +} + +/*static*/ size_t XioMsgHdr::get_max_encoded_length() { + ceph_msg_header _ceph_msg_header; + ceph_msg_footer _ceph_msg_footer; + XioMsgHdr hdr (_ceph_msg_header, _ceph_msg_footer, 0 /* features */); + const std::list<buffer::ptr>& hdr_buffers = hdr.get_bl().buffers(); + ceph_assert(hdr_buffers.size() == 1); /* accelio header is small without scatter gather */ + return hdr_buffers.begin()->length(); +} + +void XioMsg::print_debug(CephContext *cct, const char *tag) const { + print_xio_msg_hdr(cct, tag, hdr, get_xio_msg()); + print_ceph_msg(cct, tag, m); +} diff --git a/src/msg/xio/XioMsg.h b/src/msg/xio/XioMsg.h new file mode 100644 index 00000000..2f0c8490 --- /dev/null +++ b/src/msg/xio/XioMsg.h @@ -0,0 +1,446 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef XIO_MSG_H +#define XIO_MSG_H + +#include <boost/intrusive/list.hpp> +#include "msg/SimplePolicyMessenger.h" +extern "C" { +#include "libxio.h" +} +#include "XioConnection.h" +#include "XioSubmit.h" +#include "msg/msg_types.h" +#include "XioPool.h" + +namespace bi = boost::intrusive; + +class XioMessenger; + +class XioMsgCnt +{ +public: + ceph_le32 msg_cnt; + buffer::list bl; +public: + explicit XioMsgCnt(buffer::ptr p) + { + bl.append(p); + buffer::list::iterator bl_iter = bl.begin(); + decode(msg_cnt, bl_iter); + } +}; + +class XioMsgHdr +{ +public: + char tag; + ceph_le32 msg_cnt; + ceph_le32 peer_type; + entity_addr_t addr; /* XXX hack! */ + ceph_msg_header* hdr; + ceph_msg_footer* ftr; + uint64_t features; + buffer::list bl; +public: + XioMsgHdr(ceph_msg_header& _hdr, ceph_msg_footer& _ftr, uint64_t _features) + : tag(CEPH_MSGR_TAG_MSG), msg_cnt(init_le32(0)), hdr(&_hdr), ftr(&_ftr), + features(_features) + { } + + XioMsgHdr(ceph_msg_header& _hdr, ceph_msg_footer &_ftr, buffer::ptr p) + : hdr(&_hdr), ftr(&_ftr) + { + bl.append(p); + buffer::list::iterator bl_iter = bl.begin(); + decode(bl_iter); + } + + static size_t get_max_encoded_length(); + + const buffer::list& get_bl() { encode(bl); return bl; }; + + inline void encode_hdr(ceph::buffer::list& bl) const { + using ceph::encode; + encode(tag, bl); + encode(msg_cnt, bl); + encode(peer_type, bl); + encode(addr, bl, features); + encode(hdr->seq, bl); + encode(hdr->tid, bl); + encode(hdr->type, bl); + encode(hdr->priority, bl); + encode(hdr->version, bl); + encode(hdr->front_len, bl); + encode(hdr->middle_len, bl); + encode(hdr->data_len, bl); + encode(hdr->data_off, bl); + encode(hdr->src.type, bl); + encode(hdr->src.num, bl); + encode(hdr->compat_version, bl); + encode(hdr->crc, bl); + } + + inline void encode_ftr(buffer::list& bl) const { + using ceph::encode; + encode(ftr->front_crc, bl); + encode(ftr->middle_crc, bl); + encode(ftr->data_crc, bl); + encode(ftr->sig, bl); + encode(ftr->flags, bl); + } + + inline void encode(buffer::list& bl) const { + encode_hdr(bl); + encode_ftr(bl); + } + + inline void decode_hdr(buffer::list::iterator& bl) { + using ceph::decode; + decode(tag, bl); + decode(msg_cnt, bl); + decode(peer_type, bl); + decode(addr, bl); + decode(hdr->seq, bl); + decode(hdr->tid, bl); + decode(hdr->type, bl); + decode(hdr->priority, bl); + decode(hdr->version, bl); + decode(hdr->front_len, bl); + decode(hdr->middle_len, bl); + decode(hdr->data_len, bl); + decode(hdr->data_off, bl); + decode(hdr->src.type, bl); + decode(hdr->src.num, bl); + decode(hdr->compat_version, bl); + decode(hdr->crc, bl); + } + + inline void decode_ftr(buffer::list::iterator& bl) { + using ceph::decode; + decode(ftr->front_crc, bl); + decode(ftr->middle_crc, bl); + decode(ftr->data_crc, bl); + decode(ftr->sig, bl); + decode(ftr->flags, bl); + } + + inline void decode(buffer::list::iterator& bl) { + decode_hdr(bl); + decode_ftr(bl); + } + + virtual ~XioMsgHdr() + {} +}; + +WRITE_CLASS_ENCODER(XioMsgHdr); + +extern struct xio_mempool *xio_msgr_noreg_mpool; + +#define XIO_MSGR_IOVLEN 16 + +struct xio_msg_ex +{ + struct xio_msg msg; + struct xio_iovec_ex iovs[XIO_MSGR_IOVLEN]; + + explicit xio_msg_ex(void* user_context) { + // go in structure order + msg.in.header.iov_len = 0; + msg.in.header.iov_base = NULL; /* XXX Accelio requires this currently */ + + msg.in.sgl_type = XIO_SGL_TYPE_IOV_PTR; + msg.in.pdata_iov.max_nents = XIO_MSGR_IOVLEN; + msg.in.pdata_iov.nents = 0; + msg.in.pdata_iov.sglist = iovs; + + // minimal zero "out" side + msg.out.header.iov_len = 0; + msg.out.header.iov_base = NULL; /* XXX Accelio requires this currently, + * against spec */ + // out (some members adjusted later) + msg.out.sgl_type = XIO_SGL_TYPE_IOV_PTR; + msg.out.pdata_iov.max_nents = XIO_MSGR_IOVLEN; + msg.out.pdata_iov.nents = 0; + msg.out.pdata_iov.sglist = iovs; + + // minimal initialize an "out" msg + msg.request = NULL; + msg.type = XIO_MSG_TYPE_ONE_WAY; + // for now, we DO NEED receipts for every msg + msg.flags = 0; + msg.user_context = user_context; + msg.next = NULL; + // minimal zero "in" side + } +}; + +class XioSend : public XioSubmit +{ +public: + virtual void print_debug(CephContext *cct, const char *tag) const {}; + const struct xio_msg * get_xio_msg() const {return &req_0.msg;} + struct xio_msg * get_xio_msg() {return &req_0.msg;} + virtual size_t get_msg_count() const {return 1;} + + XioSend(XioConnection *_xcon, struct xio_reg_mem& _mp, int _ex_cnt=0) : + XioSubmit(XioSubmit::OUTGOING_MSG, _xcon), + req_0(this), mp_this(_mp), nrefs(_ex_cnt+1) + { + xpool_inc_msgcnt(); + xcon->get(); + } + + XioSend* get() { nrefs++; return this; }; + + void put(int n) { + int refs = nrefs -= n; + if (refs == 0) { + struct xio_reg_mem *mp = &this->mp_this; + this->~XioSend(); + xpool_free(sizeof(XioSend), mp); + } + } + + void put() { + put(1); + } + + void put_msg_refs() { + put(get_msg_count()); + } + + virtual ~XioSend() { + xpool_dec_msgcnt(); + xcon->put(); + } + +private: + xio_msg_ex req_0; + struct xio_reg_mem mp_this; + std::atomic<unsigned> nrefs = { 0 }; +}; + +class XioCommand : public XioSend +{ +public: + XioCommand(XioConnection *_xcon, struct xio_reg_mem& _mp):XioSend(_xcon, _mp) { + } + + buffer::list& get_bl_ref() { return bl; }; + +private: + buffer::list bl; +}; + +struct XioMsg : public XioSend +{ +public: + Message* m; + XioMsgHdr hdr; + xio_msg_ex* req_arr; + +public: + XioMsg(Message *_m, XioConnection *_xcon, struct xio_reg_mem& _mp, + int _ex_cnt, uint64_t _features) : + XioSend(_xcon, _mp, _ex_cnt), + m(_m), hdr(m->get_header(), m->get_footer(), _features), + req_arr(NULL) + { + const entity_inst_t &inst = xcon->get_messenger()->get_myinst(); + hdr.peer_type = inst.name.type(); + hdr.addr = xcon->get_messenger()->get_myaddr_legacy(); + hdr.hdr->src.type = inst.name.type(); + hdr.hdr->src.num = inst.name.num(); + hdr.msg_cnt = _ex_cnt+1; + + if (unlikely(_ex_cnt > 0)) { + alloc_trailers(_ex_cnt); + } + } + + void print_debug(CephContext *cct, const char *tag) const override; + size_t get_msg_count() const override { + return hdr.msg_cnt; + } + + void alloc_trailers(int cnt) { + req_arr = static_cast<xio_msg_ex*>(malloc(cnt * sizeof(xio_msg_ex))); + for (int ix = 0; ix < cnt; ++ix) { + xio_msg_ex* xreq = &(req_arr[ix]); + new (xreq) xio_msg_ex(this); + } + } + + Message *get_message() { return m; } + + ~XioMsg() + { + if (unlikely(!!req_arr)) { + for (unsigned int ix = 0; ix < get_msg_count()-1; ++ix) { + xio_msg_ex* xreq = &(req_arr[ix]); + xreq->~xio_msg_ex(); + } + free(req_arr); + } + + /* testing only! server's ready, resubmit request (not reached on + * PASSIVE/server side) */ + if (unlikely(m->get_magic() & MSG_MAGIC_REDUPE)) { + if (likely(xcon->is_connected())) { + xcon->send_message(m); + } else { + /* dispose it */ + m->put(); + } + } else { + /* the normal case: done with message */ + m->put(); + } + } +}; + +class XioDispatchHook : public Message::CompletionHook +{ +private: + XioConnection *xcon; + XioInSeq msg_seq; + XioPool rsp_pool; + std::atomic<unsigned> nrefs { 1 }; + bool cl_flag; + friend class XioConnection; + friend class XioMessenger; +public: + struct xio_reg_mem mp_this; + + XioDispatchHook(XioConnection *_xcon, Message *_m, XioInSeq& _msg_seq, + struct xio_reg_mem& _mp) : + CompletionHook(_m), + xcon(_xcon->get()), + msg_seq(_msg_seq), + rsp_pool(xio_msgr_noreg_mpool), + cl_flag(false), + mp_this(_mp) + { + ++xcon->n_reqs; // atomicity by portal thread + xpool_inc_hookcnt(); + } + + virtual void finish(int r) { + this->put(); + } + + virtual void complete(int r) { + finish(r); + } + + int release_msgs(); + + XioDispatchHook* get() { + nrefs++; return this; + } + + void put(int n = 1) { + int refs = nrefs -= n; + if (refs == 0) { + /* in Marcus' new system, refs reaches 0 twice: once in + * Message lifecycle, and again after xio_release_msg. + */ + if (!cl_flag && release_msgs()) + return; + struct xio_reg_mem *mp = &this->mp_this; + this->~XioDispatchHook(); + xpool_free(sizeof(XioDispatchHook), mp); + } + } + + XioInSeq& get_seq() { return msg_seq; } + + XioPool& get_pool() { return rsp_pool; } + + void on_err_finalize(XioConnection *xcon) { + /* can't decode message; even with one-way must free + * xio_msg structures, and then xiopool + */ + this->finish(-1); + } + + ~XioDispatchHook() { + --xcon->n_reqs; // atomicity by portal thread + xpool_dec_hookcnt(); + xcon->put(); + } +}; + +/* A sender-side CompletionHook that relies on the on_msg_delivered + * to complete a pending mark down. */ +class XioMarkDownHook : public Message::CompletionHook +{ +private: + XioConnection* xcon; + +public: + struct xio_reg_mem mp_this; + + XioMarkDownHook( + XioConnection* _xcon, Message *_m, struct xio_reg_mem& _mp) : + CompletionHook(_m), xcon(_xcon->get()), mp_this(_mp) + { } + + virtual void claim(int r) {} + + virtual void finish(int r) { + xcon->put(); + struct xio_reg_mem *mp = &this->mp_this; + this->~XioMarkDownHook(); + xio_mempool_free(mp); + } + + virtual void complete(int r) { + xcon->_mark_down(XioConnection::CState::OP_FLAG_NONE); + finish(r); + } +}; + +struct XioCompletion : public XioSubmit +{ + XioDispatchHook *xhook; +public: + XioCompletion(XioConnection *_xcon, XioDispatchHook *_xhook) + : XioSubmit(XioSubmit::INCOMING_MSG_RELEASE, _xcon /* not xcon! */), + xhook(_xhook->get()) { + // submit queue ref + xcon->get(); + }; + + struct xio_msg* dequeue() { + return xhook->get_seq().dequeue(); + } + + XioDispatchHook* get_xhook() { return xhook; } + + void finalize() { + xcon->put(); + xhook->put(); + } +}; + +void print_xio_msg_hdr(CephContext *cct, const char *tag, + const XioMsgHdr &hdr, const struct xio_msg *msg); +void print_ceph_msg(CephContext *cct, const char *tag, Message *m); + +#endif /* XIO_MSG_H */ diff --git a/src/msg/xio/XioPool.cc b/src/msg/xio/XioPool.cc new file mode 100644 index 00000000..5f0d77a2 --- /dev/null +++ b/src/msg/xio/XioPool.cc @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> +#include "XioPool.h" + +XioPoolStats xp_stats; + +bool XioPool::trace_mempool = 0; +bool XioPool::trace_msgcnt = 0; + +void XioPoolStats::dump(const char* tag, uint64_t serial) +{ + std::cout + << tag << " #" << serial << ": " + << "pool objs: " + << "64: " << ctr_set[SLAB_64].read() << " " + << "256: " << ctr_set[SLAB_256].read() << " " + << "1024: " << ctr_set[SLAB_1024].read() << " " + << "page: " << ctr_set[SLAB_PAGE].read() << " " + << "max: " << ctr_set[SLAB_MAX].read() << " " + << "overflow: " << ctr_set[SLAB_OVERFLOW].read() << " " + << std::endl; + std::cout + << tag << " #" << serial << ": " + << " msg objs: " + << "in: " << hook_cnt.read() << " " + << "out: " << msg_cnt.read() << " " + << std::endl; +} diff --git a/src/msg/xio/XioPool.h b/src/msg/xio/XioPool.h new file mode 100644 index 00000000..07fa7311 --- /dev/null +++ b/src/msg/xio/XioPool.h @@ -0,0 +1,218 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef XIO_POOL_H +#define XIO_POOL_H + +#include <atomic> +#include <vector> +#include <cstdlib> +#include <cstring> +#include <cstdint> + +extern "C" { +#include "libxio.h" +} + +#include "common/likely.h" + +static inline int xpool_alloc(struct xio_mempool *pool, uint64_t size, + struct xio_reg_mem* mp); +static inline void xpool_free(uint64_t size, struct xio_reg_mem* mp); + +class XioPool +{ +private: + struct xio_mempool *handle; + +public: + static bool trace_mempool; + static bool trace_msgcnt; + static const int MB = 8; + + struct xio_piece { + struct xio_reg_mem mp[1]; + struct xio_piece *next; + int s; + char payload[MB]; + } *first; + + explicit XioPool(struct xio_mempool *_handle) : + handle(_handle), first(0) + { + } + ~XioPool() + { + struct xio_piece *p; + while ((p = first)) { + first = p->next; + if (unlikely(trace_mempool)) { + memset(p->payload, 0xcf, p->s); // guard bytes + } + xpool_free(sizeof(struct xio_piece)+(p->s)-MB, p->mp); + } + } + void *alloc(size_t _s) + { + void *r; + struct xio_reg_mem mp[1]; + struct xio_piece *x; + int e = xpool_alloc(handle, (sizeof(struct xio_piece)-MB) + _s, mp); + if (e) { + r = 0; + } else { + x = reinterpret_cast<struct xio_piece *>(mp->addr); + *x->mp = *mp; + x->next = first; + x->s = _s; + first = x; + r = x->payload; + } + return r; + } +}; + +class XioPoolStats { +private: + enum pool_sizes { + SLAB_64 = 0, + SLAB_256, + SLAB_1024, + SLAB_PAGE, + SLAB_MAX, + SLAB_OVERFLOW, + NUM_SLABS, + }; + + std::atomic<unsigned> ctr_set[NUM_SLABS] = {}; + std::atomic<unsigned> msg_cnt = { 0 }; // send msgs + std::atomic<unsigned> hook_cnt = { 0 }; // recv msgs + +public: + void dump(const char* tag, uint64_t serial); + + void inc(uint64_t size) { + if (size <= 64) { + (ctr_set[SLAB_64])++; + return; + } + if (size <= 256) { + (ctr_set[SLAB_256])++; + return; + } + if (size <= 1024) { + (ctr_set[SLAB_1024])++; + return; + } + if (size <= 8192) { + (ctr_set[SLAB_PAGE])++; + return; + } + (ctr_set[SLAB_MAX])++; + } + + void dec(uint64_t size) { + if (size <= 64) { + (ctr_set[SLAB_64])--; + return; + } + if (size <= 256) { + (ctr_set[SLAB_256])--; + return; + } + if (size <= 1024) { + (ctr_set[SLAB_1024])--; + return; + } + if (size <= 8192) { + (ctr_set[SLAB_PAGE])--; + return; + } + (ctr_set[SLAB_MAX])--; + } + + void inc_overflow() { ctr_set[SLAB_OVERFLOW]++; } + void dec_overflow() { ctr_set[SLAB_OVERFLOW]--; } + + void inc_msgcnt() { + if (unlikely(XioPool::trace_msgcnt)) { + msg_cnt++; + } + } + + void dec_msgcnt() { + if (unlikely(XioPool::trace_msgcnt)) { + msg_cnt--; + } + } + + void inc_hookcnt() { + if (unlikely(XioPool::trace_msgcnt)) { + hook_cnt++; + } + } + + void dec_hookcnt() { + if (unlikely(XioPool::trace_msgcnt)) { + hook_cnt--; + } + } +}; + +extern XioPoolStats xp_stats; + +static inline int xpool_alloc(struct xio_mempool *pool, uint64_t size, + struct xio_reg_mem* mp) +{ + // try to allocate from the xio pool + int r = xio_mempool_alloc(pool, size, mp); + if (r == 0) { + if (unlikely(XioPool::trace_mempool)) + xp_stats += size; + return 0; + } + // fall back to malloc on errors + mp->addr = malloc(size); + ceph_assert(mp->addr); + mp->length = 0; + if (unlikely(XioPool::trace_mempool)) + xp_stats.inc_overflow(); + return 0; +} + +static inline void xpool_free(uint64_t size, struct xio_reg_mem* mp) +{ + if (mp->length) { + if (unlikely(XioPool::trace_mempool)) + xp_stats -= size; + xio_mempool_free(mp); + } else { // from malloc + if (unlikely(XioPool::trace_mempool)) + xp_stats.dec_overflow(); + free(mp->addr); + } +} + +#define xpool_inc_msgcnt() \ + do { xp_stats.inc_msgcnt(); } while (0) + +#define xpool_dec_msgcnt() \ + do { xp_stats.dec_msgcnt(); } while (0) + +#define xpool_inc_hookcnt() \ + do { xp_stats.inc_hookcnt(); } while (0) + +#define xpool_dec_hookcnt() \ + do { xp_stats.dec_hookcnt(); } while (0) + +#endif /* XIO_POOL_H */ diff --git a/src/msg/xio/XioPortal.cc b/src/msg/xio/XioPortal.cc new file mode 100644 index 00000000..e2379fb3 --- /dev/null +++ b/src/msg/xio/XioPortal.cc @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "XioPortal.h" +#include <stdio.h> + +#define dout_subsys ceph_subsys_xio + +int XioPortal::bind(struct xio_session_ops *ops, const string &base_uri, + uint16_t port, uint16_t *assigned_port) +{ + // format uri + char buf[40]; + xio_uri = base_uri; + xio_uri += ":"; + sprintf(buf, "%d", port); + xio_uri += buf; + + uint16_t assigned; + server = xio_bind(ctx, ops, xio_uri.c_str(), &assigned, 0, msgr); + if (server == NULL) + return xio_errno(); + + // update uri if port changed + if (port != assigned) { + xio_uri = base_uri; + xio_uri += ":"; + sprintf(buf, "%d", assigned); + xio_uri += buf; + } + + portal_id = const_cast<char*>(xio_uri.c_str()); + if (assigned_port) + *assigned_port = assigned; + ldout(msgr->cct,20) << "xio_bind: portal " << xio_uri + << " returned server " << server << dendl; + return 0; +} + +int XioPortals::bind(struct xio_session_ops *ops, const string& base_uri, + uint16_t port, uint16_t *port0) +{ + /* a server needs at least 1 portal */ + if (n < 1) + return EINVAL; + Messenger *msgr = portals[0]->msgr; + portals.resize(n); + + uint16_t port_min = msgr->cct->_conf->ms_bind_port_min; + const uint16_t port_max = msgr->cct->_conf->ms_bind_port_max; + + /* bind the portals */ + for (size_t i = 0; i < portals.size(); i++) { + uint16_t result_port; + if (port != 0) { + // bind directly to the given port + int r = portals[i]->bind(ops, base_uri, port, &result_port); + if (r != 0) + return -r; + } else { + int r = EADDRINUSE; + // try ports within the configured range + for (; port_min <= port_max; port_min++) { + r = portals[i]->bind(ops, base_uri, port_min, &result_port); + if (r == 0) { + port_min++; + break; + } + } + if (r != 0) { + lderr(msgr->cct) << "portal.bind unable to bind to " << base_uri + << " on any port in range " << msgr->cct->_conf->ms_bind_port_min + << "-" << port_max << ": " << xio_strerror(r) << dendl; + return -r; + } + } + + ldout(msgr->cct,5) << "xp::bind: portal " << i << " bind OK: " + << portals[i]->xio_uri << dendl; + + if (i == 0 && port0 != NULL) + *port0 = result_port; + port = 0; // use port 0 for all subsequent portals + } + + return 0; +} diff --git a/src/msg/xio/XioPortal.h b/src/msg/xio/XioPortal.h new file mode 100644 index 00000000..7a0afee4 --- /dev/null +++ b/src/msg/xio/XioPortal.h @@ -0,0 +1,458 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + *s + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef XIO_PORTAL_H +#define XIO_PORTAL_H + +#include <string> + +extern "C" { +#include "libxio.h" +} +#include "XioInSeq.h" +#include <boost/lexical_cast.hpp> +#include "msg/SimplePolicyMessenger.h" +#include "XioConnection.h" +#include "XioMsg.h" + +#include "include/spinlock.h" + +#include "include/ceph_assert.h" +#include "common/dout.h" + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 64 /* XXX arch-specific define */ +#endif +#define CACHE_PAD(_n) char __pad ## _n [CACHE_LINE_SIZE] + +class XioPortal : public Thread +{ +private: + + struct SubmitQueue + { + const static int nlanes = 7; + + struct Lane + { + uint32_t size; + XioSubmit::Queue q; + ceph::spinlock sp; + CACHE_PAD(0); + }; + + Lane qlane[nlanes]; + + int ix; /* atomicity by portal thread */ + + SubmitQueue() : ix(0) + { + int ix; + Lane* lane; + + for (ix = 0; ix < nlanes; ++ix) { + lane = &qlane[ix]; + lane->size = 0; + } + } + + inline Lane* get_lane(XioConnection *xcon) + { + return &qlane[(((uint64_t) xcon) / 16) % nlanes]; + } + + void enq(XioConnection *xcon, XioSubmit* xs) + { + Lane* lane = get_lane(xcon); + std::lock_guard<decltype(lane->sp)> lg(lane->sp); + lane->q.push_back(*xs); + ++(lane->size); + } + + void enq(XioConnection *xcon, XioSubmit::Queue& requeue_q) + { + int size = requeue_q.size(); + Lane* lane = get_lane(xcon); + std::lock_guard<decltype(lane->sp)> lg(lane->sp); + XioSubmit::Queue::const_iterator i1 = lane->q.end(); + lane->q.splice(i1, requeue_q); + lane->size += size; + } + + void deq(XioSubmit::Queue& send_q) + { + Lane* lane; + int cnt; + + for (cnt = 0; cnt < nlanes; ++cnt, ++ix, ix = ix % nlanes) { + std::lock_guard<decltype(lane->sp)> lg(lane->sp); + lane = &qlane[ix]; + if (lane->size > 0) { + XioSubmit::Queue::const_iterator i1 = send_q.end(); + send_q.splice(i1, lane->q); + lane->size = 0; + ++ix, ix = ix % nlanes; + break; + } + } + } + + }; /* SubmitQueue */ + + Messenger *msgr; + struct xio_context *ctx; + struct xio_server *server; + SubmitQueue submit_q; + ceph::spinlock sp; + void *ev_loop; + string xio_uri; + char *portal_id; + bool _shutdown; + bool drained; + uint32_t magic; + uint32_t special_handling; + + friend class XioPortals; + friend class XioMessenger; + +public: + explicit XioPortal(Messenger *_msgr, int max_conns) : + msgr(_msgr), ctx(NULL), server(NULL), submit_q(), xio_uri(""), + portal_id(NULL), _shutdown(false), drained(false), + magic(0), + special_handling(0) + { + struct xio_context_params ctx_params; + memset(&ctx_params, 0, sizeof(ctx_params)); + ctx_params.user_context = this; + /* + * hint to Accelio the total number of connections that will share + * this context's resources: internal primary task pool... + */ + ctx_params.max_conns_per_ctx = max_conns; + + /* a portal is an xio_context and event loop */ + ctx = xio_context_create(&ctx_params, 0 /* poll timeout */, -1 /* cpu hint */); + ceph_assert(ctx && "Whoops, failed to create portal/ctx"); + } + + int bind(struct xio_session_ops *ops, const string &base_uri, + uint16_t port, uint16_t *assigned_port); + + inline void release_xio_msg(XioCompletion* xcmp) { + struct xio_msg *msg = xcmp->dequeue(); + struct xio_msg *next_msg = NULL; + int code; + if (unlikely(!xcmp->xcon->conn)) { + // NOTE: msg is not safe to dereference if the connection was torn down + xcmp->xcon->msg_release_fail(msg, ENOTCONN); + } + else while (msg) { + next_msg = static_cast<struct xio_msg *>(msg->user_context); + code = xio_release_msg(msg); + if (unlikely(code)) /* very unlikely, so log it */ + xcmp->xcon->msg_release_fail(msg, code); + msg = next_msg; + } + xcmp->trace.event("xio_release_msg"); + xcmp->finalize(); /* unconditional finalize */ + } + + void enqueue(XioConnection *xcon, XioSubmit *xs) + { + if (! _shutdown) { + submit_q.enq(xcon, xs); + xio_context_stop_loop(ctx); + return; + } + + /* dispose xs */ + switch(xs->type) { + case XioSubmit::OUTGOING_MSG: /* it was an outgoing 1-way */ + { + XioSend* xsend = static_cast<XioSend*>(xs); + xs->xcon->msg_send_fail(xsend, -EINVAL); + } + break; + default: + /* INCOMING_MSG_RELEASE */ + release_xio_msg(static_cast<XioCompletion*>(xs)); + break; + }; + } + + void requeue(XioConnection* xcon, XioSubmit::Queue& send_q) { + submit_q.enq(xcon, send_q); + } + + void requeue_all_xcon(XioConnection* xcon, + XioSubmit::Queue::iterator& q_iter, + XioSubmit::Queue& send_q) { + // XXX gather all already-dequeued outgoing messages for xcon + // and push them in FIFO order to front of the input queue, + // and mark the connection as flow-controlled + XioSubmit::Queue requeue_q; + + while (q_iter != send_q.end()) { + XioSubmit *xs = &(*q_iter); + // skip retires and anything for other connections + if (xs->xcon != xcon) { + q_iter++; + continue; + } + q_iter = send_q.erase(q_iter); + requeue_q.push_back(*xs); + } + std::lock_guard<decltype(xcon->sp)> lg(xcon->sp); + XioSubmit::Queue::const_iterator i1 = xcon->outgoing.requeue.begin(); + xcon->outgoing.requeue.splice(i1, requeue_q); + xcon->cstate.state_flow_controlled(XioConnection::CState::OP_FLAG_LOCKED); + } + + void *entry() + { + int size, code = 0; + uint32_t xio_qdepth_high; + XioSubmit::Queue send_q; + XioSubmit::Queue::iterator q_iter; + struct xio_msg *msg = NULL; + XioConnection *xcon; + XioSubmit *xs; + XioSend *xsend; + + do { + submit_q.deq(send_q); + + /* shutdown() barrier */ + std::lock_guard<decltype(sp)> lg(sp); + + restart: + size = send_q.size(); + + if (_shutdown) { + // XXX XioSend queues for flow-controlled connections may require + // cleanup + drained = true; + } + + if (size > 0) { + q_iter = send_q.begin(); + while (q_iter != send_q.end()) { + xs = &(*q_iter); + xcon = xs->xcon; + + switch (xs->type) { + case XioSubmit::OUTGOING_MSG: /* it was an outgoing 1-way */ + xsend = static_cast<XioSend*>(xs); + if (unlikely(!xcon->conn || !xcon->is_connected())) + code = ENOTCONN; + else { + /* XXX guard Accelio send queue (should be safe to rely + * on Accelio's check on below, but this assures that + * all chained xio_msg are accounted) */ + xio_qdepth_high = xcon->xio_qdepth_high_mark(); + if (unlikely((xcon->send_ctr + xsend->get_msg_count()) > + xio_qdepth_high)) { + requeue_all_xcon(xcon, q_iter, send_q); + goto restart; + } + + xs->trace.event("xio_send_msg"); + msg = xsend->get_xio_msg(); + code = xio_send_msg(xcon->conn, msg); + /* header trace moved here to capture xio serial# */ + if (ldlog_p1(msgr->cct, ceph_subsys_xio, 11)) { + xsend->print_debug(msgr->cct, "xio_send_msg"); + } + /* get the right Accelio's errno code */ + if (unlikely(code)) { + if ((code == -1) && (xio_errno() == -1)) { + /* In case XIO does not have any credits to send, + * it would still queue up the message(s) for transmission, + * but would return -1 and errno would also be set to -1. + * This needs to be treated as a success. + */ + code = 0; + } + else { + code = xio_errno(); + } + } + } /* !ENOTCONN */ + if (unlikely(code)) { + switch (code) { + case XIO_E_TX_QUEUE_OVERFLOW: + { + requeue_all_xcon(xcon, q_iter, send_q); + goto restart; + } + break; + default: + q_iter = send_q.erase(q_iter); + xcon->msg_send_fail(xsend, code); + continue; + break; + }; + } else { + xcon->send.set(msg->timestamp); // need atomic? + xcon->send_ctr += xsend->get_msg_count(); // only inc if cb promised + } + break; + default: + /* INCOMING_MSG_RELEASE */ + q_iter = send_q.erase(q_iter); + release_xio_msg(static_cast<XioCompletion*>(xs)); + continue; + } /* switch (xs->type) */ + q_iter = send_q.erase(q_iter); + } /* while */ + } /* size > 0 */ + + xio_context_run_loop(ctx, 300); + + } while ((!_shutdown) || (!drained)); + + /* shutting down */ + if (server) { + xio_unbind(server); + } + xio_context_destroy(ctx); + return NULL; + } + + void shutdown() + { + std::lock_guard<decltype(sp)> lg(sp); + _shutdown = true; + } +}; + +class XioPortals +{ +private: + vector<XioPortal*> portals; + char **p_vec; + int n; + int last_unused; + +public: + XioPortals(Messenger *msgr, int _n, int nconns) : p_vec(NULL), last_unused(0) + { + n = max(_n, 1); + + portals.resize(n); + for (int i = 0; i < n; i++) { + if (!portals[i]) { + portals[i] = new XioPortal(msgr, nconns); + ceph_assert(portals[i] != nullptr); + } + } + } + + vector<XioPortal*>& get() { return portals; } + + const char **get_vec() + { + return (const char **) p_vec; + } + + int get_portals_len() + { + return n; + } + + int get_last_unused() + { + int pix = last_unused; + if (++last_unused >= get_portals_len()) + last_unused = 0; + return pix; + } + + XioPortal* get_next_portal() + { + int pix = get_last_unused(); + return portals[pix]; + } + + int bind(struct xio_session_ops *ops, const string& base_uri, + uint16_t port, uint16_t *port0); + + int accept(struct xio_session *session, + struct xio_new_session_req *req, + void *cb_user_context) + { + const char **portals_vec = get_vec(); + int pix = get_last_unused(); + + if (pix == 0) { + return xio_accept(session, NULL, 0, NULL, 0); + } else { + return xio_accept(session, + (const char **)&(portals_vec[pix]), + 1, NULL, 0); + } + } + + void start() + { + XioPortal *portal; + int p_ix, nportals = portals.size(); + + p_vec = new char*[nportals]; + for (p_ix = 0; p_ix < nportals; ++p_ix) { + portal = portals[p_ix]; + p_vec[p_ix] = (char*) /* portal->xio_uri.c_str() */ + portal->portal_id; + } + + for (p_ix = 0; p_ix < nportals; ++p_ix) { + string thread_name = "ms_xio_"; + thread_name.append(std::to_string(p_ix)); + portal = portals[p_ix]; + portal->create(thread_name.c_str()); + } + } + + void shutdown() + { + int nportals = portals.size(); + for (int p_ix = 0; p_ix < nportals; ++p_ix) { + XioPortal *portal = portals[p_ix]; + portal->shutdown(); + } + } + + void join() + { + int nportals = portals.size(); + for (int p_ix = 0; p_ix < nportals; ++p_ix) { + XioPortal *portal = portals[p_ix]; + portal->join(); + } + } + + ~XioPortals() + { + int nportals = portals.size(); + for (int ix = 0; ix < nportals; ++ix) + delete(portals[ix]); + portals.clear(); + if (p_vec) + delete[] p_vec; + } +}; + +#endif /* XIO_PORTAL_H */ diff --git a/src/msg/xio/XioSubmit.h b/src/msg/xio/XioSubmit.h new file mode 100644 index 00000000..9840ad4a --- /dev/null +++ b/src/msg/xio/XioSubmit.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> + * Portions Copyright (C) 2013 CohortFS, LLC + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef XIO_SUBMIT_H +#define XIO_SUBMIT_H + +#include <boost/intrusive/list.hpp> +#include "msg/SimplePolicyMessenger.h" +extern "C" { +#include "libxio.h" +} +#include "XioConnection.h" +#include "msg/msg_types.h" +#include "XioPool.h" + +namespace bi = boost::intrusive; + +class XioConnection; + +struct XioSubmit +{ +public: + enum submit_type + { + OUTGOING_MSG, + INCOMING_MSG_RELEASE + }; + enum submit_type type; + bi::list_member_hook<> submit_list; + XioConnection *xcon; + ZTracer::Trace trace; + + XioSubmit(enum submit_type _type, XioConnection *_xcon) : + type(_type), xcon(_xcon) + {} + + typedef bi::list< XioSubmit, + bi::member_hook< XioSubmit, + bi::list_member_hook<>, + &XioSubmit::submit_list > + > Queue; + virtual ~XioSubmit(){ + } +}; + +#endif /* XIO_SUBMIT_H */ |