diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg/async | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
74 files changed, 26668 insertions, 0 deletions
diff --git a/src/msg/async/AsyncConnection.cc b/src/msg/async/AsyncConnection.cc new file mode 100644 index 00000000..b78d84a3 --- /dev/null +++ b/src/msg/async/AsyncConnection.cc @@ -0,0 +1,771 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <unistd.h> + +#include "include/Context.h" +#include "include/random.h" +#include "common/errno.h" +#include "AsyncMessenger.h" +#include "AsyncConnection.h" + +#include "ProtocolV1.h" +#include "ProtocolV2.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "common/EventTrace.h" + +// Constant to limit starting sequence number to 2^31. Nothing special about it, just a big number. PLR +#define SEQ_MASK 0x7fffffff + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream& AsyncConnection::_conn_prefix(std::ostream *_dout) { + return *_dout << "-- " << async_msgr->get_myaddrs() << " >> " + << *peer_addrs << " conn(" << this + << (msgr2 ? " msgr2=" : " legacy=") + << protocol.get() + << " " << ceph_con_mode_name(protocol->auth_meta->con_mode) + << " :" << port + << " s=" << get_state_name(state) + << " l=" << policy.lossy + << ")."; +} + +// Notes: +// 1. Don't dispatch any event when closed! It may cause AsyncConnection alive even if AsyncMessenger dead + +const uint32_t AsyncConnection::TCP_PREFETCH_MIN_SIZE = 512; + +class C_time_wakeup : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_time_wakeup(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd_or_id) override { + conn->wakeup_from(fd_or_id); + } +}; + +class C_handle_read : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_handle_read(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd_or_id) override { + conn->process(); + } +}; + +class C_handle_write : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_handle_write(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd) override { + conn->handle_write(); + } +}; + +class C_handle_write_callback : public EventCallback { + AsyncConnectionRef conn; + +public: + explicit C_handle_write_callback(AsyncConnectionRef c) : conn(c) {} + void do_request(uint64_t fd) override { conn->handle_write_callback(); } +}; + +class C_clean_handler : public EventCallback { + AsyncConnectionRef conn; + public: + explicit C_clean_handler(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t id) override { + conn->cleanup(); + delete this; + } +}; + +class C_tick_wakeup : public EventCallback { + AsyncConnectionRef conn; + + public: + explicit C_tick_wakeup(AsyncConnectionRef c): conn(c) {} + void do_request(uint64_t fd_or_id) override { + conn->tick(fd_or_id); + } +}; + + +AsyncConnection::AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q, + Worker *w, bool m2, bool local) + : Connection(cct, m), delay_state(NULL), async_msgr(m), conn_id(q->get_id()), + logger(w->get_perf_counter()), + state(STATE_NONE), port(-1), + dispatch_queue(q), recv_buf(NULL), + recv_max_prefetch(std::max<int64_t>(msgr->cct->_conf->ms_tcp_prefetch_max_size, TCP_PREFETCH_MIN_SIZE)), + recv_start(0), recv_end(0), + last_active(ceph::coarse_mono_clock::now()), + connect_timeout_us(cct->_conf->ms_connection_ready_timeout*1000*1000), + inactive_timeout_us(cct->_conf->ms_connection_idle_timeout*1000*1000), + msgr2(m2), state_offset(0), + worker(w), center(&w->center),read_buffer(nullptr) +{ +#ifdef UNIT_TESTS_BUILT + this->interceptor = m->interceptor; +#endif + read_handler = new C_handle_read(this); + write_handler = new C_handle_write(this); + write_callback_handler = new C_handle_write_callback(this); + wakeup_handler = new C_time_wakeup(this); + tick_handler = new C_tick_wakeup(this); + // double recv_max_prefetch see "read_until" + recv_buf = new char[2*recv_max_prefetch]; + if (local) { + protocol = std::unique_ptr<Protocol>(new LoopbackProtocolV1(this)); + } else if (m2) { + protocol = std::unique_ptr<Protocol>(new ProtocolV2(this)); + } else { + protocol = std::unique_ptr<Protocol>(new ProtocolV1(this)); + } + logger->inc(l_msgr_created_connections); +} + +AsyncConnection::~AsyncConnection() +{ + if (recv_buf) + delete[] recv_buf; + ceph_assert(!delay_state); +} + +int AsyncConnection::get_con_mode() const { + return protocol->get_con_mode(); +} + +void AsyncConnection::maybe_start_delay_thread() +{ + if (!delay_state) { + async_msgr->cct->_conf.with_val<std::string>( + "ms_inject_delay_type", + [this](const string& s) { + if (s.find(ceph_entity_type_name(peer_type)) != string::npos) { + ldout(msgr->cct, 1) << __func__ << " setting up a delay queue" + << dendl; + delay_state = new DelayedDelivery(async_msgr, center, dispatch_queue, + conn_id); + } + }); + } +} + + +ssize_t AsyncConnection::read(unsigned len, char *buffer, + std::function<void(char *, ssize_t)> callback) { + ldout(async_msgr->cct, 20) << __func__ + << (pendingReadLen ? " continue" : " start") + << " len=" << len << dendl; + ssize_t r = read_until(len, buffer); + if (r > 0) { + readCallback = callback; + pendingReadLen = len; + read_buffer = buffer; + } + return r; +} + +// Because this func will be called multi times to populate +// the needed buffer, so the passed in bufferptr must be the same. +// Normally, only "read_message" will pass existing bufferptr in +// +// And it will uses readahead method to reduce small read overhead, +// "recv_buf" is used to store read buffer +// +// return the remaining bytes, 0 means this buffer is finished +// else return < 0 means error +ssize_t AsyncConnection::read_until(unsigned len, char *p) +{ + ldout(async_msgr->cct, 25) << __func__ << " len is " << len << " state_offset is " + << state_offset << dendl; + + if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) { + if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) { + ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl; + cs.shutdown(); + } + } + + ssize_t r = 0; + uint64_t left = len - state_offset; + if (recv_end > recv_start) { + uint64_t to_read = std::min<uint64_t>(recv_end - recv_start, left); + memcpy(p, recv_buf+recv_start, to_read); + recv_start += to_read; + left -= to_read; + ldout(async_msgr->cct, 25) << __func__ << " got " << to_read << " in buffer " + << " left is " << left << " buffer still has " + << recv_end - recv_start << dendl; + if (left == 0) { + return 0; + } + state_offset += to_read; + } + + recv_end = recv_start = 0; + /* nothing left in the prefetch buffer */ + if (left > (uint64_t)recv_max_prefetch) { + /* this was a large read, we don't prefetch for these */ + do { + r = read_bulk(p+state_offset, left); + ldout(async_msgr->cct, 25) << __func__ << " read_bulk left is " << left << " got " << r << dendl; + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl; + return -1; + } else if (r == static_cast<int>(left)) { + state_offset = 0; + return 0; + } + state_offset += r; + left -= r; + } while (r > 0); + } else { + do { + r = read_bulk(recv_buf+recv_end, recv_max_prefetch); + ldout(async_msgr->cct, 25) << __func__ << " read_bulk recv_end is " << recv_end + << " left is " << left << " got " << r << dendl; + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " read failed" << dendl; + return -1; + } + recv_end += r; + if (r >= static_cast<int>(left)) { + recv_start = len - state_offset; + memcpy(p+state_offset, recv_buf, recv_start); + state_offset = 0; + return 0; + } + left -= r; + } while (r > 0); + memcpy(p+state_offset, recv_buf, recv_end-recv_start); + state_offset += (recv_end - recv_start); + recv_end = recv_start = 0; + } + ldout(async_msgr->cct, 25) << __func__ << " need len " << len << " remaining " + << len - state_offset << " bytes" << dendl; + return len - state_offset; +} + +/* return -1 means `fd` occurs error or closed, it should be closed + * return 0 means EAGAIN or EINTR */ +ssize_t AsyncConnection::read_bulk(char *buf, unsigned len) +{ + ssize_t nread; + again: + nread = cs.read(buf, len); + if (nread < 0) { + if (nread == -EAGAIN) { + nread = 0; + } else if (nread == -EINTR) { + goto again; + } else { + ldout(async_msgr->cct, 1) << __func__ << " reading from fd=" << cs.fd() + << " : "<< strerror(nread) << dendl; + return -1; + } + } else if (nread == 0) { + ldout(async_msgr->cct, 1) << __func__ << " peer close file descriptor " + << cs.fd() << dendl; + return -1; + } + return nread; +} + +ssize_t AsyncConnection::write(bufferlist &bl, + std::function<void(ssize_t)> callback, + bool more) { + + std::unique_lock<std::mutex> l(write_lock); + outgoing_bl.claim_append(bl); + ssize_t r = _try_send(more); + if (r > 0) { + writeCallback = callback; + } + return r; +} + +// return the remaining bytes, it may larger than the length of ptr +// else return < 0 means error +ssize_t AsyncConnection::_try_send(bool more) +{ + if (async_msgr->cct->_conf->ms_inject_socket_failures && cs) { + if (rand() % async_msgr->cct->_conf->ms_inject_socket_failures == 0) { + ldout(async_msgr->cct, 0) << __func__ << " injecting socket failure" << dendl; + cs.shutdown(); + } + } + + ceph_assert(center->in_thread()); + ldout(async_msgr->cct, 25) << __func__ << " cs.send " << outgoing_bl.length() + << " bytes" << dendl; + ssize_t r = cs.send(outgoing_bl, more); + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " send error: " << cpp_strerror(r) << dendl; + return r; + } + + ldout(async_msgr->cct, 10) << __func__ << " sent bytes " << r + << " remaining bytes " << outgoing_bl.length() << dendl; + + if (!open_write && is_queued()) { + center->create_file_event(cs.fd(), EVENT_WRITABLE, write_handler); + open_write = true; + } + + if (open_write && !is_queued()) { + center->delete_file_event(cs.fd(), EVENT_WRITABLE); + open_write = false; + if (writeCallback) { + center->dispatch_event_external(write_callback_handler); + } + } + + return outgoing_bl.length(); +} + +void AsyncConnection::inject_delay() { + if (async_msgr->cct->_conf->ms_inject_internal_delays) { + ldout(async_msgr->cct, 10) << __func__ << " sleep for " << + async_msgr->cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(async_msgr->cct->_conf->ms_inject_internal_delays); + t.sleep(); + } +} + +void AsyncConnection::process() { + std::lock_guard<std::mutex> l(lock); + last_active = ceph::coarse_mono_clock::now(); + recv_start_time = ceph::mono_clock::now(); + + ldout(async_msgr->cct, 20) << __func__ << dendl; + + switch (state) { + case STATE_NONE: { + ldout(async_msgr->cct, 20) << __func__ << " enter none state" << dendl; + return; + } + case STATE_CLOSED: { + ldout(async_msgr->cct, 20) << __func__ << " socket closed" << dendl; + return; + } + case STATE_CONNECTING: { + ceph_assert(!policy.server); + + // clear timer (if any) since we are connecting/re-connecting + if (last_tick_id) { + center->delete_time_event(last_tick_id); + last_tick_id = 0; + } + + if (cs) { + center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE); + cs.close(); + } + + SocketOptions opts; + opts.priority = async_msgr->get_socket_priority(); + opts.connect_bind_addr = msgr->get_myaddrs().front(); + ssize_t r = worker->connect(target_addr, opts, &cs); + if (r < 0) { + protocol->fault(); + return; + } + + center->create_file_event(cs.fd(), EVENT_READABLE, read_handler); + state = STATE_CONNECTING_RE; + } + case STATE_CONNECTING_RE: { + ssize_t r = cs.is_connected(); + if (r < 0) { + ldout(async_msgr->cct, 1) << __func__ << " reconnect failed to " + << target_addr << dendl; + if (r == -ECONNREFUSED) { + ldout(async_msgr->cct, 2) + << __func__ << " connection refused!" << dendl; + dispatch_queue->queue_refused(this); + } + protocol->fault(); + return; + } else if (r == 0) { + ldout(async_msgr->cct, 10) + << __func__ << " nonblock connect inprogress" << dendl; + if (async_msgr->get_stack()->nonblock_connect_need_writable_event()) { + center->create_file_event(cs.fd(), EVENT_WRITABLE, + read_handler); + } + logger->tinc(l_msgr_running_recv_time, + ceph::mono_clock::now() - recv_start_time); + return; + } + + center->delete_file_event(cs.fd(), EVENT_WRITABLE); + ldout(async_msgr->cct, 10) + << __func__ << " connect successfully, ready to send banner" << dendl; + state = STATE_CONNECTION_ESTABLISHED; + ceph_assert(last_tick_id == 0); + // exclude TCP nonblock connect time + last_connect_started = ceph::coarse_mono_clock::now(); + last_tick_id = center->create_time_event( + connect_timeout_us, tick_handler); + break; + } + + case STATE_ACCEPTING: { + center->create_file_event(cs.fd(), EVENT_READABLE, read_handler); + state = STATE_CONNECTION_ESTABLISHED; + + break; + } + + case STATE_CONNECTION_ESTABLISHED: { + if (pendingReadLen) { + ssize_t r = read(*pendingReadLen, read_buffer, readCallback); + if (r <= 0) { // read all bytes, or an error occured + pendingReadLen.reset(); + char *buf_tmp = read_buffer; + read_buffer = nullptr; + readCallback(buf_tmp, r); + } + return; + } + break; + } + } + + protocol->read_event(); + + logger->tinc(l_msgr_running_recv_time, + ceph::mono_clock::now() - recv_start_time); +} + +bool AsyncConnection::is_connected() { + return protocol->is_connected(); +} + +void AsyncConnection::connect(const entity_addrvec_t &addrs, int type, + entity_addr_t &target) { + + std::lock_guard<std::mutex> l(lock); + set_peer_type(type); + set_peer_addrs(addrs); + policy = msgr->get_policy(type); + target_addr = target; + _connect(); +} + +void AsyncConnection::_connect() +{ + ldout(async_msgr->cct, 10) << __func__ << dendl; + + state = STATE_CONNECTING; + protocol->connect(); + // rescheduler connection in order to avoid lock dep + // may called by external thread(send_message) + center->dispatch_event_external(read_handler); +} + +void AsyncConnection::accept(ConnectedSocket socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr) +{ + ldout(async_msgr->cct, 10) << __func__ << " sd=" << socket.fd() + << " listen_addr " << listen_addr + << " peer_addr " << peer_addr << dendl; + ceph_assert(socket.fd() >= 0); + + std::lock_guard<std::mutex> l(lock); + cs = std::move(socket); + socket_addr = listen_addr; + target_addr = peer_addr; // until we know better + state = STATE_ACCEPTING; + protocol->accept(); + // rescheduler connection in order to avoid lock dep + center->dispatch_event_external(read_handler); +} + +int AsyncConnection::send_message(Message *m) +{ + FUNCTRACE(async_msgr->cct); + lgeneric_subdout(async_msgr->cct, ms, + 1) << "-- " << async_msgr->get_myaddrs() << " --> " + << get_peer_addrs() << " -- " + << *m << " -- " << m << " con " + << this + << dendl; + + // optimistic think it's ok to encode(actually may broken now) + if (!m->get_priority()) + m->set_priority(async_msgr->get_default_send_priority()); + + m->get_header().src = async_msgr->get_myname(); + m->set_connection(this); + + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_BEGIN", true); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_BEGIN", true); + + if (async_msgr->get_myaddrs() == get_peer_addrs()) { //loopback connection + ldout(async_msgr->cct, 20) << __func__ << " " << *m << " local" << dendl; + std::lock_guard<std::mutex> l(write_lock); + if (protocol->is_connected()) { + dispatch_queue->local_delivery(m, m->get_priority()); + } else { + ldout(async_msgr->cct, 10) << __func__ << " loopback connection closed." + << " Drop message " << m << dendl; + m->put(); + } + return 0; + } + + // we don't want to consider local message here, it's too lightweight which + // may disturb users + logger->inc(l_msgr_send_messages); + + protocol->send_message(m); + return 0; +} + +entity_addr_t AsyncConnection::_infer_target_addr(const entity_addrvec_t& av) +{ + // pick the first addr of the same address family as socket_addr. it could be + // an any: or v2: addr, we don't care. it should not be a v1 addr. + for (auto& i : av.v) { + if (i.is_legacy()) { + continue; + } + if (i.get_family() == socket_addr.get_family()) { + ldout(async_msgr->cct,10) << __func__ << " " << av << " -> " << i << dendl; + return i; + } + } + ldout(async_msgr->cct,10) << __func__ << " " << av << " -> nothing to match " + << socket_addr << dendl; + return {}; +} + +void AsyncConnection::fault() +{ + shutdown_socket(); + open_write = false; + + // queue delayed items immediately + if (delay_state) + delay_state->flush(); + + recv_start = recv_end = 0; + state_offset = 0; + outgoing_bl.clear(); +} + +void AsyncConnection::_stop() { + writeCallback.reset(); + dispatch_queue->discard_queue(conn_id); + async_msgr->unregister_conn(this); + worker->release_worker(); + + state = STATE_CLOSED; + open_write = false; + + state_offset = 0; + // Make sure in-queue events will been processed + center->dispatch_event_external(EventCallbackRef(new C_clean_handler(this))); +} + +bool AsyncConnection::is_queued() const { + return outgoing_bl.length(); +} + +void AsyncConnection::shutdown_socket() { + for (auto &&t : register_time_events) center->delete_time_event(t); + register_time_events.clear(); + if (last_tick_id) { + center->delete_time_event(last_tick_id); + last_tick_id = 0; + } + if (cs) { + center->delete_file_event(cs.fd(), EVENT_READABLE | EVENT_WRITABLE); + cs.shutdown(); + cs.close(); + } +} + +void AsyncConnection::DelayedDelivery::do_request(uint64_t id) +{ + Message *m = nullptr; + { + std::lock_guard<std::mutex> l(delay_lock); + register_time_events.erase(id); + if (stop_dispatch) + return ; + if (delay_queue.empty()) + return ; + m = delay_queue.front(); + delay_queue.pop_front(); + } + if (msgr->ms_can_fast_dispatch(m)) { + dispatch_queue->fast_dispatch(m); + } else { + dispatch_queue->enqueue(m, m->get_priority(), conn_id); + } +} + +void AsyncConnection::DelayedDelivery::discard() { + stop_dispatch = true; + center->submit_to(center->get_id(), + [this]() mutable { + std::lock_guard<std::mutex> l(delay_lock); + while (!delay_queue.empty()) { + Message *m = delay_queue.front(); + dispatch_queue->dispatch_throttle_release( + m->get_dispatch_throttle_size()); + m->put(); + delay_queue.pop_front(); + } + for (auto i : register_time_events) + center->delete_time_event(i); + register_time_events.clear(); + stop_dispatch = false; + }, + true); +} + +void AsyncConnection::DelayedDelivery::flush() { + stop_dispatch = true; + center->submit_to( + center->get_id(), [this] () mutable { + std::lock_guard<std::mutex> l(delay_lock); + while (!delay_queue.empty()) { + Message *m = delay_queue.front(); + if (msgr->ms_can_fast_dispatch(m)) { + dispatch_queue->fast_dispatch(m); + } else { + dispatch_queue->enqueue(m, m->get_priority(), conn_id); + } + delay_queue.pop_front(); + } + for (auto i : register_time_events) + center->delete_time_event(i); + register_time_events.clear(); + stop_dispatch = false; + }, true); +} + +void AsyncConnection::send_keepalive() +{ + protocol->send_keepalive(); +} + +void AsyncConnection::mark_down() +{ + ldout(async_msgr->cct, 1) << __func__ << dendl; + std::lock_guard<std::mutex> l(lock); + protocol->stop(); +} + +void AsyncConnection::handle_write() +{ + ldout(async_msgr->cct, 10) << __func__ << dendl; + protocol->write_event(); +} + +void AsyncConnection::handle_write_callback() { + std::lock_guard<std::mutex> l(lock); + last_active = ceph::coarse_mono_clock::now(); + recv_start_time = ceph::mono_clock::now(); + write_lock.lock(); + if (writeCallback) { + auto callback = *writeCallback; + writeCallback.reset(); + write_lock.unlock(); + callback(0); + return; + } + write_lock.unlock(); +} + +void AsyncConnection::stop(bool queue_reset) { + lock.lock(); + bool need_queue_reset = (state != STATE_CLOSED) && queue_reset; + protocol->stop(); + lock.unlock(); + if (need_queue_reset) dispatch_queue->queue_reset(this); +} + +void AsyncConnection::cleanup() { + shutdown_socket(); + delete read_handler; + delete write_handler; + delete write_callback_handler; + delete wakeup_handler; + delete tick_handler; + if (delay_state) { + delete delay_state; + delay_state = NULL; + } +} + +void AsyncConnection::wakeup_from(uint64_t id) +{ + lock.lock(); + register_time_events.erase(id); + lock.unlock(); + process(); +} + +void AsyncConnection::tick(uint64_t id) +{ + auto now = ceph::coarse_mono_clock::now(); + ldout(async_msgr->cct, 20) << __func__ << " last_id=" << last_tick_id + << " last_active=" << last_active << dendl; + std::lock_guard<std::mutex> l(lock); + last_tick_id = 0; + if (!is_connected()) { + if (connect_timeout_us <= + (uint64_t)std::chrono::duration_cast<std::chrono::microseconds> + (now - last_connect_started).count()) { + ldout(async_msgr->cct, 1) << __func__ << " see no progress in more than " + << connect_timeout_us + << " us during connecting, fault." + << dendl; + protocol->fault(); + } else { + last_tick_id = center->create_time_event(connect_timeout_us, tick_handler); + } + } else { + auto idle_period = std::chrono::duration_cast<std::chrono::microseconds> + (now - last_active).count(); + if (inactive_timeout_us < (uint64_t)idle_period) { + ldout(async_msgr->cct, 1) << __func__ << " idle (" << idle_period + << ") for more than " << inactive_timeout_us + << " us, fault." + << dendl; + protocol->fault(); + } else { + last_tick_id = center->create_time_event(inactive_timeout_us, tick_handler); + } + } +} diff --git a/src/msg/async/AsyncConnection.h b/src/msg/async/AsyncConnection.h new file mode 100644 index 00000000..0c2512c8 --- /dev/null +++ b/src/msg/async/AsyncConnection.h @@ -0,0 +1,238 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ASYNCCONNECTION_H +#define CEPH_MSG_ASYNCCONNECTION_H + +#include <atomic> +#include <pthread.h> +#include <climits> +#include <list> +#include <mutex> +#include <map> +#include <functional> +#include <optional> + +#include "auth/AuthSessionHandler.h" +#include "common/ceph_time.h" +#include "common/perf_counters.h" +#include "include/buffer.h" +#include "msg/Connection.h" +#include "msg/Messenger.h" + +#include "Event.h" +#include "Stack.h" + +class AsyncMessenger; +class DispatchQueue; +class Worker; +class Protocol; + +static const int ASYNC_IOV_MAX = (IOV_MAX >= 1024 ? IOV_MAX / 4 : IOV_MAX); + +/* + * AsyncConnection maintains a logic session between two endpoints. In other + * word, a pair of addresses can find the only AsyncConnection. AsyncConnection + * will handle with network fault or read/write transactions. If one file + * descriptor broken, AsyncConnection will maintain the message queue and + * sequence, try to reconnect peer endpoint. + */ +class AsyncConnection : public Connection { + + ssize_t read(unsigned len, char *buffer, + std::function<void(char *, ssize_t)> callback); + ssize_t read_until(unsigned needed, char *p); + ssize_t read_bulk(char *buf, unsigned len); + + ssize_t write(bufferlist &bl, std::function<void(ssize_t)> callback, + bool more=false); + ssize_t _try_send(bool more=false); + + void _connect(); + void _stop(); + void fault(); + void inject_delay(); + + bool is_queued() const; + void shutdown_socket(); + + /** + * The DelayedDelivery is for injecting delays into Message delivery off + * the socket. It is only enabled if delays are requested, and if they + * are then it pulls Messages off the DelayQueue and puts them into the + * AsyncMessenger event queue. + */ + class DelayedDelivery : public EventCallback { + std::set<uint64_t> register_time_events; // need to delete it if stop + std::deque<Message*> delay_queue; + std::mutex delay_lock; + AsyncMessenger *msgr; + EventCenter *center; + DispatchQueue *dispatch_queue; + uint64_t conn_id; + std::atomic_bool stop_dispatch; + + public: + explicit DelayedDelivery(AsyncMessenger *omsgr, EventCenter *c, + DispatchQueue *q, uint64_t cid) + : msgr(omsgr), center(c), dispatch_queue(q), conn_id(cid), + stop_dispatch(false) { } + ~DelayedDelivery() override { + ceph_assert(register_time_events.empty()); + ceph_assert(delay_queue.empty()); + } + void set_center(EventCenter *c) { center = c; } + void do_request(uint64_t id) override; + void queue(double delay_period, Message *m) { + std::lock_guard<std::mutex> l(delay_lock); + delay_queue.push_back(m); + register_time_events.insert(center->create_time_event(delay_period*1000000, this)); + } + void discard(); + bool ready() const { return !stop_dispatch && delay_queue.empty() && register_time_events.empty(); } + void flush(); + } *delay_state; + + public: + AsyncConnection(CephContext *cct, AsyncMessenger *m, DispatchQueue *q, + Worker *w, bool is_msgr2, bool local); + ~AsyncConnection() override; + void maybe_start_delay_thread(); + + ostream& _conn_prefix(std::ostream *_dout); + + bool is_connected() override; + + // Only call when AsyncConnection first construct + void connect(const entity_addrvec_t& addrs, int type, entity_addr_t& target); + + // Only call when AsyncConnection first construct + void accept(ConnectedSocket socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr); + int send_message(Message *m) override; + + void send_keepalive() override; + void mark_down() override; + void mark_disposable() override { + std::lock_guard<std::mutex> l(lock); + policy.lossy = true; + } + + entity_addr_t get_peer_socket_addr() const override { + return target_addr; + } + + int get_con_mode() const override; + + private: + enum { + STATE_NONE, + STATE_CONNECTING, + STATE_CONNECTING_RE, + STATE_ACCEPTING, + STATE_CONNECTION_ESTABLISHED, + STATE_CLOSED + }; + + static const uint32_t TCP_PREFETCH_MIN_SIZE; + static const char *get_state_name(int state) { + const char* const statenames[] = {"STATE_NONE", + "STATE_CONNECTING", + "STATE_CONNECTING_RE", + "STATE_ACCEPTING", + "STATE_CONNECTION_ESTABLISHED", + "STATE_CLOSED"}; + return statenames[state]; + } + + AsyncMessenger *async_msgr; + uint64_t conn_id; + PerfCounters *logger; + int state; + ConnectedSocket cs; + int port; + Messenger::Policy policy; + + DispatchQueue *dispatch_queue; + + // lockfree, only used in own thread + bufferlist outgoing_bl; + bool open_write = false; + + std::mutex write_lock; + + std::mutex lock; + EventCallbackRef read_handler; + EventCallbackRef write_handler; + EventCallbackRef write_callback_handler; + EventCallbackRef wakeup_handler; + EventCallbackRef tick_handler; + char *recv_buf; + uint32_t recv_max_prefetch; + uint32_t recv_start; + uint32_t recv_end; + set<uint64_t> register_time_events; // need to delete it if stop + ceph::coarse_mono_clock::time_point last_connect_started; + ceph::coarse_mono_clock::time_point last_active; + ceph::mono_clock::time_point recv_start_time; + uint64_t last_tick_id = 0; + const uint64_t connect_timeout_us; + const uint64_t inactive_timeout_us; + + // Tis section are temp variables used by state transition + + // Accepting state + bool msgr2 = false; + entity_addr_t socket_addr; ///< local socket addr + entity_addr_t target_addr; ///< which of the peer_addrs we're connecting to (as clienet) or should reconnect to (as peer) + + entity_addr_t _infer_target_addr(const entity_addrvec_t& av); + + // used only by "read_until" + uint64_t state_offset; + Worker *worker; + EventCenter *center; + + std::unique_ptr<Protocol> protocol; + + std::optional<std::function<void(ssize_t)>> writeCallback; + std::function<void(char *, ssize_t)> readCallback; + std::optional<unsigned> pendingReadLen; + char *read_buffer; + + public: + // used by eventcallback + void handle_write(); + void handle_write_callback(); + void process(); + void wakeup_from(uint64_t id); + void tick(uint64_t id); + void local_deliver(); + void stop(bool queue_reset); + void cleanup(); + PerfCounters *get_perf_counter() { + return logger; + } + + friend class Protocol; + friend class ProtocolV1; + friend class ProtocolV2; +}; /* AsyncConnection */ + +typedef boost::intrusive_ptr<AsyncConnection> AsyncConnectionRef; + +#endif diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc new file mode 100644 index 00000000..2b1488c4 --- /dev/null +++ b/src/msg/async/AsyncMessenger.cc @@ -0,0 +1,949 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "acconfig.h" + +#include <iostream> +#include <fstream> + +#include "AsyncMessenger.h" + +#include "common/config.h" +#include "common/Timer.h" +#include "common/errno.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "common/EventTrace.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _prefix(_dout, this) +static ostream& _prefix(std::ostream *_dout, AsyncMessenger *m) { + return *_dout << "-- " << m->get_myaddrs() << " "; +} + +static ostream& _prefix(std::ostream *_dout, Processor *p) { + return *_dout << " Processor -- "; +} + + +/******************* + * Processor + */ + +class Processor::C_processor_accept : public EventCallback { + Processor *pro; + + public: + explicit C_processor_accept(Processor *p): pro(p) {} + void do_request(uint64_t id) override { + pro->accept(); + } +}; + +Processor::Processor(AsyncMessenger *r, Worker *w, CephContext *c) + : msgr(r), net(c), worker(w), + listen_handler(new C_processor_accept(this)) {} + +int Processor::bind(const entity_addrvec_t &bind_addrs, + const set<int>& avoid_ports, + entity_addrvec_t* bound_addrs) +{ + const auto& conf = msgr->cct->_conf; + // bind to socket(s) + ldout(msgr->cct, 10) << __func__ << " " << bind_addrs << dendl; + + SocketOptions opts; + opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay; + opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf; + + listen_sockets.resize(bind_addrs.v.size()); + *bound_addrs = bind_addrs; + + for (unsigned k = 0; k < bind_addrs.v.size(); ++k) { + auto& listen_addr = bound_addrs->v[k]; + + /* bind to port */ + int r = -1; + + for (int i = 0; i < conf->ms_bind_retry_count; i++) { + if (i > 0) { + lderr(msgr->cct) << __func__ << " was unable to bind. Trying again in " + << conf->ms_bind_retry_delay << " seconds " << dendl; + sleep(conf->ms_bind_retry_delay); + } + + if (listen_addr.get_port()) { + worker->center.submit_to( + worker->center.get_id(), + [this, k, &listen_addr, &opts, &r]() { + r = worker->listen(listen_addr, k, opts, &listen_sockets[k]); + }, false); + if (r < 0) { + lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr + << ": " << cpp_strerror(r) << dendl; + continue; + } + } else { + // try a range of ports + for (int port = msgr->cct->_conf->ms_bind_port_min; + port <= msgr->cct->_conf->ms_bind_port_max; + port++) { + if (avoid_ports.count(port)) + continue; + + listen_addr.set_port(port); + worker->center.submit_to( + worker->center.get_id(), + [this, k, &listen_addr, &opts, &r]() { + r = worker->listen(listen_addr, k, opts, &listen_sockets[k]); + }, false); + if (r == 0) + break; + } + if (r < 0) { + lderr(msgr->cct) << __func__ << " unable to bind to " << listen_addr + << " on any port in range " + << msgr->cct->_conf->ms_bind_port_min + << "-" << msgr->cct->_conf->ms_bind_port_max << ": " + << cpp_strerror(r) << dendl; + listen_addr.set_port(0); // Clear port before retry, otherwise we shall fail again. + continue; + } + ldout(msgr->cct, 10) << __func__ << " bound on random port " + << listen_addr << dendl; + } + if (r == 0) { + break; + } + } + + // It seems that binding completely failed, return with that exit status + if (r < 0) { + lderr(msgr->cct) << __func__ << " was unable to bind after " + << conf->ms_bind_retry_count + << " attempts: " << cpp_strerror(r) << dendl; + for (unsigned j = 0; j < k; ++j) { + // clean up previous bind + listen_sockets[j].abort_accept(); + } + return r; + } + } + + ldout(msgr->cct, 10) << __func__ << " bound to " << *bound_addrs << dendl; + return 0; +} + +void Processor::start() +{ + ldout(msgr->cct, 1) << __func__ << dendl; + + // start thread + worker->center.submit_to(worker->center.get_id(), [this]() { + for (auto& l : listen_sockets) { + if (l) { + worker->center.create_file_event(l.fd(), EVENT_READABLE, + listen_handler); } + } + }, false); +} + +void Processor::accept() +{ + SocketOptions opts; + opts.nodelay = msgr->cct->_conf->ms_tcp_nodelay; + opts.rcbuf_size = msgr->cct->_conf->ms_tcp_rcvbuf; + opts.priority = msgr->get_socket_priority(); + + for (auto& listen_socket : listen_sockets) { + ldout(msgr->cct, 10) << __func__ << " listen_fd=" << listen_socket.fd() + << dendl; + unsigned accept_error_num = 0; + + while (true) { + entity_addr_t addr; + ConnectedSocket cli_socket; + Worker *w = worker; + if (!msgr->get_stack()->support_local_listen_table()) + w = msgr->get_stack()->get_worker(); + else + ++w->references; + int r = listen_socket.accept(&cli_socket, opts, &addr, w); + if (r == 0) { + ldout(msgr->cct, 10) << __func__ << " accepted incoming on sd " + << cli_socket.fd() << dendl; + + msgr->add_accept( + w, std::move(cli_socket), + msgr->get_myaddrs().v[listen_socket.get_addr_slot()], + addr); + accept_error_num = 0; + continue; + } else { + --w->references; + if (r == -EINTR) { + continue; + } else if (r == -EAGAIN) { + break; + } else if (r == -EMFILE || r == -ENFILE) { + lderr(msgr->cct) << __func__ << " open file descriptions limit reached sd = " << listen_socket.fd() + << " errno " << r << " " << cpp_strerror(r) << dendl; + if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) { + lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl; + ceph_abort(); + } + continue; + } else if (r == -ECONNABORTED) { + ldout(msgr->cct, 0) << __func__ << " it was closed because of rst arrived sd = " << listen_socket.fd() + << " errno " << r << " " << cpp_strerror(r) << dendl; + continue; + } else { + lderr(msgr->cct) << __func__ << " no incoming connection?" + << " errno " << r << " " << cpp_strerror(r) << dendl; + if (++accept_error_num > msgr->cct->_conf->ms_max_accept_failures) { + lderr(msgr->cct) << "Proccessor accept has encountered enough error numbers, just do ceph_abort()." << dendl; + ceph_abort(); + } + continue; + } + } + } + } +} + +void Processor::stop() +{ + ldout(msgr->cct,10) << __func__ << dendl; + + worker->center.submit_to(worker->center.get_id(), [this]() { + for (auto& listen_socket : listen_sockets) { + if (listen_socket) { + worker->center.delete_file_event(listen_socket.fd(), EVENT_READABLE); + listen_socket.abort_accept(); + } + } + }, false); +} + + +struct StackSingleton { + CephContext *cct; + std::shared_ptr<NetworkStack> stack; + + explicit StackSingleton(CephContext *c): cct(c) {} + void ready(std::string &type) { + if (!stack) + stack = NetworkStack::create(cct, type); + } + ~StackSingleton() { + stack->stop(); + } +}; + + +class C_handle_reap : public EventCallback { + AsyncMessenger *msgr; + + public: + explicit C_handle_reap(AsyncMessenger *m): msgr(m) {} + void do_request(uint64_t id) override { + // judge whether is a time event + msgr->reap_dead(); + } +}; + +/******************* + * AsyncMessenger + */ + +AsyncMessenger::AsyncMessenger(CephContext *cct, entity_name_t name, + const std::string &type, string mname, uint64_t _nonce) + : SimplePolicyMessenger(cct, name,mname, _nonce), + dispatch_queue(cct, this, mname), + lock("AsyncMessenger::lock"), + nonce(_nonce), need_addr(true), did_bind(false), + global_seq(0), deleted_lock("AsyncMessenger::deleted_lock"), + cluster_protocol(0), stopped(true) +{ + std::string transport_type = "posix"; + if (type.find("rdma") != std::string::npos) + transport_type = "rdma"; + else if (type.find("dpdk") != std::string::npos) + transport_type = "dpdk"; + + auto single = &cct->lookup_or_create_singleton_object<StackSingleton>( + "AsyncMessenger::NetworkStack::" + transport_type, true, cct); + single->ready(transport_type); + stack = single->stack.get(); + stack->start(); + local_worker = stack->get_worker(); + local_connection = new AsyncConnection(cct, this, &dispatch_queue, + local_worker, true, true); + init_local_connection(); + reap_handler = new C_handle_reap(this); + unsigned processor_num = 1; + if (stack->support_local_listen_table()) + processor_num = stack->get_num_worker(); + for (unsigned i = 0; i < processor_num; ++i) + processors.push_back(new Processor(this, stack->get_worker(i), cct)); +} + +/** + * Destroy the AsyncMessenger. Pretty simple since all the work is done + * elsewhere. + */ +AsyncMessenger::~AsyncMessenger() +{ + delete reap_handler; + ceph_assert(!did_bind); // either we didn't bind or we shut down the Processor + for (auto &&p : processors) + delete p; +} + +void AsyncMessenger::ready() +{ + ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl; + + stack->ready(); + if (pending_bind) { + int err = bindv(pending_bind_addrs); + if (err) { + lderr(cct) << __func__ << " postponed bind failed" << dendl; + ceph_abort(); + } + } + + Mutex::Locker l(lock); + for (auto &&p : processors) + p->start(); + dispatch_queue.start(); +} + +int AsyncMessenger::shutdown() +{ + ldout(cct,10) << __func__ << " " << get_myaddrs() << dendl; + + // done! clean up. + for (auto &&p : processors) + p->stop(); + mark_down_all(); + // break ref cycles on the loopback connection + local_connection->set_priv(NULL); + local_connection->mark_down(); + did_bind = false; + lock.Lock(); + stop_cond.Signal(); + stopped = true; + lock.Unlock(); + stack->drain(); + return 0; +} + +int AsyncMessenger::bind(const entity_addr_t &bind_addr) +{ + ldout(cct,10) << __func__ << " " << bind_addr << dendl; + // old bind() can take entity_addr_t(). new bindv() can take a + // 0.0.0.0-like address but needs type and family to be set. + auto a = bind_addr; + if (a == entity_addr_t()) { + a.set_type(entity_addr_t::TYPE_LEGACY); + if (cct->_conf->ms_bind_ipv6) { + a.set_family(AF_INET6); + } else { + a.set_family(AF_INET); + } + } + return bindv(entity_addrvec_t(a)); +} + +int AsyncMessenger::bindv(const entity_addrvec_t &bind_addrs) +{ + lock.Lock(); + + if (!pending_bind && started) { + ldout(cct,10) << __func__ << " already started" << dendl; + lock.Unlock(); + return -1; + } + + ldout(cct,10) << __func__ << " " << bind_addrs << dendl; + + if (!stack->is_ready()) { + ldout(cct, 10) << __func__ << " Network Stack is not ready for bind yet - postponed" << dendl; + pending_bind_addrs = bind_addrs; + pending_bind = true; + lock.Unlock(); + return 0; + } + + lock.Unlock(); + + // bind to a socket + set<int> avoid_ports; + entity_addrvec_t bound_addrs; + unsigned i = 0; + for (auto &&p : processors) { + int r = p->bind(bind_addrs, avoid_ports, &bound_addrs); + if (r) { + // Note: this is related to local tcp listen table problem. + // Posix(default kernel implementation) backend shares listen table + // in the kernel, so all threads can use the same listen table naturally + // and only one thread need to bind. But other backends(like dpdk) uses local + // listen table, we need to bind/listen tcp port for each worker. So if the + // first worker failed to bind, it could be think the normal error then handle + // it, like port is used case. But if the first worker successfully to bind + // but the second worker failed, it's not expected and we need to assert + // here + ceph_assert(i == 0); + return r; + } + ++i; + } + _finish_bind(bind_addrs, bound_addrs); + return 0; +} + +int AsyncMessenger::rebind(const set<int>& avoid_ports) +{ + ldout(cct,1) << __func__ << " rebind avoid " << avoid_ports << dendl; + ceph_assert(did_bind); + + for (auto &&p : processors) + p->stop(); + mark_down_all(); + + // adjust the nonce; we want our entity_addr_t to be truly unique. + nonce += 1000000; + ldout(cct, 10) << __func__ << " new nonce " << nonce + << " and addr " << get_myaddrs() << dendl; + + entity_addrvec_t bound_addrs; + entity_addrvec_t bind_addrs = get_myaddrs(); + set<int> new_avoid(avoid_ports); + for (auto& a : bind_addrs.v) { + new_avoid.insert(a.get_port()); + a.set_port(0); + } + ldout(cct, 10) << __func__ << " will try " << bind_addrs + << " and avoid ports " << new_avoid << dendl; + unsigned i = 0; + for (auto &&p : processors) { + int r = p->bind(bind_addrs, avoid_ports, &bound_addrs); + if (r) { + ceph_assert(i == 0); + return r; + } + ++i; + } + _finish_bind(bind_addrs, bound_addrs); + for (auto &&p : processors) { + p->start(); + } + return 0; +} + +int AsyncMessenger::client_bind(const entity_addr_t &bind_addr) +{ + if (!cct->_conf->ms_bind_before_connect) + return 0; + Mutex::Locker l(lock); + if (did_bind) { + return 0; + } + if (started) { + ldout(cct, 10) << __func__ << " already started" << dendl; + return -1; + } + ldout(cct, 10) << __func__ << " " << bind_addr << dendl; + + set_myaddrs(entity_addrvec_t(bind_addr)); + return 0; +} + +void AsyncMessenger::_finish_bind(const entity_addrvec_t& bind_addrs, + const entity_addrvec_t& listen_addrs) +{ + set_myaddrs(bind_addrs); + for (auto& a : bind_addrs.v) { + if (!a.is_blank_ip()) { + learned_addr(a); + } + } + + if (get_myaddrs().front().get_port() == 0) { + set_myaddrs(listen_addrs); + } + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + a.set_nonce(nonce); + } + set_myaddrs(newaddrs); + + init_local_connection(); + + ldout(cct,1) << __func__ << " bind my_addrs is " << get_myaddrs() << dendl; + did_bind = true; +} + +int AsyncMessenger::start() +{ + lock.Lock(); + ldout(cct,1) << __func__ << " start" << dendl; + + // register at least one entity, first! + ceph_assert(my_name.type() >= 0); + + ceph_assert(!started); + started = true; + stopped = false; + + if (!did_bind) { + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + a.nonce = nonce; + } + set_myaddrs(newaddrs); + _init_local_connection(); + } + + lock.Unlock(); + return 0; +} + +void AsyncMessenger::wait() +{ + lock.Lock(); + if (!started) { + lock.Unlock(); + return; + } + if (!stopped) + stop_cond.Wait(lock); + + lock.Unlock(); + + dispatch_queue.shutdown(); + if (dispatch_queue.is_started()) { + ldout(cct, 10) << __func__ << ": waiting for dispatch queue" << dendl; + dispatch_queue.wait(); + dispatch_queue.discard_local(); + ldout(cct, 10) << __func__ << ": dispatch queue is stopped" << dendl; + } + + // close all connections + shutdown_connections(false); + stack->drain(); + + ldout(cct, 10) << __func__ << ": done." << dendl; + ldout(cct, 1) << __func__ << " complete." << dendl; + started = false; +} + +void AsyncMessenger::add_accept(Worker *w, ConnectedSocket cli_socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr) +{ + lock.Lock(); + AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w, + listen_addr.is_msgr2(), false); + conn->accept(std::move(cli_socket), listen_addr, peer_addr); + accepting_conns.insert(conn); + lock.Unlock(); +} + +AsyncConnectionRef AsyncMessenger::create_connect( + const entity_addrvec_t& addrs, int type) +{ + ceph_assert(lock.is_locked()); + + ldout(cct, 10) << __func__ << " " << addrs + << ", creating connection and registering" << dendl; + + // here is where we decide which of the addrs to connect to. always prefer + // the first one, if we support it. + entity_addr_t target; + for (auto& a : addrs.v) { + if (!a.is_msgr2() && !a.is_legacy()) { + continue; + } + // FIXME: for ipv4 vs ipv6, check whether local host can handle ipv6 before + // trying it? for now, just pick whichever is listed first. + target = a; + break; + } + + // create connection + Worker *w = stack->get_worker(); + AsyncConnectionRef conn = new AsyncConnection(cct, this, &dispatch_queue, w, + target.is_msgr2(), false); + conn->connect(addrs, type, target); + ceph_assert(!conns.count(addrs)); + ldout(cct, 10) << __func__ << " " << conn << " " << addrs << " " + << *conn->peer_addrs << dendl; + conns[addrs] = conn; + w->get_perf_counter()->inc(l_msgr_active_connections); + + return conn; +} + + +ConnectionRef AsyncMessenger::get_loopback_connection() +{ + return local_connection; +} + +bool AsyncMessenger::should_use_msgr2() +{ + // if we are bound to v1 only, and we are connecting to a v2 peer, + // we cannot use the peer's v2 address. otherwise the connection + // is assymetrical, because they would have to use v1 to connect + // to us, and we would use v2, and connection race detection etc + // would totally break down (among other things). or, the other + // end will be confused that we advertise ourselve with a v1 + // address only (that we bound to) but connected with protocol v2. + return !did_bind || get_myaddrs().has_msgr2(); +} + +entity_addrvec_t AsyncMessenger::_filter_addrs(int type, + const entity_addrvec_t& addrs) +{ + if (!should_use_msgr2()) { + ldout(cct, 10) << __func__ << " " << addrs << " type " << type + << " limiting to v1 ()" << dendl; + entity_addrvec_t r; + for (auto& i : addrs.v) { + if (i.is_msgr2()) { + continue; + } + r.v.push_back(i); + } + return r; + } else { + return addrs; + } +} + +int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + + FUNCTRACE(cct); + ceph_assert(m); + + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE(((MOSDOp *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP"); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE(((MOSDOpReply *)m)->get_oid().name.c_str(), "SEND_MSG_OSD_OP_REPLY"); + + ldout(cct, 1) << __func__ << "--> " << ceph_entity_type_name(type) << " " + << addrs << " -- " << *m << " -- ?+" + << m->get_data().length() << " " << m << dendl; + + if (addrs.empty()) { + ldout(cct,0) << __func__ << " message " << *m + << " with empty dest " << addrs << dendl; + m->put(); + return -EINVAL; + } + + auto av = _filter_addrs(type, addrs); + AsyncConnectionRef conn = _lookup_conn(av); + submit_message(m, conn, av, type); + return 0; +} + +ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + if (*my_addrs == addrs || + (addrs.v.size() == 1 && + my_addrs->contains(addrs.front()))) { + // local + return local_connection; + } + + auto av = _filter_addrs(type, addrs); + + AsyncConnectionRef conn = _lookup_conn(av); + if (conn) { + ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl; + } else { + conn = create_connect(av, type); + ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl; + } + + return conn; +} + +void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con, + const entity_addrvec_t& dest_addrs, + int dest_type) +{ + if (cct->_conf->ms_dump_on_send) { + m->encode(-1, MSG_CRC_ALL); + ldout(cct, 0) << __func__ << " submit_message " << *m << "\n"; + m->get_payload().hexdump(*_dout); + if (m->get_data().length() > 0) { + *_dout << " data:\n"; + m->get_data().hexdump(*_dout); + } + *_dout << dendl; + m->clear_payload(); + } + + // existing connection? + if (con) { + con->send_message(m); + return ; + } + + // local? + if (*my_addrs == dest_addrs || + (dest_addrs.v.size() == 1 && + my_addrs->contains(dest_addrs.front()))) { + // local + local_connection->send_message(m); + return ; + } + + // remote, no existing connection. + const Policy& policy = get_policy(dest_type); + if (policy.server) { + ldout(cct, 20) << __func__ << " " << *m << " remote, " << dest_addrs + << ", lossy server for target type " + << ceph_entity_type_name(dest_type) << ", no session, dropping." << dendl; + m->put(); + } else { + ldout(cct,20) << __func__ << " " << *m << " remote, " << dest_addrs + << ", new connection." << dendl; + con = create_connect(dest_addrs, dest_type); + con->send_message(m); + } +} + +/** + * If my_addr doesn't have an IP set, this function + * will fill it in from the passed addr. Otherwise it does nothing and returns. + */ +bool AsyncMessenger::set_addr_unknowns(const entity_addrvec_t &addrs) +{ + ldout(cct,1) << __func__ << " " << addrs << dendl; + bool ret = false; + Mutex::Locker l(lock); + + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + if (a.is_blank_ip()) { + int type = a.get_type(); + int port = a.get_port(); + uint32_t nonce = a.get_nonce(); + for (auto& b : addrs.v) { + if (a.get_family() == b.get_family()) { + ldout(cct,1) << __func__ << " assuming my addr " << a + << " matches provided addr " << b << dendl; + a = b; + a.set_nonce(nonce); + a.set_type(type); + a.set_port(port); + ret = true; + break; + } + } + } + } + set_myaddrs(newaddrs); + if (ret) { + _init_local_connection(); + } + ldout(cct,1) << __func__ << " now " << *my_addrs << dendl; + return ret; +} + +void AsyncMessenger::set_addrs(const entity_addrvec_t &addrs) +{ + Mutex::Locker l(lock); + auto t = addrs; + for (auto& a : t.v) { + a.set_nonce(nonce); + } + set_myaddrs(t); + _init_local_connection(); +} + +void AsyncMessenger::shutdown_connections(bool queue_reset) +{ + ldout(cct,1) << __func__ << " " << dendl; + lock.Lock(); + for (set<AsyncConnectionRef>::iterator q = accepting_conns.begin(); + q != accepting_conns.end(); ++q) { + AsyncConnectionRef p = *q; + ldout(cct, 5) << __func__ << " accepting_conn " << p.get() << dendl; + p->stop(queue_reset); + } + accepting_conns.clear(); + + while (!conns.empty()) { + auto it = conns.begin(); + AsyncConnectionRef p = it->second; + ldout(cct, 5) << __func__ << " mark down " << it->first << " " << p << dendl; + conns.erase(it); + p->get_perf_counter()->dec(l_msgr_active_connections); + p->stop(queue_reset); + } + + { + Mutex::Locker l(deleted_lock); + while (!deleted_conns.empty()) { + set<AsyncConnectionRef>::iterator it = deleted_conns.begin(); + AsyncConnectionRef p = *it; + ldout(cct, 5) << __func__ << " delete " << p << dendl; + deleted_conns.erase(it); + } + } + lock.Unlock(); +} + +void AsyncMessenger::mark_down_addrs(const entity_addrvec_t& addrs) +{ + lock.Lock(); + AsyncConnectionRef p = _lookup_conn(addrs); + if (p) { + ldout(cct, 1) << __func__ << " " << addrs << " -- " << p << dendl; + p->stop(true); + } else { + ldout(cct, 1) << __func__ << " " << addrs << " -- connection dne" << dendl; + } + lock.Unlock(); +} + +int AsyncMessenger::get_proto_version(int peer_type, bool connect) const +{ + int my_type = my_name.type(); + + // set reply protocol version + if (peer_type == my_type) { + // internal + return cluster_protocol; + } else { + // public + switch (connect ? peer_type : my_type) { + case CEPH_ENTITY_TYPE_OSD: return CEPH_OSDC_PROTOCOL; + case CEPH_ENTITY_TYPE_MDS: return CEPH_MDSC_PROTOCOL; + case CEPH_ENTITY_TYPE_MON: return CEPH_MONC_PROTOCOL; + } + } + return 0; +} + +int AsyncMessenger::accept_conn(AsyncConnectionRef conn) +{ + Mutex::Locker l(lock); + auto it = conns.find(*conn->peer_addrs); + if (it != conns.end()) { + AsyncConnectionRef existing = it->second; + + // lazy delete, see "deleted_conns" + // If conn already in, we will return 0 + Mutex::Locker l(deleted_lock); + if (deleted_conns.erase(existing)) { + conns.erase(it); + } else if (conn != existing) { + return -1; + } + } + ldout(cct, 10) << __func__ << " " << conn << " " << *conn->peer_addrs << dendl; + conns[*conn->peer_addrs] = conn; + conn->get_perf_counter()->inc(l_msgr_active_connections); + accepting_conns.erase(conn); + return 0; +} + + +bool AsyncMessenger::learned_addr(const entity_addr_t &peer_addr_for_me) +{ + // be careful here: multiple threads may block here, and readers of + // my_addr do NOT hold any lock. + + // this always goes from true -> false under the protection of the + // mutex. if it is already false, we need not retake the mutex at + // all. + if (!need_addr) + return false; + std::lock_guard l(lock); + if (need_addr) { + if (my_addrs->empty()) { + auto a = peer_addr_for_me; + a.set_type(entity_addr_t::TYPE_ANY); + a.set_nonce(nonce); + if (!did_bind) { + a.set_port(0); + } + set_myaddrs(entity_addrvec_t(a)); + ldout(cct,10) << __func__ << " had no addrs" << dendl; + } else { + // fix all addrs of the same family, regardless of type (msgr2 vs legacy) + entity_addrvec_t newaddrs = *my_addrs; + for (auto& a : newaddrs.v) { + if (a.is_blank_ip() && + a.get_family() == peer_addr_for_me.get_family()) { + entity_addr_t t = peer_addr_for_me; + if (!did_bind) { + t.set_type(entity_addr_t::TYPE_ANY); + t.set_port(0); + } else { + t.set_type(a.get_type()); + t.set_port(a.get_port()); + } + t.set_nonce(a.get_nonce()); + ldout(cct,10) << __func__ << " " << a << " -> " << t << dendl; + a = t; + } + } + set_myaddrs(newaddrs); + } + ldout(cct, 1) << __func__ << " learned my addr " << *my_addrs + << " (peer_addr_for_me " << peer_addr_for_me << ")" << dendl; + _init_local_connection(); + need_addr = false; + return true; + } + return false; +} + +int AsyncMessenger::reap_dead() +{ + ldout(cct, 1) << __func__ << " start" << dendl; + int num = 0; + + Mutex::Locker l1(lock); + Mutex::Locker l2(deleted_lock); + + while (!deleted_conns.empty()) { + auto it = deleted_conns.begin(); + AsyncConnectionRef p = *it; + ldout(cct, 5) << __func__ << " delete " << p << dendl; + auto conns_it = conns.find(*p->peer_addrs); + if (conns_it != conns.end() && conns_it->second == p) + conns.erase(conns_it); + accepting_conns.erase(p); + deleted_conns.erase(it); + ++num; + } + + return num; +} diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h new file mode 100644 index 00000000..98bf9d52 --- /dev/null +++ b/src/msg/async/AsyncMessenger.h @@ -0,0 +1,426 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_ASYNCMESSENGER_H +#define CEPH_ASYNCMESSENGER_H + +#include <map> +#include <mutex> + +#include "include/types.h" +#include "include/xlist.h" +#include "include/spinlock.h" +#include "include/unordered_map.h" +#include "include/unordered_set.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "msg/SimplePolicyMessenger.h" +#include "msg/DispatchQueue.h" +#include "AsyncConnection.h" +#include "Event.h" + +#include "include/ceph_assert.h" + +class AsyncMessenger; + +/** + * If the Messenger binds to a specific address, the Processor runs + * and listens for incoming connections. + */ +class Processor { + AsyncMessenger *msgr; + NetHandler net; + Worker *worker; + vector<ServerSocket> listen_sockets; + EventCallbackRef listen_handler; + + class C_processor_accept; + + public: + Processor(AsyncMessenger *r, Worker *w, CephContext *c); + ~Processor() { delete listen_handler; }; + + void stop(); + int bind(const entity_addrvec_t &bind_addrs, + const set<int>& avoid_ports, + entity_addrvec_t* bound_addrs); + void start(); + void accept(); +}; + +/* + * AsyncMessenger is represented for maintaining a set of asynchronous connections, + * it may own a bind address and the accepted connections will be managed by + * AsyncMessenger. + * + */ + +class AsyncMessenger : public SimplePolicyMessenger { + // First we have the public Messenger interface implementation... +public: + /** + * Initialize the AsyncMessenger! + * + * @param cct The CephContext to use + * @param name The name to assign ourselves + * _nonce A unique ID to use for this AsyncMessenger. It should not + * be a value that will be repeated if the daemon restarts. + */ + AsyncMessenger(CephContext *cct, entity_name_t name, const std::string &type, + string mname, uint64_t _nonce); + + /** + * Destroy the AsyncMessenger. Pretty simple since all the work is done + * elsewhere. + */ + ~AsyncMessenger() override; + + /** @defgroup Accessors + * @{ + */ + bool set_addr_unknowns(const entity_addrvec_t &addr) override; + void set_addrs(const entity_addrvec_t &addrs) override; + + int get_dispatch_queue_len() override { + return dispatch_queue.get_queue_len(); + } + + double get_dispatch_queue_max_age(utime_t now) override { + return dispatch_queue.get_max_age(now); + } + /** @} Accessors */ + + /** + * @defgroup Configuration functions + * @{ + */ + void set_cluster_protocol(int p) override { + ceph_assert(!started && !did_bind); + cluster_protocol = p; + } + + int bind(const entity_addr_t& bind_addr) override; + int rebind(const set<int>& avoid_ports) override; + int client_bind(const entity_addr_t& bind_addr) override; + + int bindv(const entity_addrvec_t& bind_addrs) override; + + bool should_use_msgr2() override; + + /** @} Configuration functions */ + + /** + * @defgroup Startup/Shutdown + * @{ + */ + int start() override; + void wait() override; + int shutdown() override; + + /** @} // Startup/Shutdown */ + + /** + * @defgroup Messaging + * @{ + */ + int send_to(Message *m, int type, const entity_addrvec_t& addrs) override; + + /** @} // Messaging */ + + /** + * @defgroup Connection Management + * @{ + */ + ConnectionRef connect_to(int type, + const entity_addrvec_t& addrs) override; + ConnectionRef get_loopback_connection() override; + void mark_down(const entity_addr_t& addr) override { + mark_down_addrs(entity_addrvec_t(addr)); + } + void mark_down_addrs(const entity_addrvec_t& addrs) override; + void mark_down_all() override { + shutdown_connections(true); + } + /** @} // Connection Management */ + + /** + * @defgroup Inner classes + * @{ + */ + + /** + * @} // Inner classes + */ + +protected: + /** + * @defgroup Messenger Interfaces + * @{ + */ + /** + * Start up the DispatchQueue thread once we have somebody to dispatch to. + */ + void ready() override; + /** @} // Messenger Interfaces */ + +private: + + /** + * @defgroup Utility functions + * @{ + */ + + /** + * Create a connection associated with the given entity (of the given type). + * Initiate the connection. (This function returning does not guarantee + * connection success.) + * + * @param addrs The address(es) of the entity to connect to. + * @param type The peer type of the entity at the address. + * + * @return a pointer to the newly-created connection. Caller does not own a + * reference; take one if you need it. + */ + AsyncConnectionRef create_connect(const entity_addrvec_t& addrs, int type); + + /** + * Queue up a Message for delivery to the entity specified + * by addr and dest_type. + * submit_message() is responsible for creating + * new AsyncConnection (and closing old ones) as necessary. + * + * @param m The Message to queue up. This function eats a reference. + * @param con The existing Connection to use, or NULL if you don't know of one. + * @param dest_addr The address to send the Message to. + * @param dest_type The peer type of the address we're sending to + * just drop silently under failure. + */ + void submit_message(Message *m, AsyncConnectionRef con, + const entity_addrvec_t& dest_addrs, int dest_type); + + void _finish_bind(const entity_addrvec_t& bind_addrs, + const entity_addrvec_t& listen_addrs); + + entity_addrvec_t _filter_addrs(int type, + const entity_addrvec_t& addrs); + + private: + static const uint64_t ReapDeadConnectionThreshold = 5; + + NetworkStack *stack; + std::vector<Processor*> processors; + friend class Processor; + DispatchQueue dispatch_queue; + + // the worker run messenger's cron jobs + Worker *local_worker; + + std::string ms_type; + + /// overall lock used for AsyncMessenger data structures + Mutex lock; + // AsyncMessenger stuff + /// approximately unique ID set by the Constructor for use in entity_addr_t + uint64_t nonce; + + /// true, specifying we haven't learned our addr; set false when we find it. + // maybe this should be protected by the lock? + bool need_addr; + + /** + * set to bind addresses if bind was called before NetworkStack was ready to + * bind + */ + entity_addrvec_t pending_bind_addrs; + + /** + * false; set to true if a pending bind exists + */ + bool pending_bind = false; + + /** + * The following aren't lock-protected since you shouldn't be able to race + * the only writers. + */ + + /** + * false; set to true if the AsyncMessenger bound to a specific address; + * and set false again by Accepter::stop(). + */ + bool did_bind; + /// counter for the global seq our connection protocol uses + __u32 global_seq; + /// lock to protect the global_seq + ceph::spinlock global_seq_lock; + + /** + * hash map of addresses to Asyncconnection + * + * NOTE: a Asyncconnection* with state CLOSED may still be in the map but is considered + * invalid and can be replaced by anyone holding the msgr lock + */ + ceph::unordered_map<entity_addrvec_t, AsyncConnectionRef> conns; + + /** + * list of connection are in the process of accepting + * + * These are not yet in the conns map. + */ + set<AsyncConnectionRef> accepting_conns; + + /** + * list of connection are closed which need to be clean up + * + * Because AsyncMessenger and AsyncConnection follow a lock rule that + * we can lock AsyncMesenger::lock firstly then lock AsyncConnection::lock + * but can't reversed. This rule is aimed to avoid dead lock. + * So if AsyncConnection want to unregister itself from AsyncMessenger, + * we pick up this idea that just queue itself to this set and do lazy + * deleted for AsyncConnection. "_lookup_conn" must ensure not return a + * AsyncConnection in this set. + */ + Mutex deleted_lock; + set<AsyncConnectionRef> deleted_conns; + + EventCallbackRef reap_handler; + + /// internal cluster protocol version, if any, for talking to entities of the same type. + int cluster_protocol; + + Cond stop_cond; + bool stopped; + + AsyncConnectionRef _lookup_conn(const entity_addrvec_t& k) { + ceph_assert(lock.is_locked()); + auto p = conns.find(k); + if (p == conns.end()) + return NULL; + + // lazy delete, see "deleted_conns" + Mutex::Locker l(deleted_lock); + if (deleted_conns.erase(p->second)) { + conns.erase(p); + return NULL; + } + + return p->second; + } + + void _init_local_connection() { + ceph_assert(lock.is_locked()); + local_connection->peer_addrs = *my_addrs; + local_connection->peer_type = my_name.type(); + local_connection->set_features(CEPH_FEATURES_ALL); + ms_deliver_handle_fast_connect(local_connection.get()); + } + + void shutdown_connections(bool queue_reset); + +public: + + /// con used for sending messages to ourselves + AsyncConnectionRef local_connection; + + /** + * @defgroup AsyncMessenger internals + * @{ + */ + /** + * This wraps _lookup_conn. + */ + AsyncConnectionRef lookup_conn(const entity_addrvec_t& k) { + Mutex::Locker l(lock); + return _lookup_conn(k); + } + + int accept_conn(AsyncConnectionRef conn); + bool learned_addr(const entity_addr_t &peer_addr_for_me); + void add_accept(Worker *w, ConnectedSocket cli_socket, + const entity_addr_t &listen_addr, + const entity_addr_t &peer_addr); + NetworkStack *get_stack() { + return stack; + } + + uint64_t get_nonce() const { + return nonce; + } + + /** + * Increment the global sequence for this AsyncMessenger and return it. + * This is for the connect protocol, although it doesn't hurt if somebody + * else calls it. + * + * @return a global sequence ID that nobody else has seen. + */ + __u32 get_global_seq(__u32 old=0) { + std::lock_guard<ceph::spinlock> lg(global_seq_lock); + + if (old > global_seq) + global_seq = old; + __u32 ret = ++global_seq; + + return ret; + } + /** + * Get the protocol version we support for the given peer type: either + * a peer protocol (if it matches our own), the protocol version for the + * peer (if we're connecting), or our protocol version (if we're accepting). + */ + int get_proto_version(int peer_type, bool connect) const; + + /** + * Fill in the address and peer type for the local connection, which + * is used for delivering messages back to ourself. + */ + void init_local_connection() { + Mutex::Locker l(lock); + _init_local_connection(); + } + + /** + * Unregister connection from `conns` + * + * See "deleted_conns" + */ + void unregister_conn(AsyncConnectionRef conn) { + Mutex::Locker l(deleted_lock); + conn->get_perf_counter()->dec(l_msgr_active_connections); + deleted_conns.emplace(std::move(conn)); + + if (deleted_conns.size() >= ReapDeadConnectionThreshold) { + local_worker->center.dispatch_event_external(reap_handler); + } + } + + /** + * Reap dead connection from `deleted_conns` + * + * @return the number of dead connections + * + * See "deleted_conns" + */ + int reap_dead(); + + /** + * @} // AsyncMessenger Internals + */ +} ; + +#endif /* CEPH_ASYNCMESSENGER_H */ diff --git a/src/msg/async/Event.cc b/src/msg/async/Event.cc new file mode 100644 index 00000000..6b5e4c7c --- /dev/null +++ b/src/msg/async/Event.cc @@ -0,0 +1,471 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "include/compat.h" +#include "common/errno.h" +#include "Event.h" + +#ifdef HAVE_DPDK +#include "dpdk/EventDPDK.h" +#endif + +#ifdef HAVE_EPOLL +#include "EventEpoll.h" +#else +#ifdef HAVE_KQUEUE +#include "EventKqueue.h" +#else +#include "EventSelect.h" +#endif +#endif + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "EventCallback " +class C_handle_notify : public EventCallback { + EventCenter *center; + CephContext *cct; + + public: + C_handle_notify(EventCenter *c, CephContext *cc): center(c), cct(cc) {} + void do_request(uint64_t fd_or_id) override { + char c[256]; + int r = 0; + do { + r = read(fd_or_id, c, sizeof(c)); + if (r < 0) { + if (errno != EAGAIN) + ldout(cct, 1) << __func__ << " read notify pipe failed: " << cpp_strerror(errno) << dendl; + } + } while (r > 0); + } +}; + +#undef dout_prefix +#define dout_prefix _event_prefix(_dout) + +/** + * Construct a Poller. + * + * \param center + * EventCenter object through which the poller will be invoked (defaults + * to the global #RAMCloud::center object). + * \param pollerName + * Human readable name that can be printed out in debugging messages + * about the poller. The name of the superclass is probably sufficient + * for most cases. + */ +EventCenter::Poller::Poller(EventCenter* center, const string& name) + : owner(center), poller_name(name), slot(owner->pollers.size()) +{ + owner->pollers.push_back(this); +} + +/** + * Destroy a Poller. + */ +EventCenter::Poller::~Poller() +{ + // Erase this Poller from the vector by overwriting it with the + // poller that used to be the last one in the vector. + // + // Note: this approach is reentrant (it is safe to delete a + // poller from a poller callback, which means that the poll + // method is in the middle of scanning the list of all pollers; + // the worst that will happen is that the poller that got moved + // may not be invoked in the current scan). + owner->pollers[slot] = owner->pollers.back(); + owner->pollers[slot]->slot = slot; + owner->pollers.pop_back(); + slot = -1; +} + +ostream& EventCenter::_event_prefix(std::ostream *_dout) +{ + return *_dout << "Event(" << this << " nevent=" << nevent + << " time_id=" << time_event_next_id << ")."; +} + +int EventCenter::init(int n, unsigned i, const std::string &t) +{ + // can't init multi times + ceph_assert(nevent == 0); + + type = t; + idx = i; + + if (t == "dpdk") { +#ifdef HAVE_DPDK + driver = new DPDKDriver(cct); +#endif + } else { +#ifdef HAVE_EPOLL + driver = new EpollDriver(cct); +#else +#ifdef HAVE_KQUEUE + driver = new KqueueDriver(cct); +#else + driver = new SelectDriver(cct); +#endif +#endif + } + + if (!driver) { + lderr(cct) << __func__ << " failed to create event driver " << dendl; + return -1; + } + + int r = driver->init(this, n); + if (r < 0) { + lderr(cct) << __func__ << " failed to init event driver." << dendl; + return r; + } + + file_events.resize(n); + nevent = n; + + if (!driver->need_wakeup()) + return 0; + + int fds[2]; + if (pipe_cloexec(fds) < 0) { + int e = errno; + lderr(cct) << __func__ << " can't create notify pipe: " << cpp_strerror(e) << dendl; + return -e; + } + + notify_receive_fd = fds[0]; + notify_send_fd = fds[1]; + r = net.set_nonblock(notify_receive_fd); + if (r < 0) { + return r; + } + r = net.set_nonblock(notify_send_fd); + if (r < 0) { + return r; + } + + return r; +} + +EventCenter::~EventCenter() +{ + { + std::lock_guard<std::mutex> l(external_lock); + while (!external_events.empty()) { + EventCallbackRef e = external_events.front(); + if (e) + e->do_request(0); + external_events.pop_front(); + } + } + time_events.clear(); + //assert(time_events.empty()); + + if (notify_receive_fd >= 0) + ::close(notify_receive_fd); + if (notify_send_fd >= 0) + ::close(notify_send_fd); + + delete driver; + if (notify_handler) + delete notify_handler; +} + + +void EventCenter::set_owner() +{ + owner = pthread_self(); + ldout(cct, 2) << __func__ << " idx=" << idx << " owner=" << owner << dendl; + if (!global_centers) { + global_centers = &cct->lookup_or_create_singleton_object< + EventCenter::AssociatedCenters>( + "AsyncMessenger::EventCenter::global_center::" + type, true); + ceph_assert(global_centers); + global_centers->centers[idx] = this; + if (driver->need_wakeup()) { + notify_handler = new C_handle_notify(this, cct); + int r = create_file_event(notify_receive_fd, EVENT_READABLE, notify_handler); + ceph_assert(r == 0); + } + } +} + +int EventCenter::create_file_event(int fd, int mask, EventCallbackRef ctxt) +{ + ceph_assert(in_thread()); + int r = 0; + if (fd >= nevent) { + int new_size = nevent << 2; + while (fd >= new_size) + new_size <<= 2; + ldout(cct, 20) << __func__ << " event count exceed " << nevent << ", expand to " << new_size << dendl; + r = driver->resize_events(new_size); + if (r < 0) { + lderr(cct) << __func__ << " event count is exceed." << dendl; + return -ERANGE; + } + file_events.resize(new_size); + nevent = new_size; + } + + EventCenter::FileEvent *event = _get_file_event(fd); + ldout(cct, 20) << __func__ << " create event started fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; + if (event->mask == mask) + return 0; + + r = driver->add_event(fd, event->mask, mask); + if (r < 0) { + // Actually we don't allow any failed error code, caller doesn't prepare to + // handle error status. So now we need to assert failure here. In practice, + // add_event shouldn't report error, otherwise it must be a innermost bug! + lderr(cct) << __func__ << " add event failed, ret=" << r << " fd=" << fd + << " mask=" << mask << " original mask is " << event->mask << dendl; + ceph_abort_msg("BUG!"); + return r; + } + + event->mask |= mask; + if (mask & EVENT_READABLE) { + event->read_cb = ctxt; + } + if (mask & EVENT_WRITABLE) { + event->write_cb = ctxt; + } + ldout(cct, 20) << __func__ << " create event end fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; + return 0; +} + +void EventCenter::delete_file_event(int fd, int mask) +{ + ceph_assert(in_thread() && fd >= 0); + if (fd >= nevent) { + ldout(cct, 1) << __func__ << " delete event fd=" << fd << " is equal or greater than nevent=" << nevent + << "mask=" << mask << dendl; + return ; + } + EventCenter::FileEvent *event = _get_file_event(fd); + ldout(cct, 30) << __func__ << " delete event started fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; + if (!event->mask) + return ; + + int r = driver->del_event(fd, event->mask, mask); + if (r < 0) { + // see create_file_event + ceph_abort_msg("BUG!"); + } + + if (mask & EVENT_READABLE && event->read_cb) { + event->read_cb = nullptr; + } + if (mask & EVENT_WRITABLE && event->write_cb) { + event->write_cb = nullptr; + } + + event->mask = event->mask & (~mask); + ldout(cct, 30) << __func__ << " delete event end fd=" << fd << " mask=" << mask + << " original mask is " << event->mask << dendl; +} + +uint64_t EventCenter::create_time_event(uint64_t microseconds, EventCallbackRef ctxt) +{ + ceph_assert(in_thread()); + uint64_t id = time_event_next_id++; + + ldout(cct, 30) << __func__ << " id=" << id << " trigger after " << microseconds << "us"<< dendl; + EventCenter::TimeEvent event; + clock_type::time_point expire = clock_type::now() + std::chrono::microseconds(microseconds); + event.id = id; + event.time_cb = ctxt; + std::multimap<clock_type::time_point, TimeEvent>::value_type s_val(expire, event); + auto it = time_events.insert(std::move(s_val)); + event_map[id] = it; + + return id; +} + +void EventCenter::delete_time_event(uint64_t id) +{ + ceph_assert(in_thread()); + ldout(cct, 30) << __func__ << " id=" << id << dendl; + if (id >= time_event_next_id || id == 0) + return ; + + auto it = event_map.find(id); + if (it == event_map.end()) { + ldout(cct, 10) << __func__ << " id=" << id << " not found" << dendl; + return ; + } + + time_events.erase(it->second); + event_map.erase(it); +} + +void EventCenter::wakeup() +{ + // No need to wake up since we never sleep + if (!pollers.empty() || !driver->need_wakeup()) + return ; + + ldout(cct, 20) << __func__ << dendl; + char buf = 'c'; + // wake up "event_wait" + int n = write(notify_send_fd, &buf, sizeof(buf)); + if (n < 0) { + if (errno != EAGAIN) { + ldout(cct, 1) << __func__ << " write notify pipe failed: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + } +} + +int EventCenter::process_time_events() +{ + int processed = 0; + clock_type::time_point now = clock_type::now(); + ldout(cct, 30) << __func__ << " cur time is " << now << dendl; + + while (!time_events.empty()) { + auto it = time_events.begin(); + if (now >= it->first) { + TimeEvent &e = it->second; + EventCallbackRef cb = e.time_cb; + uint64_t id = e.id; + time_events.erase(it); + event_map.erase(id); + ldout(cct, 30) << __func__ << " process time event: id=" << id << dendl; + processed++; + cb->do_request(id); + } else { + break; + } + } + + return processed; +} + +int EventCenter::process_events(unsigned timeout_microseconds, ceph::timespan *working_dur) +{ + struct timeval tv; + int numevents; + bool trigger_time = false; + auto now = clock_type::now(); + + auto it = time_events.begin(); + bool blocking = pollers.empty() && !external_num_events.load(); + // If exists external events or poller, don't block + if (!blocking) { + if (it != time_events.end() && now >= it->first) + trigger_time = true; + tv.tv_sec = 0; + tv.tv_usec = 0; + } else { + clock_type::time_point shortest; + shortest = now + std::chrono::microseconds(timeout_microseconds); + + if (it != time_events.end() && shortest >= it->first) { + ldout(cct, 30) << __func__ << " shortest is " << shortest << " it->first is " << it->first << dendl; + shortest = it->first; + trigger_time = true; + if (shortest > now) { + timeout_microseconds = std::chrono::duration_cast<std::chrono::microseconds>( + shortest - now).count(); + } else { + shortest = now; + timeout_microseconds = 0; + } + } + tv.tv_sec = timeout_microseconds / 1000000; + tv.tv_usec = timeout_microseconds % 1000000; + } + + ldout(cct, 30) << __func__ << " wait second " << tv.tv_sec << " usec " << tv.tv_usec << dendl; + vector<FiredFileEvent> fired_events; + numevents = driver->event_wait(fired_events, &tv); + auto working_start = ceph::mono_clock::now(); + for (int j = 0; j < numevents; j++) { + int rfired = 0; + FileEvent *event; + EventCallbackRef cb; + event = _get_file_event(fired_events[j].fd); + + /* note the event->mask & mask & ... code: maybe an already processed + * event removed an element that fired and we still didn't + * processed, so we check if the event is still valid. */ + if (event->mask & fired_events[j].mask & EVENT_READABLE) { + rfired = 1; + cb = event->read_cb; + cb->do_request(fired_events[j].fd); + } + + if (event->mask & fired_events[j].mask & EVENT_WRITABLE) { + if (!rfired || event->read_cb != event->write_cb) { + cb = event->write_cb; + cb->do_request(fired_events[j].fd); + } + } + + ldout(cct, 30) << __func__ << " event_wq process is " << fired_events[j].fd << " mask is " << fired_events[j].mask << dendl; + } + + if (trigger_time) + numevents += process_time_events(); + + if (external_num_events.load()) { + external_lock.lock(); + deque<EventCallbackRef> cur_process; + cur_process.swap(external_events); + external_num_events.store(0); + external_lock.unlock(); + numevents += cur_process.size(); + while (!cur_process.empty()) { + EventCallbackRef e = cur_process.front(); + ldout(cct, 30) << __func__ << " do " << e << dendl; + e->do_request(0); + cur_process.pop_front(); + } + } + + if (!numevents && !blocking) { + for (uint32_t i = 0; i < pollers.size(); i++) + numevents += pollers[i]->poll(); + } + + if (working_dur) + *working_dur = ceph::mono_clock::now() - working_start; + return numevents; +} + +void EventCenter::dispatch_event_external(EventCallbackRef e) +{ + uint64_t num = 0; + { + std::lock_guard lock{external_lock}; + if (external_num_events > 0 && *external_events.rbegin() == e) { + return; + } + external_events.push_back(e); + num = ++external_num_events; + } + if (num == 1 && !in_thread()) + wakeup(); + + ldout(cct, 30) << __func__ << " " << e << " pending " << num << dendl; +} diff --git a/src/msg/async/Event.h b/src/msg/async/Event.h new file mode 100644 index 00000000..6736060e --- /dev/null +++ b/src/msg/async/Event.h @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENT_H +#define CEPH_MSG_EVENT_H + +#ifdef __APPLE__ +#include <AvailabilityMacros.h> +#endif + +// We use epoll, kqueue, evport, select in descending order by performance. +#if defined(__linux__) +#define HAVE_EPOLL 1 +#endif + +#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__) +#define HAVE_KQUEUE 1 +#endif + +#ifdef __sun +#include <sys/feature_tests.h> +#ifdef _DTRACE_VERSION +#define HAVE_EVPORT 1 +#endif +#endif + +#include <atomic> +#include <mutex> +#include <condition_variable> + +#include "common/ceph_time.h" +#include "common/dout.h" +#include "net_handler.h" + +#define EVENT_NONE 0 +#define EVENT_READABLE 1 +#define EVENT_WRITABLE 2 + +class EventCenter; + +class EventCallback { + + public: + virtual void do_request(uint64_t fd_or_id) = 0; + virtual ~EventCallback() {} // we want a virtual destructor!!! +}; + +typedef EventCallback* EventCallbackRef; + +struct FiredFileEvent { + int fd; + int mask; +}; + +/* + * EventDriver is a wrap of event mechanisms depends on different OS. + * For example, Linux will use epoll(2), BSD will use kqueue(2) and select will + * be used for worst condition. + */ +class EventDriver { + public: + virtual ~EventDriver() {} // we want a virtual destructor!!! + virtual int init(EventCenter *center, int nevent) = 0; + virtual int add_event(int fd, int cur_mask, int mask) = 0; + virtual int del_event(int fd, int cur_mask, int del_mask) = 0; + virtual int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) = 0; + virtual int resize_events(int newsize) = 0; + virtual bool need_wakeup() { return true; } +}; + +/* + * EventCenter maintain a set of file descriptor and handle registered events. + */ +class EventCenter { + public: + // should be enough; + static const int MAX_EVENTCENTER = 24; + + private: + using clock_type = ceph::coarse_mono_clock; + + struct AssociatedCenters { + EventCenter *centers[MAX_EVENTCENTER]; + AssociatedCenters() { + // FIPS zeroization audit 20191115: this memset is not security related. + memset(centers, 0, MAX_EVENTCENTER * sizeof(EventCenter*)); + } + }; + + struct FileEvent { + int mask; + EventCallbackRef read_cb; + EventCallbackRef write_cb; + FileEvent(): mask(0), read_cb(NULL), write_cb(NULL) {} + }; + + struct TimeEvent { + uint64_t id; + EventCallbackRef time_cb; + + TimeEvent(): id(0), time_cb(NULL) {} + }; + + public: + /** + * A Poller object is invoked once each time through the dispatcher's + * inner polling loop. + */ + class Poller { + public: + explicit Poller(EventCenter* center, const string& pollerName); + virtual ~Poller(); + + /** + * This method is defined by a subclass and invoked once by the + * center during each pass through its inner polling loop. + * + * \return + * 1 means that this poller did useful work during this call. + * 0 means that the poller found no work to do. + */ + virtual int poll() = 0; + + private: + /// The EventCenter object that owns this Poller. NULL means the + /// EventCenter has been deleted. + EventCenter* owner; + + /// Human-readable string name given to the poller to make it + /// easy to identify for debugging. For most pollers just passing + /// in the subclass name probably makes sense. + string poller_name; + + /// Index of this Poller in EventCenter::pollers. Allows deletion + /// without having to scan all the entries in pollers. -1 means + /// this poller isn't currently in EventCenter::pollers (happens + /// after EventCenter::reset). + int slot; + }; + + private: + CephContext *cct; + std::string type; + int nevent; + // Used only to external event + pthread_t owner = 0; + std::mutex external_lock; + std::atomic_ulong external_num_events; + deque<EventCallbackRef> external_events; + vector<FileEvent> file_events; + EventDriver *driver; + std::multimap<clock_type::time_point, TimeEvent> time_events; + // Keeps track of all of the pollers currently defined. We don't + // use an intrusive list here because it isn't reentrant: we need + // to add/remove elements while the center is traversing the list. + std::vector<Poller*> pollers; + std::map<uint64_t, std::multimap<clock_type::time_point, TimeEvent>::iterator> event_map; + uint64_t time_event_next_id; + int notify_receive_fd; + int notify_send_fd; + NetHandler net; + EventCallbackRef notify_handler; + unsigned idx; + AssociatedCenters *global_centers = nullptr; + + int process_time_events(); + FileEvent *_get_file_event(int fd) { + ceph_assert(fd < nevent); + return &file_events[fd]; + } + + public: + explicit EventCenter(CephContext *c): + cct(c), nevent(0), + external_num_events(0), + driver(NULL), time_event_next_id(1), + notify_receive_fd(-1), notify_send_fd(-1), net(c), + notify_handler(NULL), idx(0) { } + ~EventCenter(); + ostream& _event_prefix(std::ostream *_dout); + + int init(int nevent, unsigned idx, const std::string &t); + void set_owner(); + pthread_t get_owner() const { return owner; } + unsigned get_id() const { return idx; } + + EventDriver *get_driver() { return driver; } + + // Used by internal thread + int create_file_event(int fd, int mask, EventCallbackRef ctxt); + uint64_t create_time_event(uint64_t milliseconds, EventCallbackRef ctxt); + void delete_file_event(int fd, int mask); + void delete_time_event(uint64_t id); + int process_events(unsigned timeout_microseconds, ceph::timespan *working_dur = nullptr); + void wakeup(); + + // Used by external thread + void dispatch_event_external(EventCallbackRef e); + inline bool in_thread() const { + return pthread_equal(pthread_self(), owner); + } + + private: + template <typename func> + class C_submit_event : public EventCallback { + std::mutex lock; + std::condition_variable cond; + bool done = false; + func f; + bool nonwait; + public: + C_submit_event(func &&_f, bool nw) + : f(std::move(_f)), nonwait(nw) {} + void do_request(uint64_t id) override { + f(); + lock.lock(); + cond.notify_all(); + done = true; + bool del = nonwait; + lock.unlock(); + if (del) + delete this; + } + void wait() { + ceph_assert(!nonwait); + std::unique_lock<std::mutex> l(lock); + while (!done) + cond.wait(l); + } + }; + + public: + template <typename func> + void submit_to(int i, func &&f, bool nowait = false) { + ceph_assert(i < MAX_EVENTCENTER && global_centers); + EventCenter *c = global_centers->centers[i]; + ceph_assert(c); + if (!nowait && c->in_thread()) { + f(); + return ; + } + if (nowait) { + C_submit_event<func> *event = new C_submit_event<func>(std::move(f), true); + c->dispatch_event_external(event); + } else { + C_submit_event<func> event(std::move(f), false); + c->dispatch_event_external(&event); + event.wait(); + } + }; +}; + +#endif diff --git a/src/msg/async/EventEpoll.cc b/src/msg/async/EventEpoll.cc new file mode 100644 index 00000000..37b46973 --- /dev/null +++ b/src/msg/async/EventEpoll.cc @@ -0,0 +1,142 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include <fcntl.h> +#include "EventEpoll.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "EpollDriver." + +int EpollDriver::init(EventCenter *c, int nevent) +{ + events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*nevent); + if (!events) { + lderr(cct) << __func__ << " unable to malloc memory. " << dendl; + return -ENOMEM; + } + memset(events, 0, sizeof(struct epoll_event)*nevent); + + epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ + if (epfd == -1) { + lderr(cct) << __func__ << " unable to do epoll_create: " + << cpp_strerror(errno) << dendl; + return -errno; + } + if (::fcntl(epfd, F_SETFD, FD_CLOEXEC) == -1) { + int e = errno; + ::close(epfd); + lderr(cct) << __func__ << " unable to set cloexec: " + << cpp_strerror(e) << dendl; + + return -e; + } + + size = nevent; + + return 0; +} + +int EpollDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask + << " add_mask=" << add_mask << " to " << epfd << dendl; + struct epoll_event ee; + /* If the fd was already monitored for some event, we need a MOD + * operation. Otherwise we need an ADD operation. */ + int op; + op = cur_mask == EVENT_NONE ? EPOLL_CTL_ADD: EPOLL_CTL_MOD; + + ee.events = EPOLLET; + add_mask |= cur_mask; /* Merge old events */ + if (add_mask & EVENT_READABLE) + ee.events |= EPOLLIN; + if (add_mask & EVENT_WRITABLE) + ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (epoll_ctl(epfd, op, fd, &ee) == -1) { + lderr(cct) << __func__ << " epoll_ctl: add fd=" << fd << " failed. " + << cpp_strerror(errno) << dendl; + return -errno; + } + + return 0; +} + +int EpollDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask + << " delmask=" << delmask << " to " << epfd << dendl; + struct epoll_event ee; + int mask = cur_mask & (~delmask); + int r = 0; + + ee.events = 0; + if (mask & EVENT_READABLE) ee.events |= EPOLLIN; + if (mask & EVENT_WRITABLE) ee.events |= EPOLLOUT; + ee.data.u64 = 0; /* avoid valgrind warning */ + ee.data.fd = fd; + if (mask != EVENT_NONE) { + if ((r = epoll_ctl(epfd, EPOLL_CTL_MOD, fd, &ee)) < 0) { + lderr(cct) << __func__ << " epoll_ctl: modify fd=" << fd << " mask=" << mask + << " failed." << cpp_strerror(errno) << dendl; + return -errno; + } + } else { + /* Note, Kernel < 2.6.9 requires a non null event pointer even for + * EPOLL_CTL_DEL. */ + if ((r = epoll_ctl(epfd, EPOLL_CTL_DEL, fd, &ee)) < 0) { + lderr(cct) << __func__ << " epoll_ctl: delete fd=" << fd + << " failed." << cpp_strerror(errno) << dendl; + return -errno; + } + } + return 0; +} + +int EpollDriver::resize_events(int newsize) +{ + return 0; +} + +int EpollDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int retval, numevents = 0; + + retval = epoll_wait(epfd, events, size, + tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); + if (retval > 0) { + int j; + + numevents = retval; + fired_events.resize(numevents); + for (j = 0; j < numevents; j++) { + int mask = 0; + struct epoll_event *e = events + j; + + if (e->events & EPOLLIN) mask |= EVENT_READABLE; + if (e->events & EPOLLOUT) mask |= EVENT_WRITABLE; + if (e->events & EPOLLERR) mask |= EVENT_READABLE|EVENT_WRITABLE; + if (e->events & EPOLLHUP) mask |= EVENT_READABLE|EVENT_WRITABLE; + fired_events[j].fd = e->data.fd; + fired_events[j].mask = mask; + } + } + return numevents; +} diff --git a/src/msg/async/EventEpoll.h b/src/msg/async/EventEpoll.h new file mode 100644 index 00000000..abc4b8bb --- /dev/null +++ b/src/msg/async/EventEpoll.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENTEPOLL_H +#define CEPH_MSG_EVENTEPOLL_H + +#include <unistd.h> +#include <sys/epoll.h> + +#include "Event.h" + +class EpollDriver : public EventDriver { + int epfd; + struct epoll_event *events; + CephContext *cct; + int size; + + public: + explicit EpollDriver(CephContext *c): epfd(-1), events(NULL), cct(c), size(0) {} + ~EpollDriver() override { + if (epfd != -1) + close(epfd); + + if (events) + free(events); + } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, + struct timeval *tp) override; +}; + +#endif diff --git a/src/msg/async/EventKqueue.cc b/src/msg/async/EventKqueue.cc new file mode 100644 index 00000000..d6ba4a3d --- /dev/null +++ b/src/msg/async/EventKqueue.cc @@ -0,0 +1,267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "EventKqueue.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "KqueueDriver." + +#define KEVENT_NOWAIT 0 + +int KqueueDriver::test_kqfd() { + struct kevent ke[1]; + if (kevent(kqfd, ke, 0, NULL, 0, KEVENT_NOWAIT) == -1) { + ldout(cct,0) << __func__ << " invalid kqfd = " << kqfd + << cpp_strerror(errno) << dendl; + return -errno; + } + return kqfd; +} + +int KqueueDriver::restore_events() { + struct kevent ke[2]; + int i; + + ldout(cct,30) << __func__ << " on kqfd = " << kqfd << dendl; + for(i=0;i<size;i++) { + int num = 0; + if (sav_events[i].mask == 0 ) + continue; + ldout(cct,30) << __func__ << " restore kqfd = " << kqfd + << " fd = " << i << " mask " << sav_events[i].mask << dendl; + if (sav_events[i].mask & EVENT_READABLE) + EV_SET(&ke[num++], i, EVFILT_READ, EV_ADD, 0, 0, NULL); + if (sav_events[i].mask & EVENT_WRITABLE) + EV_SET(&ke[num++], i, EVFILT_WRITE, EV_ADD, 0, 0, NULL); + if (num) { + if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) { + ldout(cct,0) << __func__ << " unable to add event: " + << cpp_strerror(errno) << dendl; + return -errno; + } + } + } + return 0; +} + +int KqueueDriver::test_thread_change(const char* funcname) { + // check to see if we changed thread, because that invalidates + // the kqfd and we need to restore that + int oldkqfd = kqfd; + + if (!pthread_equal(mythread, pthread_self())) { + ldout(cct,20) << funcname << " We changed thread from " << mythread + << " to " << pthread_self() << dendl; + mythread = pthread_self(); + kqfd = -1; + } else if ((kqfd != -1) && (test_kqfd() < 0)) { + // should this ever happen? + // It would be strange to change kqfd with thread change. + // Might nee to change this into an ceph_assert() in the future. + ldout(cct,0) << funcname << " Warning: Recreating old kqfd. " + << "This should not happen!!!" << dendl; + kqfd = -1; + } + if (kqfd == -1) { + kqfd = kqueue(); + ldout(cct,30) << funcname << " kqueue: new kqfd = " << kqfd + << " (was: " << oldkqfd << ")" + << dendl; + if (kqfd < 0) { + lderr(cct) << funcname << " unable to do kqueue: " + << cpp_strerror(errno) << dendl; + return -errno; + } + if (restore_events()< 0) { + lderr(cct) << funcname << " unable restore all events " + << cpp_strerror(errno) << dendl; + return -errno; + } + } + return 0; +} + +int KqueueDriver::init(EventCenter *c, int nevent) +{ + // keep track of possible changes of our thread + // because change of thread kills the kqfd + mythread = pthread_self(); + + // Reserve the space to accept the kevent return events. + res_events = (struct kevent*)malloc(sizeof(struct kevent)*nevent); + if (!res_events) { + lderr(cct) << __func__ << " unable to malloc memory: " + << cpp_strerror(errno) << dendl; + return -ENOMEM; + } + memset(res_events, 0, sizeof(struct kevent)*nevent); + size = nevent; + + // Reserve the space to keep all of the events set, so it can be redone + // when we change trhread ID. + sav_events = (struct SaveEvent*)malloc(sizeof(struct SaveEvent)*nevent); + if (!sav_events) { + lderr(cct) << __func__ << " unable to malloc memory: " + << cpp_strerror(errno) << dendl; + return -ENOMEM; + } + memset(sav_events, 0, sizeof(struct SaveEvent)*nevent); + sav_max = nevent; + + // Delay assigning a descriptor until it is really needed. + // kqfd = kqueue(); + kqfd = -1; + return 0; +} + +int KqueueDriver::add_event(int fd, int cur_mask, int add_mask) +{ + struct kevent ke[2]; + int num = 0; + + ldout(cct,30) << __func__ << " add event kqfd = " << kqfd << " fd = " << fd + << " cur_mask = " << cur_mask << " add_mask = " << add_mask + << dendl; + + int r = test_thread_change(__func__); + if ( r < 0 ) + return r; + + if (add_mask & EVENT_READABLE) + EV_SET(&ke[num++], fd, EVFILT_READ, EV_ADD|EV_CLEAR, 0, 0, NULL); + if (add_mask & EVENT_WRITABLE) + EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_ADD|EV_CLEAR, 0, 0, NULL); + + if (num) { + if (kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT) == -1) { + lderr(cct) << __func__ << " unable to add event: " + << cpp_strerror(errno) << dendl; + return -errno; + } + } + // keep what we set + if (fd >= sav_max) + resize_events(sav_max+5000); + sav_events[fd].mask = cur_mask | add_mask; + return 0; +} + +int KqueueDriver::del_event(int fd, int cur_mask, int del_mask) +{ + struct kevent ke[2]; + int num = 0; + int mask = cur_mask & del_mask; + + ldout(cct,30) << __func__ << " delete event kqfd = " << kqfd + << " fd = " << fd << " cur_mask = " << cur_mask + << " del_mask = " << del_mask << dendl; + + int r = test_thread_change(__func__); + if ( r < 0 ) + return r; + + if (mask & EVENT_READABLE) + EV_SET(&ke[num++], fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + if (mask & EVENT_WRITABLE) + EV_SET(&ke[num++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL); + + if (num) { + int r = 0; + if ((r = kevent(kqfd, ke, num, NULL, 0, KEVENT_NOWAIT)) < 0) { + lderr(cct) << __func__ << " kevent: delete fd=" << fd << " mask=" << mask + << " failed." << cpp_strerror(errno) << dendl; + return -errno; + } + } + // keep the administration + sav_events[fd].mask = cur_mask & ~del_mask; + return 0; +} + +int KqueueDriver::resize_events(int newsize) +{ + ldout(cct,30) << __func__ << " kqfd = " << kqfd << "newsize = " << newsize + << dendl; + if (newsize > sav_max) { + sav_events = (struct SaveEvent*)realloc(sav_events, sizeof(struct SaveEvent)*newsize); + if (!sav_events) { + lderr(cct) << __func__ << " unable to realloc memory: " + << cpp_strerror(errno) << dendl; + ceph_assert(sav_events); + return -ENOMEM; + } + memset(&sav_events[size], 0, sizeof(struct SaveEvent)*(newsize-sav_max)); + sav_max = newsize; + } + return 0; +} + +int KqueueDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int retval, numevents = 0; + struct timespec timeout; + + ldout(cct,10) << __func__ << " kqfd = " << kqfd << dendl; + + int r = test_thread_change(__func__); + if ( r < 0 ) + return r; + + if (tvp != NULL) { + timeout.tv_sec = tvp->tv_sec; + timeout.tv_nsec = tvp->tv_usec * 1000; + ldout(cct,20) << __func__ << " " + << timeout.tv_sec << " sec " + << timeout.tv_nsec << " nsec" + << dendl; + retval = kevent(kqfd, NULL, 0, res_events, size, &timeout); + } else { + ldout(cct,30) << __func__ << " event_wait: " << " NULL" << dendl; + retval = kevent(kqfd, NULL, 0, res_events, size, KEVENT_NOWAIT); + } + + ldout(cct,25) << __func__ << " kevent retval: " << retval << dendl; + if (retval < 0) { + lderr(cct) << __func__ << " kqueue error: " + << cpp_strerror(errno) << dendl; + return -errno; + } else if (retval == 0) { + ldout(cct,5) << __func__ << " Hit timeout(" + << timeout.tv_sec << " sec " + << timeout.tv_nsec << " nsec" + << ")." << dendl; + } else { + int j; + + numevents = retval; + fired_events.resize(numevents); + for (j = 0; j < numevents; j++) { + int mask = 0; + struct kevent *e = res_events + j; + + if (e->filter == EVFILT_READ) mask |= EVENT_READABLE; + if (e->filter == EVFILT_WRITE) mask |= EVENT_WRITABLE; + if (e->flags & EV_ERROR) mask |= EVENT_READABLE|EVENT_WRITABLE; + fired_events[j].fd = (int)e->ident; + fired_events[j].mask = mask; + + } + } + return numevents; +} diff --git a/src/msg/async/EventKqueue.h b/src/msg/async/EventKqueue.h new file mode 100644 index 00000000..24863a93 --- /dev/null +++ b/src/msg/async/EventKqueue.h @@ -0,0 +1,67 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENTKQUEUE_H +#define CEPH_MSG_EVENTKQUEUE_H + +#include <sys/types.h> +#include <sys/event.h> +#include <unistd.h> + +#include "Event.h" + +class KqueueDriver : public EventDriver { + int kqfd; + pthread_t mythread; + struct kevent *res_events; + CephContext *cct; + int size; + + // Keep what we set on the kqfd + struct SaveEvent{ + int fd; + int mask; + }; + struct SaveEvent *sav_events; + int sav_max; + int restore_events(); + int test_kqfd(); + int test_thread_change(const char* funcname); + + public: + explicit KqueueDriver(CephContext *c): kqfd(-1), res_events(NULL), cct(c), + size(0), sav_max(0) {} + virtual ~KqueueDriver() { + if (kqfd != -1) + close(kqfd); + + if (res_events) + free(res_events); + size = 0; + if (sav_events) + free(sav_events); + sav_max = 0; + } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, + struct timeval *tp) override; +}; + +#endif diff --git a/src/msg/async/EventSelect.cc b/src/msg/async/EventSelect.cc new file mode 100644 index 00000000..fdee6ebc --- /dev/null +++ b/src/msg/async/EventSelect.cc @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "EventSelect.h" + +#include <unistd.h> +#include <sys/select.h> +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "SelectDriver." + +int SelectDriver::init(EventCenter *c, int nevent) +{ + ldout(cct, 0) << "Select isn't suitable for production env, just avoid " + << "compiling error or special purpose" << dendl; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + max_fd = 0; + return 0; +} + +int SelectDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 10) << __func__ << " add event to fd=" << fd << " mask=" << add_mask + << dendl; + + int mask = cur_mask | add_mask; + if (mask & EVENT_READABLE) + FD_SET(fd, &rfds); + if (mask & EVENT_WRITABLE) + FD_SET(fd, &wfds); + if (fd > max_fd) + max_fd = fd; + + return 0; +} + +int SelectDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 10) << __func__ << " del event fd=" << fd << " cur mask=" << cur_mask + << dendl; + + if (delmask & EVENT_READABLE) + FD_CLR(fd, &rfds); + if (delmask & EVENT_WRITABLE) + FD_CLR(fd, &wfds); + return 0; +} + +int SelectDriver::resize_events(int newsize) +{ + return 0; +} + +int SelectDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int retval, numevents = 0; + + memcpy(&_rfds, &rfds, sizeof(fd_set)); + memcpy(&_wfds, &wfds, sizeof(fd_set)); + + retval = select(max_fd+1, &_rfds, &_wfds, NULL, tvp); + if (retval > 0) { + for (int j = 0; j <= max_fd; j++) { + int mask = 0; + struct FiredFileEvent fe; + if (FD_ISSET(j, &_rfds)) + mask |= EVENT_READABLE; + if (FD_ISSET(j, &_wfds)) + mask |= EVENT_WRITABLE; + if (mask) { + fe.fd = j; + fe.mask = mask; + fired_events.push_back(fe); + numevents++; + } + } + } + return numevents; +} diff --git a/src/msg/async/EventSelect.h b/src/msg/async/EventSelect.h new file mode 100644 index 00000000..1b75da0b --- /dev/null +++ b/src/msg/async/EventSelect.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_EVENTSELECT_H +#define CEPH_MSG_EVENTSELECT_H + +#include "Event.h" + +class SelectDriver : public EventDriver { + fd_set rfds, wfds; + /* We need to have a copy of the fd sets as it's not safe to reuse + * FD sets after select(). */ + fd_set _rfds, _wfds; + int max_fd; + CephContext *cct; + + public: + explicit SelectDriver(CephContext *c): max_fd(0), cct(c) {} + ~SelectDriver() override {} + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, + struct timeval *tp) override; +}; + +#endif diff --git a/src/msg/async/PosixStack.cc b/src/msg/async/PosixStack.cc new file mode 100644 index 00000000..e9c8d404 --- /dev/null +++ b/src/msg/async/PosixStack.cc @@ -0,0 +1,293 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <errno.h> + +#include <algorithm> + +#include "PosixStack.h" + +#include "include/buffer.h" +#include "include/str_list.h" +#include "common/errno.h" +#include "common/strtol.h" +#include "common/dout.h" +#include "msg/Messenger.h" +#include "include/compat.h" +#include "include/sock_compat.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "PosixStack " + +class PosixConnectedSocketImpl final : public ConnectedSocketImpl { + NetHandler &handler; + int _fd; + entity_addr_t sa; + bool connected; + + public: + explicit PosixConnectedSocketImpl(NetHandler &h, const entity_addr_t &sa, int f, bool connected) + : handler(h), _fd(f), sa(sa), connected(connected) {} + + int is_connected() override { + if (connected) + return 1; + + int r = handler.reconnect(sa, _fd); + if (r == 0) { + connected = true; + return 1; + } else if (r < 0) { + return r; + } else { + return 0; + } + } + + ssize_t zero_copy_read(bufferptr&) override { + return -EOPNOTSUPP; + } + + ssize_t read(char *buf, size_t len) override { + ssize_t r = ::read(_fd, buf, len); + if (r < 0) + r = -errno; + return r; + } + + // return the sent length + // < 0 means error occurred + static ssize_t do_sendmsg(int fd, struct msghdr &msg, unsigned len, bool more) + { + size_t sent = 0; + while (1) { + MSGR_SIGPIPE_STOPPER; + ssize_t r; + r = ::sendmsg(fd, &msg, MSG_NOSIGNAL | (more ? MSG_MORE : 0)); + if (r < 0) { + if (errno == EINTR) { + continue; + } else if (errno == EAGAIN) { + break; + } + return -errno; + } + + sent += r; + if (len == sent) break; + + while (r > 0) { + if (msg.msg_iov[0].iov_len <= (size_t)r) { + // drain this whole item + r -= msg.msg_iov[0].iov_len; + msg.msg_iov++; + msg.msg_iovlen--; + } else { + msg.msg_iov[0].iov_base = (char *)msg.msg_iov[0].iov_base + r; + msg.msg_iov[0].iov_len -= r; + break; + } + } + } + return (ssize_t)sent; + } + + ssize_t send(bufferlist &bl, bool more) override { + size_t sent_bytes = 0; + auto pb = std::cbegin(bl.buffers()); + uint64_t left_pbrs = std::size(bl.buffers()); + while (left_pbrs) { + struct msghdr msg; + struct iovec msgvec[IOV_MAX]; + uint64_t size = std::min<uint64_t>(left_pbrs, IOV_MAX); + left_pbrs -= size; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&msg, 0, sizeof(msg)); + msg.msg_iovlen = size; + msg.msg_iov = msgvec; + unsigned msglen = 0; + for (auto iov = msgvec; iov != msgvec + size; iov++) { + iov->iov_base = (void*)(pb->c_str()); + iov->iov_len = pb->length(); + msglen += pb->length(); + ++pb; + } + ssize_t r = do_sendmsg(_fd, msg, msglen, left_pbrs || more); + if (r < 0) + return r; + + // "r" is the remaining length + sent_bytes += r; + if (static_cast<unsigned>(r) < msglen) + break; + // only "r" == 0 continue + } + + if (sent_bytes) { + bufferlist swapped; + if (sent_bytes < bl.length()) { + bl.splice(sent_bytes, bl.length()-sent_bytes, &swapped); + bl.swap(swapped); + } else { + bl.clear(); + } + } + + return static_cast<ssize_t>(sent_bytes); + } + void shutdown() override { + ::shutdown(_fd, SHUT_RDWR); + } + void close() override { + ::close(_fd); + } + int fd() const override { + return _fd; + } + int socket_fd() const override { + return _fd; + } + friend class PosixServerSocketImpl; + friend class PosixNetworkStack; +}; + +class PosixServerSocketImpl : public ServerSocketImpl { + NetHandler &handler; + int _fd; + + public: + explicit PosixServerSocketImpl(NetHandler &h, int f, + const entity_addr_t& listen_addr, unsigned slot) + : ServerSocketImpl(listen_addr.get_type(), slot), + handler(h), _fd(f) {} + int accept(ConnectedSocket *sock, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + void abort_accept() override { + ::close(_fd); + } + int fd() const override { + return _fd; + } +}; + +int PosixServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) { + ceph_assert(sock); + sockaddr_storage ss; + socklen_t slen = sizeof(ss); + int sd = accept_cloexec(_fd, (sockaddr*)&ss, &slen); + if (sd < 0) { + return -errno; + } + + int r = handler.set_nonblock(sd); + if (r < 0) { + ::close(sd); + return -errno; + } + + r = handler.set_socket_options(sd, opt.nodelay, opt.rcbuf_size); + if (r < 0) { + ::close(sd); + return -errno; + } + + ceph_assert(NULL != out); //out should not be NULL in accept connection + + out->set_type(addr_type); + out->set_sockaddr((sockaddr*)&ss); + handler.set_priority(sd, opt.priority, out->get_family()); + + std::unique_ptr<PosixConnectedSocketImpl> csi(new PosixConnectedSocketImpl(handler, *out, sd, true)); + *sock = ConnectedSocket(std::move(csi)); + return 0; +} + +void PosixWorker::initialize() +{ +} + +int PosixWorker::listen(entity_addr_t &sa, + unsigned addr_slot, + const SocketOptions &opt, + ServerSocket *sock) +{ + int listen_sd = net.create_socket(sa.get_family(), true); + if (listen_sd < 0) { + return -errno; + } + + int r = net.set_nonblock(listen_sd); + if (r < 0) { + ::close(listen_sd); + return -errno; + } + + r = net.set_socket_options(listen_sd, opt.nodelay, opt.rcbuf_size); + if (r < 0) { + ::close(listen_sd); + return -errno; + } + + r = ::bind(listen_sd, sa.get_sockaddr(), sa.get_sockaddr_len()); + if (r < 0) { + r = -errno; + ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr() + << ": " << cpp_strerror(r) << dendl; + ::close(listen_sd); + return r; + } + + r = ::listen(listen_sd, cct->_conf->ms_tcp_listen_backlog); + if (r < 0) { + r = -errno; + lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(r) << dendl; + ::close(listen_sd); + return r; + } + + *sock = ServerSocket( + std::unique_ptr<PosixServerSocketImpl>( + new PosixServerSocketImpl(net, listen_sd, sa, addr_slot))); + return 0; +} + +int PosixWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) { + int sd; + + if (opts.nonblock) { + sd = net.nonblock_connect(addr, opts.connect_bind_addr); + } else { + sd = net.connect(addr, opts.connect_bind_addr); + } + + if (sd < 0) { + return -errno; + } + + net.set_priority(sd, opts.priority, addr.get_family()); + *socket = ConnectedSocket( + std::unique_ptr<PosixConnectedSocketImpl>(new PosixConnectedSocketImpl(net, addr, sd, !opts.nonblock))); + return 0; +} + +PosixNetworkStack::PosixNetworkStack(CephContext *c, const string &t) + : NetworkStack(c, t) +{ +} diff --git a/src/msg/async/PosixStack.h b/src/msg/async/PosixStack.h new file mode 100644 index 00000000..f1aaccd4 --- /dev/null +++ b/src/msg/async/PosixStack.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ASYNC_POSIXSTACK_H +#define CEPH_MSG_ASYNC_POSIXSTACK_H + +#include <thread> + +#include "msg/msg_types.h" +#include "msg/async/net_handler.h" + +#include "Stack.h" + +class PosixWorker : public Worker { + NetHandler net; + void initialize() override; + public: + PosixWorker(CephContext *c, unsigned i) + : Worker(c, i), net(c) {} + int listen(entity_addr_t &sa, + unsigned addr_slot, + const SocketOptions &opt, + ServerSocket *socks) override; + int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; +}; + +class PosixNetworkStack : public NetworkStack { + vector<std::thread> threads; + + public: + explicit PosixNetworkStack(CephContext *c, const string &t); + + void spawn_worker(unsigned i, std::function<void ()> &&func) override { + threads.resize(i+1); + threads[i] = std::thread(func); + } + void join_worker(unsigned i) override { + ceph_assert(threads.size() > i && threads[i].joinable()); + threads[i].join(); + } +}; + +#endif //CEPH_MSG_ASYNC_POSIXSTACK_H diff --git a/src/msg/async/Protocol.cc b/src/msg/async/Protocol.cc new file mode 100644 index 00000000..4bdc065e --- /dev/null +++ b/src/msg/async/Protocol.cc @@ -0,0 +1,14 @@ +#include "Protocol.h" + +#include "AsyncConnection.h" +#include "AsyncMessenger.h" + +Protocol::Protocol(int type, AsyncConnection *connection) + : proto_type(type), + connection(connection), + messenger(connection->async_msgr), + cct(connection->async_msgr->cct) { + auth_meta.reset(new AuthConnectionMeta()); +} + +Protocol::~Protocol() {} diff --git a/src/msg/async/Protocol.h b/src/msg/async/Protocol.h new file mode 100644 index 00000000..cccba183 --- /dev/null +++ b/src/msg/async/Protocol.h @@ -0,0 +1,140 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _MSG_ASYNC_PROTOCOL_ +#define _MSG_ASYNC_PROTOCOL_ + +#include <list> +#include <map> + +#include "AsyncConnection.h" +#include "include/buffer.h" +#include "include/msgr.h" + +/* + * Continuation Helper Classes + */ + +#include <memory> +#include <tuple> + +template <class C> +class Ct { +public: + virtual ~Ct() {} + virtual Ct<C> *call(C *foo) const = 0; +}; + +template <class C, typename... Args> +class CtFun : public Ct<C> { +private: + using fn_t = Ct<C> *(C::*)(Args...); + fn_t _f; + std::tuple<Args...> _params; + + template <std::size_t... Is> + inline Ct<C> *_call(C *foo, std::index_sequence<Is...>) const { + return (foo->*_f)(std::get<Is>(_params)...); + } + +public: + CtFun(fn_t f) : _f(f) {} + + inline void setParams(Args... args) { _params = std::make_tuple(args...); } + inline Ct<C> *call(C *foo) const override { + return _call(foo, std::index_sequence_for<Args...>()); + } +}; + +using rx_buffer_t = + std::unique_ptr<buffer::ptr_node, buffer::ptr_node::disposer>; + +template <class C> +class CtRxNode : public Ct<C> { + using fn_t = Ct<C> *(C::*)(rx_buffer_t&&, int r); + fn_t _f; + +public: + mutable rx_buffer_t node; + int r; + + CtRxNode(fn_t f) : _f(f) {} + void setParams(rx_buffer_t &&node, int r) { + this->node = std::move(node); + this->r = r; + } + inline Ct<C> *call(C *foo) const override { + return (foo->*_f)(std::move(node), r); + } +}; + +template <class C> using CONTINUATION_TYPE = CtFun<C>; +template <class C> using CONTINUATION_TX_TYPE = CtFun<C, int>; +template <class C> using CONTINUATION_RX_TYPE = CtFun<C, char*, int>; +template <class C> using CONTINUATION_RXBPTR_TYPE = CtRxNode<C>; + +#define CONTINUATION_DECL(C, F, ...) \ + CtFun<C, ##__VA_ARGS__> F##_cont { (&C::F) }; + +#define CONTINUATION(F) F##_cont +#define CONTINUE(F, ...) (F##_cont.setParams(__VA_ARGS__), &F##_cont) + +#define CONTINUATION_RUN(CT) \ + { \ + Ct<std::remove_reference<decltype(*this)>::type> *_cont = &CT;\ + do { \ + _cont = _cont->call(this); \ + } while (_cont); \ + } + +#define READ_HANDLER_CONTINUATION_DECL(C, F) \ + CONTINUATION_DECL(C, F, char *, int) + +#define READ_BPTR_HANDLER_CONTINUATION_DECL(C, F) \ + CtRxNode<C> F##_cont { (&C::F) }; + +#define WRITE_HANDLER_CONTINUATION_DECL(C, F) CONTINUATION_DECL(C, F, int) + +////////////////////////////////////////////////////////////////////// + +class AsyncMessenger; + +class Protocol { +public: + const int proto_type; +protected: + AsyncConnection *connection; + AsyncMessenger *messenger; + CephContext *cct; +public: + std::shared_ptr<AuthConnectionMeta> auth_meta; + +public: + Protocol(int type, AsyncConnection *connection); + virtual ~Protocol(); + + // prepare protocol for connecting to peer + virtual void connect() = 0; + // prepare protocol for accepting peer connections + virtual void accept() = 0; + // true -> protocol is ready for sending messages + virtual bool is_connected() = 0; + // stop connection + virtual void stop() = 0; + // signal and handle connection failure + virtual void fault() = 0; + // send message + virtual void send_message(Message *m) = 0; + // send keepalive + virtual void send_keepalive() = 0; + + virtual void read_event() = 0; + virtual void write_event() = 0; + virtual bool is_queued() = 0; + + int get_con_mode() const { + return auth_meta->con_mode; + } +}; + +#endif /* _MSG_ASYNC_PROTOCOL_ */ diff --git a/src/msg/async/ProtocolV1.cc b/src/msg/async/ProtocolV1.cc new file mode 100644 index 00000000..9a7ab9d4 --- /dev/null +++ b/src/msg/async/ProtocolV1.cc @@ -0,0 +1,2547 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include "ProtocolV1.h" + +#include "common/errno.h" + +#include "AsyncConnection.h" +#include "AsyncMessenger.h" +#include "common/EventTrace.h" +#include "include/random.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream &ProtocolV1::_conn_prefix(std::ostream *_dout) { + return *_dout << "--1- " << messenger->get_myaddrs() << " >> " + << *connection->peer_addrs + << " conn(" + << connection << " " << this + << " :" << connection->port << " s=" << get_state_name(state) + << " pgs=" << peer_global_seq << " cs=" << connect_seq + << " l=" << connection->policy.lossy << ")."; +} + +#define WRITE(B, C) write(CONTINUATION(C), B) + +#define READ(L, C) read(CONTINUATION(C), L) + +#define READB(L, B, C) read(CONTINUATION(C), L, B) + +// Constant to limit starting sequence number to 2^31. Nothing special about +// it, just a big number. PLR +#define SEQ_MASK 0x7fffffff + +const int ASYNC_COALESCE_THRESHOLD = 256; + +using namespace std; + +static void alloc_aligned_buffer(bufferlist &data, unsigned len, unsigned off) { + // create a buffer to read into that matches the data alignment + unsigned alloc_len = 0; + unsigned left = len; + unsigned head = 0; + if (off & ~CEPH_PAGE_MASK) { + // head + alloc_len += CEPH_PAGE_SIZE; + head = std::min<uint64_t>(CEPH_PAGE_SIZE - (off & ~CEPH_PAGE_MASK), left); + left -= head; + } + alloc_len += left; + bufferptr ptr(buffer::create_small_page_aligned(alloc_len)); + if (head) ptr.set_offset(CEPH_PAGE_SIZE - head); + data.push_back(std::move(ptr)); +} + +/** + * Protocol V1 + **/ + +ProtocolV1::ProtocolV1(AsyncConnection *connection) + : Protocol(1, connection), + temp_buffer(nullptr), + can_write(WriteStatus::NOWRITE), + keepalive(false), + connect_seq(0), + peer_global_seq(0), + msg_left(0), + cur_msg_size(0), + replacing(false), + is_reset_from_peer(false), + once_ready(false), + state(NONE), + global_seq(0), + authorizer(nullptr), + wait_for_seq(false) { + temp_buffer = new char[4096]; +} + +ProtocolV1::~ProtocolV1() { + ceph_assert(out_q.empty()); + ceph_assert(sent.empty()); + + delete[] temp_buffer; + + if (authorizer) { + delete authorizer; + } +} + +void ProtocolV1::connect() { + this->state = START_CONNECT; + + // reset connect state variables + if (authorizer) { + delete authorizer; + authorizer = nullptr; + } + authorizer_buf.clear(); + // FIPS zeroization audit 20191115: these memsets are not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + memset(&connect_reply, 0, sizeof(connect_reply)); + + global_seq = messenger->get_global_seq(); +} + +void ProtocolV1::accept() { this->state = START_ACCEPT; } + +bool ProtocolV1::is_connected() { + return can_write.load() == WriteStatus::CANWRITE; +} + +void ProtocolV1::stop() { + ldout(cct, 20) << __func__ << dendl; + if (state == CLOSED) { + return; + } + + if (connection->delay_state) connection->delay_state->flush(); + + ldout(cct, 2) << __func__ << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + + reset_recv_state(); + discard_out_queue(); + + connection->_stop(); + + can_write = WriteStatus::CLOSED; + state = CLOSED; +} + +void ProtocolV1::fault() { + ldout(cct, 20) << __func__ << dendl; + + if (state == CLOSED || state == NONE) { + ldout(cct, 10) << __func__ << " connection is already closed" << dendl; + return; + } + + if (connection->policy.lossy && state != START_CONNECT && + state != CONNECTING) { + ldout(cct, 1) << __func__ << " on lossy channel, failing" << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return; + } + + connection->write_lock.lock(); + can_write = WriteStatus::NOWRITE; + is_reset_from_peer = false; + + // requeue sent items + requeue_sent(); + + if (!once_ready && out_q.empty() && state >= START_ACCEPT && + state <= ACCEPTING_WAIT_CONNECT_MSG_AUTH && !replacing) { + ldout(cct, 10) << __func__ << " with nothing to send and in the half " + << " accept state just closed" << dendl; + connection->write_lock.unlock(); + stop(); + connection->dispatch_queue->queue_reset(connection); + return; + } + replacing = false; + + connection->fault(); + + reset_recv_state(); + + if (connection->policy.standby && out_q.empty() && !keepalive && + state != WAIT) { + ldout(cct, 10) << __func__ << " with nothing to send, going to standby" + << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return; + } + + connection->write_lock.unlock(); + + if ((state >= START_CONNECT && state <= CONNECTING_SEND_CONNECT_MSG) || + state == WAIT) { + // backoff! + if (state == WAIT) { + backoff.set_from_double(cct->_conf->ms_max_backoff); + } else if (backoff == utime_t()) { + backoff.set_from_double(cct->_conf->ms_initial_backoff); + } else { + backoff += backoff; + if (backoff > cct->_conf->ms_max_backoff) + backoff.set_from_double(cct->_conf->ms_max_backoff); + } + + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + ldout(cct, 10) << __func__ << " waiting " << backoff << dendl; + // woke up again; + connection->register_time_events.insert( + connection->center->create_time_event(backoff.to_nsec() / 1000, + connection->wakeup_handler)); + } else { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 0) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 0) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); + } +} + +void ProtocolV1::send_message(Message *m) { + bufferlist bl; + uint64_t f = connection->get_features(); + + // TODO: Currently not all messages supports reencode like MOSDMap, so here + // only let fast dispatch support messages prepare message + bool can_fast_prepare = messenger->ms_can_fast_dispatch(m); + if (can_fast_prepare) { + prepare_send_message(f, m, bl); + } + + std::lock_guard<std::mutex> l(connection->write_lock); + // "features" changes will change the payload encoding + if (can_fast_prepare && + (can_write == WriteStatus::NOWRITE || connection->get_features() != f)) { + // ensure the correctness of message encoding + bl.clear(); + m->clear_payload(); + ldout(cct, 5) << __func__ << " clear encoded buffer previous " << f + << " != " << connection->get_features() << dendl; + } + if (can_write == WriteStatus::CLOSED) { + ldout(cct, 10) << __func__ << " connection closed." + << " Drop message " << m << dendl; + m->put(); + } else { + m->trace.event("async enqueueing message"); + out_q[m->get_priority()].emplace_back(std::move(bl), m); + ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m + << dendl; + if (can_write != WriteStatus::REPLACING && !write_in_progress) { + write_in_progress = true; + connection->center->dispatch_event_external(connection->write_handler); + } + } +} + +void ProtocolV1::prepare_send_message(uint64_t features, Message *m, + bufferlist &bl) { + ldout(cct, 20) << __func__ << " m " << *m << dendl; + + // associate message with Connection (for benefit of encode_payload) + if (m->empty_payload()) { + ldout(cct, 20) << __func__ << " encoding features " << features << " " << m + << " " << *m << dendl; + } else { + ldout(cct, 20) << __func__ << " half-reencoding features " << features + << " " << m << " " << *m << dendl; + } + + // encode and copy out of *m + m->encode(features, messenger->crcflags); + + bl.append(m->get_payload()); + bl.append(m->get_middle()); + bl.append(m->get_data()); +} + +void ProtocolV1::send_keepalive() { + ldout(cct, 10) << __func__ << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (can_write != WriteStatus::CLOSED) { + keepalive = true; + connection->center->dispatch_event_external(connection->write_handler); + } +} + +void ProtocolV1::read_event() { + ldout(cct, 20) << __func__ << dendl; + switch (state) { + case START_CONNECT: + CONTINUATION_RUN(CONTINUATION(send_client_banner)); + break; + case START_ACCEPT: + CONTINUATION_RUN(CONTINUATION(send_server_banner)); + break; + case OPENED: + CONTINUATION_RUN(CONTINUATION(wait_message)); + break; + case THROTTLE_MESSAGE: + CONTINUATION_RUN(CONTINUATION(throttle_message)); + break; + case THROTTLE_BYTES: + CONTINUATION_RUN(CONTINUATION(throttle_bytes)); + break; + case THROTTLE_DISPATCH_QUEUE: + CONTINUATION_RUN(CONTINUATION(throttle_dispatch_queue)); + break; + default: + break; + } +} + +void ProtocolV1::write_event() { + ldout(cct, 10) << __func__ << dendl; + ssize_t r = 0; + + connection->write_lock.lock(); + if (can_write == WriteStatus::CANWRITE) { + if (keepalive) { + append_keepalive_or_ack(); + keepalive = false; + } + + auto start = ceph::mono_clock::now(); + bool more; + do { + bufferlist data; + Message *m = _get_next_outgoing(&data); + if (!m) { + break; + } + + if (!connection->policy.lossy) { + // put on sent list + sent.push_back(m); + m->get(); + } + more = !out_q.empty(); + connection->write_lock.unlock(); + + // send_message or requeue messages may not encode message + if (!data.length()) { + prepare_send_message(connection->get_features(), m, data); + } + + r = write_message(m, data, more); + + connection->write_lock.lock(); + if (r == 0) { + ; + } else if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + break; + } else if (r > 0) + break; + } while (can_write == WriteStatus::CANWRITE); + write_in_progress = false; + connection->write_lock.unlock(); + + // if r > 0 mean data still lefted, so no need _try_send. + if (r == 0) { + uint64_t left = ack_left; + if (left) { + ceph_le64 s; + s = in_seq; + connection->outgoing_bl.append(CEPH_MSGR_TAG_ACK); + connection->outgoing_bl.append((char *)&s, sizeof(s)); + ldout(cct, 10) << __func__ << " try send msg ack, acked " << left + << " messages" << dendl; + ack_left -= left; + left = ack_left; + r = connection->_try_send(left); + } else if (is_queued()) { + r = connection->_try_send(); + } + } + + connection->logger->tinc(l_msgr_running_send_time, + ceph::mono_clock::now() - start); + if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + } else { + write_in_progress = false; + connection->write_lock.unlock(); + connection->lock.lock(); + connection->write_lock.lock(); + if (state == STANDBY && !connection->policy.server && is_queued()) { + ldout(cct, 10) << __func__ << " policy.server is false" << dendl; + connection->_connect(); + } else if (connection->cs && state != NONE && state != CLOSED && + state != START_CONNECT) { + r = connection->_try_send(); + if (r < 0) { + ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl; + connection->write_lock.unlock(); + fault(); + connection->lock.unlock(); + return; + } + } + connection->write_lock.unlock(); + connection->lock.unlock(); + } +} + +bool ProtocolV1::is_queued() { + return !out_q.empty() || connection->is_queued(); +} + +void ProtocolV1::run_continuation(CtPtr pcontinuation) { + if (pcontinuation) { + CONTINUATION_RUN(*pcontinuation); + } +} + +CtPtr ProtocolV1::read(CONTINUATION_RX_TYPE<ProtocolV1> &next, + int len, char *buffer) { + if (!buffer) { + buffer = temp_buffer; + } + ssize_t r = connection->read(len, buffer, + [&next, this](char *buffer, int r) { + next.setParams(buffer, r); + CONTINUATION_RUN(next); + }); + if (r <= 0) { + next.setParams(buffer, r); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV1::write(CONTINUATION_TX_TYPE<ProtocolV1> &next, + bufferlist &buffer) { + ssize_t r = connection->write(buffer, [&next, this](int r) { + next.setParams(r); + CONTINUATION_RUN(next); + }); + if (r <= 0) { + next.setParams(r); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV1::ready() { + ldout(cct, 25) << __func__ << dendl; + + // make sure no pending tick timer + if (connection->last_tick_id) { + connection->center->delete_time_event(connection->last_tick_id); + } + connection->last_tick_id = connection->center->create_time_event( + connection->inactive_timeout_us, connection->tick_handler); + + connection->write_lock.lock(); + can_write = WriteStatus::CANWRITE; + if (is_queued()) { + connection->center->dispatch_event_external(connection->write_handler); + } + connection->write_lock.unlock(); + connection->maybe_start_delay_thread(); + + state = OPENED; + return wait_message(); +} + +CtPtr ProtocolV1::wait_message() { + if (state != OPENED) { // must have changed due to a replace + return nullptr; + } + + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(char), handle_message); +} + +CtPtr ProtocolV1::handle_message(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read tag failed" << dendl; + return _fault(); + } + + char tag = buffer[0]; + ldout(cct, 20) << __func__ << " process tag " << (int)tag << dendl; + + if (tag == CEPH_MSGR_TAG_KEEPALIVE) { + ldout(cct, 20) << __func__ << " got KEEPALIVE" << dendl; + connection->set_last_keepalive(ceph_clock_now()); + } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2) { + return READ(sizeof(ceph_timespec), handle_keepalive2); + } else if (tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) { + return READ(sizeof(ceph_timespec), handle_keepalive2_ack); + } else if (tag == CEPH_MSGR_TAG_ACK) { + return READ(sizeof(ceph_le64), handle_tag_ack); + } else if (tag == CEPH_MSGR_TAG_MSG) { +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + ltt_recv_stamp = ceph_clock_now(); +#endif + recv_stamp = ceph_clock_now(); + ldout(cct, 20) << __func__ << " begin MSG" << dendl; + return READ(sizeof(ceph_msg_header), handle_message_header); + } else if (tag == CEPH_MSGR_TAG_CLOSE) { + ldout(cct, 20) << __func__ << " got CLOSE" << dendl; + stop(); + } else { + ldout(cct, 0) << __func__ << " bad tag " << (int)tag << dendl; + return _fault(); + } + return nullptr; +} + +CtPtr ProtocolV1::handle_keepalive2(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl; + return _fault(); + } + + ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl; + + ceph_timespec *t; + t = (ceph_timespec *)buffer; + utime_t kp_t = utime_t(*t); + connection->write_lock.lock(); + append_keepalive_or_ack(true, &kp_t); + connection->write_lock.unlock(); + + ldout(cct, 20) << __func__ << " got KEEPALIVE2 " << kp_t << dendl; + connection->set_last_keepalive(ceph_clock_now()); + + if (is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(wait_message); +} + +void ProtocolV1::append_keepalive_or_ack(bool ack, utime_t *tp) { + ldout(cct, 10) << __func__ << dendl; + if (ack) { + ceph_assert(tp); + struct ceph_timespec ts; + tp->encode_timeval(&ts); + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2_ACK); + connection->outgoing_bl.append((char *)&ts, sizeof(ts)); + } else if (connection->has_feature(CEPH_FEATURE_MSGR_KEEPALIVE2)) { + struct ceph_timespec ts; + utime_t t = ceph_clock_now(); + t.encode_timeval(&ts); + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE2); + connection->outgoing_bl.append((char *)&ts, sizeof(ts)); + } else { + connection->outgoing_bl.append(CEPH_MSGR_TAG_KEEPALIVE); + } +} + +CtPtr ProtocolV1::handle_keepalive2_ack(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read keeplive timespec failed" << dendl; + return _fault(); + } + + ceph_timespec *t; + t = (ceph_timespec *)buffer; + connection->set_last_keepalive_ack(utime_t(*t)); + ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl; + + return CONTINUE(wait_message); +} + +CtPtr ProtocolV1::handle_tag_ack(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read ack seq failed" << dendl; + return _fault(); + } + + ceph_le64 seq; + seq = *(ceph_le64 *)buffer; + ldout(cct, 20) << __func__ << " got ACK" << dendl; + + ldout(cct, 15) << __func__ << " got ack seq " << seq << dendl; + // trim sent list + static const int max_pending = 128; + int i = 0; + Message *pending[max_pending]; + connection->write_lock.lock(); + while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) { + Message *m = sent.front(); + sent.pop_front(); + pending[i++] = m; + ldout(cct, 10) << __func__ << " got ack seq " << seq + << " >= " << m->get_seq() << " on " << m << " " << *m + << dendl; + } + connection->write_lock.unlock(); + for (int k = 0; k < i; k++) { + pending[k]->put(); + } + + return CONTINUE(wait_message); +} + +CtPtr ProtocolV1::handle_message_header(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message header failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got MSG header" << dendl; + + current_header = *((ceph_msg_header *)buffer); + + ldout(cct, 20) << __func__ << " got envelope type=" << current_header.type << " src " + << entity_name_t(current_header.src) << " front=" << current_header.front_len + << " data=" << current_header.data_len << " off " << current_header.data_off + << dendl; + + if (messenger->crcflags & MSG_CRC_HEADER) { + __u32 header_crc = 0; + header_crc = ceph_crc32c(0, (unsigned char *)¤t_header, + sizeof(current_header) - sizeof(current_header.crc)); + // verify header crc + if (header_crc != current_header.crc) { + ldout(cct, 0) << __func__ << " got bad header crc " << header_crc + << " != " << current_header.crc << dendl; + return _fault(); + } + } + + // Reset state + data_buf.clear(); + front.clear(); + middle.clear(); + data.clear(); + + state = THROTTLE_MESSAGE; + return CONTINUE(throttle_message); +} + +CtPtr ProtocolV1::throttle_message() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " wants " << 1 + << " message from policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + if (!connection->policy.throttler_messages->get_or_fail()) { + ldout(cct, 10) << __func__ << " wants 1 message from policy throttle " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + state = THROTTLE_BYTES; + return CONTINUE(throttle_bytes); +} + +CtPtr ProtocolV1::throttle_bytes() { + ldout(cct, 20) << __func__ << dendl; + + cur_msg_size = current_header.front_len + current_header.middle_len + + current_header.data_len; + if (cur_msg_size) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() + << "/" << connection->policy.throttler_bytes->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event( + 1000, connection->wakeup_handler)); + } + return nullptr; + } + } + } + + state = THROTTLE_DISPATCH_QUEUE; + return CONTINUE(throttle_dispatch_queue); +} + +CtPtr ProtocolV1::throttle_dispatch_queue() { + ldout(cct, 20) << __func__ << dendl; + + if (cur_msg_size) { + if (!connection->dispatch_queue->dispatch_throttler.get_or_fail( + cur_msg_size)) { + ldout(cct, 10) + << __func__ << " wants " << cur_msg_size + << " bytes from dispatch throttle " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + throttle_stamp = ceph_clock_now(); + + state = READ_MESSAGE_FRONT; + return read_message_front(); +} + +CtPtr ProtocolV1::read_message_front() { + ldout(cct, 20) << __func__ << dendl; + + unsigned front_len = current_header.front_len; + if (front_len) { + if (!front.length()) { + front.push_back(buffer::create(front_len)); + } + return READB(front_len, front.c_str(), handle_message_front); + } + return read_message_middle(); +} + +CtPtr ProtocolV1::handle_message_front(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message front failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got front " << front.length() << dendl; + + return read_message_middle(); +} + +CtPtr ProtocolV1::read_message_middle() { + ldout(cct, 20) << __func__ << dendl; + + if (current_header.middle_len) { + if (!middle.length()) { + middle.push_back(buffer::create(current_header.middle_len)); + } + return READB(current_header.middle_len, middle.c_str(), + handle_message_middle); + } + + return read_message_data_prepare(); +} + +CtPtr ProtocolV1::handle_message_middle(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read message middle failed" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got middle " << middle.length() << dendl; + + return read_message_data_prepare(); +} + +CtPtr ProtocolV1::read_message_data_prepare() { + ldout(cct, 20) << __func__ << dendl; + + unsigned data_len = le32_to_cpu(current_header.data_len); + unsigned data_off = le32_to_cpu(current_header.data_off); + + if (data_len) { + // get a buffer +#if 0 + // rx_buffers is broken by design... see + // http://tracker.ceph.com/issues/22480 + map<ceph_tid_t, pair<bufferlist, int> >::iterator p = + connection->rx_buffers.find(current_header.tid); + if (p != connection->rx_buffers.end()) { + ldout(cct, 10) << __func__ << " seleting rx buffer v " << p->second.second + << " at offset " << data_off << " len " + << p->second.first.length() << dendl; + data_buf = p->second.first; + // make sure it's big enough + if (data_buf.length() < data_len) + data_buf.push_back(buffer::create(data_len - data_buf.length())); + data_blp = data_buf.begin(); + } else { + ldout(cct, 20) << __func__ << " allocating new rx buffer at offset " + << data_off << dendl; + alloc_aligned_buffer(data_buf, data_len, data_off); + data_blp = data_buf.begin(); + } +#else + ldout(cct, 20) << __func__ << " allocating new rx buffer at offset " + << data_off << dendl; + alloc_aligned_buffer(data_buf, data_len, data_off); + data_blp = data_buf.begin(); +#endif + } + + msg_left = data_len; + + return CONTINUE(read_message_data); +} + +CtPtr ProtocolV1::read_message_data() { + ldout(cct, 20) << __func__ << " msg_left=" << msg_left << dendl; + + if (msg_left > 0) { + bufferptr bp = data_blp.get_current_ptr(); + unsigned read_len = std::min(bp.length(), msg_left); + + return READB(read_len, bp.c_str(), handle_message_data); + } + + return read_message_footer(); +} + +CtPtr ProtocolV1::handle_message_data(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read data error " << dendl; + return _fault(); + } + + bufferptr bp = data_blp.get_current_ptr(); + unsigned read_len = std::min(bp.length(), msg_left); + ceph_assert(read_len < std::numeric_limits<int>::max()); + data_blp.advance(read_len); + data.append(bp, 0, read_len); + msg_left -= read_len; + + return CONTINUE(read_message_data); +} + +CtPtr ProtocolV1::read_message_footer() { + ldout(cct, 20) << __func__ << dendl; + + state = READ_FOOTER_AND_DISPATCH; + + unsigned len; + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + len = sizeof(ceph_msg_footer); + } else { + len = sizeof(ceph_msg_footer_old); + } + + return READ(len, handle_message_footer); +} + +CtPtr ProtocolV1::handle_message_footer(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read footer data error " << dendl; + return _fault(); + } + + ceph_msg_footer footer; + ceph_msg_footer_old old_footer; + + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + footer = *((ceph_msg_footer *)buffer); + } else { + old_footer = *((ceph_msg_footer_old *)buffer); + footer.front_crc = old_footer.front_crc; + footer.middle_crc = old_footer.middle_crc; + footer.data_crc = old_footer.data_crc; + footer.sig = 0; + footer.flags = old_footer.flags; + } + + int aborted = (footer.flags & CEPH_MSG_FOOTER_COMPLETE) == 0; + ldout(cct, 10) << __func__ << " aborted = " << aborted << dendl; + if (aborted) { + ldout(cct, 0) << __func__ << " got " << front.length() << " + " + << middle.length() << " + " << data.length() + << " byte message.. ABORTED" << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ << " got " << front.length() << " + " + << middle.length() << " + " << data.length() << " byte message" + << dendl; + Message *message = decode_message(cct, messenger->crcflags, current_header, + footer, front, middle, data, connection); + if (!message) { + ldout(cct, 1) << __func__ << " decode message failed " << dendl; + return _fault(); + } + + // + // Check the signature if one should be present. A zero return indicates + // success. PLR + // + + if (session_security.get() == NULL) { + ldout(cct, 10) << __func__ << " no session security set" << dendl; + } else { + if (session_security->check_message_signature(message)) { + ldout(cct, 0) << __func__ << " Signature check failed" << dendl; + message->put(); + return _fault(); + } + } + message->set_byte_throttler(connection->policy.throttler_bytes); + message->set_message_throttler(connection->policy.throttler_messages); + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(cur_msg_size); + + message->set_recv_stamp(recv_stamp); + message->set_throttle_stamp(throttle_stamp); + message->set_recv_complete_stamp(ceph_clock_now()); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = in_seq; + if (message->get_seq() <= cur_seq) { + ldout(cct, 0) << __func__ << " got old message " << message->get_seq() + << " <= " << cur_seq << " " << message << " " << *message + << ", discarding" << dendl; + message->put(); + if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) && + cct->_conf->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return nullptr; + } + if (message->get_seq() > cur_seq + 1) { + ldout(cct, 0) << __func__ << " missed message? skipped from seq " + << cur_seq << " to " << message->get_seq() << dendl; + if (cct->_conf->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + if (message->get_type() == CEPH_MSG_OSD_OP || + message->get_type() == CEPH_MSG_OSD_OPREPLY) { + utime_t ltt_processed_stamp = ceph_clock_now(); + double usecs_elapsed = + (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000; + ostringstream buf; + if (message->get_type() == CEPH_MSG_OSD_OP) + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP", + false); + else + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY", + false); + } +#endif + + // note last received message. + in_seq = message->get_seq(); + ldout(cct, 5) << " rx " << message->get_source() << " seq " + << message->get_seq() << " " << message << " " << *message + << dendl; + + bool need_dispatch_writer = false; + if (!connection->policy.lossy) { + ack_left++; + need_dispatch_writer = true; + } + + state = OPENED; + + connection->logger->inc(l_msgr_recv_messages); + connection->logger->inc( + l_msgr_recv_bytes, + cur_msg_size + sizeof(ceph_msg_header) + sizeof(ceph_msg_footer)); + + messenger->ms_fast_preprocess(message); + auto fast_dispatch_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_recv_time, + fast_dispatch_time - connection->recv_start_time); + if (connection->delay_state) { + double delay_period = 0; + if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) { + delay_period = + cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0; + ldout(cct, 1) << "queue_received will delay after " + << (ceph_clock_now() + delay_period) << " on " << message + << " " << *message << dendl; + } + connection->delay_state->queue(delay_period, message); + } else if (messenger->ms_can_fast_dispatch(message)) { + connection->lock.unlock(); + connection->dispatch_queue->fast_dispatch(message); + connection->recv_start_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_fast_dispatch_time, + connection->recv_start_time - fast_dispatch_time); + connection->lock.lock(); + } else { + connection->dispatch_queue->enqueue(message, message->get_priority(), + connection->conn_id); + } + + // clean up local buffer references + data_buf.clear(); + front.clear(); + middle.clear(); + data.clear(); + + if (need_dispatch_writer && connection->is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(wait_message); +} + +void ProtocolV1::session_reset() { + ldout(cct, 10) << __func__ << " started" << dendl; + + std::lock_guard<std::mutex> l(connection->write_lock); + if (connection->delay_state) { + connection->delay_state->discard(); + } + + connection->dispatch_queue->discard_queue(connection->conn_id); + discard_out_queue(); + // note: we need to clear outgoing_bl here, but session_reset may be + // called by other thread, so let caller clear this itself! + // outgoing_bl.clear(); + + connection->dispatch_queue->queue_remote_reset(connection); + + randomize_out_seq(); + + in_seq = 0; + connect_seq = 0; + // it's safe to directly set 0, double locked + ack_left = 0; + once_ready = false; + can_write = WriteStatus::NOWRITE; +} + +void ProtocolV1::randomize_out_seq() { + if (connection->get_features() & CEPH_FEATURE_MSG_AUTH) { + // Set out_seq to a random value, so CRC won't be predictable. + auto rand_seq = ceph::util::generate_random_number<uint64_t>(0, SEQ_MASK); + ldout(cct, 10) << __func__ << " randomize_out_seq " << rand_seq << dendl; + out_seq = rand_seq; + } else { + // previously, seq #'s always started at 0. + out_seq = 0; + } +} + +ssize_t ProtocolV1::write_message(Message *m, bufferlist &bl, bool more) { + FUNCTRACE(cct); + ceph_assert(connection->center->in_thread()); + m->set_seq(++out_seq); + + if (messenger->crcflags & MSG_CRC_HEADER) { + m->calc_header_crc(); + } + + ceph_msg_header &header = m->get_header(); + ceph_msg_footer &footer = m->get_footer(); + + // TODO: let sign_message could be reentry? + // Now that we have all the crcs calculated, handle the + // digital signature for the message, if the AsyncConnection has session + // security set up. Some session security options do not + // actually calculate and check the signature, but they should + // handle the calls to sign_message and check_signature. PLR + if (session_security.get() == NULL) { + ldout(cct, 20) << __func__ << " no session security" << dendl; + } else { + if (session_security->sign_message(m)) { + ldout(cct, 20) << __func__ << " failed to sign m=" << m + << "): sig = " << footer.sig << dendl; + } else { + ldout(cct, 20) << __func__ << " signed m=" << m + << "): sig = " << footer.sig << dendl; + } + } + + connection->outgoing_bl.append(CEPH_MSGR_TAG_MSG); + connection->outgoing_bl.append((char *)&header, sizeof(header)); + + ldout(cct, 20) << __func__ << " sending message type=" << header.type + << " src " << entity_name_t(header.src) + << " front=" << header.front_len << " data=" << header.data_len + << " off " << header.data_off << dendl; + + if ((bl.length() <= ASYNC_COALESCE_THRESHOLD) && (bl.buffers().size() > 1)) { + for (const auto &pb : bl.buffers()) { + connection->outgoing_bl.append((char *)pb.c_str(), pb.length()); + } + } else { + connection->outgoing_bl.claim_append(bl); + } + + // send footer; if receiver doesn't support signatures, use the old footer + // format + ceph_msg_footer_old old_footer; + if (connection->has_feature(CEPH_FEATURE_MSG_AUTH)) { + connection->outgoing_bl.append((char *)&footer, sizeof(footer)); + } else { + if (messenger->crcflags & MSG_CRC_HEADER) { + old_footer.front_crc = footer.front_crc; + old_footer.middle_crc = footer.middle_crc; + old_footer.data_crc = footer.data_crc; + } else { + old_footer.front_crc = old_footer.middle_crc = 0; + } + old_footer.data_crc = + messenger->crcflags & MSG_CRC_DATA ? footer.data_crc : 0; + old_footer.flags = footer.flags; + connection->outgoing_bl.append((char *)&old_footer, sizeof(old_footer)); + } + + m->trace.event("async writing message"); + ldout(cct, 20) << __func__ << " sending " << m->get_seq() << " " << m + << dendl; + ssize_t total_send_size = connection->outgoing_bl.length(); + ssize_t rc = connection->_try_send(more); + if (rc < 0) { + ldout(cct, 1) << __func__ << " error sending " << m << ", " + << cpp_strerror(rc) << dendl; + } else { + connection->logger->inc( + l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length()); + ldout(cct, 10) << __func__ << " sending " << m + << (rc ? " continuely." : " done.") << dendl; + } + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false); + m->put(); + + return rc; +} + +void ProtocolV1::requeue_sent() { + write_in_progress = false; + if (sent.empty()) { + return; + } + + list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + out_seq -= sent.size(); + while (!sent.empty()) { + Message *m = sent.back(); + sent.pop_back(); + ldout(cct, 10) << __func__ << " " << *m << " for resend " + << " (" << m->get_seq() << ")" << dendl; + rq.push_front(make_pair(bufferlist(), m)); + } +} + +uint64_t ProtocolV1::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) { + ldout(cct, 10) << __func__ << " " << seq << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (out_q.count(CEPH_MSG_PRIO_HIGHEST) == 0) { + return seq; + } + list<pair<bufferlist, Message *> > &rq = out_q[CEPH_MSG_PRIO_HIGHEST]; + uint64_t count = out_seq; + while (!rq.empty()) { + pair<bufferlist, Message *> p = rq.front(); + if (p.second->get_seq() == 0 || p.second->get_seq() > seq) break; + ldout(cct, 10) << __func__ << " " << *(p.second) << " for resend seq " + << p.second->get_seq() << " <= " << seq << ", discarding" + << dendl; + p.second->put(); + rq.pop_front(); + count++; + } + if (rq.empty()) out_q.erase(CEPH_MSG_PRIO_HIGHEST); + return count; +} + +/* + * Tears down the message queues, and removes them from the + * DispatchQueue Must hold write_lock prior to calling. + */ +void ProtocolV1::discard_out_queue() { + ldout(cct, 10) << __func__ << " started" << dendl; + + for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) { + ldout(cct, 20) << __func__ << " discard " << *p << dendl; + (*p)->put(); + } + sent.clear(); + for (map<int, list<pair<bufferlist, Message *> > >::iterator p = + out_q.begin(); + p != out_q.end(); ++p) { + for (list<pair<bufferlist, Message *> >::iterator r = p->second.begin(); + r != p->second.end(); ++r) { + ldout(cct, 20) << __func__ << " discard " << r->second << dendl; + r->second->put(); + } + } + out_q.clear(); + write_in_progress = false; +} + +void ProtocolV1::reset_security() +{ + ldout(cct, 5) << __func__ << dendl; + + // clean up state internal variables and states + if (state == CONNECTING_SEND_CONNECT_MSG) { + if (authorizer) { + delete authorizer; + } + authorizer = nullptr; + } +} + +void ProtocolV1::reset_recv_state() { + ldout(cct, 5) << __func__ << dendl; + + // execute in the same thread that uses the `session_security`. + // We need to do the warp because holding `write_lock` is not + // enough as `write_event()` releases it just before calling + // `write_message()`. `submit_to()` here is NOT blocking. + if (!connection->center->in_thread()) { + connection->center->submit_to(connection->center->get_id(), [this] { + ldout(cct, 5) << "reset_recv_state (warped) reseting security handlers" + << dendl; + // Possibly unnecessary. See the comment in `deactivate_existing`. + std::lock_guard<std::mutex> l(connection->lock); + std::lock_guard<std::mutex> wl(connection->write_lock); + reset_security(); + }, /* nowait = */true); + } else { + reset_security(); + } + + // clean read and write callbacks + connection->pendingReadLen.reset(); + connection->writeCallback.reset(); + + if (state > THROTTLE_MESSAGE && state <= READ_FOOTER_AND_DISPATCH && + connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " releasing " << 1 + << " message to policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + connection->policy.throttler_messages->put(); + } + if (state > THROTTLE_BYTES && state <= READ_FOOTER_AND_DISPATCH) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " releasing " << cur_msg_size + << " bytes to policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + connection->policy.throttler_bytes->put(cur_msg_size); + } + } + if (state > THROTTLE_DISPATCH_QUEUE && state <= READ_FOOTER_AND_DISPATCH) { + ldout(cct, 10) + << __func__ << " releasing " << cur_msg_size + << " bytes to dispatch_queue throttler " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() << dendl; + connection->dispatch_queue->dispatch_throttle_release(cur_msg_size); + } +} + +Message *ProtocolV1::_get_next_outgoing(bufferlist *bl) { + Message *m = 0; + if (!out_q.empty()) { + map<int, list<pair<bufferlist, Message *> > >::reverse_iterator it = + out_q.rbegin(); + ceph_assert(!it->second.empty()); + list<pair<bufferlist, Message *> >::iterator p = it->second.begin(); + m = p->second; + if (bl) bl->swap(p->first); + it->second.erase(p); + if (it->second.empty()) out_q.erase(it->first); + } + return m; +} + +/** + * Client Protocol V1 + **/ + +CtPtr ProtocolV1::send_client_banner() { + ldout(cct, 20) << __func__ << dendl; + state = CONNECTING; + + bufferlist bl; + bl.append(CEPH_BANNER, strlen(CEPH_BANNER)); + return WRITE(bl, handle_client_banner_write); +} + +CtPtr ProtocolV1::handle_client_banner_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " write client banner failed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " connect write banner done: " + << connection->get_peer_addr() << dendl; + + return wait_server_banner(); +} + +CtPtr ProtocolV1::wait_server_banner() { + state = CONNECTING_WAIT_BANNER_AND_IDENTIFY; + + ldout(cct, 20) << __func__ << dendl; + + bufferlist myaddrbl; + unsigned banner_len = strlen(CEPH_BANNER); + unsigned need_len = banner_len + sizeof(ceph_entity_addr) * 2; + return READ(need_len, handle_server_banner_and_identify); +} + +CtPtr ProtocolV1::handle_server_banner_and_identify(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read banner and identify addresses failed" + << dendl; + return _fault(); + } + + unsigned banner_len = strlen(CEPH_BANNER); + if (memcmp(buffer, CEPH_BANNER, banner_len)) { + ldout(cct, 0) << __func__ << " connect protocol error (bad banner) on peer " + << connection->get_peer_addr() << dendl; + return _fault(); + } + + bufferlist bl; + entity_addr_t paddr, peer_addr_for_me; + + bl.append(buffer + banner_len, sizeof(ceph_entity_addr) * 2); + auto p = bl.cbegin(); + try { + decode(paddr, p); + decode(peer_addr_for_me, p); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode peer addr failed " << dendl; + return _fault(); + } + ldout(cct, 20) << __func__ << " connect read peer addr " << paddr + << " on socket " << connection->cs.fd() << dendl; + + entity_addr_t peer_addr = connection->peer_addrs->legacy_addr(); + if (peer_addr != paddr) { + if (paddr.is_blank_ip() && peer_addr.get_port() == paddr.get_port() && + peer_addr.get_nonce() == paddr.get_nonce()) { + ldout(cct, 0) << __func__ << " connect claims to be " << paddr << " not " + << peer_addr << " - presumably this is the same node!" + << dendl; + } else { + ldout(cct, 10) << __func__ << " connect claims to be " << paddr << " not " + << peer_addr << dendl; + return _fault(); + } + } + + ldout(cct, 20) << __func__ << " connect peer addr for me is " + << peer_addr_for_me << dendl; + if (messenger->get_myaddrs().empty() || + messenger->get_myaddrs().front().is_blank_ip()) { + sockaddr_storage ss; + socklen_t len = sizeof(ss); + getsockname(connection->cs.fd(), (sockaddr *)&ss, &len); + entity_addr_t a; + if (cct->_conf->ms_learn_addr_from_peer) { + ldout(cct, 1) << __func__ << " peer " << connection->target_addr + << " says I am " << peer_addr_for_me << " (socket says " + << (sockaddr*)&ss << ")" << dendl; + a = peer_addr_for_me; + } else { + ldout(cct, 1) << __func__ << " socket to " << connection->target_addr + << " says I am " << (sockaddr*)&ss + << " (peer says " << peer_addr_for_me << ")" << dendl; + a.set_sockaddr((sockaddr *)&ss); + } + a.set_type(entity_addr_t::TYPE_LEGACY); // anything but NONE; learned_addr ignores this + a.set_port(0); + connection->lock.unlock(); + messenger->learned_addr(a); + if (cct->_conf->ms_inject_internal_delays && + cct->_conf->ms_inject_socket_failures) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 10) << __func__ << " sleep for " + << cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + } + connection->lock.lock(); + if (state != CONNECTING_WAIT_BANNER_AND_IDENTIFY) { + ldout(cct, 1) << __func__ + << " state changed while learned_addr, mark_down or " + << " replacing must be happened just now" << dendl; + return nullptr; + } + } + + bufferlist myaddrbl; + encode(messenger->get_myaddr_legacy(), myaddrbl, 0); // legacy + return WRITE(myaddrbl, handle_my_addr_write); +} + +CtPtr ProtocolV1::handle_my_addr_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 2) << __func__ << " connect couldn't write my addr, " + << cpp_strerror(r) << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " connect sent my addr " + << messenger->get_myaddr_legacy() << dendl; + + return CONTINUE(send_connect_message); +} + +CtPtr ProtocolV1::send_connect_message() { + state = CONNECTING_SEND_CONNECT_MSG; + + ldout(cct, 20) << __func__ << dendl; + + if (!authorizer) { + authorizer = messenger->ms_deliver_get_authorizer(connection->peer_type); + } + + ceph_msg_connect connect; + connect.features = connection->policy.features_supported; + connect.host_type = messenger->get_myname().type(); + connect.global_seq = global_seq; + connect.connect_seq = connect_seq; + connect.protocol_version = + messenger->get_proto_version(connection->peer_type, true); + connect.authorizer_protocol = authorizer ? authorizer->protocol : 0; + connect.authorizer_len = authorizer ? authorizer->bl.length() : 0; + + if (authorizer) { + ldout(cct, 10) << __func__ + << " connect_msg.authorizer_len=" << connect.authorizer_len + << " protocol=" << connect.authorizer_protocol << dendl; + } + + connect.flags = 0; + if (connection->policy.lossy) { + connect.flags |= + CEPH_MSG_CONNECT_LOSSY; // this is fyi, actually, server decides! + } + + bufferlist bl; + bl.append((char *)&connect, sizeof(connect)); + if (authorizer) { + bl.append(authorizer->bl.c_str(), authorizer->bl.length()); + } + + ldout(cct, 10) << __func__ << " connect sending gseq=" << global_seq + << " cseq=" << connect_seq + << " proto=" << connect.protocol_version << dendl; + + return WRITE(bl, handle_connect_message_write); +} + +CtPtr ProtocolV1::handle_connect_message_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 2) << __func__ << " connect couldn't send reply " + << cpp_strerror(r) << dendl; + return _fault(); + } + + ldout(cct, 20) << __func__ + << " connect wrote (self +) cseq, waiting for reply" << dendl; + + return wait_connect_reply(); +} + +CtPtr ProtocolV1::wait_connect_reply() { + ldout(cct, 20) << __func__ << dendl; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_reply, 0, sizeof(connect_reply)); + return READ(sizeof(connect_reply), handle_connect_reply_1); +} + +CtPtr ProtocolV1::handle_connect_reply_1(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect reply failed" << dendl; + return _fault(); + } + + connect_reply = *((ceph_msg_connect_reply *)buffer); + + ldout(cct, 20) << __func__ << " connect got reply tag " + << (int)connect_reply.tag << " connect_seq " + << connect_reply.connect_seq << " global_seq " + << connect_reply.global_seq << " proto " + << connect_reply.protocol_version << " flags " + << (int)connect_reply.flags << " features " + << connect_reply.features << dendl; + + if (connect_reply.authorizer_len) { + return wait_connect_reply_auth(); + } + + return handle_connect_reply_2(); +} + +CtPtr ProtocolV1::wait_connect_reply_auth() { + ldout(cct, 20) << __func__ << dendl; + + ldout(cct, 10) << __func__ + << " reply.authorizer_len=" << connect_reply.authorizer_len + << dendl; + + ceph_assert(connect_reply.authorizer_len < 4096); + + return READ(connect_reply.authorizer_len, handle_connect_reply_auth); +} + +CtPtr ProtocolV1::handle_connect_reply_auth(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect reply authorizer failed" + << dendl; + return _fault(); + } + + bufferlist authorizer_reply; + authorizer_reply.append(buffer, connect_reply.authorizer_len); + + if (connect_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + ldout(cct, 10) << __func__ << " connect got auth challenge" << dendl; + authorizer->add_challenge(cct, authorizer_reply); + return CONTINUE(send_connect_message); + } + + auto iter = authorizer_reply.cbegin(); + if (authorizer && !authorizer->verify_reply(iter, + nullptr /* connection_secret */)) { + ldout(cct, 0) << __func__ << " failed verifying authorize reply" << dendl; + return _fault(); + } + + return handle_connect_reply_2(); +} + +CtPtr ProtocolV1::handle_connect_reply_2() { + ldout(cct, 20) << __func__ << dendl; + + if (connect_reply.tag == CEPH_MSGR_TAG_FEATURES) { + ldout(cct, 0) << __func__ << " connect protocol feature mismatch, my " + << std::hex << connection->policy.features_supported + << " < peer " << connect_reply.features << " missing " + << (connect_reply.features & + ~connection->policy.features_supported) + << std::dec << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_BADPROTOVER) { + ldout(cct, 0) << __func__ << " connect protocol version mismatch, my " + << messenger->get_proto_version(connection->peer_type, true) + << " != " << connect_reply.protocol_version << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_BADAUTHORIZER) { + ldout(cct, 0) << __func__ << " connect got BADAUTHORIZER" << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RESETSESSION) { + ldout(cct, 0) << __func__ << " connect got RESETSESSION" << dendl; + session_reset(); + connect_seq = 0; + + // see session_reset + connection->outgoing_bl.clear(); + + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_GLOBAL) { + global_seq = messenger->get_global_seq(connect_reply.global_seq); + ldout(cct, 5) << __func__ << " connect got RETRY_GLOBAL " + << connect_reply.global_seq << " chose new " << global_seq + << dendl; + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_RETRY_SESSION) { + ceph_assert(connect_reply.connect_seq > connect_seq); + ldout(cct, 5) << __func__ << " connect got RETRY_SESSION " << connect_seq + << " -> " << connect_reply.connect_seq << dendl; + connect_seq = connect_reply.connect_seq; + return CONTINUE(send_connect_message); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_WAIT) { + ldout(cct, 1) << __func__ << " connect got WAIT (connection race)" << dendl; + state = WAIT; + return _fault(); + } + + uint64_t feat_missing; + feat_missing = + connection->policy.features_required & ~(uint64_t)connect_reply.features; + if (feat_missing) { + ldout(cct, 1) << __func__ << " missing required features " << std::hex + << feat_missing << std::dec << dendl; + return _fault(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_SEQ) { + ldout(cct, 10) + << __func__ + << " got CEPH_MSGR_TAG_SEQ, reading acked_seq and writing in_seq" + << dendl; + + return wait_ack_seq(); + } + + if (connect_reply.tag == CEPH_MSGR_TAG_READY) { + ldout(cct, 10) << __func__ << " got CEPH_MSGR_TAG_READY " << dendl; + } + + return client_ready(); +} + +CtPtr ProtocolV1::wait_ack_seq() { + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(uint64_t), handle_ack_seq); +} + +CtPtr ProtocolV1::handle_ack_seq(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect ack seq failed" << dendl; + return _fault(); + } + + uint64_t newly_acked_seq = 0; + + newly_acked_seq = *((uint64_t *)buffer); + ldout(cct, 2) << __func__ << " got newly_acked_seq " << newly_acked_seq + << " vs out_seq " << out_seq << dendl; + out_seq = discard_requeued_up_to(out_seq, newly_acked_seq); + + bufferlist bl; + uint64_t s = in_seq; + bl.append((char *)&s, sizeof(s)); + + return WRITE(bl, handle_in_seq_write); +} + +CtPtr ProtocolV1::handle_in_seq_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 10) << __func__ << " failed to send in_seq " << dendl; + return _fault(); + } + + ldout(cct, 10) << __func__ << " send in_seq done " << dendl; + + return client_ready(); +} + +CtPtr ProtocolV1::client_ready() { + ldout(cct, 20) << __func__ << dendl; + + // hooray! + peer_global_seq = connect_reply.global_seq; + connection->policy.lossy = connect_reply.flags & CEPH_MSG_CONNECT_LOSSY; + + once_ready = true; + connect_seq += 1; + ceph_assert(connect_seq == connect_reply.connect_seq); + backoff = utime_t(); + connection->set_features((uint64_t)connect_reply.features & + (uint64_t)connection->policy.features_supported); + ldout(cct, 10) << __func__ << " connect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + // If we have an authorizer, get a new AuthSessionHandler to deal with + // ongoing security of the connection. PLR + if (authorizer != NULL) { + ldout(cct, 10) << __func__ << " setting up session_security with auth " + << authorizer << dendl; + session_security.reset(get_auth_session_handler( + cct, authorizer->protocol, + authorizer->session_key, + connection->get_features())); + } else { + // We have no authorizer, so we shouldn't be applying security to messages + // in this AsyncConnection. PLR + ldout(cct, 10) << __func__ << " no authorizer, clearing session_security" + << dendl; + session_security.reset(); + } + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +/** + * Server Protocol V1 + **/ + +CtPtr ProtocolV1::send_server_banner() { + ldout(cct, 20) << __func__ << dendl; + state = ACCEPTING; + + bufferlist bl; + + bl.append(CEPH_BANNER, strlen(CEPH_BANNER)); + + // as a server, we should have a legacy addr if we accepted this connection. + auto legacy = messenger->get_myaddrs().legacy_addr(); + encode(legacy, bl, 0); // legacy + connection->port = legacy.get_port(); + encode(connection->target_addr, bl, 0); // legacy + + ldout(cct, 1) << __func__ << " sd=" << connection->cs.fd() + << " legacy " << legacy + << " socket_addr " << connection->socket_addr + << " target_addr " << connection->target_addr + << dendl; + + return WRITE(bl, handle_server_banner_write); +} + +CtPtr ProtocolV1::handle_server_banner_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << " write server banner failed" << dendl; + return _fault(); + } + ldout(cct, 10) << __func__ << " write banner and addr done: " + << connection->get_peer_addr() << dendl; + + return wait_client_banner(); +} + +CtPtr ProtocolV1::wait_client_banner() { + ldout(cct, 20) << __func__ << dendl; + + return READ(strlen(CEPH_BANNER) + sizeof(ceph_entity_addr), + handle_client_banner); +} + +CtPtr ProtocolV1::handle_client_banner(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner and addr failed" << dendl; + return _fault(); + } + + if (memcmp(buffer, CEPH_BANNER, strlen(CEPH_BANNER))) { + ldout(cct, 1) << __func__ << " accept peer sent bad banner '" << buffer + << "' (should be '" << CEPH_BANNER << "')" << dendl; + return _fault(); + } + + bufferlist addr_bl; + entity_addr_t peer_addr; + + addr_bl.append(buffer + strlen(CEPH_BANNER), sizeof(ceph_entity_addr)); + try { + auto ti = addr_bl.cbegin(); + decode(peer_addr, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode peer_addr failed " << dendl; + return _fault(); + } + + ldout(cct, 10) << __func__ << " accept peer addr is " << peer_addr << dendl; + if (peer_addr.is_blank_ip()) { + // peer apparently doesn't know what ip they have; figure it out for them. + int port = peer_addr.get_port(); + peer_addr.set_sockaddr(connection->target_addr.get_sockaddr()); + peer_addr.set_port(port); + + ldout(cct, 0) << __func__ << " accept peer addr is really " << peer_addr + << " (socket is " << connection->target_addr << ")" << dendl; + } + connection->set_peer_addr(peer_addr); // so that connection_state gets set up + connection->target_addr = peer_addr; + + return CONTINUE(wait_connect_message); +} + +CtPtr ProtocolV1::wait_connect_message() { + ldout(cct, 20) << __func__ << dendl; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + return READ(sizeof(connect_msg), handle_connect_message_1); +} + +CtPtr ProtocolV1::handle_connect_message_1(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect msg failed" << dendl; + return _fault(); + } + + connect_msg = *((ceph_msg_connect *)buffer); + + state = ACCEPTING_WAIT_CONNECT_MSG_AUTH; + + if (connect_msg.authorizer_len) { + return wait_connect_message_auth(); + } + + return handle_connect_message_2(); +} + +CtPtr ProtocolV1::wait_connect_message_auth() { + ldout(cct, 20) << __func__ << dendl; + authorizer_buf.clear(); + authorizer_buf.push_back(buffer::create(connect_msg.authorizer_len)); + return READB(connect_msg.authorizer_len, authorizer_buf.c_str(), + handle_connect_message_auth); +} + +CtPtr ProtocolV1::handle_connect_message_auth(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read connect authorizer failed" << dendl; + return _fault(); + } + + return handle_connect_message_2(); +} + +CtPtr ProtocolV1::handle_connect_message_2() { + ldout(cct, 20) << __func__ << dendl; + + ldout(cct, 20) << __func__ << " accept got peer connect_seq " + << connect_msg.connect_seq << " global_seq " + << connect_msg.global_seq << dendl; + + connection->set_peer_type(connect_msg.host_type); + connection->policy = messenger->get_policy(connect_msg.host_type); + + ldout(cct, 10) << __func__ << " accept of host_type " << connect_msg.host_type + << ", policy.lossy=" << connection->policy.lossy + << " policy.server=" << connection->policy.server + << " policy.standby=" << connection->policy.standby + << " policy.resetcheck=" << connection->policy.resetcheck + << " features 0x" << std::hex << (uint64_t)connect_msg.features + << std::dec + << dendl; + + ceph_msg_connect_reply reply; + bufferlist authorizer_reply; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&reply, 0, sizeof(reply)); + reply.protocol_version = + messenger->get_proto_version(connection->peer_type, false); + + // mismatch? + ldout(cct, 10) << __func__ << " accept my proto " << reply.protocol_version + << ", their proto " << connect_msg.protocol_version << dendl; + + if (connect_msg.protocol_version != reply.protocol_version) { + return send_connect_message_reply(CEPH_MSGR_TAG_BADPROTOVER, reply, + authorizer_reply); + } + + // require signatures for cephx? + if (connect_msg.authorizer_protocol == CEPH_AUTH_CEPHX) { + if (connection->peer_type == CEPH_ENTITY_TYPE_OSD || + connection->peer_type == CEPH_ENTITY_TYPE_MDS || + connection->peer_type == CEPH_ENTITY_TYPE_MGR) { + if (cct->_conf->cephx_require_signatures || + cct->_conf->cephx_cluster_require_signatures) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring MSG_AUTH feature bit for cluster" + << dendl; + connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (cct->_conf->cephx_require_version >= 2 || + cct->_conf->cephx_cluster_require_version >= 2) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring cephx v2 feature bit for cluster" + << dendl; + connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + } else { + if (cct->_conf->cephx_require_signatures || + cct->_conf->cephx_service_require_signatures) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring MSG_AUTH feature bit for service" + << dendl; + connection->policy.features_required |= CEPH_FEATURE_MSG_AUTH; + } + if (cct->_conf->cephx_require_version >= 2 || + cct->_conf->cephx_service_require_version >= 2) { + ldout(cct, 10) + << __func__ + << " using cephx, requiring cephx v2 feature bit for service" + << dendl; + connection->policy.features_required |= CEPH_FEATUREMASK_CEPHX_V2; + } + } + } + + uint64_t feat_missing = + connection->policy.features_required & ~(uint64_t)connect_msg.features; + if (feat_missing) { + ldout(cct, 1) << __func__ << " peer missing required features " << std::hex + << feat_missing << std::dec << dendl; + return send_connect_message_reply(CEPH_MSGR_TAG_FEATURES, reply, + authorizer_reply); + } + + bufferlist auth_bl_copy = authorizer_buf; + connection->lock.unlock(); + ldout(cct,10) << __func__ << " authorizor_protocol " + << connect_msg.authorizer_protocol + << " len " << auth_bl_copy.length() + << dendl; + bool authorizer_valid; + bool need_challenge = HAVE_FEATURE(connect_msg.features, CEPHX_V2); + bool had_challenge = (bool)authorizer_challenge; + if (!messenger->ms_deliver_verify_authorizer( + connection, connection->peer_type, connect_msg.authorizer_protocol, + auth_bl_copy, authorizer_reply, authorizer_valid, session_key, + nullptr /* connection_secret */, + need_challenge ? &authorizer_challenge : nullptr) || + !authorizer_valid) { + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (need_challenge && !had_challenge && authorizer_challenge) { + ldout(cct, 10) << __func__ << ": challenging authorizer" << dendl; + ceph_assert(authorizer_reply.length()); + return send_connect_message_reply(CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER, + reply, authorizer_reply); + } else { + ldout(cct, 0) << __func__ << ": got bad authorizer, auth_reply_len=" + << authorizer_reply.length() << dendl; + session_security.reset(); + return send_connect_message_reply(CEPH_MSGR_TAG_BADAUTHORIZER, reply, + authorizer_reply); + } + } + + // We've verified the authorizer for this AsyncConnection, so set up the + // session security structure. PLR + ldout(cct, 10) << __func__ << " accept setting up session_security." << dendl; + + // existing? + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + connection->inject_delay(); + + connection->lock.lock(); + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (existing == connection) { + existing = nullptr; + } + if (existing && existing->protocol->proto_type != 1) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + if (existing) { + // There is no possible that existing connection will acquire this + // connection's lock + existing->lock.lock(); // skip lockdep check (we are locking a second + // AsyncConnection here) + + ldout(cct,10) << __func__ << " existing=" << existing << " exproto=" + << existing->protocol.get() << dendl; + ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get()); + ceph_assert(exproto); + ceph_assert(exproto->proto_type == 1); + + if (exproto->state == CLOSED) { + ldout(cct, 1) << __func__ << " existing " << existing + << " already closed." << dendl; + existing->lock.unlock(); + existing = nullptr; + + return open(reply, authorizer_reply); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing_state=" + << connection->get_state_name(existing->state) << dendl; + reply.global_seq = exproto->peer_global_seq; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply, + authorizer_reply); + } + + if (connect_msg.global_seq < exproto->peer_global_seq) { + ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq " + << exproto->peer_global_seq << " > " + << connect_msg.global_seq << ", RETRY_GLOBAL" << dendl; + reply.global_seq = exproto->peer_global_seq; // so we can send it below.. + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_GLOBAL, reply, + authorizer_reply); + } else { + ldout(cct, 10) << __func__ << " accept existing " << existing << ".gseq " + << exproto->peer_global_seq + << " <= " << connect_msg.global_seq << ", looks ok" + << dendl; + } + + if (existing->policy.lossy) { + ldout(cct, 0) + << __func__ + << " accept replacing existing (lossy) channel (new one lossy=" + << connection->policy.lossy << ")" << dendl; + exproto->session_reset(); + return replace(existing, reply, authorizer_reply); + } + + ldout(cct, 1) << __func__ << " accept connect_seq " + << connect_msg.connect_seq + << " vs existing csq=" << exproto->connect_seq + << " existing_state=" + << connection->get_state_name(existing->state) << dendl; + + if (connect_msg.connect_seq == 0 && exproto->connect_seq > 0) { + ldout(cct, 0) + << __func__ + << " accept peer reset, then tried to connect to us, replacing" + << dendl; + // this is a hard reset from peer + is_reset_from_peer = true; + if (connection->policy.resetcheck) { + exproto->session_reset(); // this resets out_queue, msg_ and + // connect_seq #'s + } + return replace(existing, reply, authorizer_reply); + } + + if (connect_msg.connect_seq < exproto->connect_seq) { + // old attempt, or we sent READY but they didn't get it. + ldout(cct, 10) << __func__ << " accept existing " << existing << ".cseq " + << exproto->connect_seq << " > " << connect_msg.connect_seq + << ", RETRY_SESSION" << dendl; + reply.connect_seq = exproto->connect_seq + 1; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply, + authorizer_reply); + } + + if (connect_msg.connect_seq == exproto->connect_seq) { + // if the existing connection successfully opened, and/or + // subsequently went to standby, then the peer should bump + // their connect_seq and retry: this is not a connection race + // we need to resolve here. + if (exproto->state == OPENED || exproto->state == STANDBY) { + ldout(cct, 10) << __func__ << " accept connection race, existing " + << existing << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq + << ", OPEN|STANDBY, RETRY_SESSION " << dendl; + // if connect_seq both zero, dont stuck into dead lock. it's ok to + // replace + if (connection->policy.resetcheck && exproto->connect_seq == 0) { + return replace(existing, reply, authorizer_reply); + } + + reply.connect_seq = exproto->connect_seq + 1; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RETRY_SESSION, reply, + authorizer_reply); + } + + // connection race? + if (connection->peer_addrs->legacy_addr() < messenger->get_myaddr_legacy() || + existing->policy.server) { + // incoming wins + ldout(cct, 10) << __func__ << " accept connection race, existing " + << existing << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq + << ", or we are server, replacing my attempt" << dendl; + return replace(existing, reply, authorizer_reply); + } else { + // our existing outgoing wins + ldout(messenger->cct, 10) + << __func__ << " accept connection race, existing " << existing + << ".cseq " << exproto->connect_seq + << " == " << connect_msg.connect_seq << ", sending WAIT" << dendl; + ceph_assert(connection->peer_addrs->legacy_addr() > + messenger->get_myaddr_legacy()); + existing->lock.unlock(); + // make sure we follow through with opening the existing + // connection (if it isn't yet open) since we know the peer + // has something to send to us. + existing->send_keepalive(); + return send_connect_message_reply(CEPH_MSGR_TAG_WAIT, reply, + authorizer_reply); + } + } + + ceph_assert(connect_msg.connect_seq > exproto->connect_seq); + ceph_assert(connect_msg.global_seq >= exproto->peer_global_seq); + if (connection->policy.resetcheck && // RESETSESSION only used by servers; + // peers do not reset each other + exproto->connect_seq == 0) { + ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq " + << connect_msg.connect_seq << ", " << existing + << ".cseq = " << exproto->connect_seq + << "), sending RESETSESSION " << dendl; + existing->lock.unlock(); + return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply, + authorizer_reply); + } + + // reconnect + ldout(cct, 10) << __func__ << " accept peer sent cseq " + << connect_msg.connect_seq << " > " << exproto->connect_seq + << dendl; + return replace(existing, reply, authorizer_reply); + } // existing + else if (!replacing && connect_msg.connect_seq > 0) { + // we reset, and they are opening a new session + ldout(cct, 0) << __func__ << " accept we reset (peer sent cseq " + << connect_msg.connect_seq << "), sending RESETSESSION" + << dendl; + return send_connect_message_reply(CEPH_MSGR_TAG_RESETSESSION, reply, + authorizer_reply); + } else { + // new session + ldout(cct, 10) << __func__ << " accept new session" << dendl; + existing = nullptr; + return open(reply, authorizer_reply); + } +} + +CtPtr ProtocolV1::send_connect_message_reply(char tag, + ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 20) << __func__ << dendl; + bufferlist reply_bl; + reply.tag = tag; + reply.features = + ((uint64_t)connect_msg.features & connection->policy.features_supported) | + connection->policy.features_required; + reply.authorizer_len = authorizer_reply.length(); + reply_bl.append((char *)&reply, sizeof(reply)); + + ldout(cct, 10) << __func__ << " reply features 0x" << std::hex + << reply.features << " = (policy sup 0x" + << connection->policy.features_supported + << " & connect 0x" << (uint64_t)connect_msg.features + << ") | policy req 0x" + << connection->policy.features_required + << dendl; + + if (reply.authorizer_len) { + reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length()); + authorizer_reply.clear(); + } + + return WRITE(reply_bl, handle_connect_message_reply_write); +} + +CtPtr ProtocolV1::handle_connect_message_reply_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << " write connect message reply failed" << dendl; + connection->inject_delay(); + return _fault(); + } + + return CONTINUE(wait_connect_message); +} + +CtPtr ProtocolV1::replace(AsyncConnectionRef existing, + ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 10) << __func__ << " accept replacing " << existing << dendl; + + connection->inject_delay(); + if (existing->policy.lossy) { + // disconnect from the Connection + ldout(cct, 1) << __func__ << " replacing on lossy channel, failing existing" + << dendl; + existing->protocol->stop(); + existing->dispatch_queue->queue_reset(existing.get()); + } else { + ceph_assert(can_write == WriteStatus::NOWRITE); + existing->write_lock.lock(); + + ProtocolV1 *exproto = dynamic_cast<ProtocolV1 *>(existing->protocol.get()); + + // reset the in_seq if this is a hard reset from peer, + // otherwise we respect our original connection's value + if (is_reset_from_peer) { + exproto->is_reset_from_peer = true; + } + + connection->center->delete_file_event(connection->cs.fd(), + EVENT_READABLE | EVENT_WRITABLE); + + if (existing->delay_state) { + existing->delay_state->flush(); + ceph_assert(!connection->delay_state); + } + exproto->reset_recv_state(); + + exproto->connect_msg.features = connect_msg.features; + + auto temp_cs = std::move(connection->cs); + EventCenter *new_center = connection->center; + Worker *new_worker = connection->worker; + // avoid _stop shutdown replacing socket + // queue a reset on the new connection, which we're dumping for the old + stop(); + + connection->dispatch_queue->queue_reset(connection); + ldout(messenger->cct, 1) + << __func__ << " stop myself to swap existing" << dendl; + exproto->can_write = WriteStatus::REPLACING; + exproto->replacing = true; + exproto->write_in_progress = false; + existing->state_offset = 0; + // avoid previous thread modify event + exproto->state = NONE; + existing->state = AsyncConnection::STATE_NONE; + // Discard existing prefetch buffer in `recv_buf` + existing->recv_start = existing->recv_end = 0; + // there shouldn't exist any buffer + ceph_assert(connection->recv_start == connection->recv_end); + + exproto->authorizer_challenge.reset(); + + auto deactivate_existing = std::bind( + [existing, new_worker, new_center, exproto, reply, + authorizer_reply](ConnectedSocket &cs) mutable { + // we need to delete time event in original thread + { + std::lock_guard<std::mutex> l(existing->lock); + existing->write_lock.lock(); + exproto->requeue_sent(); + existing->outgoing_bl.clear(); + existing->open_write = false; + existing->write_lock.unlock(); + if (exproto->state == NONE) { + existing->shutdown_socket(); + existing->cs = std::move(cs); + existing->worker->references--; + new_worker->references++; + existing->logger = new_worker->get_perf_counter(); + existing->worker = new_worker; + existing->center = new_center; + if (existing->delay_state) + existing->delay_state->set_center(new_center); + } else if (exproto->state == CLOSED) { + auto back_to_close = + std::bind([](ConnectedSocket &cs) mutable { cs.close(); }, + std::move(cs)); + new_center->submit_to(new_center->get_id(), + std::move(back_to_close), true); + return; + } else { + ceph_abort(); + } + } + + // Before changing existing->center, it may already exists some + // events in existing->center's queue. Then if we mark down + // `existing`, it will execute in another thread and clean up + // connection. Previous event will result in segment fault + auto transfer_existing = [existing, exproto, reply, + authorizer_reply]() mutable { + std::lock_guard<std::mutex> l(existing->lock); + if (exproto->state == CLOSED) return; + ceph_assert(exproto->state == NONE); + + // we have called shutdown_socket above + ceph_assert(existing->last_tick_id == 0); + // restart timer since we are going to re-build connection + existing->last_connect_started = ceph::coarse_mono_clock::now(); + existing->last_tick_id = existing->center->create_time_event( + existing->connect_timeout_us, existing->tick_handler); + existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED; + exproto->state = ACCEPTING; + + existing->center->create_file_event( + existing->cs.fd(), EVENT_READABLE, existing->read_handler); + reply.global_seq = exproto->peer_global_seq; + exproto->run_continuation(exproto->send_connect_message_reply( + CEPH_MSGR_TAG_RETRY_GLOBAL, reply, authorizer_reply)); + }; + if (existing->center->in_thread()) + transfer_existing(); + else + existing->center->submit_to(existing->center->get_id(), + std::move(transfer_existing), true); + }, + std::move(temp_cs)); + + existing->center->submit_to(existing->center->get_id(), + std::move(deactivate_existing), true); + existing->write_lock.unlock(); + existing->lock.unlock(); + return nullptr; + } + existing->lock.unlock(); + + return open(reply, authorizer_reply); +} + +CtPtr ProtocolV1::open(ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply) { + ldout(cct, 20) << __func__ << dendl; + + connect_seq = connect_msg.connect_seq + 1; + peer_global_seq = connect_msg.global_seq; + ldout(cct, 10) << __func__ << " accept success, connect_seq = " << connect_seq + << " in_seq=" << in_seq << ", sending READY" << dendl; + + // if it is a hard reset from peer, we don't need a round-trip to negotiate + // in/out sequence + if ((connect_msg.features & CEPH_FEATURE_RECONNECT_SEQ) && + !is_reset_from_peer) { + reply.tag = CEPH_MSGR_TAG_SEQ; + wait_for_seq = true; + } else { + reply.tag = CEPH_MSGR_TAG_READY; + wait_for_seq = false; + out_seq = discard_requeued_up_to(out_seq, 0); + is_reset_from_peer = false; + in_seq = 0; + } + + // send READY reply + reply.features = connection->policy.features_supported; + reply.global_seq = messenger->get_global_seq(); + reply.connect_seq = connect_seq; + reply.flags = 0; + reply.authorizer_len = authorizer_reply.length(); + if (connection->policy.lossy) { + reply.flags = reply.flags | CEPH_MSG_CONNECT_LOSSY; + } + + connection->set_features((uint64_t)reply.features & + (uint64_t)connect_msg.features); + ldout(cct, 10) << __func__ << " accept features " + << connection->get_features() + << " authorizer_protocol " + << connect_msg.authorizer_protocol << dendl; + + session_security.reset( + get_auth_session_handler(cct, connect_msg.authorizer_protocol, + session_key, + connection->get_features())); + + bufferlist reply_bl; + reply_bl.append((char *)&reply, sizeof(reply)); + + if (reply.authorizer_len) { + reply_bl.append(authorizer_reply.c_str(), authorizer_reply.length()); + } + + if (reply.tag == CEPH_MSGR_TAG_SEQ) { + uint64_t s = in_seq; + reply_bl.append((char *)&s, sizeof(s)); + } + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + replacing = false; + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->legacy_addr() + << " just fail later one(this)" << dendl; + ldout(cct, 10) << "accept fault after register" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != ACCEPTING_WAIT_CONNECT_MSG_AUTH) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + ldout(cct, 10) << "accept fault after register" << dendl; + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + return WRITE(reply_bl, handle_ready_connect_message_reply_write); +} + +CtPtr ProtocolV1::handle_ready_connect_message_reply_write(int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " write ready connect message reply failed" + << dendl; + return _fault(); + } + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + once_ready = true; + + state = ACCEPTING_HANDLED_CONNECT_MSG; + + if (wait_for_seq) { + return wait_seq(); + } + + return server_ready(); +} + +CtPtr ProtocolV1::wait_seq() { + ldout(cct, 20) << __func__ << dendl; + + return READ(sizeof(uint64_t), handle_seq); +} + +CtPtr ProtocolV1::handle_seq(char *buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read ack seq failed" << dendl; + return _fault(); + } + + uint64_t newly_acked_seq = *(uint64_t *)buffer; + ldout(cct, 2) << __func__ << " accept get newly_acked_seq " << newly_acked_seq + << dendl; + out_seq = discard_requeued_up_to(out_seq, newly_acked_seq); + + return server_ready(); +} + +CtPtr ProtocolV1::server_ready() { + ldout(cct, 20) << __func__ << " session_security is " + << session_security + << dendl; + + ldout(cct, 20) << __func__ << " accept done" << dendl; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&connect_msg, 0, sizeof(connect_msg)); + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + return ready(); +} diff --git a/src/msg/async/ProtocolV1.h b/src/msg/async/ProtocolV1.h new file mode 100644 index 00000000..070ce73f --- /dev/null +++ b/src/msg/async/ProtocolV1.h @@ -0,0 +1,305 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _MSG_ASYNC_PROTOCOL_V1_ +#define _MSG_ASYNC_PROTOCOL_V1_ + +#include "Protocol.h" + +class ProtocolV1; +using CtPtr = Ct<ProtocolV1>*; + +class ProtocolV1 : public Protocol { +/* + * ProtocolV1 State Machine + * + + send_server_banner send_client_banner + | | + v v + wait_client_banner wait_server_banner + | | + | v + v handle_server_banner_and_identify + wait_connect_message <---------\ | + | | | v + | wait_connect_message_auth | send_connect_message <----------\ + | | | | | + v v | | | +handle_connect_message_2 | v | + | | | wait_connect_reply | + v v | | | | + replace -> send_connect_message_reply | V | + | | wait_connect_reply_auth | + | | | | + v v v | + open ---\ handle_connect_reply_2 --------/ + | | | + | v v + | wait_seq wait_ack_seq + | | | + v v v + server_ready client_ready + | | + \------------------> wait_message <------------/ + | ^ | ^ + /------------------------/ | | | + | | | \----------------- ------------\ + v /----------/ v | +handle_keepalive2 | handle_message_header read_message_footer +handle_keepalive2_ack | | ^ +handle_tag_ack | v | + | | throttle_message read_message_data + \----------------/ | ^ + v | + read_message_front --> read_message_middle --/ +*/ + +protected: + + enum State { + NONE = 0, + START_CONNECT, + CONNECTING, + CONNECTING_WAIT_BANNER_AND_IDENTIFY, + CONNECTING_SEND_CONNECT_MSG, + START_ACCEPT, + ACCEPTING, + ACCEPTING_WAIT_CONNECT_MSG_AUTH, + ACCEPTING_HANDLED_CONNECT_MSG, + OPENED, + THROTTLE_MESSAGE, + THROTTLE_BYTES, + THROTTLE_DISPATCH_QUEUE, + READ_MESSAGE_FRONT, + READ_FOOTER_AND_DISPATCH, + CLOSED, + WAIT, + STANDBY + }; + + static const char *get_state_name(int state) { + const char *const statenames[] = {"NONE", + "START_CONNECT", + "CONNECTING", + "CONNECTING_WAIT_BANNER_AND_IDENTIFY", + "CONNECTING_SEND_CONNECT_MSG", + "START_ACCEPT", + "ACCEPTING", + "ACCEPTING_WAIT_CONNECT_MSG_AUTH", + "ACCEPTING_HANDLED_CONNECT_MSG", + "OPENED", + "THROTTLE_MESSAGE", + "THROTTLE_BYTES", + "THROTTLE_DISPATCH_QUEUE", + "READ_MESSAGE_FRONT", + "READ_FOOTER_AND_DISPATCH", + "CLOSED", + "WAIT", + "STANDBY"}; + return statenames[state]; + } + + char *temp_buffer; + + enum class WriteStatus { NOWRITE, REPLACING, CANWRITE, CLOSED }; + std::atomic<WriteStatus> can_write; + std::list<Message *> sent; // the first bufferlist need to inject seq + // priority queue for outbound msgs + std::map<int, std::list<std::pair<bufferlist, Message *>>> out_q; + bool keepalive; + bool write_in_progress = false; + + __u32 connect_seq, peer_global_seq; + std::atomic<uint64_t> in_seq{0}; + std::atomic<uint64_t> out_seq{0}; + std::atomic<uint64_t> ack_left{0}; + + CryptoKey session_key; + std::shared_ptr<AuthSessionHandler> session_security; + std::unique_ptr<AuthAuthorizerChallenge> authorizer_challenge; // accept side + + // Open state + ceph_msg_connect connect_msg; + ceph_msg_connect_reply connect_reply; + bufferlist authorizer_buf; + + utime_t backoff; // backoff time + utime_t recv_stamp; + utime_t throttle_stamp; + unsigned msg_left; + uint64_t cur_msg_size; + ceph_msg_header current_header; + bufferlist data_buf; + bufferlist::iterator data_blp; + bufferlist front, middle, data; + + bool replacing; // when replacing process happened, we will reply connect + // side with RETRY tag and accept side will clear replaced + // connection. So when connect side reissue connect_msg, + // there won't exists conflicting connection so we use + // "replacing" to skip RESETSESSION to avoid detect wrong + // presentation + bool is_reset_from_peer; + bool once_ready; + + State state; + + void run_continuation(CtPtr pcontinuation); + CtPtr read(CONTINUATION_RX_TYPE<ProtocolV1> &next, int len, + char *buffer = nullptr); + CtPtr write(CONTINUATION_TX_TYPE<ProtocolV1> &next,bufferlist &bl); + inline CtPtr _fault() { // helper fault method that stops continuation + fault(); + return nullptr; + } + + CONTINUATION_DECL(ProtocolV1, wait_message); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_keepalive2_ack); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_tag_ack); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_header); + CONTINUATION_DECL(ProtocolV1, throttle_message); + CONTINUATION_DECL(ProtocolV1, throttle_bytes); + CONTINUATION_DECL(ProtocolV1, throttle_dispatch_queue); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_front); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_middle); + CONTINUATION_DECL(ProtocolV1, read_message_data); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_data); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_message_footer); + + CtPtr ready(); + CtPtr wait_message(); + CtPtr handle_message(char *buffer, int r); + + CtPtr handle_keepalive2(char *buffer, int r); + void append_keepalive_or_ack(bool ack = false, utime_t *t = nullptr); + CtPtr handle_keepalive2_ack(char *buffer, int r); + CtPtr handle_tag_ack(char *buffer, int r); + + CtPtr handle_message_header(char *buffer, int r); + CtPtr throttle_message(); + CtPtr throttle_bytes(); + CtPtr throttle_dispatch_queue(); + CtPtr read_message_front(); + CtPtr handle_message_front(char *buffer, int r); + CtPtr read_message_middle(); + CtPtr handle_message_middle(char *buffer, int r); + CtPtr read_message_data_prepare(); + CtPtr read_message_data(); + CtPtr handle_message_data(char *buffer, int r); + CtPtr read_message_footer(); + CtPtr handle_message_footer(char *buffer, int r); + + void session_reset(); + void randomize_out_seq(); + + Message *_get_next_outgoing(bufferlist *bl); + + void prepare_send_message(uint64_t features, Message *m, bufferlist &bl); + ssize_t write_message(Message *m, bufferlist &bl, bool more); + + void requeue_sent(); + uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq); + void discard_out_queue(); + + void reset_recv_state(); + void reset_security(); + + ostream &_conn_prefix(std::ostream *_dout); + +public: + ProtocolV1(AsyncConnection *connection); + virtual ~ProtocolV1(); + + virtual void connect() override; + virtual void accept() override; + virtual bool is_connected() override; + virtual void stop() override; + virtual void fault() override; + virtual void send_message(Message *m) override; + virtual void send_keepalive() override; + + virtual void read_event() override; + virtual void write_event() override; + virtual bool is_queued() override; + + // Client Protocol +private: + int global_seq; + AuthAuthorizer *authorizer; + + CONTINUATION_DECL(ProtocolV1, send_client_banner); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_and_identify); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_my_addr_write); + CONTINUATION_DECL(ProtocolV1, send_connect_message); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_1); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_reply_auth); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_ack_seq); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_in_seq_write); + + CtPtr send_client_banner(); + CtPtr handle_client_banner_write(int r); + CtPtr wait_server_banner(); + CtPtr handle_server_banner_and_identify(char *buffer, int r); + CtPtr handle_my_addr_write(int r); + CtPtr send_connect_message(); + CtPtr handle_connect_message_write(int r); + CtPtr wait_connect_reply(); + CtPtr handle_connect_reply_1(char *buffer, int r); + CtPtr wait_connect_reply_auth(); + CtPtr handle_connect_reply_auth(char *buffer, int r); + CtPtr handle_connect_reply_2(); + CtPtr wait_ack_seq(); + CtPtr handle_ack_seq(char *buffer, int r); + CtPtr handle_in_seq_write(int r); + CtPtr client_ready(); + + // Server Protocol +protected: + bool wait_for_seq; + + CONTINUATION_DECL(ProtocolV1, send_server_banner); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_server_banner_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_client_banner); + CONTINUATION_DECL(ProtocolV1, wait_connect_message); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_1); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_connect_message_auth); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, + handle_connect_message_reply_write); + WRITE_HANDLER_CONTINUATION_DECL(ProtocolV1, + handle_ready_connect_message_reply_write); + READ_HANDLER_CONTINUATION_DECL(ProtocolV1, handle_seq); + + CtPtr send_server_banner(); + CtPtr handle_server_banner_write(int r); + CtPtr wait_client_banner(); + CtPtr handle_client_banner(char *buffer, int r); + CtPtr wait_connect_message(); + CtPtr handle_connect_message_1(char *buffer, int r); + CtPtr wait_connect_message_auth(); + CtPtr handle_connect_message_auth(char *buffer, int r); + CtPtr handle_connect_message_2(); + CtPtr send_connect_message_reply(char tag, ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply); + CtPtr handle_connect_message_reply_write(int r); + CtPtr replace(AsyncConnectionRef existing, ceph_msg_connect_reply &reply, + bufferlist &authorizer_reply); + CtPtr open(ceph_msg_connect_reply &reply, bufferlist &authorizer_reply); + CtPtr handle_ready_connect_message_reply_write(int r); + CtPtr wait_seq(); + CtPtr handle_seq(char *buffer, int r); + CtPtr server_ready(); +}; + +class LoopbackProtocolV1 : public ProtocolV1 { +public: + LoopbackProtocolV1(AsyncConnection *connection) : ProtocolV1(connection) { + this->can_write = WriteStatus::CANWRITE; + } +}; + +#endif /* _MSG_ASYNC_PROTOCOL_V1_ */ diff --git a/src/msg/async/ProtocolV2.cc b/src/msg/async/ProtocolV2.cc new file mode 100644 index 00000000..381d42c3 --- /dev/null +++ b/src/msg/async/ProtocolV2.cc @@ -0,0 +1,2870 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <type_traits> + +#include "ProtocolV2.h" +#include "AsyncMessenger.h" + +#include "common/EventTrace.h" +#include "common/ceph_crypto.h" +#include "common/errno.h" +#include "include/random.h" +#include "auth/AuthClient.h" +#include "auth/AuthServer.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix _conn_prefix(_dout) +ostream &ProtocolV2::_conn_prefix(std::ostream *_dout) { + return *_dout << "--2- " << messenger->get_myaddrs() << " >> " + << *connection->peer_addrs << " conn(" << connection << " " + << this + << " " << ceph_con_mode_name(auth_meta->con_mode) + << " :" << connection->port + << " s=" << get_state_name(state) << " pgs=" << peer_global_seq + << " cs=" << connect_seq << " l=" << connection->policy.lossy + << " rev1=" << HAVE_MSGR2_FEATURE(peer_supported_features, + REVISION_1) + << " rx=" << session_stream_handlers.rx.get() + << " tx=" << session_stream_handlers.tx.get() + << ")."; +} + +using namespace ceph::msgr::v2; + +using CtPtr = Ct<ProtocolV2> *; +using CtRef = Ct<ProtocolV2> &; + +void ProtocolV2::run_continuation(CtPtr pcontinuation) { + if (pcontinuation) { + run_continuation(*pcontinuation); + } +} + +void ProtocolV2::run_continuation(CtRef continuation) { + try { + CONTINUATION_RUN(continuation) + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " failed decoding of frame header: " << e + << dendl; + _fault(); + } catch (const ceph::crypto::onwire::MsgAuthError &e) { + lderr(cct) << __func__ << " " << e.what() << dendl; + _fault(); + } catch (const DecryptionError &) { + lderr(cct) << __func__ << " failed to decrypt frame payload" << dendl; + } +} + +#define WRITE(B, D, C) write(D, CONTINUATION(C), B) + +#define READ(L, C) read(CONTINUATION(C), buffer::ptr_node::create(buffer::create(L))) + +#define READ_RXBUF(B, C) read(CONTINUATION(C), B) + +#ifdef UNIT_TESTS_BUILT + +#define INTERCEPT(S) { \ +if(connection->interceptor) { \ + auto a = connection->interceptor->intercept(connection, (S)); \ + if (a == Interceptor::ACTION::FAIL) { \ + return _fault(); \ + } else if (a == Interceptor::ACTION::STOP) { \ + stop(); \ + connection->dispatch_queue->queue_reset(connection); \ + return nullptr; \ + }}} + +#else +#define INTERCEPT(S) +#endif + +ProtocolV2::ProtocolV2(AsyncConnection *connection) + : Protocol(2, connection), + state(NONE), + peer_supported_features(0), + client_cookie(0), + server_cookie(0), + global_seq(0), + connect_seq(0), + peer_global_seq(0), + message_seq(0), + reconnecting(false), + replacing(false), + can_write(false), + bannerExchangeCallback(nullptr), + tx_frame_asm(&session_stream_handlers, false), + rx_frame_asm(&session_stream_handlers, false), + next_tag(static_cast<Tag>(0)), + keepalive(false) { +} + +ProtocolV2::~ProtocolV2() { +} + +void ProtocolV2::connect() { + ldout(cct, 1) << __func__ << dendl; + state = START_CONNECT; + pre_auth.enabled = true; +} + +void ProtocolV2::accept() { + ldout(cct, 1) << __func__ << dendl; + state = START_ACCEPT; +} + +bool ProtocolV2::is_connected() { return can_write; } + +/* + * Tears down the message queues, and removes them from the + * DispatchQueue Must hold write_lock prior to calling. + */ +void ProtocolV2::discard_out_queue() { + ldout(cct, 10) << __func__ << " started" << dendl; + + for (list<Message *>::iterator p = sent.begin(); p != sent.end(); ++p) { + ldout(cct, 20) << __func__ << " discard " << *p << dendl; + (*p)->put(); + } + sent.clear(); + for (auto& [ prio, entries ] : out_queue) { + static_cast<void>(prio); + for (auto& entry : entries) { + ldout(cct, 20) << __func__ << " discard " << *entry.m << dendl; + entry.m->put(); + } + } + out_queue.clear(); + write_in_progress = false; +} + +void ProtocolV2::reset_session() { + ldout(cct, 1) << __func__ << dendl; + + std::lock_guard<std::mutex> l(connection->write_lock); + if (connection->delay_state) { + connection->delay_state->discard(); + } + + connection->dispatch_queue->discard_queue(connection->conn_id); + discard_out_queue(); + connection->outgoing_bl.clear(); + + connection->dispatch_queue->queue_remote_reset(connection); + + out_seq = 0; + in_seq = 0; + client_cookie = 0; + server_cookie = 0; + connect_seq = 0; + peer_global_seq = 0; + message_seq = 0; + ack_left = 0; + can_write = false; +} + +void ProtocolV2::stop() { + ldout(cct, 1) << __func__ << dendl; + if (state == CLOSED) { + return; + } + + if (connection->delay_state) connection->delay_state->flush(); + + std::lock_guard<std::mutex> l(connection->write_lock); + + reset_recv_state(); + discard_out_queue(); + + connection->_stop(); + + can_write = false; + state = CLOSED; +} + +void ProtocolV2::fault() { _fault(); } + +void ProtocolV2::requeue_sent() { + write_in_progress = false; + if (sent.empty()) { + return; + } + + auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST]; + out_seq -= sent.size(); + while (!sent.empty()) { + Message *m = sent.back(); + sent.pop_back(); + ldout(cct, 5) << __func__ << " requeueing message m=" << m + << " seq=" << m->get_seq() << " type=" << m->get_type() << " " + << *m << dendl; + rq.emplace_front(out_queue_entry_t{false, m}); + } +} + +uint64_t ProtocolV2::discard_requeued_up_to(uint64_t out_seq, uint64_t seq) { + ldout(cct, 10) << __func__ << " " << seq << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (out_queue.count(CEPH_MSG_PRIO_HIGHEST) == 0) { + return seq; + } + auto& rq = out_queue[CEPH_MSG_PRIO_HIGHEST]; + uint64_t count = out_seq; + while (!rq.empty()) { + Message* const m = rq.front().m; + if (m->get_seq() == 0 || m->get_seq() > seq) break; + ldout(cct, 5) << __func__ << " discarding message m=" << m + << " seq=" << m->get_seq() << " ack_seq=" << seq << " " + << *m << dendl; + m->put(); + rq.pop_front(); + count++; + } + if (rq.empty()) out_queue.erase(CEPH_MSG_PRIO_HIGHEST); + return count; +} + +void ProtocolV2::reset_security() { + ldout(cct, 5) << __func__ << dendl; + + auth_meta.reset(new AuthConnectionMeta); + session_stream_handlers.rx.reset(nullptr); + session_stream_handlers.tx.reset(nullptr); + pre_auth.rxbuf.clear(); + pre_auth.txbuf.clear(); +} + +// it's expected the `write_lock` is held while calling this method. +void ProtocolV2::reset_recv_state() { + ldout(cct, 5) << __func__ << dendl; + + if (!connection->center->in_thread()) { + // execute in the same thread that uses the rx/tx handlers. We need + // to do the warp because holding `write_lock` is not enough as + // `write_event()` unlocks it just before calling `write_message()`. + // `submit_to()` here is NOT blocking. + connection->center->submit_to(connection->center->get_id(), [this] { + ldout(cct, 5) << "reset_recv_state (warped) reseting crypto handlers" + << dendl; + // Possibly unnecessary. See the comment in `deactivate_existing`. + std::lock_guard<std::mutex> l(connection->lock); + std::lock_guard<std::mutex> wl(connection->write_lock); + reset_security(); + }, /* nowait = */true); + } else { + reset_security(); + } + + // clean read and write callbacks + connection->pendingReadLen.reset(); + connection->writeCallback.reset(); + + next_tag = static_cast<Tag>(0); + + reset_throttle(); +} + +size_t ProtocolV2::get_current_msg_size() const { + ceph_assert(rx_frame_asm.get_num_segments() > 0); + size_t sum = 0; + // we don't include SegmentIndex::Msg::HEADER. + for (size_t i = 1; i < rx_frame_asm.get_num_segments(); i++) { + sum += rx_frame_asm.get_segment_logical_len(i); + } + return sum; +} + +void ProtocolV2::reset_throttle() { + if (state > THROTTLE_MESSAGE && state <= THROTTLE_DONE && + connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " releasing " << 1 + << " message to policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + connection->policy.throttler_messages->put(); + } + if (state > THROTTLE_BYTES && state <= THROTTLE_DONE) { + if (connection->policy.throttler_bytes) { + const size_t cur_msg_size = get_current_msg_size(); + ldout(cct, 10) << __func__ << " releasing " << cur_msg_size + << " bytes to policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + connection->policy.throttler_bytes->put(cur_msg_size); + } + } + if (state > THROTTLE_DISPATCH_QUEUE && state <= THROTTLE_DONE) { + const size_t cur_msg_size = get_current_msg_size(); + ldout(cct, 10) + << __func__ << " releasing " << cur_msg_size + << " bytes to dispatch_queue throttler " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() << dendl; + connection->dispatch_queue->dispatch_throttle_release(cur_msg_size); + } +} + +CtPtr ProtocolV2::_fault() { + ldout(cct, 10) << __func__ << dendl; + + if (state == CLOSED || state == NONE) { + ldout(cct, 10) << __func__ << " connection is already closed" << dendl; + return nullptr; + } + + if (connection->policy.lossy && + !(state >= START_CONNECT && state <= SESSION_RECONNECTING)) { + ldout(cct, 2) << __func__ << " on lossy channel, failing" << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + connection->write_lock.lock(); + + can_write = false; + // requeue sent items + requeue_sent(); + + if (out_queue.empty() && state >= START_ACCEPT && + state <= SESSION_ACCEPTING && !replacing) { + ldout(cct, 2) << __func__ << " with nothing to send and in the half " + << " accept state just closed" << dendl; + connection->write_lock.unlock(); + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + replacing = false; + connection->fault(); + reset_recv_state(); + + reconnecting = false; + + if (connection->policy.standby && out_queue.empty() && !keepalive && + state != WAIT) { + ldout(cct, 1) << __func__ << " with nothing to send, going to standby" + << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return nullptr; + } + if (connection->policy.server) { + ldout(cct, 1) << __func__ << " server, going to standby, even though i have stuff queued" << dendl; + state = STANDBY; + connection->write_lock.unlock(); + return nullptr; + } + + connection->write_lock.unlock(); + + if (!(state >= START_CONNECT && state <= SESSION_RECONNECTING) && + state != WAIT && + state != SESSION_ACCEPTING /* due to connection race */) { + // policy maybe empty when state is in accept + if (connection->policy.server) { + ldout(cct, 1) << __func__ << " server, going to standby" << dendl; + state = STANDBY; + } else { + ldout(cct, 1) << __func__ << " initiating reconnect" << dendl; + connect_seq++; + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + pre_auth.enabled = true; + connection->state = AsyncConnection::STATE_CONNECTING; + } + backoff = utime_t(); + connection->center->dispatch_event_external(connection->read_handler); + } else { + if (state == WAIT) { + backoff.set_from_double(cct->_conf->ms_max_backoff); + } else if (backoff == utime_t()) { + backoff.set_from_double(cct->_conf->ms_initial_backoff); + } else { + backoff += backoff; + if (backoff > cct->_conf->ms_max_backoff) + backoff.set_from_double(cct->_conf->ms_max_backoff); + } + + if (server_cookie) { + connect_seq++; + } + + global_seq = messenger->get_global_seq(); + state = START_CONNECT; + pre_auth.enabled = true; + connection->state = AsyncConnection::STATE_CONNECTING; + ldout(cct, 1) << __func__ << " waiting " << backoff << dendl; + // woke up again; + connection->register_time_events.insert( + connection->center->create_time_event(backoff.to_nsec() / 1000, + connection->wakeup_handler)); + } + return nullptr; +} + +void ProtocolV2::prepare_send_message(uint64_t features, + Message *m) { + ldout(cct, 20) << __func__ << " m=" << *m << dendl; + + // associate message with Connection (for benefit of encode_payload) + if (m->empty_payload()) { + ldout(cct, 20) << __func__ << " encoding features " << features << " " << m + << " " << *m << dendl; + } else { + ldout(cct, 20) << __func__ << " half-reencoding features " << features + << " " << m << " " << *m << dendl; + } + + // encode and copy out of *m + m->encode(features, 0); +} + +void ProtocolV2::send_message(Message *m) { + uint64_t f = connection->get_features(); + + // TODO: Currently not all messages supports reencode like MOSDMap, so here + // only let fast dispatch support messages prepare message + const bool can_fast_prepare = messenger->ms_can_fast_dispatch(m); + if (can_fast_prepare) { + prepare_send_message(f, m); + } + + std::lock_guard<std::mutex> l(connection->write_lock); + bool is_prepared = can_fast_prepare; + // "features" changes will change the payload encoding + if (can_fast_prepare && (!can_write || connection->get_features() != f)) { + // ensure the correctness of message encoding + m->clear_payload(); + is_prepared = false; + ldout(cct, 10) << __func__ << " clear encoded buffer previous " << f + << " != " << connection->get_features() << dendl; + } + if (state == CLOSED) { + ldout(cct, 10) << __func__ << " connection closed." + << " Drop message " << m << dendl; + m->put(); + } else { + ldout(cct, 5) << __func__ << " enqueueing message m=" << m + << " type=" << m->get_type() << " " << *m << dendl; + m->trace.event("async enqueueing message"); + out_queue[m->get_priority()].emplace_back( + out_queue_entry_t{is_prepared, m}); + ldout(cct, 15) << __func__ << " inline write is denied, reschedule m=" << m + << dendl; + if (((!replacing && can_write) || state == STANDBY) && !write_in_progress) { + write_in_progress = true; + connection->center->dispatch_event_external(connection->write_handler); + } + } +} + +void ProtocolV2::send_keepalive() { + ldout(cct, 10) << __func__ << dendl; + std::lock_guard<std::mutex> l(connection->write_lock); + if (state != CLOSED) { + keepalive = true; + connection->center->dispatch_event_external(connection->write_handler); + } +} + +void ProtocolV2::read_event() { + ldout(cct, 20) << __func__ << dendl; + + switch (state) { + case START_CONNECT: + run_continuation(CONTINUATION(start_client_banner_exchange)); + break; + case START_ACCEPT: + run_continuation(CONTINUATION(start_server_banner_exchange)); + break; + case READY: + run_continuation(CONTINUATION(read_frame)); + break; + case THROTTLE_MESSAGE: + run_continuation(CONTINUATION(throttle_message)); + break; + case THROTTLE_BYTES: + run_continuation(CONTINUATION(throttle_bytes)); + break; + case THROTTLE_DISPATCH_QUEUE: + run_continuation(CONTINUATION(throttle_dispatch_queue)); + break; + default: + break; + } +} + +ProtocolV2::out_queue_entry_t ProtocolV2::_get_next_outgoing() { + out_queue_entry_t out_entry; + + if (!out_queue.empty()) { + auto it = out_queue.rbegin(); + auto& entries = it->second; + ceph_assert(!entries.empty()); + out_entry = entries.front(); + entries.pop_front(); + if (entries.empty()) { + out_queue.erase(it->first); + } + } + return out_entry; +} + +ssize_t ProtocolV2::write_message(Message *m, bool more) { + FUNCTRACE(cct); + ceph_assert(connection->center->in_thread()); + m->set_seq(++out_seq); + + connection->lock.lock(); + uint64_t ack_seq = in_seq; + ack_left = 0; + connection->lock.unlock(); + + ceph_msg_header &header = m->get_header(); + ceph_msg_footer &footer = m->get_footer(); + + ceph_msg_header2 header2{header.seq, header.tid, + header.type, header.priority, + header.version, + init_le32(0), header.data_off, + init_le64(ack_seq), + footer.flags, header.compat_version, + header.reserved}; + + auto message = MessageFrame::Encode( + header2, + m->get_payload(), + m->get_middle(), + m->get_data()); + if (!append_frame(message)) { + m->put(); + return -EILSEQ; + } + + ldout(cct, 5) << __func__ << " sending message m=" << m + << " seq=" << m->get_seq() << " " << *m << dendl; + + m->trace.event("async writing message"); + ldout(cct, 20) << __func__ << " sending m=" << m << " seq=" << m->get_seq() + << " src=" << entity_name_t(messenger->get_myname()) + << " off=" << header2.data_off + << dendl; + ssize_t total_send_size = connection->outgoing_bl.length(); + ssize_t rc = connection->_try_send(more); + if (rc < 0) { + ldout(cct, 1) << __func__ << " error sending " << m << ", " + << cpp_strerror(rc) << dendl; + } else { + connection->logger->inc( + l_msgr_send_bytes, total_send_size - connection->outgoing_bl.length()); + ldout(cct, 10) << __func__ << " sending " << m + << (rc ? " continuely." : " done.") << dendl; + } + if (m->get_type() == CEPH_MSG_OSD_OP) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OP_END", false); + else if (m->get_type() == CEPH_MSG_OSD_OPREPLY) + OID_EVENT_TRACE_WITH_MSG(m, "SEND_MSG_OSD_OPREPLY_END", false); + m->put(); + + return rc; +} + +template <class F> +bool ProtocolV2::append_frame(F& frame) { + ceph::bufferlist bl; + try { + bl = frame.get_buffer(tx_frame_asm); + } catch (ceph::crypto::onwire::TxHandlerError &e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return false; + } + + ldout(cct, 25) << __func__ << " assembled frame " << bl.length() + << " bytes " << tx_frame_asm << dendl; + connection->outgoing_bl.append(bl); + return true; +} + +void ProtocolV2::handle_message_ack(uint64_t seq) { + if (connection->policy.lossy) { // lossy connections don't keep sent messages + return; + } + + ldout(cct, 15) << __func__ << " seq=" << seq << dendl; + + // trim sent list + static const int max_pending = 128; + int i = 0; + Message *pending[max_pending]; + connection->write_lock.lock(); + while (!sent.empty() && sent.front()->get_seq() <= seq && i < max_pending) { + Message *m = sent.front(); + sent.pop_front(); + pending[i++] = m; + ldout(cct, 10) << __func__ << " got ack seq " << seq + << " >= " << m->get_seq() << " on " << m << " " << *m + << dendl; + } + connection->write_lock.unlock(); + for (int k = 0; k < i; k++) { + pending[k]->put(); + } +} + +void ProtocolV2::write_event() { + ldout(cct, 10) << __func__ << dendl; + ssize_t r = 0; + + connection->write_lock.lock(); + if (can_write) { + if (keepalive) { + ldout(cct, 10) << __func__ << " appending keepalive" << dendl; + auto keepalive_frame = KeepAliveFrame::Encode(); + if (!append_frame(keepalive_frame)) { + connection->write_lock.unlock(); + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + keepalive = false; + } + + auto start = ceph::mono_clock::now(); + bool more; + do { + const auto out_entry = _get_next_outgoing(); + if (!out_entry.m) { + break; + } + + if (!connection->policy.lossy) { + // put on sent list + sent.push_back(out_entry.m); + out_entry.m->get(); + } + more = !out_queue.empty(); + connection->write_lock.unlock(); + + // send_message or requeue messages may not encode message + if (!out_entry.is_prepared) { + prepare_send_message(connection->get_features(), out_entry.m); + } + + r = write_message(out_entry.m, more); + + connection->write_lock.lock(); + if (r == 0) { + ; + } else if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + break; + } else if (r > 0) + break; + } while (can_write); + write_in_progress = false; + + // if r > 0 mean data still lefted, so no need _try_send. + if (r == 0) { + uint64_t left = ack_left; + if (left) { + ceph_le64 s; + s = in_seq; + ldout(cct, 10) << __func__ << " try send msg ack, acked " << left + << " messages" << dendl; + auto ack_frame = AckFrame::Encode(in_seq); + if (append_frame(ack_frame)) { + ack_left -= left; + left = ack_left; + r = connection->_try_send(left); + } else { + r = -EILSEQ; + } + } else if (is_queued()) { + r = connection->_try_send(); + } + } + connection->write_lock.unlock(); + + connection->logger->tinc(l_msgr_running_send_time, + ceph::mono_clock::now() - start); + if (r < 0) { + ldout(cct, 1) << __func__ << " send msg failed" << dendl; + connection->lock.lock(); + fault(); + connection->lock.unlock(); + return; + } + } else { + write_in_progress = false; + connection->write_lock.unlock(); + connection->lock.lock(); + connection->write_lock.lock(); + if (state == STANDBY && !connection->policy.server && is_queued()) { + ldout(cct, 10) << __func__ << " policy.server is false" << dendl; + if (server_cookie) { // only increment connect_seq if there is a session + connect_seq++; + } + connection->_connect(); + } else if (connection->cs && state != NONE && state != CLOSED && + state != START_CONNECT) { + r = connection->_try_send(); + if (r < 0) { + ldout(cct, 1) << __func__ << " send outcoming bl failed" << dendl; + connection->write_lock.unlock(); + fault(); + connection->lock.unlock(); + return; + } + } + connection->write_lock.unlock(); + connection->lock.unlock(); + } +} + +bool ProtocolV2::is_queued() { + return !out_queue.empty() || connection->is_queued(); +} + +CtPtr ProtocolV2::read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next, + rx_buffer_t &&buffer) { + const auto len = buffer->length(); + const auto buf = buffer->c_str(); + next.node = std::move(buffer); + ssize_t r = connection->read(len, buf, + [&next, this](char *buffer, int r) { + if (unlikely(pre_auth.enabled) && r >= 0) { + pre_auth.rxbuf.append(*next.node); + ceph_assert(!cct->_conf->ms_die_on_bug || + pre_auth.rxbuf.length() < 20000000); + } + next.r = r; + run_continuation(next); + }); + if (r <= 0) { + // error or done synchronously + if (unlikely(pre_auth.enabled) && r >= 0) { + pre_auth.rxbuf.append(*next.node); + ceph_assert(!cct->_conf->ms_die_on_bug || + pre_auth.rxbuf.length() < 20000000); + } + next.r = r; + return &next; + } + + return nullptr; +} + +template <class F> +CtPtr ProtocolV2::write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + F &frame) { + ceph::bufferlist bl; + try { + bl = frame.get_buffer(tx_frame_asm); + } catch (ceph::crypto::onwire::TxHandlerError &e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return _fault(); + } + + ldout(cct, 25) << __func__ << " assembled frame " << bl.length() + << " bytes " << tx_frame_asm << dendl; + return write(desc, next, bl); +} + +CtPtr ProtocolV2::write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + bufferlist &buffer) { + if (unlikely(pre_auth.enabled)) { + pre_auth.txbuf.append(buffer); + ceph_assert(!cct->_conf->ms_die_on_bug || + pre_auth.txbuf.length() < 20000000); + } + + ssize_t r = + connection->write(buffer, [&next, desc, this](int r) { + if (r < 0) { + ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + connection->inject_delay(); + _fault(); + } + run_continuation(next); + }); + + if (r < 0) { + ldout(cct, 1) << __func__ << " " << desc << " write failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } else if (r == 0) { + next.setParams(); + return &next; + } + + return nullptr; +} + +CtPtr ProtocolV2::_banner_exchange(CtRef callback) { + ldout(cct, 20) << __func__ << dendl; + bannerExchangeCallback = &callback; + + bufferlist banner_payload; + encode((uint64_t)CEPH_MSGR2_SUPPORTED_FEATURES, banner_payload, 0); + encode((uint64_t)CEPH_MSGR2_REQUIRED_FEATURES, banner_payload, 0); + + bufferlist bl; + bl.append(CEPH_BANNER_V2_PREFIX, strlen(CEPH_BANNER_V2_PREFIX)); + encode((uint16_t)banner_payload.length(), bl, 0); + bl.claim_append(banner_payload); + + INTERCEPT(state == BANNER_CONNECTING ? 3 : 4); + + return WRITE(bl, "banner", _wait_for_peer_banner); +} + +CtPtr ProtocolV2::_wait_for_peer_banner() { + unsigned banner_len = strlen(CEPH_BANNER_V2_PREFIX) + sizeof(__le16); + return READ(banner_len, _handle_peer_banner); +} + +CtPtr ProtocolV2::_handle_peer_banner(rx_buffer_t &&buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner failed r=" << r << " (" + << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + unsigned banner_prefix_len = strlen(CEPH_BANNER_V2_PREFIX); + + if (memcmp(buffer->c_str(), CEPH_BANNER_V2_PREFIX, banner_prefix_len)) { + if (memcmp(buffer->c_str(), CEPH_BANNER, strlen(CEPH_BANNER)) == 0) { + lderr(cct) << __func__ << " peer " << *connection->peer_addrs + << " is using msgr V1 protocol" << dendl; + return _fault(); + } + ldout(cct, 1) << __func__ << " accept peer sent bad banner" << dendl; + return _fault(); + } + + uint16_t payload_len; + bufferlist bl; + buffer->set_offset(banner_prefix_len); + buffer->set_length(sizeof(__le16)); + bl.push_back(std::move(buffer)); + auto ti = bl.cbegin(); + try { + decode(payload_len, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode banner payload len failed " << dendl; + return _fault(); + } + + INTERCEPT(state == BANNER_CONNECTING ? 5 : 6); + + return READ(payload_len, _handle_peer_banner_payload); +} + +CtPtr ProtocolV2::_handle_peer_banner_payload(rx_buffer_t &&buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read peer banner payload failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + uint64_t peer_supported_features; + uint64_t peer_required_features; + + bufferlist bl; + bl.push_back(std::move(buffer)); + auto ti = bl.cbegin(); + try { + decode(peer_supported_features, ti); + decode(peer_required_features, ti); + } catch (const buffer::error &e) { + lderr(cct) << __func__ << " decode banner payload failed " << dendl; + return _fault(); + } + + ldout(cct, 1) << __func__ << " supported=" << std::hex + << peer_supported_features << " required=" << std::hex + << peer_required_features << std::dec << dendl; + + // Check feature bit compatibility + + uint64_t supported_features = CEPH_MSGR2_SUPPORTED_FEATURES; + uint64_t required_features = CEPH_MSGR2_REQUIRED_FEATURES; + + if ((required_features & peer_supported_features) != required_features) { + ldout(cct, 1) << __func__ << " peer does not support all required features" + << " required=" << std::hex << required_features + << " supported=" << std::hex << peer_supported_features + << std::dec << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + if ((supported_features & peer_required_features) != peer_required_features) { + ldout(cct, 1) << __func__ << " we do not support all peer required features" + << " required=" << std::hex << peer_required_features + << " supported=" << supported_features << std::dec << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + this->peer_supported_features = peer_supported_features; + if (peer_required_features == 0) { + this->connection_features = msgr2_required; + } + + // if the peer supports msgr2.1, switch to it + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + tx_frame_asm.set_is_rev1(is_rev1); + rx_frame_asm.set_is_rev1(is_rev1); + + if (state == BANNER_CONNECTING) { + state = HELLO_CONNECTING; + } + else { + ceph_assert(state == BANNER_ACCEPTING); + state = HELLO_ACCEPTING; + } + + auto hello = HelloFrame::Encode(messenger->get_mytype(), + connection->target_addr); + + INTERCEPT(state == HELLO_CONNECTING ? 7 : 8); + + return WRITE(hello, "hello frame", read_frame); +} + +CtPtr ProtocolV2::handle_hello(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != HELLO_CONNECTING && state != HELLO_ACCEPTING) { + lderr(cct) << __func__ << " not in hello exchange state!" << dendl; + return _fault(); + } + + auto hello = HelloFrame::Decode(payload); + + ldout(cct, 5) << __func__ << " received hello:" + << " peer_type=" << (int)hello.entity_type() + << " peer_addr_for_me=" << hello.peer_addr() << dendl; + + sockaddr_storage ss; + socklen_t len = sizeof(ss); + getsockname(connection->cs.fd(), (sockaddr *)&ss, &len); + ldout(cct, 5) << __func__ << " getsockname says I am " << (sockaddr *)&ss + << " when talking to " << connection->target_addr << dendl; + + if (connection->get_peer_type() == -1) { + connection->set_peer_type(hello.entity_type()); + + ceph_assert(state == HELLO_ACCEPTING); + connection->policy = messenger->get_policy(hello.entity_type()); + ldout(cct, 10) << __func__ << " accept of host_type " + << (int)hello.entity_type() + << ", policy.lossy=" << connection->policy.lossy + << " policy.server=" << connection->policy.server + << " policy.standby=" << connection->policy.standby + << " policy.resetcheck=" << connection->policy.resetcheck + << dendl; + } else { + ceph_assert(state == HELLO_CONNECTING); + if (connection->get_peer_type() != hello.entity_type()) { + ldout(cct, 1) << __func__ << " connection peer type does not match what" + << " peer advertises " << connection->get_peer_type() + << " != " << (int)hello.entity_type() << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + } + + if (messenger->get_myaddrs().empty() || + messenger->get_myaddrs().front().is_blank_ip()) { + entity_addr_t a; + if (cct->_conf->ms_learn_addr_from_peer) { + ldout(cct, 1) << __func__ << " peer " << connection->target_addr + << " says I am " << hello.peer_addr() << " (socket says " + << (sockaddr*)&ss << ")" << dendl; + a = hello.peer_addr(); + } else { + ldout(cct, 1) << __func__ << " socket to " << connection->target_addr + << " says I am " << (sockaddr*)&ss + << " (peer says " << hello.peer_addr() << ")" << dendl; + a.set_sockaddr((sockaddr *)&ss); + } + a.set_type(entity_addr_t::TYPE_MSGR2); // anything but NONE; learned_addr ignores this + a.set_port(0); + connection->lock.unlock(); + messenger->learned_addr(a); + if (cct->_conf->ms_inject_internal_delays && + cct->_conf->ms_inject_socket_failures) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 10) << __func__ << " sleep for " + << cct->_conf->ms_inject_internal_delays << dendl; + utime_t t; + t.set_from_double(cct->_conf->ms_inject_internal_delays); + t.sleep(); + } + } + connection->lock.lock(); + if (state != HELLO_CONNECTING) { + ldout(cct, 1) << __func__ + << " state changed while learned_addr, mark_down or " + << " replacing must be happened just now" << dendl; + return nullptr; + } + } + + + + CtPtr callback; + callback = bannerExchangeCallback; + bannerExchangeCallback = nullptr; + ceph_assert(callback); + return callback; +} + +CtPtr ProtocolV2::read_frame() { + if (state == CLOSED) { + return nullptr; + } + + ldout(cct, 20) << __func__ << dendl; + rx_preamble.clear(); + rx_epilogue.clear(); + rx_segments_data.clear(); + + return READ(rx_frame_asm.get_preamble_onwire_len(), + handle_read_frame_preamble_main); +} + +CtPtr ProtocolV2::handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read frame preamble failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + rx_preamble.push_back(std::move(buffer)); + + ldout(cct, 30) << __func__ << " preamble\n"; + rx_preamble.hexdump(*_dout); + *_dout << dendl; + + try { + next_tag = rx_frame_asm.disassemble_preamble(rx_preamble); + } catch (FrameError& e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return _fault(); + } catch (ceph::crypto::onwire::MsgAuthError&) { + ldout(cct, 1) << __func__ << "bad auth tag" << dendl; + return _fault(); + } + + ldout(cct, 25) << __func__ << " disassembled preamble " << rx_frame_asm + << dendl; + + if (session_stream_handlers.rx) { + ldout(cct, 30) << __func__ << " preamble after decrypt\n"; + rx_preamble.hexdump(*_dout); + *_dout << dendl; + } + + // does it need throttle? + if (next_tag == Tag::MESSAGE) { + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + state = THROTTLE_MESSAGE; + return CONTINUE(throttle_message); + } else { + return read_frame_segment(); + } +} + +CtPtr ProtocolV2::handle_read_frame_dispatch() { + ldout(cct, 10) << __func__ + << " tag=" << static_cast<uint32_t>(next_tag) << dendl; + + switch (next_tag) { + case Tag::HELLO: + case Tag::AUTH_REQUEST: + case Tag::AUTH_BAD_METHOD: + case Tag::AUTH_REPLY_MORE: + case Tag::AUTH_REQUEST_MORE: + case Tag::AUTH_DONE: + case Tag::AUTH_SIGNATURE: + case Tag::CLIENT_IDENT: + case Tag::SERVER_IDENT: + case Tag::IDENT_MISSING_FEATURES: + case Tag::SESSION_RECONNECT: + case Tag::SESSION_RESET: + case Tag::SESSION_RETRY: + case Tag::SESSION_RETRY_GLOBAL: + case Tag::SESSION_RECONNECT_OK: + case Tag::KEEPALIVE2: + case Tag::KEEPALIVE2_ACK: + case Tag::ACK: + case Tag::WAIT: + return handle_frame_payload(); + case Tag::MESSAGE: + return handle_message(); + default: { + lderr(cct) << __func__ + << " received unknown tag=" << static_cast<uint32_t>(next_tag) + << dendl; + return _fault(); + } + } + + return nullptr; +} + +CtPtr ProtocolV2::read_frame_segment() { + size_t seg_idx = rx_segments_data.size(); + ldout(cct, 20) << __func__ << " seg_idx=" << seg_idx << dendl; + rx_segments_data.emplace_back(); + + uint32_t onwire_len = rx_frame_asm.get_segment_onwire_len(seg_idx); + if (onwire_len == 0) { + return _handle_read_frame_segment(); + } + + rx_buffer_t rx_buffer; + uint16_t align = rx_frame_asm.get_segment_align(seg_idx); + try { + rx_buffer = buffer::ptr_node::create(buffer::create_aligned( + onwire_len, align)); + } catch (std::bad_alloc&) { + // Catching because of potential issues with satisfying alignment. + ldout(cct, 1) << __func__ << " can't allocate aligned rx_buffer" + << " len=" << onwire_len + << " align=" << align + << dendl; + return _fault(); + } + + return READ_RXBUF(std::move(rx_buffer), handle_read_frame_segment); +} + +CtPtr ProtocolV2::handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r) { + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read frame segment failed r=" << r << " (" + << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + rx_segments_data.back().push_back(std::move(rx_buffer)); + return _handle_read_frame_segment(); +} + +CtPtr ProtocolV2::_handle_read_frame_segment() { + if (rx_segments_data.size() == rx_frame_asm.get_num_segments()) { + // OK, all segments planned to read are read. Can go with epilogue. + uint32_t epilogue_onwire_len = rx_frame_asm.get_epilogue_onwire_len(); + if (epilogue_onwire_len == 0) { + return _handle_read_frame_epilogue_main(); + } + return READ(epilogue_onwire_len, handle_read_frame_epilogue_main); + } + // TODO: for makeshift only. This will be more generic and throttled + return read_frame_segment(); +} + +CtPtr ProtocolV2::handle_frame_payload() { + ceph_assert(!rx_segments_data.empty()); + auto& payload = rx_segments_data.back(); + + ldout(cct, 30) << __func__ << "\n"; + payload.hexdump(*_dout); + *_dout << dendl; + + switch (next_tag) { + case Tag::HELLO: + return handle_hello(payload); + case Tag::AUTH_REQUEST: + return handle_auth_request(payload); + case Tag::AUTH_BAD_METHOD: + return handle_auth_bad_method(payload); + case Tag::AUTH_REPLY_MORE: + return handle_auth_reply_more(payload); + case Tag::AUTH_REQUEST_MORE: + return handle_auth_request_more(payload); + case Tag::AUTH_DONE: + return handle_auth_done(payload); + case Tag::AUTH_SIGNATURE: + return handle_auth_signature(payload); + case Tag::CLIENT_IDENT: + return handle_client_ident(payload); + case Tag::SERVER_IDENT: + return handle_server_ident(payload); + case Tag::IDENT_MISSING_FEATURES: + return handle_ident_missing_features(payload); + case Tag::SESSION_RECONNECT: + return handle_reconnect(payload); + case Tag::SESSION_RESET: + return handle_session_reset(payload); + case Tag::SESSION_RETRY: + return handle_session_retry(payload); + case Tag::SESSION_RETRY_GLOBAL: + return handle_session_retry_global(payload); + case Tag::SESSION_RECONNECT_OK: + return handle_reconnect_ok(payload); + case Tag::KEEPALIVE2: + return handle_keepalive2(payload); + case Tag::KEEPALIVE2_ACK: + return handle_keepalive2_ack(payload); + case Tag::ACK: + return handle_message_ack(payload); + case Tag::WAIT: + return handle_wait(payload); + default: + ceph_abort(); + } + return nullptr; +} + +CtPtr ProtocolV2::ready() { + ldout(cct, 25) << __func__ << dendl; + + reconnecting = false; + replacing = false; + + // make sure no pending tick timer + if (connection->last_tick_id) { + connection->center->delete_time_event(connection->last_tick_id); + } + connection->last_tick_id = connection->center->create_time_event( + connection->inactive_timeout_us, connection->tick_handler); + + { + std::lock_guard<std::mutex> l(connection->write_lock); + can_write = true; + if (!out_queue.empty()) { + connection->center->dispatch_event_external(connection->write_handler); + } + } + + connection->maybe_start_delay_thread(); + + state = READY; + ldout(cct, 1) << __func__ << " entity=" << peer_name << " client_cookie=" + << std::hex << client_cookie << " server_cookie=" + << server_cookie << std::dec << " in_seq=" << in_seq + << " out_seq=" << out_seq << dendl; + + INTERCEPT(15); + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r) +{ + ldout(cct, 20) << __func__ << " r=" << r << dendl; + + if (r < 0) { + ldout(cct, 1) << __func__ << " read frame epilogue failed r=" << r + << " (" << cpp_strerror(r) << ")" << dendl; + return _fault(); + } + + rx_epilogue.push_back(std::move(buffer)); + return _handle_read_frame_epilogue_main(); +} + +CtPtr ProtocolV2::_handle_read_frame_epilogue_main() { + bool aborted; + try { + rx_frame_asm.disassemble_first_segment(rx_preamble, rx_segments_data[0]); + aborted = !rx_frame_asm.disassemble_remaining_segments( + rx_segments_data.data(), rx_epilogue); + } catch (FrameError& e) { + ldout(cct, 1) << __func__ << " " << e.what() << dendl; + return _fault(); + } catch (ceph::crypto::onwire::MsgAuthError&) { + ldout(cct, 1) << __func__ << "bad auth tag" << dendl; + return _fault(); + } + + // we do have a mechanism that allows transmitter to start sending message + // and abort after putting entire data field on wire. This will be used by + // the kernel client to avoid unnecessary buffering. + if (aborted) { + reset_throttle(); + state = READY; + return CONTINUE(read_frame); + } + return handle_read_frame_dispatch(); +} + +CtPtr ProtocolV2::handle_message() { + ldout(cct, 20) << __func__ << dendl; + ceph_assert(state == THROTTLE_DONE); + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + ltt_recv_stamp = ceph_clock_now(); +#endif + recv_stamp = ceph_clock_now(); + + const size_t cur_msg_size = get_current_msg_size(); + auto msg_frame = MessageFrame::Decode(rx_segments_data); + + // XXX: paranoid copy just to avoid oops + ceph_msg_header2 current_header = msg_frame.header(); + + ldout(cct, 5) << __func__ + << " got " << msg_frame.front_len() + << " + " << msg_frame.middle_len() + << " + " << msg_frame.data_len() + << " byte message." + << " envelope type=" << current_header.type + << " src " << peer_name + << " off " << current_header.data_off + << dendl; + + INTERCEPT(16); + ceph_msg_header header{current_header.seq, + current_header.tid, + current_header.type, + current_header.priority, + current_header.version, + init_le32(msg_frame.front_len()), + init_le32(msg_frame.middle_len()), + init_le32(msg_frame.data_len()), + current_header.data_off, + peer_name, + current_header.compat_version, + current_header.reserved, + init_le32(0)}; + ceph_msg_footer footer{init_le32(0), init_le32(0), + init_le32(0), init_le64(0), current_header.flags}; + + Message *message = decode_message(cct, 0, header, footer, + msg_frame.front(), + msg_frame.middle(), + msg_frame.data(), + connection); + if (!message) { + ldout(cct, 1) << __func__ << " decode message failed " << dendl; + return _fault(); + } else { + state = READ_MESSAGE_COMPLETE; + } + + INTERCEPT(17); + + message->set_byte_throttler(connection->policy.throttler_bytes); + message->set_message_throttler(connection->policy.throttler_messages); + + // store reservation size in message, so we don't get confused + // by messages entering the dispatch queue through other paths. + message->set_dispatch_throttle_size(cur_msg_size); + + message->set_recv_stamp(recv_stamp); + message->set_throttle_stamp(throttle_stamp); + message->set_recv_complete_stamp(ceph_clock_now()); + + // check received seq#. if it is old, drop the message. + // note that incoming messages may skip ahead. this is convenient for the + // client side queueing because messages can't be renumbered, but the (kernel) + // client will occasionally pull a message out of the sent queue to send + // elsewhere. in that case it doesn't matter if we "got" it or not. + uint64_t cur_seq = in_seq; + if (message->get_seq() <= cur_seq) { + ldout(cct, 0) << __func__ << " got old message " << message->get_seq() + << " <= " << cur_seq << " " << message << " " << *message + << ", discarding" << dendl; + message->put(); + if (connection->has_feature(CEPH_FEATURE_RECONNECT_SEQ) && + cct->_conf->ms_die_on_old_message) { + ceph_assert(0 == "old msgs despite reconnect_seq feature"); + } + return nullptr; + } + if (message->get_seq() > cur_seq + 1) { + ldout(cct, 0) << __func__ << " missed message? skipped from seq " + << cur_seq << " to " << message->get_seq() << dendl; + if (cct->_conf->ms_die_on_skipped_message) { + ceph_assert(0 == "skipped incoming seq"); + } + } + +#if defined(WITH_LTTNG) && defined(WITH_EVENTTRACE) + if (message->get_type() == CEPH_MSG_OSD_OP || + message->get_type() == CEPH_MSG_OSD_OPREPLY) { + utime_t ltt_processed_stamp = ceph_clock_now(); + double usecs_elapsed = + (ltt_processed_stamp.to_nsec() - ltt_recv_stamp.to_nsec()) / 1000; + ostringstream buf; + if (message->get_type() == CEPH_MSG_OSD_OP) + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OP", + false); + else + OID_ELAPSED_WITH_MSG(message, usecs_elapsed, "TIME_TO_DECODE_OSD_OPREPLY", + false); + } +#endif + + // note last received message. + in_seq = message->get_seq(); + ldout(cct, 5) << __func__ << " received message m=" << message + << " seq=" << message->get_seq() + << " from=" << message->get_source() << " type=" << header.type + << " " << *message << dendl; + + bool need_dispatch_writer = false; + if (!connection->policy.lossy) { + ack_left++; + need_dispatch_writer = true; + } + + state = READY; + + connection->logger->inc(l_msgr_recv_messages); + connection->logger->inc(l_msgr_recv_bytes, + rx_frame_asm.get_frame_onwire_len()); + + messenger->ms_fast_preprocess(message); + auto fast_dispatch_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_recv_time, + fast_dispatch_time - connection->recv_start_time); + if (connection->delay_state) { + double delay_period = 0; + if (rand() % 10000 < cct->_conf->ms_inject_delay_probability * 10000.0) { + delay_period = + cct->_conf->ms_inject_delay_max * (double)(rand() % 10000) / 10000.0; + ldout(cct, 1) << "queue_received will delay after " + << (ceph_clock_now() + delay_period) << " on " << message + << " " << *message << dendl; + } + connection->delay_state->queue(delay_period, message); + } else if (messenger->ms_can_fast_dispatch(message)) { + connection->lock.unlock(); + connection->dispatch_queue->fast_dispatch(message); + connection->recv_start_time = ceph::mono_clock::now(); + connection->logger->tinc(l_msgr_running_fast_dispatch_time, + connection->recv_start_time - fast_dispatch_time); + connection->lock.lock(); + } else { + connection->dispatch_queue->enqueue(message, message->get_priority(), + connection->conn_id); + } + + handle_message_ack(current_header.ack_seq); + + // we might have been reused by another connection + // let's check if that is the case + if (state != READY) { + // yes, that was the case, let's do nothing + return nullptr; + } + + if (need_dispatch_writer && connection->is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(read_frame); +} + + +CtPtr ProtocolV2::throttle_message() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->policy.throttler_messages) { + ldout(cct, 10) << __func__ << " wants " << 1 + << " message from policy throttler " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << dendl; + if (!connection->policy.throttler_messages->get_or_fail()) { + ldout(cct, 10) << __func__ << " wants 1 message from policy throttle " + << connection->policy.throttler_messages->get_current() + << "/" << connection->policy.throttler_messages->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + state = THROTTLE_BYTES; + return CONTINUE(throttle_bytes); +} + +CtPtr ProtocolV2::throttle_bytes() { + ldout(cct, 20) << __func__ << dendl; + + const size_t cur_msg_size = get_current_msg_size(); + if (cur_msg_size) { + if (connection->policy.throttler_bytes) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() << "/" + << connection->policy.throttler_bytes->get_max() << dendl; + if (!connection->policy.throttler_bytes->get_or_fail(cur_msg_size)) { + ldout(cct, 10) << __func__ << " wants " << cur_msg_size + << " bytes from policy throttler " + << connection->policy.throttler_bytes->get_current() + << "/" << connection->policy.throttler_bytes->get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event( + 1000, connection->wakeup_handler)); + } + return nullptr; + } + } + } + + state = THROTTLE_DISPATCH_QUEUE; + return CONTINUE(throttle_dispatch_queue); +} + +CtPtr ProtocolV2::throttle_dispatch_queue() { + ldout(cct, 20) << __func__ << dendl; + + const size_t cur_msg_size = get_current_msg_size(); + if (cur_msg_size) { + if (!connection->dispatch_queue->dispatch_throttler.get_or_fail( + cur_msg_size)) { + ldout(cct, 10) + << __func__ << " wants " << cur_msg_size + << " bytes from dispatch throttle " + << connection->dispatch_queue->dispatch_throttler.get_current() << "/" + << connection->dispatch_queue->dispatch_throttler.get_max() + << " failed, just wait." << dendl; + // following thread pool deal with th full message queue isn't a + // short time, so we can wait a ms. + if (connection->register_time_events.empty()) { + connection->register_time_events.insert( + connection->center->create_time_event(1000, + connection->wakeup_handler)); + } + return nullptr; + } + } + + throttle_stamp = ceph_clock_now(); + state = THROTTLE_DONE; + + return read_frame_segment(); +} + +CtPtr ProtocolV2::handle_keepalive2(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + + auto keepalive_frame = KeepAliveFrame::Decode(payload); + + ldout(cct, 30) << __func__ << " got KEEPALIVE2 tag ..." << dendl; + + connection->write_lock.lock(); + auto keepalive_ack_frame = KeepAliveFrameAck::Encode(keepalive_frame.timestamp()); + if (!append_frame(keepalive_ack_frame)) { + connection->write_lock.unlock(); + return _fault(); + } + connection->write_lock.unlock(); + + ldout(cct, 20) << __func__ << " got KEEPALIVE2 " + << keepalive_frame.timestamp() << dendl; + connection->set_last_keepalive(ceph_clock_now()); + + if (is_connected()) { + connection->center->dispatch_event_external(connection->write_handler); + } + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_keepalive2_ack(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + + auto keepalive_ack_frame = KeepAliveFrameAck::Decode(payload); + connection->set_last_keepalive_ack(keepalive_ack_frame.timestamp()); + ldout(cct, 20) << __func__ << " got KEEPALIVE_ACK" << dendl; + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_message_ack(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != READY) { + lderr(cct) << __func__ << " not in ready state!" << dendl; + return _fault(); + } + + auto ack = AckFrame::Decode(payload); + handle_message_ack(ack.seq()); + return CONTINUE(read_frame); +} + +/* Client Protocol Methods */ + +CtPtr ProtocolV2::start_client_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + INTERCEPT(1); + + state = BANNER_CONNECTING; + + global_seq = messenger->get_global_seq(); + + return _banner_exchange(CONTINUATION(post_client_banner_exchange)); +} + +CtPtr ProtocolV2::post_client_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + state = AUTH_CONNECTING; + + return send_auth_request(); +} + +CtPtr ProtocolV2::send_auth_request(std::vector<uint32_t> &allowed_methods) { + ldout(cct, 20) << __func__ << " peer_type " << (int)connection->peer_type + << " auth_client " << messenger->auth_client << dendl; + ceph_assert(messenger->auth_client); + + bufferlist bl; + vector<uint32_t> preferred_modes; + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->get_auth_request( + connection, am.get(), + &am->auth_method, &preferred_modes, &bl); + connection->lock.lock(); + if (state != AUTH_CONNECTING) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + if (r < 0) { + ldout(cct, 0) << __func__ << " get_initial_auth_request returned " << r + << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + INTERCEPT(9); + + auto frame = AuthRequestFrame::Encode(auth_meta->auth_method, preferred_modes, + bl); + return WRITE(frame, "auth request", read_frame); +} + +CtPtr ProtocolV2::handle_auth_bad_method(ceph::bufferlist &payload) { + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_CONNECTING) { + lderr(cct) << __func__ << " not in auth connect state!" << dendl; + return _fault(); + } + + auto bad_method = AuthBadMethodFrame::Decode(payload); + ldout(cct, 1) << __func__ << " method=" << bad_method.method() + << " result " << cpp_strerror(bad_method.result()) + << ", allowed methods=" << bad_method.allowed_methods() + << ", allowed modes=" << bad_method.allowed_modes() + << dendl; + ceph_assert(messenger->auth_client); + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->handle_auth_bad_method( + connection, + am.get(), + bad_method.method(), bad_method.result(), + bad_method.allowed_methods(), + bad_method.allowed_modes()); + connection->lock.lock(); + if (state != AUTH_CONNECTING || r < 0) { + return _fault(); + } + return send_auth_request(bad_method.allowed_methods()); +} + +CtPtr ProtocolV2::handle_auth_reply_more(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_CONNECTING) { + lderr(cct) << __func__ << " not in auth connect state!" << dendl; + return _fault(); + } + + auto auth_more = AuthReplyMoreFrame::Decode(payload); + ldout(cct, 5) << __func__ + << " auth reply more len=" << auth_more.auth_payload().length() + << dendl; + ceph_assert(messenger->auth_client); + ceph::bufferlist reply; + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->handle_auth_reply_more( + connection, am.get(), auth_more.auth_payload(), &reply); + connection->lock.lock(); + if (state != AUTH_CONNECTING) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + if (r < 0) { + lderr(cct) << __func__ << " auth_client handle_auth_reply_more returned " + << r << dendl; + return _fault(); + } + auto more_reply = AuthRequestMoreFrame::Encode(reply); + return WRITE(more_reply, "auth request more", read_frame); +} + +CtPtr ProtocolV2::handle_auth_done(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_CONNECTING) { + lderr(cct) << __func__ << " not in auth connect state!" << dendl; + return _fault(); + } + + auto auth_done = AuthDoneFrame::Decode(payload); + + ceph_assert(messenger->auth_client); + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_client->handle_auth_done( + connection, + am.get(), + auth_done.global_id(), + auth_done.con_mode(), + auth_done.auth_payload(), + &am->session_key, + &am->connection_secret); + connection->lock.lock(); + if (state != AUTH_CONNECTING) { + ldout(cct, 1) << __func__ << " state changed!" << dendl; + return _fault(); + } + if (r < 0) { + return _fault(); + } + auth_meta->con_mode = auth_done.con_mode(); + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/false); + + state = AUTH_CONNECTING_SIGN; + + const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() : + auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf); + auto sig_frame = AuthSignatureFrame::Encode(sig); + pre_auth.enabled = false; + pre_auth.rxbuf.clear(); + return WRITE(sig_frame, "auth signature", read_frame); +} + +CtPtr ProtocolV2::finish_client_auth() { + if (!server_cookie) { + ceph_assert(connect_seq == 0); + state = SESSION_CONNECTING; + return send_client_ident(); + } else { // reconnecting to previous session + state = SESSION_RECONNECTING; + ceph_assert(connect_seq > 0); + return send_reconnect(); + } +} + +CtPtr ProtocolV2::send_client_ident() { + ldout(cct, 20) << __func__ << dendl; + + if (!connection->policy.lossy && !client_cookie) { + client_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll); + } + + uint64_t flags = 0; + if (connection->policy.lossy) { + flags |= CEPH_MSG_CONNECT_LOSSY; + } + + auto client_ident = ClientIdentFrame::Encode( + messenger->get_myaddrs(), + connection->target_addr, + messenger->get_myname().num(), + global_seq, + connection->policy.features_supported, + connection->policy.features_required | msgr2_required, + flags, + client_cookie); + + ldout(cct, 5) << __func__ << " sending identification: " + << "addrs=" << messenger->get_myaddrs() + << " target=" << connection->target_addr + << " gid=" << messenger->get_myname().num() + << " global_seq=" << global_seq + << " features_supported=" << std::hex + << connection->policy.features_supported + << " features_required=" + << (connection->policy.features_required | msgr2_required) + << " flags=" << flags + << " cookie=" << client_cookie << std::dec << dendl; + + INTERCEPT(11); + + return WRITE(client_ident, "client ident", read_frame); +} + +CtPtr ProtocolV2::send_reconnect() { + ldout(cct, 20) << __func__ << dendl; + + auto reconnect = ReconnectFrame::Encode(messenger->get_myaddrs(), + client_cookie, + server_cookie, + global_seq, + connect_seq, + in_seq); + + ldout(cct, 5) << __func__ << " reconnect to session: client_cookie=" + << std::hex << client_cookie << " server_cookie=" + << server_cookie << std::dec + << " gs=" << global_seq << " cs=" << connect_seq + << " ms=" << in_seq << dendl; + + INTERCEPT(13); + + return WRITE(reconnect, "reconnect", read_frame); +} + +CtPtr ProtocolV2::handle_ident_missing_features(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_CONNECTING) { + lderr(cct) << __func__ << " not in session connect state!" << dendl; + return _fault(); + } + + auto ident_missing = + IdentMissingFeaturesFrame::Decode(payload); + lderr(cct) << __func__ + << " client does not support all server features: " << std::hex + << ident_missing.features() << std::dec << dendl; + + return _fault(); +} + +CtPtr ProtocolV2::handle_session_reset(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto reset = ResetFrame::Decode(payload); + + ldout(cct, 1) << __func__ << " received session reset full=" << reset.full() + << dendl; + if (reset.full()) { + reset_session(); + } else { + server_cookie = 0; + connect_seq = 0; + in_seq = 0; + } + + state = SESSION_CONNECTING; + return send_client_ident(); +} + +CtPtr ProtocolV2::handle_session_retry(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto retry = RetryFrame::Decode(payload); + connect_seq = retry.connect_seq() + 1; + + ldout(cct, 1) << __func__ + << " received session retry connect_seq=" << retry.connect_seq() + << ", inc to cs=" << connect_seq << dendl; + + return send_reconnect(); +} + +CtPtr ProtocolV2::handle_session_retry_global(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto retry = RetryGlobalFrame::Decode(payload); + global_seq = messenger->get_global_seq(retry.global_seq()); + + ldout(cct, 1) << __func__ << " received session retry global global_seq=" + << retry.global_seq() << ", choose new gs=" << global_seq + << dendl; + + return send_reconnect(); +} + +CtPtr ProtocolV2::handle_wait(ceph::bufferlist &payload) { + ldout(cct, 20) << __func__ + << " received WAIT (connection race)" + << " payload.length()=" << payload.length() + << dendl; + + if (state != SESSION_CONNECTING && state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session (re)connect state!" << dendl; + return _fault(); + } + + state = WAIT; + WaitFrame::Decode(payload); + return _fault(); +} + +CtPtr ProtocolV2::handle_reconnect_ok(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_RECONNECTING) { + lderr(cct) << __func__ << " not in session reconnect state!" << dendl; + return _fault(); + } + + auto reconnect_ok = ReconnectOkFrame::Decode(payload); + ldout(cct, 5) << __func__ + << " reconnect accepted: sms=" << reconnect_ok.msg_seq() + << dendl; + + out_seq = discard_requeued_up_to(out_seq, reconnect_ok.msg_seq()); + + backoff = utime_t(); + ldout(cct, 10) << __func__ << " reconnect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +CtPtr ProtocolV2::handle_server_ident(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_CONNECTING) { + lderr(cct) << __func__ << " not in session connect state!" << dendl; + return _fault(); + } + + auto server_ident = ServerIdentFrame::Decode(payload); + ldout(cct, 5) << __func__ << " received server identification:" + << " addrs=" << server_ident.addrs() + << " gid=" << server_ident.gid() + << " global_seq=" << server_ident.global_seq() + << " features_supported=" << std::hex + << server_ident.supported_features() + << " features_required=" << server_ident.required_features() + << " flags=" << server_ident.flags() + << " cookie=" << server_ident.cookie() << std::dec << dendl; + + // is this who we intended to talk to? + // be a bit forgiving here, since we may be connecting based on addresses parsed out + // of mon_host or something. + if (!server_ident.addrs().contains(connection->target_addr)) { + ldout(cct,1) << __func__ << " peer identifies as " << server_ident.addrs() + << ", does not include " << connection->target_addr << dendl; + return _fault(); + } + + server_cookie = server_ident.cookie(); + + connection->set_peer_addrs(server_ident.addrs()); + peer_name = entity_name_t(connection->get_peer_type(), server_ident.gid()); + connection->set_features(server_ident.supported_features() & + connection->policy.features_supported); + peer_global_seq = server_ident.global_seq(); + + connection->policy.lossy = server_ident.flags() & CEPH_MSG_CONNECT_LOSSY; + + backoff = utime_t(); + ldout(cct, 10) << __func__ << " connect success " << connect_seq + << ", lossy = " << connection->policy.lossy << ", features " + << connection->get_features() << dendl; + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + connection->dispatch_queue->queue_connect(connection); + messenger->ms_deliver_handle_fast_connect(connection); + + return ready(); +} + +/* Server Protocol Methods */ + +CtPtr ProtocolV2::start_server_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + INTERCEPT(2); + + state = BANNER_ACCEPTING; + + return _banner_exchange(CONTINUATION(post_server_banner_exchange)); +} + +CtPtr ProtocolV2::post_server_banner_exchange() { + ldout(cct, 20) << __func__ << dendl; + + state = AUTH_ACCEPTING; + + return CONTINUE(read_frame); +} + +CtPtr ProtocolV2::handle_auth_request(ceph::bufferlist &payload) { + ldout(cct, 20) << __func__ << " payload.length()=" << payload.length() + << dendl; + + if (state != AUTH_ACCEPTING) { + lderr(cct) << __func__ << " not in auth accept state!" << dendl; + return _fault(); + } + + auto request = AuthRequestFrame::Decode(payload); + ldout(cct, 10) << __func__ << " AuthRequest(method=" << request.method() + << ", preferred_modes=" << request.preferred_modes() + << ", payload_len=" << request.auth_payload().length() << ")" + << dendl; + auth_meta->auth_method = request.method(); + auth_meta->con_mode = messenger->auth_server->pick_con_mode( + connection->get_peer_type(), auth_meta->auth_method, + request.preferred_modes()); + if (auth_meta->con_mode == CEPH_CON_MODE_UNKNOWN) { + return _auth_bad_method(-EOPNOTSUPP); + } + return _handle_auth_request(request.auth_payload(), false); +} + +CtPtr ProtocolV2::_auth_bad_method(int r) +{ + ceph_assert(r < 0); + std::vector<uint32_t> allowed_methods; + std::vector<uint32_t> allowed_modes; + messenger->auth_server->get_supported_auth_methods( + connection->get_peer_type(), &allowed_methods, &allowed_modes); + ldout(cct, 1) << __func__ << " auth_method " << auth_meta->auth_method + << " r " << cpp_strerror(r) + << ", allowed_methods " << allowed_methods + << ", allowed_modes " << allowed_modes + << dendl; + auto bad_method = AuthBadMethodFrame::Encode(auth_meta->auth_method, r, + allowed_methods, allowed_modes); + return WRITE(bad_method, "bad auth method", read_frame); +} + +CtPtr ProtocolV2::_handle_auth_request(bufferlist& auth_payload, bool more) +{ + if (!messenger->auth_server) { + return _fault(); + } + bufferlist reply; + auto am = auth_meta; + connection->lock.unlock(); + int r = messenger->auth_server->handle_auth_request( + connection, am.get(), + more, am->auth_method, auth_payload, + &reply); + connection->lock.lock(); + if (state != AUTH_ACCEPTING && state != AUTH_ACCEPTING_MORE) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + if (r == 1) { + INTERCEPT(10); + state = AUTH_ACCEPTING_SIGN; + + auto auth_done = AuthDoneFrame::Encode(connection->peer_global_id, + auth_meta->con_mode, + reply); + return WRITE(auth_done, "auth done", finish_auth); + } else if (r == 0) { + state = AUTH_ACCEPTING_MORE; + + auto more = AuthReplyMoreFrame::Encode(reply); + return WRITE(more, "auth reply more", read_frame); + } else if (r == -EBUSY) { + // kick the client and maybe they'll come back later + return _fault(); + } else { + return _auth_bad_method(r); + } +} + +CtPtr ProtocolV2::finish_auth() +{ + ceph_assert(auth_meta); + // TODO: having a possibility to check whether we're server or client could + // allow reusing finish_auth(). + bool is_rev1 = HAVE_MSGR2_FEATURE(peer_supported_features, REVISION_1); + session_stream_handlers = ceph::crypto::onwire::rxtx_t::create_handler_pair( + cct, *auth_meta, /*new_nonce_format=*/is_rev1, /*crossed=*/true); + + const auto sig = auth_meta->session_key.empty() ? sha256_digest_t() : + auth_meta->session_key.hmac_sha256(cct, pre_auth.rxbuf); + auto sig_frame = AuthSignatureFrame::Encode(sig); + pre_auth.enabled = false; + pre_auth.rxbuf.clear(); + return WRITE(sig_frame, "auth signature", read_frame); +} + +CtPtr ProtocolV2::handle_auth_request_more(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_ACCEPTING_MORE) { + lderr(cct) << __func__ << " not in auth accept more state!" << dendl; + return _fault(); + } + + auto auth_more = AuthRequestMoreFrame::Decode(payload); + return _handle_auth_request(auth_more.auth_payload(), true); +} + +CtPtr ProtocolV2::handle_auth_signature(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != AUTH_ACCEPTING_SIGN && state != AUTH_CONNECTING_SIGN) { + lderr(cct) << __func__ + << " pre-auth verification signature seen in wrong state!" + << dendl; + return _fault(); + } + + auto sig_frame = AuthSignatureFrame::Decode(payload); + + const auto actual_tx_sig = auth_meta->session_key.empty() ? + sha256_digest_t() : auth_meta->session_key.hmac_sha256(cct, pre_auth.txbuf); + if (sig_frame.signature() != actual_tx_sig) { + ldout(cct, 2) << __func__ << " pre-auth signature mismatch" + << " actual_tx_sig=" << actual_tx_sig + << " sig_frame.signature()=" << sig_frame.signature() + << dendl; + return _fault(); + } else { + ldout(cct, 20) << __func__ << " pre-auth signature success" + << " sig_frame.signature()=" << sig_frame.signature() + << dendl; + pre_auth.txbuf.clear(); + } + + if (state == AUTH_ACCEPTING_SIGN) { + // server had sent AuthDone and client responded with correct pre-auth + // signature. we can start accepting new sessions/reconnects. + state = SESSION_ACCEPTING; + return CONTINUE(read_frame); + } else if (state == AUTH_CONNECTING_SIGN) { + // this happened at client side + return finish_client_auth(); + } else { + ceph_assert_always("state corruption" == nullptr); + } +} + +CtPtr ProtocolV2::handle_client_ident(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_ACCEPTING) { + lderr(cct) << __func__ << " not in session accept state!" << dendl; + return _fault(); + } + + auto client_ident = ClientIdentFrame::Decode(payload); + + ldout(cct, 5) << __func__ << " received client identification:" + << " addrs=" << client_ident.addrs() + << " target=" << client_ident.target_addr() + << " gid=" << client_ident.gid() + << " global_seq=" << client_ident.global_seq() + << " features_supported=" << std::hex + << client_ident.supported_features() + << " features_required=" << client_ident.required_features() + << " flags=" << client_ident.flags() + << " cookie=" << client_ident.cookie() << std::dec << dendl; + + if (client_ident.addrs().empty() || + client_ident.addrs().front() == entity_addr_t()) { + ldout(cct,5) << __func__ << " oops, client_ident.addrs() is empty" << dendl; + return _fault(); // a v2 peer should never do this + } + if (!messenger->get_myaddrs().contains(client_ident.target_addr())) { + ldout(cct,5) << __func__ << " peer is trying to reach " + << client_ident.target_addr() + << " which is not us (" << messenger->get_myaddrs() << ")" + << dendl; + return _fault(); + } + + connection->set_peer_addrs(client_ident.addrs()); + connection->target_addr = connection->_infer_target_addr(client_ident.addrs()); + + peer_name = entity_name_t(connection->get_peer_type(), client_ident.gid()); + connection->set_peer_id(client_ident.gid()); + + client_cookie = client_ident.cookie(); + + uint64_t feat_missing = + (connection->policy.features_required | msgr2_required) & + ~(uint64_t)client_ident.supported_features(); + if (feat_missing) { + ldout(cct, 1) << __func__ << " peer missing required features " << std::hex + << feat_missing << std::dec << dendl; + auto ident_missing_features = + IdentMissingFeaturesFrame::Encode(feat_missing); + + return WRITE(ident_missing_features, "ident missing features", read_frame); + } + + connection_features = + client_ident.supported_features() & connection->policy.features_supported; + + peer_global_seq = client_ident.global_seq(); + + // Looks good so far, let's check if there is already an existing connection + // to this peer. + + connection->lock.unlock(); + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + if (existing && + existing->protocol->proto_type != 2) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + connection->inject_delay(); + + connection->lock.lock(); + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (existing) { + return handle_existing_connection(existing); + } + + // if everything is OK reply with server identification + return send_server_ident(); +} + +CtPtr ProtocolV2::handle_reconnect(ceph::bufferlist &payload) +{ + ldout(cct, 20) << __func__ + << " payload.length()=" << payload.length() << dendl; + + if (state != SESSION_ACCEPTING) { + lderr(cct) << __func__ << " not in session accept state!" << dendl; + return _fault(); + } + + auto reconnect = ReconnectFrame::Decode(payload); + + ldout(cct, 5) << __func__ + << " received reconnect:" + << " client_cookie=" << std::hex << reconnect.client_cookie() + << " server_cookie=" << reconnect.server_cookie() << std::dec + << " gs=" << reconnect.global_seq() + << " cs=" << reconnect.connect_seq() + << " ms=" << reconnect.msg_seq() + << dendl; + + // Should we check if one of the ident.addrs match connection->target_addr + // as we do in ProtocolV1? + connection->set_peer_addrs(reconnect.addrs()); + connection->target_addr = connection->_infer_target_addr(reconnect.addrs()); + peer_global_seq = reconnect.global_seq(); + + connection->lock.unlock(); + AsyncConnectionRef existing = messenger->lookup_conn(*connection->peer_addrs); + + if (existing && + existing->protocol->proto_type != 2) { + ldout(cct,1) << __func__ << " existing " << existing << " proto " + << existing->protocol.get() << " version is " + << existing->protocol->proto_type << ", marking down" << dendl; + existing->mark_down(); + existing = nullptr; + } + + connection->inject_delay(); + + connection->lock.lock(); + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED); + return _fault(); + } + + if (!existing) { + // there is no existing connection therefore cannot reconnect to previous + // session + ldout(cct, 0) << __func__ + << " no existing connection exists, reseting client" << dendl; + auto reset = ResetFrame::Encode(true); + return WRITE(reset, "session reset", read_frame); + } + + std::lock_guard<std::mutex> l(existing->lock); + + ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get()); + if (!exproto) { + ldout(cct, 1) << __func__ << " existing=" << existing << dendl; + ceph_assert(false); + } + + if (exproto->state == CLOSED) { + ldout(cct, 5) << __func__ << " existing " << existing + << " already closed. Reseting client" << dendl; + auto reset = ResetFrame::Encode(true); + return WRITE(reset, "session reset", read_frame); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing=" << existing << dendl; + auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq); + return WRITE(retry, "session retry", read_frame); + } + + if (exproto->client_cookie != reconnect.client_cookie()) { + ldout(cct, 1) << __func__ << " existing=" << existing + << " client cookie mismatch, I must have reseted:" + << " cc=" << std::hex << exproto->client_cookie + << " rcc=" << reconnect.client_cookie() + << ", reseting client." << std::dec + << dendl; + auto reset = ResetFrame::Encode(connection->policy.resetcheck); + return WRITE(reset, "session reset", read_frame); + } else if (exproto->server_cookie == 0) { + // this happens when: + // - a connects to b + // - a sends client_ident + // - b gets client_ident, sends server_ident and sets cookie X + // - connection fault + // - b reconnects to a with cookie X, connect_seq=1 + // - a has cookie==0 + ldout(cct, 1) << __func__ << " I was a client and didn't received the" + << " server_ident. Asking peer to resume session" + << " establishment" << dendl; + auto reset = ResetFrame::Encode(false); + return WRITE(reset, "session reset", read_frame); + } + + if (exproto->peer_global_seq > reconnect.global_seq()) { + ldout(cct, 5) << __func__ + << " stale global_seq: sgs=" << exproto->peer_global_seq + << " cgs=" << reconnect.global_seq() + << ", ask client to retry global" << dendl; + auto retry = RetryGlobalFrame::Encode(exproto->peer_global_seq); + + INTERCEPT(18); + + return WRITE(retry, "session retry", read_frame); + } + + if (exproto->connect_seq > reconnect.connect_seq()) { + ldout(cct, 5) << __func__ + << " stale connect_seq scs=" << exproto->connect_seq + << " ccs=" << reconnect.connect_seq() + << " , ask client to retry" << dendl; + auto retry = RetryFrame::Encode(exproto->connect_seq); + return WRITE(retry, "session retry", read_frame); + } + + if (exproto->connect_seq == reconnect.connect_seq()) { + // reconnect race: both peers are sending reconnect messages + if (existing->peer_addrs->msgr2_addr() > + messenger->get_myaddrs().msgr2_addr() && + !existing->policy.server) { + // the existing connection wins + ldout(cct, 1) + << __func__ + << " reconnect race detected, this connection loses to existing=" + << existing << dendl; + + auto wait = WaitFrame::Encode(); + return WRITE(wait, "wait", read_frame); + } else { + // this connection wins + ldout(cct, 1) << __func__ + << " reconnect race detected, replacing existing=" + << existing << " socket by this connection's socket" + << dendl; + } + } + + ldout(cct, 1) << __func__ << " reconnect to existing=" << existing << dendl; + + reconnecting = true; + + // everything looks good + exproto->connect_seq = reconnect.connect_seq(); + exproto->message_seq = reconnect.msg_seq(); + + return reuse_connection(existing, exproto); +} + +CtPtr ProtocolV2::handle_existing_connection(AsyncConnectionRef existing) { + ldout(cct, 20) << __func__ << " existing=" << existing << dendl; + + std::lock_guard<std::mutex> l(existing->lock); + + ProtocolV2 *exproto = dynamic_cast<ProtocolV2 *>(existing->protocol.get()); + if (!exproto) { + ldout(cct, 1) << __func__ << " existing=" << existing << dendl; + ceph_assert(false); + } + + if (exproto->state == CLOSED) { + ldout(cct, 1) << __func__ << " existing " << existing << " already closed." + << dendl; + return send_server_ident(); + } + + if (exproto->replacing) { + ldout(cct, 1) << __func__ + << " existing racing replace happened while replacing." + << " existing=" << existing << dendl; + auto wait = WaitFrame::Encode(); + return WRITE(wait, "wait", read_frame); + } + + if (exproto->peer_global_seq > peer_global_seq) { + ldout(cct, 1) << __func__ << " this is a stale connection, peer_global_seq=" + << peer_global_seq + << " existing->peer_global_seq=" << exproto->peer_global_seq + << ", stopping this connection." << dendl; + stop(); + connection->dispatch_queue->queue_reset(connection); + return nullptr; + } + + if (existing->policy.lossy) { + // existing connection can be thrown out in favor of this one + ldout(cct, 1) + << __func__ << " existing=" << existing + << " is a lossy channel. Stopping existing in favor of this connection" + << dendl; + existing->protocol->stop(); + existing->dispatch_queue->queue_reset(existing.get()); + return send_server_ident(); + } + + if (exproto->server_cookie && exproto->client_cookie && + exproto->client_cookie != client_cookie) { + // Found previous session + // peer has reseted and we're going to reuse the existing connection + // by replacing the communication socket + ldout(cct, 1) << __func__ << " found previous session existing=" << existing + << ", peer must have reseted." << dendl; + if (connection->policy.resetcheck) { + exproto->reset_session(); + } + return reuse_connection(existing, exproto); + } + + if (exproto->client_cookie == client_cookie) { + // session establishment interrupted between client_ident and server_ident, + // continuing... + ldout(cct, 1) << __func__ << " found previous session existing=" << existing + << ", continuing session establishment." << dendl; + return reuse_connection(existing, exproto); + } + + if (exproto->state == READY || exproto->state == STANDBY) { + ldout(cct, 1) << __func__ << " existing=" << existing + << " is READY/STANDBY, lets reuse it" << dendl; + return reuse_connection(existing, exproto); + } + + // Looks like a connection race: server and client are both connecting to + // each other at the same time. + if (connection->peer_addrs->msgr2_addr() < + messenger->get_myaddrs().msgr2_addr() || + existing->policy.server) { + // this connection wins + ldout(cct, 1) << __func__ + << " connection race detected, replacing existing=" + << existing << " socket by this connection's socket" << dendl; + return reuse_connection(existing, exproto); + } else { + // the existing connection wins + ldout(cct, 1) + << __func__ + << " connection race detected, this connection loses to existing=" + << existing << dendl; + ceph_assert(connection->peer_addrs->msgr2_addr() > + messenger->get_myaddrs().msgr2_addr()); + + // make sure we follow through with opening the existing + // connection (if it isn't yet open) since we know the peer + // has something to send to us. + existing->send_keepalive(); + auto wait = WaitFrame::Encode(); + return WRITE(wait, "wait", read_frame); + } +} + +CtPtr ProtocolV2::reuse_connection(AsyncConnectionRef existing, + ProtocolV2 *exproto) { + ldout(cct, 20) << __func__ << " existing=" << existing + << " reconnect=" << reconnecting << dendl; + + connection->inject_delay(); + + std::lock_guard<std::mutex> l(existing->write_lock); + + connection->center->delete_file_event(connection->cs.fd(), + EVENT_READABLE | EVENT_WRITABLE); + + if (existing->delay_state) { + existing->delay_state->flush(); + ceph_assert(!connection->delay_state); + } + exproto->reset_recv_state(); + exproto->pre_auth.enabled = false; + + if (!reconnecting) { + exproto->peer_supported_features = peer_supported_features; + exproto->tx_frame_asm.set_is_rev1(tx_frame_asm.get_is_rev1()); + exproto->rx_frame_asm.set_is_rev1(rx_frame_asm.get_is_rev1()); + + exproto->client_cookie = client_cookie; + exproto->peer_name = peer_name; + exproto->connection_features = connection_features; + existing->set_features(connection_features); + } + exproto->peer_global_seq = peer_global_seq; + + ceph_assert(connection->center->in_thread()); + auto temp_cs = std::move(connection->cs); + EventCenter *new_center = connection->center; + Worker *new_worker = connection->worker; + // we can steal the session_stream_handlers under the assumption + // this happens in the event center's thread as there should be + // no user outside its boundaries (simlarly to e.g. outgoing_bl). + auto temp_stream_handlers = std::move(session_stream_handlers); + exproto->auth_meta = auth_meta; + + ldout(messenger->cct, 5) << __func__ << " stop myself to swap existing" + << dendl; + + // avoid _stop shutdown replacing socket + // queue a reset on the new connection, which we're dumping for the old + stop(); + + connection->dispatch_queue->queue_reset(connection); + + exproto->can_write = false; + exproto->write_in_progress = false; + exproto->reconnecting = reconnecting; + exproto->replacing = true; + existing->state_offset = 0; + // avoid previous thread modify event + exproto->state = NONE; + existing->state = AsyncConnection::STATE_NONE; + // Discard existing prefetch buffer in `recv_buf` + existing->recv_start = existing->recv_end = 0; + // there shouldn't exist any buffer + ceph_assert(connection->recv_start == connection->recv_end); + + auto deactivate_existing = std::bind( + [ existing, + new_worker, + new_center, + exproto, + temp_stream_handlers=std::move(temp_stream_handlers) + ](ConnectedSocket &cs) mutable { + // we need to delete time event in original thread + { + std::lock_guard<std::mutex> l(existing->lock); + existing->write_lock.lock(); + exproto->requeue_sent(); + // XXX: do we really need the locking for `outgoing_bl`? There is + // a comment just above its definition saying "lockfree, only used + // in own thread". I'm following lockfull schema just in the case. + // From performance point of view it should be fine – this happens + // far away from hot paths. + existing->outgoing_bl.clear(); + existing->open_write = false; + exproto->session_stream_handlers = std::move(temp_stream_handlers); + existing->write_lock.unlock(); + if (exproto->state == NONE) { + existing->shutdown_socket(); + existing->cs = std::move(cs); + existing->worker->references--; + new_worker->references++; + existing->logger = new_worker->get_perf_counter(); + existing->worker = new_worker; + existing->center = new_center; + if (existing->delay_state) + existing->delay_state->set_center(new_center); + } else if (exproto->state == CLOSED) { + auto back_to_close = std::bind( + [](ConnectedSocket &cs) mutable { cs.close(); }, std::move(cs)); + new_center->submit_to(new_center->get_id(), + std::move(back_to_close), true); + return; + } else { + ceph_abort(); + } + } + + // Before changing existing->center, it may already exists some + // events in existing->center's queue. Then if we mark down + // `existing`, it will execute in another thread and clean up + // connection. Previous event will result in segment fault + auto transfer_existing = [existing, exproto]() mutable { + std::lock_guard<std::mutex> l(existing->lock); + if (exproto->state == CLOSED) return; + ceph_assert(exproto->state == NONE); + + exproto->state = SESSION_ACCEPTING; + // we have called shutdown_socket above + ceph_assert(existing->last_tick_id == 0); + // restart timer since we are going to re-build connection + existing->last_connect_started = ceph::coarse_mono_clock::now(); + existing->last_tick_id = existing->center->create_time_event( + existing->connect_timeout_us, existing->tick_handler); + existing->state = AsyncConnection::STATE_CONNECTION_ESTABLISHED; + existing->center->create_file_event(existing->cs.fd(), EVENT_READABLE, + existing->read_handler); + if (!exproto->reconnecting) { + exproto->run_continuation(exproto->send_server_ident()); + } else { + exproto->run_continuation(exproto->send_reconnect_ok()); + } + }; + if (existing->center->in_thread()) + transfer_existing(); + else + existing->center->submit_to(existing->center->get_id(), + std::move(transfer_existing), true); + }, + std::move(temp_cs)); + + existing->center->submit_to(existing->center->get_id(), + std::move(deactivate_existing), true); + return nullptr; +} + +CtPtr ProtocolV2::send_server_ident() { + ldout(cct, 20) << __func__ << dendl; + + // this is required for the case when this connection is being replaced + out_seq = discard_requeued_up_to(out_seq, 0); + in_seq = 0; + + if (!connection->policy.lossy) { + server_cookie = ceph::util::generate_random_number<uint64_t>(1, -1ll); + } + + uint64_t flags = 0; + if (connection->policy.lossy) { + flags = flags | CEPH_MSG_CONNECT_LOSSY; + } + + uint64_t gs = messenger->get_global_seq(); + auto server_ident = ServerIdentFrame::Encode( + messenger->get_myaddrs(), + messenger->get_myname().num(), + gs, + connection->policy.features_supported, + connection->policy.features_required | msgr2_required, + flags, + server_cookie); + + ldout(cct, 5) << __func__ << " sending identification:" + << " addrs=" << messenger->get_myaddrs() + << " gid=" << messenger->get_myname().num() + << " global_seq=" << gs << " features_supported=" << std::hex + << connection->policy.features_supported + << " features_required=" + << (connection->policy.features_required | msgr2_required) + << " flags=" << flags + << " cookie=" << server_cookie << std::dec << dendl; + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->msgr2_addr() + << " just fail later one(this)" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + connection->set_features(connection_features); + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + + INTERCEPT(12); + + return WRITE(server_ident, "server ident", server_ready); +} + +CtPtr ProtocolV2::server_ready() { + ldout(cct, 20) << __func__ << dendl; + + if (connection->delay_state) { + ceph_assert(connection->delay_state->ready()); + } + + return ready(); +} + +CtPtr ProtocolV2::send_reconnect_ok() { + ldout(cct, 20) << __func__ << dendl; + + out_seq = discard_requeued_up_to(out_seq, message_seq); + + uint64_t ms = in_seq; + auto reconnect_ok = ReconnectOkFrame::Encode(ms); + + ldout(cct, 5) << __func__ << " sending reconnect_ok: msg_seq=" << ms << dendl; + + connection->lock.unlock(); + // Because "replacing" will prevent other connections preempt this addr, + // it's safe that here we don't acquire Connection's lock + ssize_t r = messenger->accept_conn(connection); + + connection->inject_delay(); + + connection->lock.lock(); + + if (r < 0) { + ldout(cct, 1) << __func__ << " existing race replacing process for addr = " + << connection->peer_addrs->msgr2_addr() + << " just fail later one(this)" << dendl; + connection->inject_delay(); + return _fault(); + } + if (state != SESSION_ACCEPTING) { + ldout(cct, 1) << __func__ + << " state changed while accept_conn, it must be mark_down" + << dendl; + ceph_assert(state == CLOSED || state == NONE); + messenger->unregister_conn(connection); + connection->inject_delay(); + return _fault(); + } + + // notify + connection->dispatch_queue->queue_accept(connection); + messenger->ms_deliver_handle_fast_accept(connection); + + INTERCEPT(14); + + return WRITE(reconnect_ok, "reconnect ok", server_ready); +} diff --git a/src/msg/async/ProtocolV2.h b/src/msg/async/ProtocolV2.h new file mode 100644 index 00000000..4941cea5 --- /dev/null +++ b/src/msg/async/ProtocolV2.h @@ -0,0 +1,259 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#ifndef _MSG_ASYNC_PROTOCOL_V2_ +#define _MSG_ASYNC_PROTOCOL_V2_ + +#include "Protocol.h" +#include "crypto_onwire.h" +#include "frames_v2.h" + +class ProtocolV2 : public Protocol { +private: + enum State { + NONE, + START_CONNECT, + BANNER_CONNECTING, + HELLO_CONNECTING, + AUTH_CONNECTING, + AUTH_CONNECTING_SIGN, + SESSION_CONNECTING, + SESSION_RECONNECTING, + START_ACCEPT, + BANNER_ACCEPTING, + HELLO_ACCEPTING, + AUTH_ACCEPTING, + AUTH_ACCEPTING_MORE, + AUTH_ACCEPTING_SIGN, + SESSION_ACCEPTING, + READY, + THROTTLE_MESSAGE, + THROTTLE_BYTES, + THROTTLE_DISPATCH_QUEUE, + THROTTLE_DONE, + READ_MESSAGE_COMPLETE, + STANDBY, + WAIT, + CLOSED + }; + + static const char *get_state_name(int state) { + const char *const statenames[] = {"NONE", + "START_CONNECT", + "BANNER_CONNECTING", + "HELLO_CONNECTING", + "AUTH_CONNECTING", + "AUTH_CONNECTING_SIGN", + "SESSION_CONNECTING", + "SESSION_RECONNECTING", + "START_ACCEPT", + "BANNER_ACCEPTING", + "HELLO_ACCEPTING", + "AUTH_ACCEPTING", + "AUTH_ACCEPTING_MORE", + "AUTH_ACCEPTING_SIGN", + "SESSION_ACCEPTING", + "READY", + "THROTTLE_MESSAGE", + "THROTTLE_BYTES", + "THROTTLE_DISPATCH_QUEUE", + "THROTTLE_DONE", + "READ_MESSAGE_COMPLETE", + "STANDBY", + "WAIT", + "CLOSED"}; + return statenames[state]; + } + + // TODO: move into auth_meta? + ceph::crypto::onwire::rxtx_t session_stream_handlers; + + entity_name_t peer_name; + State state; + uint64_t peer_supported_features; // CEPH_MSGR2_FEATURE_* + + uint64_t client_cookie; + uint64_t server_cookie; + uint64_t global_seq; + uint64_t connect_seq; + uint64_t peer_global_seq; + uint64_t message_seq; + bool reconnecting; + bool replacing; + bool can_write; + struct out_queue_entry_t { + bool is_prepared {false}; + Message* m {nullptr}; + }; + std::map<int, std::list<out_queue_entry_t>> out_queue; + std::list<Message *> sent; + std::atomic<uint64_t> out_seq{0}; + std::atomic<uint64_t> in_seq{0}; + std::atomic<uint64_t> ack_left{0}; + + using ProtFuncPtr = void (ProtocolV2::*)(); + Ct<ProtocolV2> *bannerExchangeCallback; + + ceph::msgr::v2::FrameAssembler tx_frame_asm; + ceph::msgr::v2::FrameAssembler rx_frame_asm; + + ceph::bufferlist rx_preamble; + ceph::bufferlist rx_epilogue; + ceph::msgr::v2::segment_bls_t rx_segments_data; + ceph::msgr::v2::Tag next_tag; + utime_t backoff; // backoff time + utime_t recv_stamp; + utime_t throttle_stamp; + + struct { + ceph::bufferlist rxbuf; + ceph::bufferlist txbuf; + bool enabled {true}; + } pre_auth; + + bool keepalive; + bool write_in_progress = false; + + ostream &_conn_prefix(std::ostream *_dout); + void run_continuation(Ct<ProtocolV2> *pcontinuation); + void run_continuation(Ct<ProtocolV2> &continuation); + + Ct<ProtocolV2> *read(CONTINUATION_RXBPTR_TYPE<ProtocolV2> &next, + rx_buffer_t&& buffer); + template <class F> + Ct<ProtocolV2> *write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + F &frame); + Ct<ProtocolV2> *write(const std::string &desc, + CONTINUATION_TYPE<ProtocolV2> &next, + bufferlist &buffer); + + template <class F> + bool append_frame(F& frame); + + void requeue_sent(); + uint64_t discard_requeued_up_to(uint64_t out_seq, uint64_t seq); + void reset_recv_state(); + void reset_security(); + void reset_throttle(); + Ct<ProtocolV2> *_fault(); + void discard_out_queue(); + void reset_session(); + void prepare_send_message(uint64_t features, Message *m); + out_queue_entry_t _get_next_outgoing(); + ssize_t write_message(Message *m, bool more); + void handle_message_ack(uint64_t seq); + + CONTINUATION_DECL(ProtocolV2, _wait_for_peer_banner); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, _handle_peer_banner_payload); + + Ct<ProtocolV2> *_banner_exchange(Ct<ProtocolV2> &callback); + Ct<ProtocolV2> *_wait_for_peer_banner(); + Ct<ProtocolV2> *_handle_peer_banner(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *_handle_peer_banner_payload(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *handle_hello(ceph::bufferlist &payload); + + CONTINUATION_DECL(ProtocolV2, read_frame); + CONTINUATION_DECL(ProtocolV2, finish_auth); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_preamble_main); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_segment); + READ_BPTR_HANDLER_CONTINUATION_DECL(ProtocolV2, handle_read_frame_epilogue_main); + CONTINUATION_DECL(ProtocolV2, throttle_message); + CONTINUATION_DECL(ProtocolV2, throttle_bytes); + CONTINUATION_DECL(ProtocolV2, throttle_dispatch_queue); + + Ct<ProtocolV2> *read_frame(); + Ct<ProtocolV2> *finish_auth(); + Ct<ProtocolV2> *finish_client_auth(); + Ct<ProtocolV2> *handle_read_frame_preamble_main(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *read_frame_segment(); + Ct<ProtocolV2> *handle_read_frame_segment(rx_buffer_t &&rx_buffer, int r); + Ct<ProtocolV2> *_handle_read_frame_segment(); + Ct<ProtocolV2> *handle_read_frame_epilogue_main(rx_buffer_t &&buffer, int r); + Ct<ProtocolV2> *_handle_read_frame_epilogue_main(); + Ct<ProtocolV2> *handle_read_frame_dispatch(); + Ct<ProtocolV2> *handle_frame_payload(); + + Ct<ProtocolV2> *ready(); + + Ct<ProtocolV2> *handle_message(); + Ct<ProtocolV2> *throttle_message(); + Ct<ProtocolV2> *throttle_bytes(); + Ct<ProtocolV2> *throttle_dispatch_queue(); + Ct<ProtocolV2> *read_message_data_prepare(); + + Ct<ProtocolV2> *handle_keepalive2(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_keepalive2_ack(ceph::bufferlist &payload); + + Ct<ProtocolV2> *handle_message_ack(ceph::bufferlist &payload); + +public: + uint64_t connection_features; + + ProtocolV2(AsyncConnection *connection); + virtual ~ProtocolV2(); + + virtual void connect() override; + virtual void accept() override; + virtual bool is_connected() override; + virtual void stop() override; + virtual void fault() override; + virtual void send_message(Message *m) override; + virtual void send_keepalive() override; + + virtual void read_event() override; + virtual void write_event() override; + virtual bool is_queued() override; + +private: + // Client Protocol + CONTINUATION_DECL(ProtocolV2, start_client_banner_exchange); + CONTINUATION_DECL(ProtocolV2, post_client_banner_exchange); + + Ct<ProtocolV2> *start_client_banner_exchange(); + Ct<ProtocolV2> *post_client_banner_exchange(); + inline Ct<ProtocolV2> *send_auth_request() { + std::vector<uint32_t> empty; + return send_auth_request(empty); + } + Ct<ProtocolV2> *send_auth_request(std::vector<uint32_t> &allowed_methods); + Ct<ProtocolV2> *handle_auth_bad_method(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_reply_more(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_done(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_signature(ceph::bufferlist &payload); + Ct<ProtocolV2> *send_client_ident(); + Ct<ProtocolV2> *send_reconnect(); + Ct<ProtocolV2> *handle_ident_missing_features(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_session_reset(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_session_retry(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_session_retry_global(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_wait(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_reconnect_ok(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_server_ident(ceph::bufferlist &payload); + + // Server Protocol + CONTINUATION_DECL(ProtocolV2, start_server_banner_exchange); + CONTINUATION_DECL(ProtocolV2, post_server_banner_exchange); + CONTINUATION_DECL(ProtocolV2, server_ready); + + Ct<ProtocolV2> *start_server_banner_exchange(); + Ct<ProtocolV2> *post_server_banner_exchange(); + Ct<ProtocolV2> *handle_auth_request(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_auth_request_more(ceph::bufferlist &payload); + Ct<ProtocolV2> *_handle_auth_request(bufferlist& auth_payload, bool more); + Ct<ProtocolV2> *_auth_bad_method(int r); + Ct<ProtocolV2> *handle_client_ident(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_ident_missing_features_write(int r); + Ct<ProtocolV2> *handle_reconnect(ceph::bufferlist &payload); + Ct<ProtocolV2> *handle_existing_connection(AsyncConnectionRef existing); + Ct<ProtocolV2> *reuse_connection(AsyncConnectionRef existing, + ProtocolV2 *exproto); + Ct<ProtocolV2> *send_server_ident(); + Ct<ProtocolV2> *send_reconnect_ok(); + Ct<ProtocolV2> *server_ready(); + + size_t get_current_msg_size() const; +}; + +#endif /* _MSG_ASYNC_PROTOCOL_V2_ */ diff --git a/src/msg/async/Stack.cc b/src/msg/async/Stack.cc new file mode 100644 index 00000000..8976c3cc --- /dev/null +++ b/src/msg/async/Stack.cc @@ -0,0 +1,217 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <mutex> + +#include "include/compat.h" +#include "common/Cond.h" +#include "common/errno.h" +#include "PosixStack.h" +#ifdef HAVE_RDMA +#include "rdma/RDMAStack.h" +#endif +#ifdef HAVE_DPDK +#include "dpdk/DPDKStack.h" +#endif + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "stack " + +std::function<void ()> NetworkStack::add_thread(unsigned i) +{ + Worker *w = workers[i]; + return [this, w]() { + char tp_name[16]; + sprintf(tp_name, "msgr-worker-%u", w->id); + ceph_pthread_setname(pthread_self(), tp_name); + const unsigned EventMaxWaitUs = 30000000; + w->center.set_owner(); + ldout(cct, 10) << __func__ << " starting" << dendl; + w->initialize(); + w->init_done(); + while (!w->done) { + ldout(cct, 30) << __func__ << " calling event process" << dendl; + + ceph::timespan dur; + int r = w->center.process_events(EventMaxWaitUs, &dur); + if (r < 0) { + ldout(cct, 20) << __func__ << " process events failed: " + << cpp_strerror(errno) << dendl; + // TODO do something? + } + w->perf_logger->tinc(l_msgr_running_total_time, dur); + } + w->reset(); + w->destroy(); + }; +} + +std::shared_ptr<NetworkStack> NetworkStack::create(CephContext *c, const string &t) +{ + if (t == "posix") + return std::make_shared<PosixNetworkStack>(c, t); +#ifdef HAVE_RDMA + else if (t == "rdma") + return std::make_shared<RDMAStack>(c, t); +#endif +#ifdef HAVE_DPDK + else if (t == "dpdk") + return std::make_shared<DPDKStack>(c, t); +#endif + + lderr(c) << __func__ << " ms_async_transport_type " << t << + " is not supported! " << dendl; + ceph_abort(); + return nullptr; +} + +Worker* NetworkStack::create_worker(CephContext *c, const string &type, unsigned i) +{ + if (type == "posix") + return new PosixWorker(c, i); +#ifdef HAVE_RDMA + else if (type == "rdma") + return new RDMAWorker(c, i); +#endif +#ifdef HAVE_DPDK + else if (type == "dpdk") + return new DPDKWorker(c, i); +#endif + + lderr(c) << __func__ << " ms_async_transport_type " << type << + " is not supported! " << dendl; + ceph_abort(); + return nullptr; +} + +NetworkStack::NetworkStack(CephContext *c, const string &t): type(t), started(false), cct(c) +{ + ceph_assert(cct->_conf->ms_async_op_threads > 0); + + const int InitEventNumber = 5000; + num_workers = cct->_conf->ms_async_op_threads; + if (num_workers >= EventCenter::MAX_EVENTCENTER) { + ldout(cct, 0) << __func__ << " max thread limit is " + << EventCenter::MAX_EVENTCENTER << ", switching to this now. " + << "Higher thread values are unnecessary and currently unsupported." + << dendl; + num_workers = EventCenter::MAX_EVENTCENTER; + } + + for (unsigned i = 0; i < num_workers; ++i) { + Worker *w = create_worker(cct, type, i); + w->center.init(InitEventNumber, i, type); + workers.push_back(w); + } +} + +void NetworkStack::start() +{ + std::unique_lock<decltype(pool_spin)> lk(pool_spin); + + if (started) { + return ; + } + + for (unsigned i = 0; i < num_workers; ++i) { + if (workers[i]->is_init()) + continue; + std::function<void ()> thread = add_thread(i); + spawn_worker(i, std::move(thread)); + } + started = true; + lk.unlock(); + + for (unsigned i = 0; i < num_workers; ++i) + workers[i]->wait_for_init(); +} + +Worker* NetworkStack::get_worker() +{ + ldout(cct, 30) << __func__ << dendl; + + // start with some reasonably large number + unsigned min_load = std::numeric_limits<int>::max(); + Worker* current_best = nullptr; + + pool_spin.lock(); + // find worker with least references + // tempting case is returning on references == 0, but in reality + // this will happen so rarely that there's no need for special case. + for (unsigned i = 0; i < num_workers; ++i) { + unsigned worker_load = workers[i]->references.load(); + if (worker_load < min_load) { + current_best = workers[i]; + min_load = worker_load; + } + } + + pool_spin.unlock(); + ceph_assert(current_best); + ++current_best->references; + return current_best; +} + +void NetworkStack::stop() +{ + std::lock_guard<decltype(pool_spin)> lk(pool_spin); + for (unsigned i = 0; i < num_workers; ++i) { + workers[i]->done = true; + workers[i]->center.wakeup(); + join_worker(i); + } + started = false; +} + +class C_drain : public EventCallback { + Mutex drain_lock; + Cond drain_cond; + unsigned drain_count; + + public: + explicit C_drain(size_t c) + : drain_lock("C_drain::drain_lock"), + drain_count(c) {} + void do_request(uint64_t id) override { + Mutex::Locker l(drain_lock); + drain_count--; + if (drain_count == 0) drain_cond.Signal(); + } + void wait() { + Mutex::Locker l(drain_lock); + while (drain_count) + drain_cond.Wait(drain_lock); + } +}; + +void NetworkStack::drain() +{ + ldout(cct, 30) << __func__ << " started." << dendl; + pthread_t cur = pthread_self(); + pool_spin.lock(); + C_drain drain(num_workers); + for (unsigned i = 0; i < num_workers; ++i) { + ceph_assert(cur != workers[i]->center.get_owner()); + workers[i]->center.dispatch_event_external(EventCallbackRef(&drain)); + } + pool_spin.unlock(); + drain.wait(); + ldout(cct, 30) << __func__ << " end." << dendl; +} diff --git a/src/msg/async/Stack.h b/src/msg/async/Stack.h new file mode 100644 index 00000000..a093dadb --- /dev/null +++ b/src/msg/async/Stack.h @@ -0,0 +1,356 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_ASYNC_STACK_H +#define CEPH_MSG_ASYNC_STACK_H + +#include "include/spinlock.h" +#include "common/perf_counters.h" +#include "msg/msg_types.h" +#include "msg/async/Event.h" + +class Worker; +class ConnectedSocketImpl { + public: + virtual ~ConnectedSocketImpl() {} + virtual int is_connected() = 0; + virtual ssize_t read(char*, size_t) = 0; + virtual ssize_t zero_copy_read(bufferptr&) = 0; + virtual ssize_t send(bufferlist &bl, bool more) = 0; + virtual void shutdown() = 0; + virtual void close() = 0; + virtual int fd() const = 0; + virtual int socket_fd() const = 0; +}; + +class ConnectedSocket; +struct SocketOptions { + bool nonblock = true; + bool nodelay = true; + int rcbuf_size = 0; + int priority = -1; + entity_addr_t connect_bind_addr; +}; + +/// \cond internal +class ServerSocketImpl { + public: + unsigned addr_type; ///< entity_addr_t::TYPE_* + unsigned addr_slot; ///< position of our addr in myaddrs().v + ServerSocketImpl(unsigned type, unsigned slot) + : addr_type(type), addr_slot(slot) {} + virtual ~ServerSocketImpl() {} + virtual int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) = 0; + virtual void abort_accept() = 0; + /// Get file descriptor + virtual int fd() const = 0; +}; +/// \endcond + +/// \addtogroup networking-module +/// @{ + +/// A TCP (or other stream-based protocol) connection. +/// +/// A \c ConnectedSocket represents a full-duplex stream between +/// two endpoints, a local endpoint and a remote endpoint. +class ConnectedSocket { + std::unique_ptr<ConnectedSocketImpl> _csi; + + public: + /// Constructs a \c ConnectedSocket not corresponding to a connection + ConnectedSocket() {}; + /// \cond internal + explicit ConnectedSocket(std::unique_ptr<ConnectedSocketImpl> csi) + : _csi(std::move(csi)) {} + /// \endcond + ~ConnectedSocket() { + if (_csi) + _csi->close(); + } + /// Moves a \c ConnectedSocket object. + ConnectedSocket(ConnectedSocket&& cs) = default; + /// Move-assigns a \c ConnectedSocket object. + ConnectedSocket& operator=(ConnectedSocket&& cs) = default; + + int is_connected() { + return _csi->is_connected(); + } + /// Read the input stream with copy. + /// + /// Copy an object returning data sent from the remote endpoint. + ssize_t read(char* buf, size_t len) { + return _csi->read(buf, len); + } + /// Gets the input stream. + /// + /// Gets an object returning data sent from the remote endpoint. + ssize_t zero_copy_read(bufferptr &data) { + return _csi->zero_copy_read(data); + } + /// Gets the output stream. + /// + /// Gets an object that sends data to the remote endpoint. + ssize_t send(bufferlist &bl, bool more) { + return _csi->send(bl, more); + } + /// Disables output to the socket. + /// + /// Current or future writes that have not been successfully flushed + /// will immediately fail with an error. This is useful to abort + /// operations on a socket that is not making progress due to a + /// peer failure. + void shutdown() { + return _csi->shutdown(); + } + /// Disables input from the socket. + /// + /// Current or future reads will immediately fail with an error. + /// This is useful to abort operations on a socket that is not making + /// progress due to a peer failure. + void close() { + _csi->close(); + _csi.reset(); + } + + /// Get file descriptor + int fd() const { + return _csi->fd(); + } + int socket_fd() const { + return _csi->socket_fd(); + } + + explicit operator bool() const { + return _csi.get(); + } +}; +/// @} + +/// \addtogroup networking-module +/// @{ + +/// A listening socket, waiting to accept incoming network connections. +class ServerSocket { + std::unique_ptr<ServerSocketImpl> _ssi; + public: + /// Constructs a \c ServerSocket not corresponding to a connection + ServerSocket() {} + /// \cond internal + explicit ServerSocket(std::unique_ptr<ServerSocketImpl> ssi) + : _ssi(std::move(ssi)) {} + ~ServerSocket() { + if (_ssi) + _ssi->abort_accept(); + } + /// \endcond + /// Moves a \c ServerSocket object. + ServerSocket(ServerSocket&& ss) = default; + /// Move-assigns a \c ServerSocket object. + ServerSocket& operator=(ServerSocket&& cs) = default; + + /// Accepts the next connection to successfully connect to this socket. + /// + /// \Accepts a \ref ConnectedSocket representing the connection, and + /// a \ref entity_addr_t describing the remote endpoint. + int accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) { + return _ssi->accept(sock, opt, out, w); + } + + /// Stops any \ref accept() in progress. + /// + /// Current and future \ref accept() calls will terminate immediately + /// with an error. + void abort_accept() { + _ssi->abort_accept(); + _ssi.reset(); + } + + /// Get file descriptor + int fd() const { + return _ssi->fd(); + } + + /// get listen/bind addr + unsigned get_addr_slot() { + return _ssi->addr_slot; + } + + explicit operator bool() const { + return _ssi.get(); + } +}; +/// @} + +class NetworkStack; + +enum { + l_msgr_first = 94000, + l_msgr_recv_messages, + l_msgr_send_messages, + l_msgr_recv_bytes, + l_msgr_send_bytes, + l_msgr_created_connections, + l_msgr_active_connections, + + l_msgr_running_total_time, + l_msgr_running_send_time, + l_msgr_running_recv_time, + l_msgr_running_fast_dispatch_time, + + l_msgr_last, +}; + +class Worker { + std::mutex init_lock; + std::condition_variable init_cond; + bool init = false; + + public: + bool done = false; + + CephContext *cct; + PerfCounters *perf_logger; + unsigned id; + + std::atomic_uint references; + EventCenter center; + + Worker(const Worker&) = delete; + Worker& operator=(const Worker&) = delete; + + Worker(CephContext *c, unsigned i) + : cct(c), perf_logger(NULL), id(i), references(0), center(c) { + char name[128]; + sprintf(name, "AsyncMessenger::Worker-%u", id); + // initialize perf_logger + PerfCountersBuilder plb(cct, name, l_msgr_first, l_msgr_last); + + plb.add_u64_counter(l_msgr_recv_messages, "msgr_recv_messages", "Network received messages"); + plb.add_u64_counter(l_msgr_send_messages, "msgr_send_messages", "Network sent messages"); + plb.add_u64_counter(l_msgr_recv_bytes, "msgr_recv_bytes", "Network received bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_send_bytes, "msgr_send_bytes", "Network sent bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_active_connections, "msgr_active_connections", "Active connection number"); + plb.add_u64_counter(l_msgr_created_connections, "msgr_created_connections", "Created connection number"); + + plb.add_time(l_msgr_running_total_time, "msgr_running_total_time", "The total time of thread running"); + plb.add_time(l_msgr_running_send_time, "msgr_running_send_time", "The total time of message sending"); + plb.add_time(l_msgr_running_recv_time, "msgr_running_recv_time", "The total time of message receiving"); + plb.add_time(l_msgr_running_fast_dispatch_time, "msgr_running_fast_dispatch_time", "The total time of fast dispatch"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + } + virtual ~Worker() { + if (perf_logger) { + cct->get_perfcounters_collection()->remove(perf_logger); + delete perf_logger; + } + } + + virtual int listen(entity_addr_t &addr, unsigned addr_slot, + const SocketOptions &opts, ServerSocket *) = 0; + virtual int connect(const entity_addr_t &addr, + const SocketOptions &opts, ConnectedSocket *socket) = 0; + virtual void destroy() {} + + virtual void initialize() {} + PerfCounters *get_perf_counter() { return perf_logger; } + void release_worker() { + int oldref = references.fetch_sub(1); + ceph_assert(oldref > 0); + } + void init_done() { + init_lock.lock(); + init = true; + init_cond.notify_all(); + init_lock.unlock(); + } + bool is_init() { + std::lock_guard<std::mutex> l(init_lock); + return init; + } + void wait_for_init() { + std::unique_lock<std::mutex> l(init_lock); + while (!init) + init_cond.wait(l); + } + void reset() { + init_lock.lock(); + init = false; + init_cond.notify_all(); + init_lock.unlock(); + done = false; + } +}; + +class NetworkStack { + std::string type; + unsigned num_workers = 0; + ceph::spinlock pool_spin; + bool started = false; + + std::function<void ()> add_thread(unsigned i); + + protected: + CephContext *cct; + vector<Worker*> workers; + + explicit NetworkStack(CephContext *c, const string &t); + public: + NetworkStack(const NetworkStack &) = delete; + NetworkStack& operator=(const NetworkStack &) = delete; + virtual ~NetworkStack() { + for (auto &&w : workers) + delete w; + } + + static std::shared_ptr<NetworkStack> create( + CephContext *c, const string &type); + + static Worker* create_worker( + CephContext *c, const string &t, unsigned i); + // backend need to override this method if supports zero copy read + virtual bool support_zero_copy_read() const { return false; } + // backend need to override this method if backend doesn't support shared + // listen table. + // For example, posix backend has in kernel global listen table. If one + // thread bind a port, other threads also aware this. + // But for dpdk backend, we maintain listen table in each thread. So we + // need to let each thread do binding port. + virtual bool support_local_listen_table() const { return false; } + virtual bool nonblock_connect_need_writable_event() const { return true; } + + void start(); + void stop(); + virtual Worker *get_worker(); + Worker *get_worker(unsigned i) { + return workers[i]; + } + void drain(); + unsigned get_num_worker() const { + return num_workers; + } + + // direct is used in tests only + virtual void spawn_worker(unsigned i, std::function<void ()> &&) = 0; + virtual void join_worker(unsigned i) = 0; + + virtual bool is_ready() { return true; }; + virtual void ready() { }; +}; + +#endif //CEPH_MSG_ASYNC_STACK_H diff --git a/src/msg/async/crypto_onwire.cc b/src/msg/async/crypto_onwire.cc new file mode 100644 index 00000000..4e423406 --- /dev/null +++ b/src/msg/async/crypto_onwire.cc @@ -0,0 +1,311 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab + +#include <array> +#include <openssl/evp.h> + +#include "crypto_onwire.h" + +#include "common/debug.h" +#include "common/ceph_crypto.h" +#include "include/types.h" + +#define dout_subsys ceph_subsys_ms + +namespace ceph::crypto::onwire { + +static constexpr const std::size_t AESGCM_KEY_LEN{16}; +static constexpr const std::size_t AESGCM_IV_LEN{12}; +static constexpr const std::size_t AESGCM_TAG_LEN{16}; +static constexpr const std::size_t AESGCM_BLOCK_LEN{16}; + +struct nonce_t { + ceph_le32 fixed; + ceph_le64 counter; + + bool operator==(const nonce_t& rhs) const { + return !memcmp(this, &rhs, sizeof(*this)); + } +} __attribute__((packed)); +static_assert(sizeof(nonce_t) == AESGCM_IV_LEN); + +using key_t = std::array<std::uint8_t, AESGCM_KEY_LEN>; + +// http://www.mindspring.com/~dmcgrew/gcm-nist-6.pdf +// https://www.openssl.org/docs/man1.0.2/crypto/EVP_aes_128_gcm.html#GCM-mode +// https://wiki.openssl.org/index.php/EVP_Authenticated_Encryption_and_Decryption +// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38d.pdf +class AES128GCM_OnWireTxHandler : public ceph::crypto::onwire::TxHandler { + CephContext* const cct; + std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx; + ceph::bufferlist buffer; + nonce_t nonce, initial_nonce; + bool used_initial_nonce; + bool new_nonce_format; // 64-bit counter? + static_assert(sizeof(nonce) == AESGCM_IV_LEN); + +public: + AES128GCM_OnWireTxHandler(CephContext* const cct, + const key_t& key, + const nonce_t& nonce, + bool new_nonce_format) + : cct(cct), + ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free), + nonce(nonce), initial_nonce(nonce), used_initial_nonce(false), + new_nonce_format(new_nonce_format) { + ceph_assert_always(ectx); + ceph_assert_always(key.size() * CHAR_BIT == 128); + + if (1 != EVP_EncryptInit_ex(ectx.get(), EVP_aes_128_gcm(), + nullptr, nullptr, nullptr)) { + throw std::runtime_error("EVP_EncryptInit_ex failed"); + } + + if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, + key.data(), nullptr)) { + throw std::runtime_error("EVP_EncryptInit_ex failed"); + } + } + + ~AES128GCM_OnWireTxHandler() override { + ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce)); + ::ceph::crypto::zeroize_for_security(&initial_nonce, sizeof(initial_nonce)); + } + + void reset_tx_handler(const uint32_t* first, const uint32_t* last) override; + + void authenticated_encrypt_update(const ceph::bufferlist& plaintext) override; + ceph::bufferlist authenticated_encrypt_final() override; +}; + +void AES128GCM_OnWireTxHandler::reset_tx_handler(const uint32_t* first, + const uint32_t* last) +{ + if (nonce == initial_nonce) { + if (used_initial_nonce) { + throw ceph::crypto::onwire::TxHandlerError("out of nonces"); + } + used_initial_nonce = true; + } + + if(1 != EVP_EncryptInit_ex(ectx.get(), nullptr, nullptr, nullptr, + reinterpret_cast<const unsigned char*>(&nonce))) { + throw std::runtime_error("EVP_EncryptInit_ex failed"); + } + + ceph_assert(buffer.get_append_buffer_unused_tail_length() == 0); + buffer.reserve(std::accumulate(first, last, AESGCM_TAG_LEN)); + + if (!new_nonce_format) { + // msgr2.0: 32-bit counter followed by 64-bit fixed field, + // susceptible to overflow! + nonce.fixed = nonce.fixed + 1; + } else { + nonce.counter = nonce.counter + 1; + } +} + +void AES128GCM_OnWireTxHandler::authenticated_encrypt_update( + const ceph::bufferlist& plaintext) +{ + ceph_assert(buffer.get_append_buffer_unused_tail_length() >= + plaintext.length()); + auto filler = buffer.append_hole(plaintext.length()); + + for (const auto& plainbuf : plaintext.buffers()) { + int update_len = 0; + + if(1 != EVP_EncryptUpdate(ectx.get(), + reinterpret_cast<unsigned char*>(filler.c_str()), + &update_len, + reinterpret_cast<const unsigned char*>(plainbuf.c_str()), + plainbuf.length())) { + throw std::runtime_error("EVP_EncryptUpdate failed"); + } + ceph_assert_always(update_len >= 0); + ceph_assert(static_cast<unsigned>(update_len) == plainbuf.length()); + filler.advance(update_len); + } + + ldout(cct, 15) << __func__ + << " plaintext.length()=" << plaintext.length() + << " buffer.length()=" << buffer.length() + << dendl; +} + +ceph::bufferlist AES128GCM_OnWireTxHandler::authenticated_encrypt_final() +{ + int final_len = 0; + ceph_assert(buffer.get_append_buffer_unused_tail_length() == + AESGCM_BLOCK_LEN); + auto filler = buffer.append_hole(AESGCM_BLOCK_LEN); + if(1 != EVP_EncryptFinal_ex(ectx.get(), + reinterpret_cast<unsigned char*>(filler.c_str()), + &final_len)) { + throw std::runtime_error("EVP_EncryptFinal_ex failed"); + } + ceph_assert_always(final_len == 0); + + static_assert(AESGCM_BLOCK_LEN == AESGCM_TAG_LEN); + if(1 != EVP_CIPHER_CTX_ctrl(ectx.get(), + EVP_CTRL_GCM_GET_TAG, AESGCM_TAG_LEN, + filler.c_str())) { + throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed"); + } + + ldout(cct, 15) << __func__ + << " buffer.length()=" << buffer.length() + << " final_len=" << final_len + << dendl; + return std::move(buffer); +} + +// RX PART +class AES128GCM_OnWireRxHandler : public ceph::crypto::onwire::RxHandler { + CephContext* const cct; + std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)> ectx; + nonce_t nonce; + bool new_nonce_format; // 64-bit counter? + static_assert(sizeof(nonce) == AESGCM_IV_LEN); + +public: + AES128GCM_OnWireRxHandler(CephContext* const cct, + const key_t& key, + const nonce_t& nonce, + bool new_nonce_format) + : cct(cct), + ectx(EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free), + nonce(nonce), new_nonce_format(new_nonce_format) { + ceph_assert_always(ectx); + ceph_assert_always(key.size() * CHAR_BIT == 128); + + if (1 != EVP_DecryptInit_ex(ectx.get(), EVP_aes_128_gcm(), + nullptr, nullptr, nullptr)) { + throw std::runtime_error("EVP_DecryptInit_ex failed"); + } + + if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, + key.data(), nullptr)) { + throw std::runtime_error("EVP_DecryptInit_ex failed"); + } + } + + ~AES128GCM_OnWireRxHandler() override { + ::ceph::crypto::zeroize_for_security(&nonce, sizeof(nonce)); + } + + std::uint32_t get_extra_size_at_final() override { + return AESGCM_TAG_LEN; + } + void reset_rx_handler() override; + void authenticated_decrypt_update(ceph::bufferlist& bl) override; + void authenticated_decrypt_update_final(ceph::bufferlist& bl) override; +}; + +void AES128GCM_OnWireRxHandler::reset_rx_handler() +{ + if(1 != EVP_DecryptInit_ex(ectx.get(), nullptr, nullptr, nullptr, + reinterpret_cast<const unsigned char*>(&nonce))) { + throw std::runtime_error("EVP_DecryptInit_ex failed"); + } + + if (!new_nonce_format) { + // msgr2.0: 32-bit counter followed by 64-bit fixed field, + // susceptible to overflow! + nonce.fixed = nonce.fixed + 1; + } else { + nonce.counter = nonce.counter + 1; + } +} + +void AES128GCM_OnWireRxHandler::authenticated_decrypt_update( + ceph::bufferlist& bl) +{ + // discard cached crcs as we will be writing through c_str() + bl.invalidate_crc(); + for (auto& buf : bl.buffers()) { + auto p = reinterpret_cast<unsigned char*>(const_cast<char*>(buf.c_str())); + int update_len = 0; + + if (1 != EVP_DecryptUpdate(ectx.get(), p, &update_len, p, buf.length())) { + throw std::runtime_error("EVP_DecryptUpdate failed"); + } + ceph_assert_always(update_len >= 0); + ceph_assert(static_cast<unsigned>(update_len) == buf.length()); + } +} + +void AES128GCM_OnWireRxHandler::authenticated_decrypt_update_final( + ceph::bufferlist& bl) +{ + unsigned orig_len = bl.length(); + ceph_assert(orig_len >= AESGCM_TAG_LEN); + + // decrypt optional data. Caller is obliged to provide only signature but it + // may supply ciphertext as well. Combining the update + final is reflected + // combined together. + ceph::bufferlist auth_tag; + bl.splice(orig_len - AESGCM_TAG_LEN, AESGCM_TAG_LEN, &auth_tag); + if (bl.length() > 0) { + authenticated_decrypt_update(bl); + } + + // we need to ensure the tag is stored in continuous memory. + if (1 != EVP_CIPHER_CTX_ctrl(ectx.get(), EVP_CTRL_GCM_SET_TAG, + AESGCM_TAG_LEN, auth_tag.c_str())) { + throw std::runtime_error("EVP_CIPHER_CTX_ctrl failed"); + } + + // I expect that 0 bytes will be appended. The call is supposed solely to + // authenticate the message. + { + int final_len = 0; + if (0 >= EVP_DecryptFinal_ex(ectx.get(), nullptr, &final_len)) { + throw MsgAuthError(); + } + ceph_assert_always(final_len == 0); + ceph_assert(bl.length() + AESGCM_TAG_LEN == orig_len); + } +} + +ceph::crypto::onwire::rxtx_t ceph::crypto::onwire::rxtx_t::create_handler_pair( + CephContext* cct, + const AuthConnectionMeta& auth_meta, + bool new_nonce_format, + bool crossed) +{ + if (auth_meta.is_mode_secure()) { + ceph_assert_always(auth_meta.connection_secret.length() >= \ + sizeof(key_t) + 2 * sizeof(nonce_t)); + const char* secbuf = auth_meta.connection_secret.c_str(); + + key_t key; + { + ::memcpy(key.data(), secbuf, sizeof(key)); + secbuf += sizeof(key); + } + + nonce_t rx_nonce; + { + ::memcpy(&rx_nonce, secbuf, sizeof(rx_nonce)); + secbuf += sizeof(rx_nonce); + } + + nonce_t tx_nonce; + { + ::memcpy(&tx_nonce, secbuf, sizeof(tx_nonce)); + secbuf += sizeof(tx_nonce); + } + + return { + std::make_unique<AES128GCM_OnWireRxHandler>( + cct, key, crossed ? tx_nonce : rx_nonce, new_nonce_format), + std::make_unique<AES128GCM_OnWireTxHandler>( + cct, key, crossed ? rx_nonce : tx_nonce, new_nonce_format) + }; + } else { + return { nullptr, nullptr }; + } +} + +} // namespace ceph::crypto::onwire diff --git a/src/msg/async/crypto_onwire.h b/src/msg/async/crypto_onwire.h new file mode 100644 index 00000000..55f75508 --- /dev/null +++ b/src/msg/async/crypto_onwire.h @@ -0,0 +1,130 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2009 Sage Weil <sage@newdream.net> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_CRYPTO_ONWIRE_H +#define CEPH_CRYPTO_ONWIRE_H + +#include <cstdint> +#include <memory> + +#include "auth/Auth.h" +#include "include/buffer.h" + +namespace ceph::math { + +// TODO +template <typename T> +class always_aligned_t { + T val; + + template <class... Args> + always_aligned_t(Args&&... args) + : val(std::forward<Args>(args)...) { + } +}; + +} // namespace ceph::math + +namespace ceph::crypto::onwire { + +struct MsgAuthError : public std::runtime_error { + MsgAuthError() + : runtime_error("message signature mismatch") { + } +}; + +struct TxHandlerError : public std::runtime_error { + TxHandlerError(const char* what) + : std::runtime_error(std::string("tx handler error: ") + what) {} +}; + +struct TxHandler { + virtual ~TxHandler() = default; + + // Instance of TxHandler must be reset before doing any encrypt-update + // step. This applies also to situation when encrypt-final was already + // called and another round of update-...-update-final will take place. + // + // The input parameter informs implementation how the -update sequence + // is fragmented and allows to make concious decision about allocation + // or reusage of provided memory. One implementation could do in-place + // encryption while other might prefer one huge output buffer. + // + // It's undefined what will happen if client doesn't follow the order. + // + // TODO: switch to always_aligned_t + virtual void reset_tx_handler(const uint32_t* first, + const uint32_t* last) = 0; + + void reset_tx_handler(std::initializer_list<uint32_t> update_size_sequence) { + if (update_size_sequence.size() > 0) { + const uint32_t* first = &*update_size_sequence.begin(); + reset_tx_handler(first, first + update_size_sequence.size()); + } else { + reset_tx_handler(nullptr, nullptr); + } + } + + // Perform encryption. Client gives full ownership right to provided + // bufferlist. The method MUST NOT be called after _final() if there + // was no call to _reset(). + virtual void authenticated_encrypt_update( + const ceph::bufferlist& plaintext) = 0; + + // Generates authentication signature and returns bufferlist crafted + // basing on plaintext from preceding call to _update(). + virtual ceph::bufferlist authenticated_encrypt_final() = 0; +}; + +class RxHandler { +public: + virtual ~RxHandler() = default; + + // Transmitter can append extra bytes of ciphertext at the -final step. + // This method return how much was added, and thus let client translate + // plaintext size into ciphertext size to grab from wire. + virtual std::uint32_t get_extra_size_at_final() = 0; + + // Instance of RxHandler must be reset before doing any decrypt-update + // step. This applies also to situation when decrypt-final was already + // called and another round of update-...-update-final will take place. + virtual void reset_rx_handler() = 0; + + // Perform decryption ciphertext must be ALWAYS aligned to 16 bytes. + virtual void authenticated_decrypt_update(ceph::bufferlist& bl) = 0; + + // Perform decryption of last cipertext's portion and verify signature + // for overall decryption sequence. + // Throws on integrity/authenticity checks + virtual void authenticated_decrypt_update_final(ceph::bufferlist& bl) = 0; +}; + +struct rxtx_t { + //rxtx_t(rxtx_t&& r) : rx(std::move(rx)), tx(std::move(tx)) {} + // Each peer can use different handlers. + // Hmm, isn't that too much flexbility? + std::unique_ptr<RxHandler> rx; + std::unique_ptr<TxHandler> tx; + + static rxtx_t create_handler_pair( + CephContext* ctx, + const class AuthConnectionMeta& auth_meta, + bool new_nonce_format, + bool crossed); +}; + +} // namespace ceph::crypto::onwire + +#endif // CEPH_CRYPTO_ONWIRE_H diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc new file mode 100644 index 00000000..dedc9e3c --- /dev/null +++ b/src/msg/async/dpdk/ARP.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "ARP.h" + +arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num) + : _arp(a), _proto_num(proto_num) +{ + _arp.add(proto_num, this); +} + +arp_for_protocol::~arp_for_protocol() +{ + _arp.del(_proto_num); +} + +arp::arp(interface* netif): + _netif(netif), + _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }), + _rx_packets( + _proto.receive( + [this] (Packet p, ethernet_address ea) { + return process_packet(std::move(p), ea); + }, + [this](forward_hash& out_hash_data, Packet& p, size_t off) { + return forward(out_hash_data, p, off); + } + ) + ) +{} + +Tub<l3_protocol::l3packet> arp::get_packet() +{ + Tub<l3_protocol::l3packet> p; + if (!_packetq.empty()) { + p = std::move(_packetq.front()); + _packetq.pop_front(); + } + return p; +} + +bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + auto ah = p.get_header<arp_hdr>(off); + auto i = _arp_for_protocol.find(ntoh(ah->ptype)); + if (i != _arp_for_protocol.end()) { + return i->second->forward(out_hash_data, p, off); + } + return false; +} + +void arp::add(uint16_t proto_num, arp_for_protocol* afp) +{ + _arp_for_protocol[proto_num] = afp; +} + +void arp::del(uint16_t proto_num) +{ + _arp_for_protocol.erase(proto_num); +} + +int arp::process_packet(Packet p, ethernet_address from) +{ + auto ah = p.get_header<arp_hdr>()->ntoh(); + auto i = _arp_for_protocol.find(ah.ptype); + if (i != _arp_for_protocol.end()) { + i->second->received(std::move(p)); + } + return 0; +} diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h new file mode 100644 index 00000000..54569564 --- /dev/null +++ b/src/msg/async/dpdk/ARP.h @@ -0,0 +1,301 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ + +#ifndef CEPH_MSG_ARP_H_ +#define CEPH_MSG_ARP_H_ + +#include <errno.h> + +#include <unordered_map> +#include <functional> + +#include "msg/async/Event.h" + +#include "ethernet.h" +#include "circular_buffer.h" +#include "ip_types.h" +#include "net.h" +#include "Packet.h" + +class arp; +template <typename L3> +class arp_for; + +class arp_for_protocol { + protected: + arp& _arp; + uint16_t _proto_num; + public: + arp_for_protocol(arp& a, uint16_t proto_num); + virtual ~arp_for_protocol(); + virtual int received(Packet p) = 0; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; } +}; + +class interface; + +class arp { + interface* _netif; + l3_protocol _proto; + subscription<Packet, ethernet_address> _rx_packets; + std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol; + circular_buffer<l3_protocol::l3packet> _packetq; + private: + struct arp_hdr { + uint16_t htype; + uint16_t ptype; + arp_hdr ntoh() { + arp_hdr hdr = *this; + hdr.htype = ::ntoh(htype); + hdr.ptype = ::ntoh(ptype); + return hdr; + } + arp_hdr hton() { + arp_hdr hdr = *this; + hdr.htype = ::hton(htype); + hdr.ptype = ::hton(ptype); + return hdr; + } + }; + public: + explicit arp(interface* netif); + void add(uint16_t proto_num, arp_for_protocol* afp); + void del(uint16_t proto_num); + private: + ethernet_address l2self() { return _netif->hw_address(); } + int process_packet(Packet p, ethernet_address from); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + Tub<l3_protocol::l3packet> get_packet(); + template <class l3_proto> + friend class arp_for; +}; + +template <typename L3> +class arp_for : public arp_for_protocol { + public: + using l2addr = ethernet_address; + using l3addr = typename L3::address_type; + private: + static constexpr auto max_waiters = 512; + enum oper { + op_request = 1, + op_reply = 2, + }; + struct arp_hdr { + uint16_t htype; + uint16_t ptype; + uint8_t hlen; + uint8_t plen; + uint16_t oper; + l2addr sender_hwaddr; + l3addr sender_paddr; + l2addr target_hwaddr; + l3addr target_paddr; + + arp_hdr ntoh() { + arp_hdr hdr = *this; + hdr.htype = ::ntoh(htype); + hdr.ptype = ::ntoh(ptype); + hdr.oper = ::ntoh(oper); + hdr.sender_hwaddr = sender_hwaddr.ntoh(); + hdr.sender_paddr = sender_paddr.ntoh(); + hdr.target_hwaddr = target_hwaddr.ntoh(); + hdr.target_paddr = target_paddr.ntoh(); + return hdr; + } + + arp_hdr hton() { + arp_hdr hdr = *this; + hdr.htype = ::hton(htype); + hdr.ptype = ::hton(ptype); + hdr.oper = ::hton(oper); + hdr.sender_hwaddr = sender_hwaddr.hton(); + hdr.sender_paddr = sender_paddr.hton(); + hdr.target_hwaddr = target_hwaddr.hton(); + hdr.target_paddr = target_paddr.hton(); + return hdr; + } + }; + struct resolution { + std::vector<std::pair<resolution_cb, Packet>> _waiters; + uint64_t timeout_fd; + }; + class C_handle_arp_timeout : public EventCallback { + arp_for *arp; + l3addr paddr; + bool first_request; + + public: + C_handle_arp_timeout(arp_for *a, l3addr addr, bool first): + arp(a), paddr(addr), first_request(first) {} + void do_request(uint64_t r) { + arp->send_query(paddr); + auto &res = arp->_in_progress[paddr]; + + for (auto& p : res._waiters) { + p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT); + } + res._waiters.clear(); + res.timeout_fd = arp->center->create_time_event( + 1*1000*1000, this); + } + }; + friend class C_handle_arp_timeout; + + private: + CephContext *cct; + EventCenter *center; + l3addr _l3self = L3::broadcast_address(); + std::unordered_map<l3addr, l2addr> _table; + std::unordered_map<l3addr, resolution> _in_progress; + private: + Packet make_query_packet(l3addr paddr); + virtual int received(Packet p) override; + int handle_request(arp_hdr* ah); + l2addr l2self() { return _arp.l2self(); } + void send(l2addr to, Packet &&p); + public: + void send_query(const l3addr& paddr); + explicit arp_for(CephContext *c, arp& a, EventCenter *cen) + : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) { + _table[L3::broadcast_address()] = ethernet::broadcast_address(); + } + ~arp_for() { + for (auto && p : _in_progress) + center->delete_time_event(p.second.timeout_fd); + } + void wait(const l3addr& addr, Packet p, resolution_cb cb); + void learn(l2addr l2, l3addr l3); + void run(); + void set_self_addr(l3addr addr) { + _table.erase(_l3self); + _table[addr] = l2self(); + _l3self = addr; + } + friend class arp; +}; + +template <typename L3> +void arp_for<L3>::send(l2addr to, Packet &&p) { + _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)}); +} + +template <typename L3> +Packet arp_for<L3>::make_query_packet(l3addr paddr) { + arp_hdr hdr; + hdr.htype = ethernet::arp_hardware_type(); + hdr.ptype = L3::arp_protocol_type(); + hdr.hlen = sizeof(l2addr); + hdr.plen = sizeof(l3addr); + hdr.oper = op_request; + hdr.sender_hwaddr = l2self(); + hdr.sender_paddr = _l3self; + hdr.target_hwaddr = ethernet::broadcast_address(); + hdr.target_paddr = paddr; + hdr = hdr.hton(); + return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr)); +} + +template <typename L3> +void arp_for<L3>::send_query(const l3addr& paddr) { + send(ethernet::broadcast_address(), make_query_packet(paddr)); +} + +template <typename L3> +void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) { + _table[paddr] = hwaddr; + auto i = _in_progress.find(paddr); + if (i != _in_progress.end()) { + auto& res = i->second; + center->delete_time_event(res.timeout_fd); + for (auto &&p : res._waiters) { + p.first(hwaddr, std::move(p.second), 0); + } + _in_progress.erase(i); + } +} + +template <typename L3> +void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) { + auto i = _table.find(paddr); + if (i != _table.end()) { + cb(i->second, std::move(p), 0); + return ; + } + + auto j = _in_progress.find(paddr); + auto first_request = j == _in_progress.end(); + auto& res = first_request ? _in_progress[paddr] : j->second; + + if (first_request) { + res.timeout_fd = center->create_time_event( + 1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request)); + send_query(paddr); + } + + if (res._waiters.size() >= max_waiters) { + cb(ethernet_address(), std::move(p), -EBUSY); + return ; + } + + res._waiters.emplace_back(cb, std::move(p)); + return ; +} + +template <typename L3> +int arp_for<L3>::received(Packet p) { + auto ah = p.get_header<arp_hdr>(); + if (!ah) { + return 0; + } + auto h = ah->ntoh(); + if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) { + return 0; + } + switch (h.oper) { + case op_request: + return handle_request(&h); + case op_reply: + _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr); + return 0; + default: + return 0; + } +} + +template <typename L3> +int arp_for<L3>::handle_request(arp_hdr* ah) { + if (ah->target_paddr == _l3self + && _l3self != L3::broadcast_address()) { + ah->oper = op_reply; + ah->target_hwaddr = ah->sender_hwaddr; + ah->target_paddr = ah->sender_paddr; + ah->sender_hwaddr = l2self(); + ah->sender_paddr = _l3self; + *ah = ah->hton(); + send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah))); + } + return 0; +} + +#endif /* CEPH_MSG_ARP_H_ */ diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc new file mode 100644 index 00000000..278efe9e --- /dev/null +++ b/src/msg/async/dpdk/DPDK.cc @@ -0,0 +1,1267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include <atomic> +#include <vector> +#include <queue> + +#include <rte_config.h> +#include <rte_common.h> +#include <rte_eal.h> +#include <rte_pci.h> +#include <rte_ethdev.h> +#include <rte_cycles.h> +#include <rte_memzone.h> + +#include "include/page.h" +#include "align.h" +#include "IP.h" +#include "const.h" +#include "dpdk_rte.h" +#include "DPDK.h" +#include "toeplitz.h" + +#include "common/Cycles.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + + +void* as_cookie(struct rte_pktmbuf_pool_private& p) { + return &p; +}; + +#ifndef MARKER +typedef void *MARKER[0]; /**< generic marker for a point in a structure */ +#endif + +/******************* Net device related constatns *****************************/ +static constexpr uint16_t default_ring_size = 512; + +// +// We need 2 times the ring size of buffers because of the way PMDs +// refill the ring. +// +static constexpr uint16_t mbufs_per_queue_rx = 2 * default_ring_size; +static constexpr uint16_t rx_gc_thresh = 64; + +// +// No need to keep more descriptors in the air than can be sent in a single +// rte_eth_tx_burst() call. +// +static constexpr uint16_t mbufs_per_queue_tx = 2 * default_ring_size; + +static constexpr uint16_t mbuf_cache_size = 512; +// +// Size of the data buffer in the non-inline case. +// +// We may want to change (increase) this value in future, while the +// inline_mbuf_data_size value will unlikely change due to reasons described +// above. +// +static constexpr size_t mbuf_data_size = 4096; + +static constexpr uint16_t mbuf_overhead = + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM; +// +// We'll allocate 2K data buffers for an inline case because this would require +// a single page per mbuf. If we used 4K data buffers here it would require 2 +// pages for a single buffer (due to "mbuf_overhead") and this is a much more +// demanding memory constraint. +// +static constexpr size_t inline_mbuf_data_size = 2048; + + +// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers +static constexpr uint8_t max_frags = 32 + 1; + +// +// Intel's 40G NIC HW limit for a number of fragments in an xmit segment. +// +// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices +// spec. for more details. +// +static constexpr uint8_t i40e_max_xmit_segment_frags = 8; + +// +// VMWare's virtual NIC limit for a number of fragments in an xmit segment. +// +// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT +// +static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16; + +static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead; + +static size_t huge_page_size = 512 * CEPH_PAGE_SIZE; + +uint32_t qp_mempool_obj_size() +{ + uint32_t mp_size = 0; + struct rte_mempool_objsz mp_obj_sz = {}; + + // + // We will align each size to huge page size because DPDK allocates + // physically contiguous memory region for each pool object. + // + + // Rx + mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+ + sizeof(struct rte_pktmbuf_pool_private), + huge_page_size); + + //Tx + std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz)); + mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0, + &mp_obj_sz)+ + sizeof(struct rte_pktmbuf_pool_private), + huge_page_size); + return mp_size; +} + +static constexpr const char* pktmbuf_pool_name = "dpdk_net_pktmbuf_pool"; + +/* + * When doing reads from the NIC queues, use this batch size + */ +static constexpr uint8_t packet_read_size = 32; +/******************************************************************************/ + +int DPDKDevice::init_port_start() +{ + ceph_assert(_port_idx < rte_eth_dev_count()); + + rte_eth_dev_info_get(_port_idx, &_dev_info); + + // + // This is a workaround for a missing handling of a HW limitation in the + // DPDK i40e driver. This and all related to _is_i40e_device code should be + // removed once this handling is added. + // + if (std::string("rte_i40evf_pmd") == _dev_info.driver_name || + std::string("rte_i40e_pmd") == _dev_info.driver_name) { + ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl; + _is_i40e_device = true; + } + + if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) { + ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl; + _is_vmxnet3_device = true; + } + + // + // Another workaround: this time for a lack of number of RSS bits. + // ixgbe PF NICs support up to 16 RSS queues. + // ixgbe VF NICs support up to 4 RSS queues. + // i40e PF NICs support up to 64 RSS queues. + // i40e VF NICs support up to 16 RSS queues. + // + if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16); + } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4); + } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64); + } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16); + } + + // Clear txq_flags - we want to support all available offload features + // except for multi-mempool and refcnt'ing which we don't need + _dev_info.default_txconf.txq_flags = + ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT; + + // + // Disable features that are not supported by port's HW + // + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + } + + /* for port configuration all features are off by default */ + rte_eth_conf port_conf = { 0 }; + + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues " + << _dev_info.max_rx_queues << " max_tx_queues " + << _dev_info.max_tx_queues << dendl; + + _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues}); + + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using " + << _num_queues << " queues" << dendl;; + + // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU. + // Even if port has a single queue we still want the RSS feature to be + // available in order to make HW calculate RSS hash for us. + if (_num_queues > 1) { + if (_dev_info.hash_key_size == 40) { + _rss_key = default_rsskey_40bytes; + } else if (_dev_info.hash_key_size == 52) { + _rss_key = default_rsskey_52bytes; + } else if (_dev_info.hash_key_size != 0) { + // WTF?!! + rte_exit(EXIT_FAILURE, + "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested", + _port_idx, _dev_info.hash_key_size); + } else { + _rss_key = default_rsskey_40bytes; + _dev_info.hash_key_size = 40; + } + + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; + if (_dev_info.hash_key_size) { + port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data()); + port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size; + } + } else { + port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE; + } + + if (_num_queues > 1) { + if (_dev_info.reta_size) { + // RETA size should be a power of 2 + ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0); + + // Set the RSS table to the correct size + _redir_table.resize(_dev_info.reta_size); + _rss_table_bits = std::lround(std::log2(_dev_info.reta_size)); + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) + << ": RSS table size is " << _dev_info.reta_size << dendl; + } else { + // FIXME: same with sw_reta + _redir_table.resize(128); + _rss_table_bits = std::lround(std::log2(128)); + } + } else { + _redir_table.push_back(0); + } + + // Set Rx VLAN stripping + if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { + port_conf.rxmode.hw_vlan_strip = 1; + } + + // Enable HW CRC stripping + port_conf.rxmode.hw_strip_crc = 1; + +#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT + // Enable LRO + if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) { + ldout(cct, 1) << __func__ << " LRO is on" << dendl; + port_conf.rxmode.enable_lro = 1; + _hw_features.rx_lro = true; + } else +#endif + ldout(cct, 1) << __func__ << " LRO is off" << dendl; + + // Check that all CSUM features are either all set all together or not set + // all together. If this assumption breaks we need to rework the below logic + // by splitting the csum offload feature bit into separate bits for IPv4, + // TCP. + ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) || + (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM))); + + // Set Rx checksum checking + if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { + ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl; + port_conf.rxmode.hw_ip_checksum = 1; + _hw_features.rx_csum_offload = 1; + } + + if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { + ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl; + _hw_features.tx_csum_ip_offload = 1; + } + + // TSO is supported starting from DPDK v1.8 + if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { + ldout(cct, 1) << __func__ << " TSO is supported" << dendl; + _hw_features.tx_tso = 1; + } + + // Check that Tx TCP CSUM features are either all set all together + // or not set all together. If this assumption breaks we need to rework the + // below logic by splitting the csum offload feature bit into separate bits + // for TCP. + ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) || + !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)); + + if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) { + ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl; + _hw_features.tx_csum_l4_offload = 1; + } + + int retval; + + ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl; + + /* + * Standard DPDK port initialisation - config port, then set up + * rx and tx rings. + */ + if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues, + &port_conf)) != 0) { + lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx + << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl; + return retval; + } + + //rte_eth_promiscuous_enable(port_num); + ldout(cct, 1) << __func__ << " done." << dendl; + + return 0; +} + +void DPDKDevice::set_hw_flow_control() +{ + // Read the port's current/default flow control settings + struct rte_eth_fc_conf fc_conf; + auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf); + + if (ret == -ENOTSUP) { + ldout(cct, 1) << __func__ << " port " << int(_port_idx) + << ": not support to get hardware flow control settings: " << ret << dendl; + goto not_supported; + } + + if (ret < 0) { + lderr(cct) << __func__ << " port " << int(_port_idx) + << ": failed to get hardware flow control settings: " << ret << dendl; + ceph_abort(); + } + + if (_enable_fc) { + fc_conf.mode = RTE_FC_FULL; + } else { + fc_conf.mode = RTE_FC_NONE; + } + + ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf); + if (ret == -ENOTSUP) { + ldout(cct, 1) << __func__ << " port " << int(_port_idx) + << ": not support to set hardware flow control settings: " << ret << dendl; + goto not_supported; + } + + if (ret < 0) { + lderr(cct) << __func__ << " port " << int(_port_idx) + << ": failed to set hardware flow control settings: " << ret << dendl; + ceph_abort(); + } + + ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": HW FC " << _enable_fc << dendl; + return; + +not_supported: + ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl; +} + +int DPDKDevice::init_port_fini() +{ + // Changing FC requires HW reset, so set it before the port is initialized. + set_hw_flow_control(); + + if (rte_eth_dev_start(_port_idx) != 0) { + lderr(cct) << __func__ << " can't start port " << _port_idx << dendl; + return -1; + } + + if (_num_queues > 1) { + if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) { + ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl; + + // Setup HW touse the TOEPLITZ hash function as an RSS hash function + struct rte_eth_hash_filter_info info = {}; + + info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; + info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; + + if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH, + RTE_ETH_FILTER_SET, &info) < 0) { + lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl; + return -1; + } + } + + set_rss_table(); + } + + // Wait for a link + if (check_port_link_status() < 0) { + lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl; + return -1; + } + + ldout(cct, 5) << __func__ << " created DPDK device" << dendl; + return 0; +} + +void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) { + ceph_assert(!cpu_weights.empty()); + if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) { + // special case queue sending to self only, to avoid requiring a hash value + return; + } + register_packet_provider([this] { + Tub<Packet> p; + if (!_proxy_packetq.empty()) { + p = std::move(_proxy_packetq.front()); + _proxy_packetq.pop_front(); + } + return p; + }); + build_sw_reta(cpu_weights); +} + +void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) { + float total_weight = 0; + for (auto&& x : cpu_weights) { + total_weight += x.second; + } + float accum = 0; + unsigned idx = 0; + std::array<uint8_t, 128> reta; + for (auto&& entry : cpu_weights) { + auto cpu = entry.first; + auto weight = entry.second; + accum += weight; + while (idx < (accum / total_weight * reta.size() - 0.5)) { + reta[idx++] = cpu; + } + } + _sw_reta = reta; +} + + +bool DPDKQueuePair::init_rx_mbuf_pool() +{ + std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx"; + + // reserve the memory for Rx buffers containers + _rx_free_pkts.reserve(mbufs_per_queue_rx); + _rx_free_bufs.reserve(mbufs_per_queue_rx); + + _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str()); + if (!_pktmbuf_pool_rx) { + ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str() + << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl; + + // + // Don't pass single-producer/single-consumer flags to mbuf create as it + // seems faster to use a cache instead. + // + struct rte_pktmbuf_pool_private roomsz = {}; + roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM; + _pktmbuf_pool_rx = rte_mempool_create( + name.c_str(), + mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size, + mbuf_cache_size, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, as_cookie(roomsz), + rte_pktmbuf_init, nullptr, + rte_socket_id(), 0); + if (!_pktmbuf_pool_rx) { + lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl; + return false; + } + + // + // allocate more data buffer + int bufs_count = cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx; + int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; + std::string mz_name = "rx_buffer_data" + std::to_string(_qid); + const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(), + mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size); + ceph_assert(mz); + void* m = mz->addr; + for (int i = 0; i < bufs_count; i++) { + ceph_assert(m); + _alloc_bufs.push_back(m); + m += mbuf_data_size; + } + + if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size, + rte_eth_dev_socket_id(_dev_port_idx), + _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) { + lderr(cct) << __func__ << " cannot initialize rx queue" << dendl; + return false; + } + } + + return _pktmbuf_pool_rx != nullptr; +} + +int DPDKDevice::check_port_link_status() +{ + int count = 0; + + ldout(cct, 20) << __func__ << dendl; + const int sleep_time = 100 * 1000; + const int max_check_time = 90; /* 9s (90 * 100ms) in total */ + while (true) { + struct rte_eth_link link; + memset(&link, 0, sizeof(link)); + rte_eth_link_get_nowait(_port_idx, &link); + + if (true) { + if (link.link_status) { + ldout(cct, 5) << __func__ << " done port " + << static_cast<unsigned>(_port_idx) + << " link Up - speed " << link.link_speed + << " Mbps - " + << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n")) + << dendl; + break; + } else if (count++ < max_check_time) { + ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl; + usleep(sleep_time); + } else { + lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl; + return -1; + } + } + } + return 0; +} + +class C_handle_dev_stats : public EventCallback { + DPDKQueuePair *_qp; + public: + C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { } + void do_request(uint64_t id) { + _qp->handle_stats(); + } +}; + +DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid) + : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid), + _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid), + _tx_gc_poller(this) +{ + if (!init_rx_mbuf_pool()) { + lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl; + ceph_abort(); + } + + static_assert(offsetof(tx_buf, private_end) - + offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM, + "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! " + "Increase the headroom size in the DPDK configuration"); + static_assert(offsetof(tx_buf, _mbuf) == 0, + "There is a pad at the beginning of the tx_buf before _mbuf " + "field!"); + static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0, + "inline_mbuf_data_size has to be a power of two!"); + + std::string name(std::string("queue") + std::to_string(qid)); + PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last); + + plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets"); + plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets"); + plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets"); + plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets"); + plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch"); + plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch"); + plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments"); + plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments"); + plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations"); + plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations"); + plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations"); + plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations"); + plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + + if (!_qid) + device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this)); +} + +void DPDKQueuePair::handle_stats() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + rte_eth_stats rte_stats = {}; + int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats); + + if (rc) { + ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl; + return ; + } + +#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0) + _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts); + _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc); +#endif + _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed); + _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf); + + _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors); + _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors); + device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this)); +} + +bool DPDKQueuePair::poll_tx() { + bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback; +#ifdef CEPH_PERF_DEV + uint64_t start = Cycles::rdtsc(); +#endif + uint32_t total_work = 0; + if (_tx_packetq.size() < 16) { + // refill send queue from upper layers + uint32_t work; + do { + work = 0; + for (auto&& pr : _pkt_providers) { + auto p = pr(); + if (p) { + work++; + if (likely(nonloopback)) { + // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl; + _tx_packetq.push_back(std::move(*p)); + } else { + auto th = p->get_header<eth_hdr>(0); + if (th->dst_mac == th->src_mac) { + _dev->l2receive(_qid, std::move(*p)); + } else { + _tx_packetq.push_back(std::move(*p)); + } + } + if (_tx_packetq.size() == 128) { + break; + } + } + } + total_work += work; + } while (work && total_work < 256 && _tx_packetq.size() < 128); + } + if (!_tx_packetq.empty()) { + uint64_t c = send(_tx_packetq); + perf_logger->inc(l_dpdk_qp_tx_packets, c); + perf_logger->set(l_dpdk_qp_tx_last_bunch, c); +#ifdef CEPH_PERF_DEV + tx_count += total_work; + tx_cycles += Cycles::rdtsc() - start; +#endif + return true; + } + + return false; +} + +inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m) +{ + _frags.clear(); + _bufs.clear(); + + for (; m != nullptr; m = m->next) { + char* data = rte_pktmbuf_mtod(m, char*); + + _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)}); + _bufs.push_back(data); + } + + auto del = std::bind( + [this](std::vector<char*> &bufs) { + for (auto&& b : bufs) { _alloc_bufs.push_back(b); } + }, std::move(_bufs)); + return Packet( + _frags.begin(), _frags.end(), make_deleter(std::move(del))); +} + +inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m) +{ + _rx_free_pkts.push_back(m); + _num_rx_free_segs += m->nb_segs; + + if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) { + char* data = rte_pktmbuf_mtod(m, char*); + + return Packet(fragment{data, rte_pktmbuf_data_len(m)}, + make_deleter([this, data] { _alloc_bufs.push_back(data); })); + } else { + return from_mbuf_lro(m); + } +} + +inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head) +{ + for (; head != nullptr; head = head->next) { + if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) { + // + // If we failed to allocate a new buffer - push the rest of the + // cluster back to the free_packets list for a later retry. + // + _rx_free_pkts.push_back(head); + return false; + } + _rx_free_bufs.push_back(head); + } + + return true; +} + +bool DPDKQueuePair::rx_gc(bool force) +{ + if (_num_rx_free_segs >= rx_gc_thresh || force) { + ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs + << " thresh " << rx_gc_thresh + << " free pkts " << _rx_free_pkts.size() + << dendl; + + while (!_rx_free_pkts.empty()) { + // + // Use back() + pop_back() semantics to avoid an extra + // _rx_free_pkts.clear() at the end of the function - clear() has a + // linear complexity. + // + auto m = _rx_free_pkts.back(); + _rx_free_pkts.pop_back(); + + if (!refill_one_cluster(m)) { + ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl; + break; + } + } + for (auto&& m : _rx_free_bufs) { + rte_pktmbuf_prefree_seg(m); + } + + if (_rx_free_bufs.size()) { + rte_mempool_put_bulk(_pktmbuf_pool_rx, + (void **)_rx_free_bufs.data(), + _rx_free_bufs.size()); + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size()); + + _num_rx_free_segs -= _rx_free_bufs.size(); + _rx_free_bufs.clear(); + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) || + (!_rx_free_pkts.empty() && _num_rx_free_segs)); + } + } + + return _num_rx_free_segs >= rx_gc_thresh; +} + + +void DPDKQueuePair::process_packets( + struct rte_mbuf **bufs, uint16_t count) +{ + uint64_t nr_frags = 0, bytes = 0; + + for (uint16_t i = 0; i < count; i++) { + struct rte_mbuf *m = bufs[i]; + offload_info oi; + + Tub<Packet> p = from_mbuf(m); + + // Drop the packet if translation above has failed + if (!p) { + perf_logger->inc(l_dpdk_qp_rx_no_memory_errors); + continue; + } + // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl; + + nr_frags += m->nb_segs; + bytes += m->pkt_len; + + // Set stipped VLAN value if available + if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) && + (m->ol_flags & PKT_RX_VLAN_STRIPPED)) { + oi.vlan_tci = m->vlan_tci; + } + + if (_dev->get_hw_features().rx_csum_offload) { + if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { + // Packet with bad checksum, just drop it. + perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors); + continue; + } + // Note that when _hw_features.rx_csum_offload is on, the receive + // code for ip, tcp and udp will assume they don't need to check + // the checksum again, because we did this here. + } + + p->set_offload_info(oi); + if (m->ol_flags & PKT_RX_RSS_HASH) { + p->set_rss_hash(m->hash.rss); + } + + _dev->l2receive(_qid, std::move(*p)); + } + + perf_logger->inc(l_dpdk_qp_rx_packets, count); + perf_logger->set(l_dpdk_qp_rx_last_bunch, count); + perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags); + perf_logger->inc(l_dpdk_qp_rx_bytes, bytes); +} + +bool DPDKQueuePair::poll_rx_once() +{ + struct rte_mbuf *buf[packet_read_size]; + + /* read a port */ +#ifdef CEPH_PERF_DEV + uint64_t start = Cycles::rdtsc(); +#endif + uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid, + buf, packet_read_size); + + /* Now process the NIC packets read */ + if (likely(count > 0)) { + process_packets(buf, count); +#ifdef CEPH_PERF_DEV + rx_cycles = Cycles::rdtsc() - start; + rx_count += count; +#endif + } +#ifdef CEPH_PERF_DEV + else { + if (rx_count > 10000 && tx_count) { + ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns " + << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns" + << dendl; + rx_count = rx_cycles = tx_count = tx_cycles = 0; + } + } +#endif + + return count; +} + +DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c, + DPDKDevice *dev, uint8_t qid): cct(c) +{ + std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx"; + + _pool = rte_mempool_lookup(name.c_str()); + if (!_pool) { + ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str() + << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl; + // + // We are going to push the buffers from the mempool into + // the circular_buffer and then poll them from there anyway, so + // we prefer to make a mempool non-atomic in this case. + // + _pool = rte_mempool_create(name.c_str(), + mbufs_per_queue_tx, inline_mbuf_size, + mbuf_cache_size, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, nullptr, + rte_pktmbuf_init, nullptr, + rte_socket_id(), 0); + + if (!_pool) { + lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl; + ceph_abort(); + } + if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size, + rte_eth_dev_socket_id(dev->port_idx()), + dev->def_tx_conf()) < 0) { + lderr(cct) << __func__ << " cannot initialize tx queue" << dendl; + ceph_abort(); + } + } + + // + // Fill the factory with the buffers from the mempool allocated + // above. + // + init_factory(); +} + +bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head) +{ + bool is_tso = head->ol_flags & PKT_TX_TCP_SEG; + + // For a non-TSO case: number of fragments should not exceed 8 + if (!is_tso){ + return head->nb_segs > i40e_max_xmit_segment_frags; + } + + // + // For a TSO case each MSS window should not include more than 8 + // fragments including headers. + // + + // Calculate the number of frags containing headers. + // + // Note: we support neither VLAN nor tunneling thus headers size + // accounting is super simple. + // + size_t headers_size = head->l2_len + head->l3_len + head->l4_len; + unsigned hdr_frags = 0; + size_t cur_payload_len = 0; + rte_mbuf *cur_seg = head; + + while (cur_seg && cur_payload_len < headers_size) { + cur_payload_len += cur_seg->data_len; + cur_seg = cur_seg->next; + hdr_frags++; + } + + // + // Header fragments will be used for each TSO segment, thus the + // maximum number of data segments will be 8 minus the number of + // header fragments. + // + // It's unclear from the spec how the first TSO segment is treated + // if the last fragment with headers contains some data bytes: + // whether this fragment will be accounted as a single fragment or + // as two separate fragments. We prefer to play it safe and assume + // that this fragment will be accounted as two separate fragments. + // + size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags; + + if (head->nb_segs <= max_win_size) { + return false; + } + + // Get the data (without headers) part of the first data fragment + size_t prev_frag_data = cur_payload_len - headers_size; + auto mss = head->tso_segsz; + + while (cur_seg) { + unsigned frags_in_seg = 0; + size_t cur_seg_size = 0; + + if (prev_frag_data) { + cur_seg_size = prev_frag_data; + frags_in_seg++; + prev_frag_data = 0; + } + + while (cur_seg_size < mss && cur_seg) { + cur_seg_size += cur_seg->data_len; + cur_seg = cur_seg->next; + frags_in_seg++; + + if (frags_in_seg > max_win_size) { + return true; + } + } + + if (cur_seg_size > mss) { + prev_frag_data = cur_seg_size - mss; + } + } + + return false; +} + +void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head) +{ + // Handle TCP checksum offload + auto oi = p.offload_info(); + if (oi.needs_ip_csum) { + head->ol_flags |= PKT_TX_IP_CKSUM; + // TODO: Take a VLAN header into an account here + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = oi.ip_hdr_len; + } + if (qp.port().get_hw_features().tx_csum_l4_offload) { + if (oi.protocol == ip_protocol_num::tcp) { + head->ol_flags |= PKT_TX_TCP_CKSUM; + // TODO: Take a VLAN header into an account here + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = oi.ip_hdr_len; + + if (oi.tso_seg_size) { + ceph_assert(oi.needs_ip_csum); + head->ol_flags |= PKT_TX_TCP_SEG; + head->l4_len = oi.tcp_hdr_len; + head->tso_segsz = oi.tso_seg_size; + } + } + } +} + +DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc( + CephContext *cct, Packet&& p, DPDKQueuePair& qp) +{ + // Too fragmented - linearize + if (p.nr_frags() > max_frags) { + p.linearize(); + qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops); + } + + build_mbuf_cluster: + rte_mbuf *head = nullptr, *last_seg = nullptr; + unsigned nsegs = 0; + + // + // Create a HEAD of the fragmented packet: check if frag0 has to be + // copied and if yes - send it in a copy way + // + if (!check_frag0(p)) { + if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl; + return nullptr; + } + } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl; + return nullptr; + } + + unsigned total_nsegs = nsegs; + + for (unsigned i = 1; i < p.nr_frags(); i++) { + rte_mbuf *h = nullptr, *new_last_seg = nullptr; + if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl; + me(head)->recycle(); + return nullptr; + } + + total_nsegs += nsegs; + + // Attach a new buffers' chain to the packet chain + last_seg->next = h; + last_seg = new_last_seg; + } + + // Update the HEAD buffer with the packet info + head->pkt_len = p.len(); + head->nb_segs = total_nsegs; + + set_cluster_offload_info(p, qp, head); + + // + // If a packet hasn't been linearized already and the resulting + // cluster requires the linearisation due to HW limitation: + // + // - Recycle the cluster. + // - Linearize the packet. + // - Build the cluster once again + // + if (head->nb_segs > max_frags || + (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) || + (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) { + me(head)->recycle(); + p.linearize(); + qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops); + + goto build_mbuf_cluster; + } + + me(last_seg)->set_packet(std::move(p)); + + return me(head); +} + +void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head) +{ + rte_mbuf* cur_seg = head; + size_t cur_seg_offset = 0; + unsigned cur_frag_idx = 0; + size_t cur_frag_offset = 0; + + while (true) { + size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset, + inline_mbuf_data_size - cur_seg_offset); + + memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset), + p.frag(cur_frag_idx).base + cur_frag_offset, to_copy); + + cur_frag_offset += to_copy; + cur_seg_offset += to_copy; + + if (cur_frag_offset >= p.frag(cur_frag_idx).size) { + ++cur_frag_idx; + if (cur_frag_idx >= p.nr_frags()) { + // + // We are done - set the data size of the last segment + // of the cluster. + // + cur_seg->data_len = cur_seg_offset; + break; + } + + cur_frag_offset = 0; + } + + if (cur_seg_offset >= inline_mbuf_data_size) { + cur_seg->data_len = inline_mbuf_data_size; + cur_seg = cur_seg->next; + cur_seg_offset = 0; + + // FIXME: assert in a fast-path - remove!!! + ceph_assert(cur_seg); + } + } +} + +DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp) +{ + // sanity + if (!p.len()) { + return nullptr; + } + + /* + * Here we are going to use the fact that the inline data size is a + * power of two. + * + * We will first try to allocate the cluster and only if we are + * successful - we will go and copy the data. + */ + auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size); + unsigned nsegs = aligned_len / inline_mbuf_data_size; + rte_mbuf *head = nullptr, *last_seg = nullptr; + + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return nullptr; + } + + head = buf->rte_mbuf_p(); + last_seg = head; + for (unsigned i = 1; i < nsegs; i++) { + buf = qp.get_tx_buf(); + if (!buf) { + me(head)->recycle(); + return nullptr; + } + + last_seg->next = buf->rte_mbuf_p(); + last_seg = last_seg->next; + } + + // + // If we've got here means that we have succeeded already! + // We only need to copy the data and set the head buffer with the + // relevant info. + // + head->pkt_len = p.len(); + head->nb_segs = nsegs; + + copy_packet_to_cluster(p, head); + set_cluster_offload_info(p, qp, head); + + return me(head); +} + +size_t DPDKQueuePair::tx_buf::copy_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len) +{ + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return 0; + } + + size_t len = std::min(buf_len, inline_mbuf_data_size); + + m = buf->rte_mbuf_p(); + + // mbuf_put() + m->data_len = len; + m->pkt_len = len; + + qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops); + qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len); + + memcpy(rte_pktmbuf_mtod(m, void*), data, len); + + return len; +} + +void DPDKDevice::set_rss_table() +{ + // always fill our local indirection table. + unsigned i = 0; + for (auto& r : _redir_table) { + r = i++ % _num_queues; + } + + if (_dev_info.reta_size == 0) + return; + + int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE); + rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; + + // Configure the HW indirection table + i = 0; + for (auto& x : reta_conf) { + x.mask = ~0ULL; + for (auto& r: x.reta) { + r = i++ % _num_queues; + } + } + + if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) { + rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx); + } +} + +/******************************** Interface functions *************************/ + +std::unique_ptr<DPDKDevice> create_dpdk_net_device( + CephContext *cct, + unsigned cores, + uint8_t port_idx, + bool use_lro, + bool enable_fc) +{ + // Check that we have at least one DPDK-able port + if (rte_eth_dev_count() == 0) { + rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n"); + } else { + ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl; + } + + return std::unique_ptr<DPDKDevice>( + new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc)); +} diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h new file mode 100644 index 00000000..fa12af6b --- /dev/null +++ b/src/msg/async/dpdk/DPDK.h @@ -0,0 +1,918 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_DPDK_DEV_H +#define CEPH_DPDK_DEV_H + +#include <memory> +#include <functional> +#include <rte_config.h> +#include <rte_common.h> +#include <rte_ethdev.h> +#include <rte_malloc.h> +#include <rte_version.h> + +#include "include/page.h" +#include "common/Tub.h" +#include "common/perf_counters.h" +#include "msg/async/Event.h" +#include "const.h" +#include "circular_buffer.h" +#include "ethernet.h" +#include "Packet.h" +#include "stream.h" +#include "net.h" +#include "toeplitz.h" + + +struct free_deleter { + void operator()(void* p) { ::free(p); } +}; + + +enum { + l_dpdk_dev_first = 58800, + l_dpdk_dev_rx_mcast, + l_dpdk_dev_rx_total_errors, + l_dpdk_dev_tx_total_errors, + l_dpdk_dev_rx_badcrc_errors, + l_dpdk_dev_rx_dropped_errors, + l_dpdk_dev_rx_nombuf_errors, + l_dpdk_dev_last +}; + +enum { + l_dpdk_qp_first = 58900, + l_dpdk_qp_rx_packets, + l_dpdk_qp_tx_packets, + l_dpdk_qp_rx_bad_checksum_errors, + l_dpdk_qp_rx_no_memory_errors, + l_dpdk_qp_rx_bytes, + l_dpdk_qp_tx_bytes, + l_dpdk_qp_rx_last_bunch, + l_dpdk_qp_tx_last_bunch, + l_dpdk_qp_rx_fragments, + l_dpdk_qp_tx_fragments, + l_dpdk_qp_rx_copy_ops, + l_dpdk_qp_tx_copy_ops, + l_dpdk_qp_rx_copy_bytes, + l_dpdk_qp_tx_copy_bytes, + l_dpdk_qp_rx_linearize_ops, + l_dpdk_qp_tx_linearize_ops, + l_dpdk_qp_tx_queue_length, + l_dpdk_qp_last +}; + +class DPDKDevice; +class DPDKWorker; + +class DPDKQueuePair { + using packet_provider_type = std::function<Tub<Packet> ()>; + public: + void configure_proxies(const std::map<unsigned, float>& cpu_weights); + // build REdirection TAble for cpu_weights map: target cpu -> weight + void build_sw_reta(const std::map<unsigned, float>& cpu_weights); + void proxy_send(Packet p) { + _proxy_packetq.push_back(std::move(p)); + } + void register_packet_provider(packet_provider_type func) { + _pkt_providers.push_back(std::move(func)); + } + bool poll_tx(); + friend class DPDKDevice; + + class tx_buf_factory; + + class tx_buf { + friend class DPDKQueuePair; + public: + static tx_buf* me(rte_mbuf* mbuf) { + return reinterpret_cast<tx_buf*>(mbuf); + } + + private: + /** + * Checks if the original packet of a given cluster should be linearized + * due to HW limitations. + * + * @param head head of a cluster to check + * + * @return TRUE if a packet should be linearized. + */ + static bool i40e_should_linearize(rte_mbuf *head); + + /** + * Sets the offload info in the head buffer of an rte_mbufs cluster. + * + * @param p an original packet the cluster is built for + * @param qp QP handle + * @param head a head of an rte_mbufs cluster + */ + static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head); + + /** + * Creates a tx_buf cluster representing a given packet in a "zero-copy" + * way. + * + * @param p packet to translate + * @param qp DPDKQueuePair handle + * + * @return the HEAD tx_buf of the cluster or nullptr in case of a + * failure + */ + static tx_buf* from_packet_zc( + CephContext *cct, Packet&& p, DPDKQueuePair& qp); + + /** + * Copy the contents of the "packet" into the given cluster of + * rte_mbuf's. + * + * @note Size of the cluster has to be big enough to accommodate all the + * contents of the given packet. + * + * @param p packet to copy + * @param head head of the rte_mbuf's cluster + */ + static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head); + + /** + * Creates a tx_buf cluster representing a given packet in a "copy" way. + * + * @param p packet to translate + * @param qp DPDKQueuePair handle + * + * @return the HEAD tx_buf of the cluster or nullptr in case of a + * failure + */ + static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp); + + /** + * Zero-copy handling of a single fragment. + * + * @param do_one_buf Functor responsible for a single rte_mbuf + * handling + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * @return TRUE in case of success + */ + template <class DoOneBufFunc> + static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp, + fragment& frag, rte_mbuf*& head, + rte_mbuf*& last_seg, unsigned& nsegs) { + size_t len, left_to_set = frag.size; + char* base = frag.base; + + rte_mbuf* m; + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(frag.size); + + // Create a HEAD of mbufs' cluster and set the first bytes into it + len = do_one_buf(qp, head, base, left_to_set); + if (!len) { + return false; + } + + left_to_set -= len; + base += len; + nsegs = 1; + + // + // Set the rest of the data into the new mbufs and chain them to + // the cluster. + // + rte_mbuf* prev_seg = head; + while (left_to_set) { + len = do_one_buf(qp, m, base, left_to_set); + if (!len) { + me(head)->recycle(); + return false; + } + + left_to_set -= len; + base += len; + nsegs++; + + prev_seg->next = m; + prev_seg = m; + } + + // Return the last mbuf in the cluster + last_seg = prev_seg; + + return true; + } + + /** + * Zero-copy handling of a single fragment. + * + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * @return TRUE in case of success + */ + static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag, + rte_mbuf*& head, rte_mbuf*& last_seg, + unsigned& nsegs) { + return do_one_frag(set_one_data_buf, qp, frag, head, + last_seg, nsegs); + } + + /** + * Copies one fragment into the cluster of rte_mbuf's. + * + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * We return the "last_seg" to avoid traversing the cluster in order to get + * it. + * + * @return TRUE in case of success + */ + static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag, + rte_mbuf*& head, rte_mbuf*& last_seg, + unsigned& nsegs) { + return do_one_frag(copy_one_data_buf, qp, frag, head, + last_seg, nsegs); + } + + /** + * Allocates a single rte_mbuf and sets it to point to a given data + * buffer. + * + * @param qp DPDKQueuePair handle (in) + * @param m New allocated rte_mbuf (out) + * @param va virtual address of a data buffer (in) + * @param buf_len length of the data to copy (in) + * + * @return The actual number of bytes that has been set in the mbuf + */ + static size_t set_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) { + static constexpr size_t max_frag_len = 15 * 1024; // 15K + + // FIXME: current all tx buf is allocated without rte_malloc + return copy_one_data_buf(qp, m, va, buf_len); + // + // Currently we break a buffer on a 15K boundary because 82599 + // devices have a 15.5K limitation on a maximum single fragment + // size. + // + rte_iova_t pa = rte_malloc_virt2iova(va); + if (!pa) + return copy_one_data_buf(qp, m, va, buf_len); + + ceph_assert(buf_len); + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return 0; + } + + size_t len = std::min(buf_len, max_frag_len); + + buf->set_zc_info(va, pa, len); + m = buf->rte_mbuf_p(); + + return len; + } + + /** + * Allocates a single rte_mbuf and copies a given data into it. + * + * @param qp DPDKQueuePair handle (in) + * @param m New allocated rte_mbuf (out) + * @param data Data to copy from (in) + * @param buf_len length of the data to copy (in) + * + * @return The actual number of bytes that has been copied + */ + static size_t copy_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len); + + /** + * Checks if the first fragment of the given packet satisfies the + * zero-copy flow requirement: its first 128 bytes should not cross the + * 4K page boundary. This is required in order to avoid splitting packet + * headers. + * + * @param p packet to check + * + * @return TRUE if packet is ok and FALSE otherwise. + */ + static bool check_frag0(Packet& p) + { + // + // First frag is special - it has headers that should not be split. + // If the addressing is such that the first fragment has to be + // split, then send this packet in a (non-zero) copy flow. We'll + // check if the first 128 bytes of the first fragment reside in the + // physically contiguous area. If that's the case - we are good to + // go. + // + if (p.frag(0).size < 128) + return false; + + return true; + } + + public: + tx_buf(tx_buf_factory& fc) : _fc(fc) { + + _buf_physaddr = _mbuf.buf_physaddr; + _data_off = _mbuf.data_off; + } + + rte_mbuf* rte_mbuf_p() { return &_mbuf; } + + void set_zc_info(void* va, phys_addr_t pa, size_t len) { + // mbuf_put() + _mbuf.data_len = len; + _mbuf.pkt_len = len; + + // Set the mbuf to point to our data + _mbuf.buf_addr = va; + _mbuf.buf_physaddr = pa; + _mbuf.data_off = 0; + _is_zc = true; + } + + void reset_zc() { + + // + // If this mbuf was the last in a cluster and contains an + // original packet object then call the destructor of the + // original packet object. + // + if (_p) { + // + // Reset the std::optional. This in particular is going + // to call the "packet"'s destructor and reset the + // "optional" state to "nonengaged". + // + _p.destroy(); + + } else if (!_is_zc) { + return; + } + + // Restore the rte_mbuf fields we trashed in set_zc_info() + _mbuf.buf_physaddr = _buf_physaddr; + _mbuf.buf_addr = rte_mbuf_to_baddr(&_mbuf); + _mbuf.data_off = _data_off; + + _is_zc = false; + } + + void recycle() { + struct rte_mbuf *m = &_mbuf, *m_next; + + while (m != nullptr) { + m_next = m->next; + rte_pktmbuf_reset(m); + _fc.put(me(m)); + m = m_next; + } + } + + void set_packet(Packet&& p) { + _p = std::move(p); + } + + private: + struct rte_mbuf _mbuf; + MARKER private_start; + Tub<Packet> _p; + phys_addr_t _buf_physaddr; + uint16_t _data_off; + // TRUE if underlying mbuf has been used in the zero-copy flow + bool _is_zc = false; + // buffers' factory the buffer came from + tx_buf_factory& _fc; + MARKER private_end; + }; + + class tx_buf_factory { + // + // Number of buffers to free in each GC iteration: + // We want the buffers to be allocated from the mempool as many as + // possible. + // + // On the other hand if there is no Tx for some time we want the + // completions to be eventually handled. Thus we choose the smallest + // possible packets count number here. + // + static constexpr int gc_count = 1; + public: + tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid); + ~tx_buf_factory() { + // put all mbuf back into mempool in order to make the next factory work + while (gc()); + rte_mempool_put_bulk(_pool, (void**)_ring.data(), + _ring.size()); + } + + + /** + * @note Should not be called if there are no free tx_buf's + * + * @return a free tx_buf object + */ + tx_buf* get() { + // Take completed from the HW first + tx_buf *pkt = get_one_completed(); + if (pkt) { + pkt->reset_zc(); + return pkt; + } + + // + // If there are no completed at the moment - take from the + // factory's cache. + // + if (_ring.empty()) { + return nullptr; + } + + pkt = _ring.back(); + _ring.pop_back(); + + return pkt; + } + + void put(tx_buf* buf) { + buf->reset_zc(); + _ring.push_back(buf); + } + + bool gc() { + for (int cnt = 0; cnt < gc_count; ++cnt) { + auto tx_buf_p = get_one_completed(); + if (!tx_buf_p) { + return false; + } + + put(tx_buf_p); + } + + return true; + } + private: + /** + * Fill the mbufs circular buffer: after this the _pool will become + * empty. We will use it to catch the completed buffers: + * + * - Underlying PMD drivers will "free" the mbufs once they are + * completed. + * - We will poll the _pktmbuf_pool_tx till it's empty and release + * all the buffers from the freed mbufs. + */ + void init_factory() { + while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) { + _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this}); + } + } + + /** + * PMD puts the completed buffers back into the mempool they have + * originally come from. + * + * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call + * rte_pktmbuf_reset() here again. + * + * @return a single tx_buf that has been completed by HW. + */ + tx_buf* get_one_completed() { + return tx_buf::me(rte_pktmbuf_alloc(_pool)); + } + + private: + CephContext *cct; + std::vector<tx_buf*> _ring; + rte_mempool* _pool = nullptr; + }; + + public: + explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid); + ~DPDKQueuePair() { + if (device_stat_time_fd) { + center->delete_time_event(device_stat_time_fd); + } + rx_gc(true); + } + + void rx_start() { + _rx_poller.construct(this); + } + + uint32_t send(circular_buffer<Packet>& pb) { + // Zero-copy send + return _send(pb, [&] (Packet&& p) { + return tx_buf::from_packet_zc(cct, std::move(p), *this); + }); + } + + DPDKDevice& port() const { return *_dev; } + tx_buf* get_tx_buf() { return _tx_buf_factory.get(); } + + void handle_stats(); + + private: + template <class Func> + uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) { + if (_tx_burst.size() == 0) { + for (auto&& p : pb) { + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(p.len()); + + tx_buf* buf = packet_to_tx_buf_p(std::move(p)); + if (!buf) { + break; + } + + _tx_burst.push_back(buf->rte_mbuf_p()); + } + } + + uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid, + _tx_burst.data() + _tx_burst_idx, + _tx_burst.size() - _tx_burst_idx); + + uint64_t nr_frags = 0, bytes = 0; + + for (int i = 0; i < sent; i++) { + rte_mbuf* m = _tx_burst[_tx_burst_idx + i]; + bytes += m->pkt_len; + nr_frags += m->nb_segs; + pb.pop_front(); + } + + perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags); + perf_logger->inc(l_dpdk_qp_tx_bytes, bytes); + + _tx_burst_idx += sent; + + if (_tx_burst_idx == _tx_burst.size()) { + _tx_burst_idx = 0; + _tx_burst.clear(); + } + + return sent; + } + + /** + * Allocate a new data buffer and set the mbuf to point to it. + * + * Do some DPDK hacks to work on PMD: it assumes that the buf_addr + * points to the private data of RTE_PKTMBUF_HEADROOM before the actual + * data buffer. + * + * @param m mbuf to update + */ + static bool refill_rx_mbuf(rte_mbuf* m, size_t size, + std::vector<void*> &datas) { + if (datas.empty()) + return false; + void *data = datas.back(); + datas.pop_back(); + + // + // Set the mbuf to point to our data. + // + // Do some DPDK hacks to work on PMD: it assumes that the buf_addr + // points to the private data of RTE_PKTMBUF_HEADROOM before the + // actual data buffer. + // + m->buf_addr = (char*)data - RTE_PKTMBUF_HEADROOM; + m->buf_physaddr = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM; + return true; + } + + bool init_rx_mbuf_pool(); + bool rx_gc(bool force=false); + bool refill_one_cluster(rte_mbuf* head); + + /** + * Polls for a burst of incoming packets. This function will not block and + * will immediately return after processing all available packets. + * + */ + bool poll_rx_once(); + + /** + * Translates an rte_mbuf's into packet and feeds them to _rx_stream. + * + * @param bufs An array of received rte_mbuf's + * @param count Number of buffers in the bufs[] + */ + void process_packets(struct rte_mbuf **bufs, uint16_t count); + + /** + * Translate rte_mbuf into the "packet". + * @param m mbuf to translate + * + * @return a "optional" object representing the newly received data if in an + * "engaged" state or an error if in a "disengaged" state. + */ + Tub<Packet> from_mbuf(rte_mbuf* m); + + /** + * Transform an LRO rte_mbuf cluster into the "packet" object. + * @param m HEAD of the mbufs' cluster to transform + * + * @return a "optional" object representing the newly received LRO packet if + * in an "engaged" state or an error if in a "disengaged" state. + */ + Tub<Packet> from_mbuf_lro(rte_mbuf* m); + + private: + CephContext *cct; + std::vector<packet_provider_type> _pkt_providers; + Tub<std::array<uint8_t, 128>> _sw_reta; + circular_buffer<Packet> _proxy_packetq; + stream<Packet> _rx_stream; + circular_buffer<Packet> _tx_packetq; + std::vector<void*> _alloc_bufs; + + PerfCounters *perf_logger; + DPDKDevice* _dev; + uint8_t _dev_port_idx; + EventCenter *center; + uint8_t _qid; + rte_mempool *_pktmbuf_pool_rx; + std::vector<rte_mbuf*> _rx_free_pkts; + std::vector<rte_mbuf*> _rx_free_bufs; + std::vector<fragment> _frags; + std::vector<char*> _bufs; + size_t _num_rx_free_segs = 0; + uint64_t device_stat_time_fd = 0; + +#ifdef CEPH_PERF_DEV + uint64_t rx_cycles = 0; + uint64_t rx_count = 0; + uint64_t tx_cycles = 0; + uint64_t tx_count = 0; +#endif + + class DPDKTXPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKTXPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {} + + virtual int poll() { + return qp->poll_tx(); + } + } _tx_poller; + + class DPDKRXGCPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKRXGCPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {} + + virtual int poll() { + return qp->rx_gc(); + } + } _rx_gc_poller; + tx_buf_factory _tx_buf_factory; + class DPDKRXPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKRXPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {} + + virtual int poll() { + return qp->poll_rx_once(); + } + }; + Tub<DPDKRXPoller> _rx_poller; + class DPDKTXGCPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKTXGCPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {} + + virtual int poll() { + return qp->_tx_buf_factory.gc(); + } + } _tx_gc_poller; + std::vector<rte_mbuf*> _tx_burst; + uint16_t _tx_burst_idx = 0; +}; + +class DPDKDevice { + public: + CephContext *cct; + PerfCounters *perf_logger; + std::vector<std::unique_ptr<DPDKQueuePair>> _queues; + std::vector<DPDKWorker*> workers; + size_t _rss_table_bits = 0; + uint8_t _port_idx; + uint16_t _num_queues; + unsigned cores; + hw_features _hw_features; + uint8_t _queues_ready = 0; + unsigned _home_cpu; + bool _use_lro; + bool _enable_fc; + std::vector<uint8_t> _redir_table; + rss_key_type _rss_key; + bool _is_i40e_device = false; + bool _is_vmxnet3_device = false; + + public: + rte_eth_dev_info _dev_info = {}; + + /** + * The final stage of a port initialization. + * @note Must be called *after* all queues from stage (2) have been + * initialized. + */ + int init_port_fini(); + + private: + /** + * Port initialization consists of 3 main stages: + * 1) General port initialization which ends with a call to + * rte_eth_dev_configure() where we request the needed number of Rx and + * Tx queues. + * 2) Individual queues initialization. This is done in the constructor of + * DPDKQueuePair class. In particular the memory pools for queues are allocated + * in this stage. + * 3) The final stage of the initialization which starts with the call of + * rte_eth_dev_start() after which the port becomes fully functional. We + * will also wait for a link to get up in this stage. + */ + + + /** + * First stage of the port initialization. + * + * @return 0 in case of success and an appropriate error code in case of an + * error. + */ + int init_port_start(); + + /** + * Check the link status of out port in up to 9s, and print them finally. + */ + int check_port_link_status(); + + /** + * Configures the HW Flow Control + */ + void set_hw_flow_control(); + + public: + DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc): + cct(c), _port_idx(port_idx), _num_queues(num_queues), + _home_cpu(0), _use_lro(use_lro), + _enable_fc(enable_fc) { + _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues); + /* now initialise the port we will use */ + int ret = init_port_start(); + if (ret != 0) { + rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx); + } + string name(std::string("port") + std::to_string(port_idx)); + PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last); + + plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets"); + plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors"); + + plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors"); + plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors"); + plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors"); + plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + } + + ~DPDKDevice() { + rte_eth_dev_stop(_port_idx); + } + + DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; } + void l2receive(int qid, Packet p) { + _queues[qid]->_rx_stream.produce(std::move(p)); + } + subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) { + auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet)); + _queues[cpuid]->rx_start(); + return std::move(sub); + } + ethernet_address hw_address() { + struct ether_addr mac; + rte_eth_macaddr_get(_port_idx, &mac); + + return mac.addr_bytes; + } + hw_features get_hw_features() { + return _hw_features; + } + const rss_key_type& rss_key() const { return _rss_key; } + uint16_t hw_queues_count() { return _num_queues; } + std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) { + std::unique_ptr<DPDKQueuePair> qp; + qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid)); + return std::move(qp); + } + unsigned hash2qid(uint32_t hash) { + // return hash % hw_queues_count(); + return _redir_table[hash & (_redir_table.size() - 1)]; + } + void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) { + ceph_assert(!_queues[i]); + _queues[i] = std::move(qp); + } + void unset_local_queue(unsigned i) { + ceph_assert(_queues[i]); + _queues[i].reset(); + } + template <typename Func> + unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) { + auto& qp = queue_for_cpu(src_cpuid); + if (!qp._sw_reta) + return src_cpuid; + + ceph_assert(!qp._sw_reta); + auto hash = hashfn() >> _rss_table_bits; + auto& reta = *qp._sw_reta; + return reta[hash % reta.size()]; + } + unsigned hash2cpu(uint32_t hash) { + // there is an assumption here that qid == get_id() which will + // not necessary be true in the future + return forward_dst(hash2qid(hash), [hash] { return hash; }); + } + + hw_features& hw_features_ref() { return _hw_features; } + + const rte_eth_rxconf* def_rx_conf() const { + return &_dev_info.default_rxconf; + } + + const rte_eth_txconf* def_tx_conf() const { + return &_dev_info.default_txconf; + } + + /** + * Set the RSS table in the device and store it in the internal vector. + */ + void set_rss_table(); + + uint8_t port_idx() { return _port_idx; } + bool is_i40e_device() const { + return _is_i40e_device; + } + bool is_vmxnet3_device() const { + return _is_vmxnet3_device; + } +}; + + +std::unique_ptr<DPDKDevice> create_dpdk_net_device( + CephContext *c, unsigned cores, uint8_t port_idx = 0, + bool use_lro = true, bool enable_fc = true); + + +/** + * @return Number of bytes needed for mempool objects of each QP. + */ +uint32_t qp_mempool_obj_size(); + +#endif // CEPH_DPDK_DEV_H diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc new file mode 100644 index 00000000..3101ae57 --- /dev/null +++ b/src/msg/async/dpdk/DPDKStack.cc @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <memory> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <tuple> + +#include "common/ceph_argparse.h" +#include "dpdk_rte.h" +#include "DPDKStack.h" +#include "DPDK.h" +#include "IP.h" +#include "TCP-Stack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" +#include "common/Cond.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdkstack " + +static int dpdk_thread_adaptor(void* f) +{ + (*static_cast<std::function<void ()>*>(f))(); + return 0; +} + +void DPDKWorker::initialize() +{ + static enum { + WAIT_DEVICE_STAGE, + WAIT_PORT_FIN_STAGE, + DONE + } create_stage = WAIT_DEVICE_STAGE; + static Mutex lock("DPDKStack::lock"); + static Cond cond; + static unsigned queue_init_done = 0; + static unsigned cores = 0; + static std::shared_ptr<DPDKDevice> sdev; + + unsigned i = center.get_id(); + if (i == 0) { + // Hardcoded port index 0. + // TODO: Inherit it from the opts + cores = cct->_conf->ms_async_op_threads; + std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device( + cct, cores, cct->_conf->ms_dpdk_port_id, + cct->_conf->ms_dpdk_lro, + cct->_conf->ms_dpdk_hw_flow_control); + sdev = std::shared_ptr<DPDKDevice>(dev.release()); + sdev->workers.resize(cores); + ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl; + + Mutex::Locker l(lock); + create_stage = WAIT_PORT_FIN_STAGE; + cond.Signal(); + } else { + Mutex::Locker l(lock); + while (create_stage <= WAIT_DEVICE_STAGE) + cond.Wait(lock); + } + ceph_assert(sdev); + if (i < sdev->hw_queues_count()) { + auto qp = sdev->init_local_queue(cct, ¢er, cct->_conf->ms_dpdk_hugepages, i); + std::map<unsigned, float> cpu_weights; + for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count(); + j < cores; j+= sdev->hw_queues_count()) + cpu_weights[i] = 1; + cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight; + qp->configure_proxies(cpu_weights); + sdev->set_local_queue(i, std::move(qp)); + Mutex::Locker l(lock); + ++queue_init_done; + cond.Signal(); + } else { + // auto master = qid % sdev->hw_queues_count(); + // sdev->set_local_queue(create_proxy_net_device(master, sdev.get())); + ceph_abort(); + } + if (i == 0) { + { + Mutex::Locker l(lock); + while (queue_init_done < cores) + cond.Wait(lock); + } + + if (sdev->init_port_fini() < 0) { + lderr(cct) << __func__ << " init_port_fini failed " << dendl; + ceph_abort(); + } + Mutex::Locker l(lock); + create_stage = DONE; + cond.Signal(); + } else { + Mutex::Locker l(lock); + while (create_stage <= WAIT_PORT_FIN_STAGE) + cond.Wait(lock); + } + + sdev->workers[i] = this; + _impl = std::unique_ptr<DPDKWorker::Impl>( + new DPDKWorker::Impl(cct, i, ¢er, sdev)); + { + Mutex::Locker l(lock); + if (!--queue_init_done) { + create_stage = WAIT_DEVICE_STAGE; + sdev.reset(); + } + } +} + +using AvailableIPAddress = std::tuple<string, string, string>; +static bool parse_available_address( + const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res) +{ + vector<string> ip_vec, gate_vec, mask_vec; + string_to_vec(ip_vec, ips); + string_to_vec(gate_vec, gates); + string_to_vec(mask_vec, masks); + if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size()) + return false; + + for (size_t i = 0; i < ip_vec.size(); ++i) { + res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]}); + } + return true; +} + +static bool match_available_address(const vector<AvailableIPAddress> &avails, + const entity_addr_t &ip, int &res) +{ + for (size_t i = 0; i < avails.size(); ++i) { + entity_addr_t addr; + auto a = std::get<0>(avails[i]).c_str(); + if (!addr.parse(a)) + continue; + if (addr.is_same_host(ip)) { + res = i; + return true; + } + } + return false; +} + +DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev) + : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif) +{ + vector<AvailableIPAddress> tuples; + bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"), + cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"), + cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples); + if (!parsed) { + lderr(cct) << __func__ << " no available address " + << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", " + << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", " + << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", " + << dendl; + ceph_abort(); + } + _inet.set_host_address(ipv4_address(std::get<0>(tuples[0]))); + _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0]))); + _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0]))); +} + +DPDKWorker::Impl::~Impl() +{ + _dev->unset_local_queue(id); +} + +int DPDKWorker::listen(entity_addr_t &sa, const SocketOptions &opt, + ServerSocket *sock) +{ + ceph_assert(sa.get_family() == AF_INET); + ceph_assert(sock); + + ldout(cct, 10) << __func__ << " addr " << sa << dendl; + // vector<AvailableIPAddress> tuples; + // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr, + // cct->_conf->ms_dpdk_gateway_ipv4_addr, + // cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples); + // if (!parsed) { + // lderr(cct) << __func__ << " no available address " + // << cct->_conf->ms_dpdk_host_ipv4_addr << ", " + // << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", " + // << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", " + // << dendl; + // return -EINVAL; + // } + // int idx; + // parsed = match_available_address(tuples, sa, idx); + // if (!parsed) { + // lderr(cct) << __func__ << " no matched address for " << sa << dendl; + // return -EINVAL; + // } + // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx]))); + // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx]))); + // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx]))); + return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(), + sock); +} + +int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) +{ + // ceph_assert(addr.get_family() == AF_INET); + int r = tcpv4_connect(_impl->_inet.get_tcp(), addr, socket); + ldout(cct, 10) << __func__ << " addr " << addr << dendl; + return r; +} + +void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func) +{ + // create a extra master thread + // + funcs[i] = std::move(func); + int r = 0; + r = dpdk::eal::init(cct); + if (r < 0) { + lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl; + ceph_abort(); + } + // if dpdk::eal::init already called by NVMEDevice, we will select 1..n + // cores + ceph_assert(rte_lcore_count() >= i + 1); + unsigned core_id; + int j = i; + RTE_LCORE_FOREACH_SLAVE(core_id) { + if (i-- == 0) { + break; + } + } + dpdk::eal::execute_on_master([&]() { + r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id); + if (r < 0) { + lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl; + ceph_abort(); + } + }); +} + +void DPDKStack::join_worker(unsigned i) +{ + dpdk::eal::execute_on_master([&]() { + rte_eal_wait_lcore(i+1); + }); +} diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h new file mode 100644 index 00000000..a44ae383 --- /dev/null +++ b/src/msg/async/dpdk/DPDKStack.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_MSG_DPDKSTACK_H +#define CEPH_MSG_DPDKSTACK_H + +#include <functional> + +#include "common/ceph_context.h" +#include "common/Tub.h" + +#include "msg/async/Stack.h" +#include "net.h" +#include "const.h" +#include "IP.h" +#include "Packet.h" + +class interface; + +template <typename Protocol> +class NativeConnectedSocketImpl; + +// DPDKServerSocketImpl +template <typename Protocol> +class DPDKServerSocketImpl : public ServerSocketImpl { + typename Protocol::listener _listener; + public: + DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt, + int type); + int listen() { + return _listener.listen(); + } + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + virtual int fd() const override { + return _listener.fd(); + } +}; + +// NativeConnectedSocketImpl +template <typename Protocol> +class NativeConnectedSocketImpl : public ConnectedSocketImpl { + typename Protocol::connection _conn; + uint32_t _cur_frag = 0; + uint32_t _cur_off = 0; + Tub<Packet> _buf; + Tub<bufferptr> _cache_ptr; + + public: + explicit NativeConnectedSocketImpl(typename Protocol::connection conn) + : _conn(std::move(conn)) {} + NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs) + : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf)) {} + virtual int is_connected() override { + return _conn.is_connected(); + } + + virtual ssize_t read(char *buf, size_t len) override { + size_t left = len; + ssize_t r = 0; + size_t off = 0; + while (left > 0) { + if (!_cache_ptr) { + _cache_ptr.construct(); + r = zero_copy_read(*_cache_ptr); + if (r <= 0) { + _cache_ptr.destroy(); + if (r == -EAGAIN) + break; + return r; + } + } + if (_cache_ptr->length() <= left) { + _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off); + left -= _cache_ptr->length(); + off += _cache_ptr->length(); + _cache_ptr.destroy(); + } else { + _cache_ptr->copy_out(0, left, buf+off); + _cache_ptr->set_offset(_cache_ptr->offset() + left); + _cache_ptr->set_length(_cache_ptr->length() - left); + left = 0; + break; + } + } + return len - left ? len - left : -EAGAIN; + } + + virtual ssize_t zero_copy_read(bufferptr &data) override { + auto err = _conn.get_errno(); + if (err <= 0) + return err; + + if (!_buf) { + _buf = std::move(_conn.read()); + if (!_buf) + return -EAGAIN; + } + + fragment &f = _buf->frag(_cur_frag); + Packet p = _buf->share(_cur_off, f.size); + auto del = std::bind( + [](Packet &p) {}, std::move(p)); + data = buffer::claim_buffer( + f.size, f.base, make_deleter(std::move(del))); + if (++_cur_frag == _buf->nr_frags()) { + _cur_frag = 0; + _cur_off = 0; + _buf.destroy(); + } else { + _cur_off += f.size; + } + ceph_assert(data.length()); + return data.length(); + } + virtual ssize_t send(bufferlist &bl, bool more) override { + auto err = _conn.get_errno(); + if (err < 0) + return (ssize_t)err; + + size_t available = _conn.peek_sent_available(); + if (available == 0) { + return 0; + } + + std::vector<fragment> frags; + std::list<bufferptr>::const_iterator pb = bl.buffers().begin(); + uint64_t left_pbrs = bl.buffers().size(); + uint64_t len = 0; + uint64_t seglen = 0; + while (len < available && left_pbrs--) { + seglen = pb->length(); + if (len + seglen > available) { + // don't continue if we enough at least 1 fragment since no available + // space for next ptr. + if (len > 0) + break; + seglen = std::min(seglen, available); + } + len += seglen; + frags.push_back(fragment{(char*)pb->c_str(), seglen}); + ++pb; + } + + if (len != bl.length()) { + bufferlist swapped; + bl.splice(0, len, &swapped); + auto del = std::bind( + [](bufferlist &bl) {}, std::move(swapped)); + return _conn.send(Packet(std::move(frags), make_deleter(std::move(del)))); + } else { + auto del = std::bind( + [](bufferlist &bl) {}, std::move(bl)); + + return _conn.send(Packet(std::move(frags), make_deleter(std::move(del)))); + } + } + virtual void shutdown() override { + _conn.close_write(); + } + // FIXME need to impl close + virtual void close() override { + _conn.close_write(); + } + virtual int fd() const override { + return _conn.fd(); + } + virtual int socket_fd() const override { + return _conn.fd(); + } + +}; + +template <typename Protocol> +DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl( + Protocol& proto, uint16_t port, const SocketOptions &opt, int type) + : ServerSocketImpl(type), _listener(proto.listen(port)) {} + +template <typename Protocol> +int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) { + if (_listener.get_errno() < 0) + return _listener.get_errno(); + auto c = _listener.accept(); + if (!c) + return -EAGAIN; + + if (out) { + *out = c->remote_addr(); + out->set_type(addr_type); + } + std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi( + new NativeConnectedSocketImpl<Protocol>(std::move(*c))); + *s = ConnectedSocket(std::move(csi)); + return 0; +} + +template <typename Protocol> +void DPDKServerSocketImpl<Protocol>::abort_accept() { + _listener.abort_accept(); +} + +class DPDKWorker : public Worker { + struct Impl { + unsigned id; + interface _netif; + std::shared_ptr<DPDKDevice> _dev; + ipv4 _inet; + Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev); + ~Impl(); + }; + std::unique_ptr<Impl> _impl; + + virtual void initialize() override; + void set_ipv4_packet_filter(ip_packet_filter* filter) { + _impl->_inet.set_packet_filter(filter); + } + using tcp4 = tcp<ipv4_traits>; + + public: + explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {} + virtual int listen(entity_addr_t &addr, const SocketOptions &opts, ServerSocket *) override; + virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; + void arp_learn(ethernet_address l2, ipv4_address l3) { + _impl->_inet.learn(l2, l3); + } + virtual void destroy() override { + _impl.reset(); + } + + friend class DPDKServerSocketImpl<tcp4>; +}; + +class DPDKStack : public NetworkStack { + vector<std::function<void()> > funcs; + public: + explicit DPDKStack(CephContext *cct, const string &t): NetworkStack(cct, t) { + funcs.resize(cct->_conf->ms_async_max_op_threads); + } + virtual bool support_zero_copy_read() const override { return true; } + virtual bool support_local_listen_table() const override { return true; } + + virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override; + virtual void join_worker(unsigned i) override; +}; + +#endif diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc new file mode 100644 index 00000000..5d291716 --- /dev/null +++ b/src/msg/async/dpdk/EventDPDK.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "DPDKStack.h" +#include "EventDPDK.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "DPDKDriver." + +int DPDKDriver::init(EventCenter *c, int nevent) +{ + return 0; +} + +int DPDKDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask + << " add_mask=" << add_mask << dendl; + + int r = manager.listen(fd, add_mask); + if (r < 0) { + lderr(cct) << __func__ << " add fd=" << fd << " failed. " + << cpp_strerror(-r) << dendl; + return -errno; + } + + return 0; +} + +int DPDKDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask + << " delmask=" << delmask << dendl; + int r = 0; + + if (delmask != EVENT_NONE) { + if ((r = manager.unlisten(fd, delmask)) < 0) { + lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask + << " failed." << cpp_strerror(-r) << dendl; + return r; + } + } + return 0; +} + +int DPDKDriver::resize_events(int newsize) +{ + return 0; +} + +int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int num_events = 512; + int events[num_events]; + int masks[num_events]; + + int retval = manager.poll(events, masks, num_events, tvp); + if (retval > 0) { + fired_events.resize(retval); + for (int i = 0; i < retval; i++) { + fired_events[i].fd = events[i]; + fired_events[i].mask = masks[i]; + } + } + return retval; +} diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h new file mode 100644 index 00000000..541c2210 --- /dev/null +++ b/src/msg/async/dpdk/EventDPDK.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_EVENTDPDK_H +#define CEPH_EVENTDPDK_H + +#include "msg/async/Event.h" +#include "msg/async/Stack.h" +#include "UserspaceEvent.h" + +class DPDKDriver : public EventDriver { + CephContext *cct; + + public: + UserspaceEventManager manager; + + explicit DPDKDriver(CephContext *c): cct(c), manager(c) {} + virtual ~DPDKDriver() { } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override; + bool need_wakeup() override { return false; } +}; + +#endif //CEPH_EVENTDPDK_H diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc new file mode 100644 index 00000000..f730cded --- /dev/null +++ b/src/msg/async/dpdk/IP.cc @@ -0,0 +1,470 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/perf_counters.h" + +#include "capture.h" +#include "IP.h" +#include "toeplitz.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + +std::ostream& operator<<(std::ostream& os, const ipv4_address& a) { + auto ip = a.ip; + return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff) + << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff); +} + +utime_t ipv4::_frag_timeout = utime_t(30, 0); +constexpr uint32_t ipv4::_frag_low_thresh; +constexpr uint32_t ipv4::_frag_high_thresh; + +class C_handle_frag_timeout : public EventCallback { + ipv4 *_ipv4; + + public: + C_handle_frag_timeout(ipv4 *i): _ipv4(i) {} + void do_request(uint64_t fd_or_id) { + _ipv4->frag_timeout(); + } +}; + +enum { + l_dpdk_qp_first = 99000, + l_dpdk_total_linearize_operations, + l_dpdk_qp_last +}; + +ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif) + : cct(c), center(cen), _netif(netif), _global_arp(netif), + _arp(c, _global_arp, cen), + _host_address(0), _gw_address(0), _netmask(0), + _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }), + _rx_packets( + _l3.receive( + [this] (Packet p, ethernet_address ea) { + return handle_received_packet(std::move(p), ea); + }, + [this] (forward_hash& out_hash_data, Packet& p, size_t off) { + return forward(out_hash_data, p, off); + } + ) + ), + _tcp(*this, cen), _icmp(c, *this), + _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp }, + { uint8_t(ip_protocol_num::icmp), &_icmp }}), + _packet_filter(nullptr) +{ + PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last); + plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations"); + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + frag_handler = new C_handle_frag_timeout(this); +} + +bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + auto iph = p.get_header<ip_hdr>(off); + + out_hash_data.push_back(iph->src_ip.ip); + out_hash_data.push_back(iph->dst_ip.ip); + + auto h = iph->ntoh(); + auto l4 = _l4[h.ip_proto]; + if (l4) { + if (h.mf() == false && h.offset() == 0) { + // This IP datagram is atomic, forward according to tcp connection hash + l4->forward(out_hash_data, p, off + sizeof(ip_hdr)); + } + // else forward according to ip fields only + } + return true; +} + +int ipv4::handle_received_packet(Packet p, ethernet_address from) +{ + auto iph = p.get_header<ip_hdr>(0); + if (!iph) { + return 0; + } + + // Skip checking csum of reassembled IP datagram + if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) { + checksummer csum; + csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); + if (csum.get() != 0) { + return 0; + } + } + + auto h = iph->ntoh(); + unsigned ip_len = h.len; + unsigned ip_hdr_len = h.ihl * 4; + unsigned pkt_len = p.len(); + auto offset = h.offset(); + + ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto) + << std::dec << " packet from " + << h.src_ip << " -> " << h.dst_ip << " id=" << h.id + << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len + << " pkt_len=" << pkt_len << " offset=" << offset << dendl; + + if (pkt_len > ip_len) { + // Trim extra data in the packet beyond IP total length + p.trim_back(pkt_len - ip_len); + } else if (pkt_len < ip_len) { + // Drop if it contains less than IP total length + return 0; + } + // Drop if the reassembled datagram will be larger than maximum IP size + if (offset + p.len() > ip_packet_len_max) { + return 0; + } + + // FIXME: process options + if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) { + ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl; + _arp.learn(from, h.src_ip); + } + + if (_packet_filter) { + bool handled = false; + _packet_filter->handle(p, &h, from, handled); + if (handled) { + return 0; + } + } + + if (h.dst_ip != _host_address) { + // FIXME: forward + return 0; + } + + // Does this IP datagram need reassembly + auto mf = h.mf(); + if (mf == true || offset != 0) { + frag_limit_mem(); + auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto}; + auto& frag = _frags[frag_id]; + if (mf == false) { + frag.last_frag_received = true; + } + // This is a newly created frag_id + if (frag.mem_size == 0) { + _frags_age.push_back(frag_id); + frag.rx_time = ceph_clock_now(); + } + auto added_size = frag.merge(h, offset, std::move(p)); + _frag_mem += added_size; + if (frag.is_complete()) { + // All the fragments are received + auto dropped_size = frag.mem_size; + auto& ip_data = frag.data.map.begin()->second; + // Choose a cpu to forward this packet + auto cpu_id = center->get_id(); + auto l4 = _l4[h.ip_proto]; + if (l4) { + size_t l4_offset = 0; + forward_hash hash_data; + hash_data.push_back(hton(h.src_ip.ip)); + hash_data.push_back(hton(h.dst_ip.ip)); + l4->forward(hash_data, ip_data, l4_offset); + cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data)); + } + + // No need to forward if the dst cpu is the current cpu + if (cpu_id == center->get_id()) { + l4->received(std::move(ip_data), h.src_ip, h.dst_ip); + } else { + auto to = _netif->hw_address(); + auto pkt = frag.get_assembled_packet(from, to); + _netif->forward(center, cpu_id, std::move(pkt)); + } + + // Delete this frag from _frags and _frags_age + frag_drop(frag_id, dropped_size); + _frags_age.remove(frag_id); + perf_logger->set(l_dpdk_total_linearize_operations, + ipv4_packet_merger::linearizations()); + } else { + // Some of the fragments are missing + if (frag_timefd) { + frag_arm(); + } + } + return 0; + } + + auto l4 = _l4[h.ip_proto]; + if (l4) { + // Trim IP header and pass to upper layer + p.trim_front(ip_hdr_len); + l4->received(std::move(p), h.src_ip, h.dst_ip); + } + return 0; +} + +void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) { + // Figure out where to send the packet to. If it is a directly connected + // host, send to it directly, otherwise send to the default gateway. + ipv4_address dst; + if (in_my_netmask(to)) { + dst = to; + } else { + dst = _gw_address; + } + + _arp.wait(std::move(dst), std::move(p), std::move(cb)); +} + +const hw_features& ipv4::get_hw_features() const +{ + return _netif->get_hw_features(); +} + +void ipv4::send(ipv4_address to, ip_protocol_num proto_num, + Packet p, ethernet_address e_dst) { + auto needs_frag = this->needs_frag(p, proto_num, get_hw_features()); + + auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable { + static uint16_t id = 0; + auto iph = pkt.prepend_header<ip_hdr>(); + iph->ihl = sizeof(*iph) / 4; + iph->ver = 4; + iph->dscp = 0; + iph->ecn = 0; + iph->len = pkt.len(); + // FIXME: a proper id + iph->id = id++; + if (needs_frag) { + uint16_t mf = remaining > 0; + // The fragment offset is measured in units of 8 octets (64 bits) + auto off = offset / 8; + iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off; + } else { + iph->frag = 0; + } + iph->ttl = 64; + iph->ip_proto = (uint8_t)proto_num; + iph->csum = 0; + iph->src_ip = _host_address; + iph->dst_ip = to; + ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to + << " len " << pkt.len() << dendl; + *iph = iph->hton(); + + if (get_hw_features().tx_csum_ip_offload) { + iph->csum = 0; + pkt.offload_info_ref().needs_ip_csum = true; + } else { + checksummer csum; + csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); + iph->csum = csum.get(); + } + + _packetq.push_back( + l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)}); + }; + + if (needs_frag) { + uint16_t offset = 0; + uint16_t remaining = p.len(); + auto mtu = get_hw_features().mtu; + + while (remaining) { + auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining); + remaining -= can_send; + auto pkt = p.share(offset, can_send); + send_pkt(pkt, remaining, offset); + offset += can_send; + } + } else { + // The whole packet can be send in one shot + send_pkt(p, 0, 0); + } +} + +Tub<l3_protocol::l3packet> ipv4::get_packet() { + // _packetq will be mostly empty here unless it hold remnants of previously + // fragmented packet + if (_packetq.empty()) { + for (size_t i = 0; i < _pkt_providers.size(); i++) { + auto l4p = _pkt_providers[_pkt_provider_idx++](); + if (_pkt_provider_idx == _pkt_providers.size()) { + _pkt_provider_idx = 0; + } + if (l4p) { + ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl; + send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst); + break; + } + } + } + + Tub<l3_protocol::l3packet> p; + if (!_packetq.empty()) { + p = std::move(_packetq.front()); + _packetq.pop_front(); + } + return p; +} + +void ipv4::frag_limit_mem() { + if (_frag_mem <= _frag_high_thresh) { + return; + } + auto drop = _frag_mem - _frag_low_thresh; + while (drop) { + if (_frags_age.empty()) { + return; + } + // Drop the oldest frag (first element) from _frags_age + auto frag_id = _frags_age.front(); + _frags_age.pop_front(); + + // Drop from _frags as well + auto& frag = _frags[frag_id]; + auto dropped_size = frag.mem_size; + frag_drop(frag_id, dropped_size); + + drop -= std::min(drop, dropped_size); + } +} + +void ipv4::frag_timeout() { + if (_frags.empty()) { + return; + } + auto now = ceph_clock_now(); + for (auto it = _frags_age.begin(); it != _frags_age.end();) { + auto frag_id = *it; + auto& frag = _frags[frag_id]; + if (now > frag.rx_time + _frag_timeout) { + auto dropped_size = frag.mem_size; + // Drop from _frags + frag_drop(frag_id, dropped_size); + // Drop from _frags_age + it = _frags_age.erase(it); + } else { + // The further items can only be younger + break; + } + } + if (_frags.size() != 0) { + frag_arm(now); + } else { + _frag_mem = 0; + } +} + +int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) { + uint32_t old = mem_size; + unsigned ip_hdr_len = h.ihl * 4; + // Store IP header + if (offset == 0) { + header = p.share(0, ip_hdr_len); + } + // Sotre IP payload + p.trim_front(ip_hdr_len); + data.merge(offset, std::move(p)); + // Update mem size + mem_size = header.memory(); + for (const auto& x : data.map) { + mem_size += x.second.memory(); + } + auto added_size = mem_size - old; + return added_size; +} + +bool ipv4::frag::is_complete() { + // If all the fragments are received, ipv4::frag::merge() should merge all + // the fragments into a single packet + auto offset = data.map.begin()->first; + auto nr_packet = data.map.size(); + return last_frag_received && nr_packet == 1 && offset == 0; +} + +Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) { + auto& ip_header = header; + auto& ip_data = data.map.begin()->second; + // Append a ethernet header, needed for forwarding + auto eh = ip_header.prepend_header<eth_hdr>(); + eh->src_mac = from; + eh->dst_mac = to; + eh->eth_proto = uint16_t(eth_protocol_num::ipv4); + *eh = eh->hton(); + // Prepare a packet contains both ethernet header, ip header and ip data + ip_header.append(std::move(ip_data)); + auto pkt = std::move(ip_header); + auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr)); + // len is the sum of each fragment + iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr))); + // No fragmentation for the assembled datagram + iph->frag = 0; + // Since each fragment's csum is checked, no need to csum + // again for the assembled datagram + offload_info oi; + oi.reassembled = true; + pkt.set_offload_info(oi); + return pkt; +} + +void icmp::received(Packet p, ipaddr from, ipaddr to) { + auto hdr = p.get_header<icmp_hdr>(0); + if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) { + return; + } + hdr->type = icmp_hdr::msg_type::echo_reply; + hdr->code = 0; + hdr->csum = 0; + checksummer csum; + csum.sum(reinterpret_cast<char*>(hdr), p.len()); + hdr->csum = csum.get(); + + if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue + auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable { + if (r == 0) { + _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp}); + } + }; + _inet.wait_l2_dst_address(from, std::move(p), cb); + } +} diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h new file mode 100644 index 00000000..480b4b95 --- /dev/null +++ b/src/msg/async/dpdk/IP.h @@ -0,0 +1,414 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ + +#ifndef CEPH_MSG_IP_H_ +#define CEPH_MSG_IP_H_ + +#include <arpa/inet.h> +#include <unordered_map> +#include <cstdint> +#include <array> +#include <map> +#include <list> +#include <chrono> + +#include "msg/async/Event.h" +#include "common/Throttle.h" + +#include "array_map.h" +#include "ARP.h" +#include "IPChecksum.h" +#include "ip_types.h" +#include "const.h" +#include "net.h" +#include "PacketUtil.h" +#include "toeplitz.h" + +class ipv4; +template <ip_protocol_num ProtoNum> +class ipv4_l4; + +template <typename InetTraits> +class tcp; + +struct ipv4_traits { + using address_type = ipv4_address; + using inet_type = ipv4_l4<ip_protocol_num::tcp>; + struct l4packet { + ipv4_address to; + Packet p; + ethernet_address e_dst; + ip_protocol_num proto_num; + }; + using packet_provider_type = std::function<Tub<l4packet> ()>; + static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) { + csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len); + } + static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min; +}; + +template <ip_protocol_num ProtoNum> +class ipv4_l4 { + public: + ipv4& _inet; + public: + ipv4_l4(ipv4& inet) : _inet(inet) {} + void register_packet_provider(ipv4_traits::packet_provider_type func); + void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb); +}; + +class ip_protocol { + public: + virtual ~ip_protocol() {} + virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; } +}; + +template <typename InetTraits> +struct l4connid { + using ipaddr = typename InetTraits::address_type; + using inet_type = typename InetTraits::inet_type; + struct connid_hash; + + ipaddr local_ip; + ipaddr foreign_ip; + uint16_t local_port; + uint16_t foreign_port; + + bool operator==(const l4connid& x) const { + return local_ip == x.local_ip + && foreign_ip == x.foreign_ip + && local_port == x.local_port + && foreign_port == x.foreign_port; + } + + uint32_t hash(const rss_key_type& rss_key) { + forward_hash hash_data; + hash_data.push_back(hton(foreign_ip.ip)); + hash_data.push_back(hton(local_ip.ip)); + hash_data.push_back(hton(foreign_port)); + hash_data.push_back(hton(local_port)); + return toeplitz_hash(rss_key, hash_data); + } +}; + +class ipv4_tcp final : public ip_protocol { + ipv4_l4<ip_protocol_num::tcp> _inet_l4; + std::unique_ptr<tcp<ipv4_traits>> _tcp; + public: + ipv4_tcp(ipv4& inet, EventCenter *c); + ~ipv4_tcp(); + virtual void received(Packet p, ipv4_address from, ipv4_address to) override; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override; + friend class ipv4; +}; + +struct icmp_hdr { + enum class msg_type : uint8_t { + echo_reply = 0, + echo_request = 8, + }; + msg_type type; + uint8_t code; + uint16_t csum; + uint32_t rest; +} __attribute__((packed)); + + +class icmp { + public: + using ipaddr = ipv4_address; + using inet_type = ipv4_l4<ip_protocol_num::icmp>; + explicit icmp(CephContext *c, inet_type& inet) + : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) { + _inet.register_packet_provider([this] { + Tub<ipv4_traits::l4packet> l4p; + if (!_packetq.empty()) { + l4p = std::move(_packetq.front()); + _packetq.pop_front(); + _queue_space.put(l4p->p.len()); + } + return l4p; + }); + } + void received(Packet p, ipaddr from, ipaddr to); + + private: + CephContext *cct; + // ipv4_l4<ip_protocol_num::icmp> + inet_type& _inet; + circular_buffer<ipv4_traits::l4packet> _packetq; + Throttle _queue_space; +}; + +class ipv4_icmp final : public ip_protocol { + CephContext *cct; + ipv4_l4<ip_protocol_num::icmp> _inet_l4; + icmp _icmp; + public: + ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {} + virtual void received(Packet p, ipv4_address from, ipv4_address to) override { + _icmp.received(std::move(p), from, to); + } + friend class ipv4; +}; + +struct ip_hdr; + +struct ip_packet_filter { + virtual ~ip_packet_filter() {}; + virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0; +}; + +struct ipv4_frag_id { + struct hash; + ipv4_address src_ip; + ipv4_address dst_ip; + uint16_t identification; + uint8_t protocol; + bool operator==(const ipv4_frag_id& x) const { + return src_ip == x.src_ip && + dst_ip == x.dst_ip && + identification == x.identification && + protocol == x.protocol; + } +}; + +struct ipv4_frag_id::hash : private std::hash<ipv4_address>, + private std::hash<uint16_t>, private std::hash<uint8_t> { + size_t operator()(const ipv4_frag_id& id) const noexcept { + using h1 = std::hash<ipv4_address>; + using h2 = std::hash<uint16_t>; + using h3 = std::hash<uint8_t>; + return h1::operator()(id.src_ip) ^ + h1::operator()(id.dst_ip) ^ + h2::operator()(id.identification) ^ + h3::operator()(id.protocol); + } +}; + +struct ipv4_tag {}; +using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>; + +class interface; + +class ipv4 { + public: + using address_type = ipv4_address; + using proto_type = uint16_t; + static address_type broadcast_address() { return ipv4_address(0xffffffff); } + static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); } + CephContext *cct; + EventCenter *center; + + private: + interface* _netif; + std::vector<ipv4_traits::packet_provider_type> _pkt_providers; + Tub<uint64_t> frag_timefd; + EventCallbackRef frag_handler; + arp _global_arp; + arp_for<ipv4> _arp; + ipv4_address _host_address; + ipv4_address _gw_address; + ipv4_address _netmask; + l3_protocol _l3; + subscription<Packet, ethernet_address> _rx_packets; + ipv4_tcp _tcp; + ipv4_icmp _icmp; + array_map<ip_protocol*, 256> _l4; + ip_packet_filter *_packet_filter; + struct frag { + Packet header; + ipv4_packet_merger data; + utime_t rx_time; + uint32_t mem_size = 0; + // fragment with MF == 0 inidates it is the last fragment + bool last_frag_received = false; + + Packet get_assembled_packet(ethernet_address from, ethernet_address to); + int32_t merge(ip_hdr &h, uint16_t offset, Packet p); + bool is_complete(); + }; + std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags; + std::list<ipv4_frag_id> _frags_age; + static utime_t _frag_timeout; + static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024}; + static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024}; + uint32_t _frag_mem = 0; + circular_buffer<l3_protocol::l3packet> _packetq; + unsigned _pkt_provider_idx = 0; + PerfCounters *perf_logger; + + private: + int handle_received_packet(Packet p, ethernet_address from); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + Tub<l3_protocol::l3packet> get_packet(); + bool in_my_netmask(ipv4_address a) const { + return !((a.ip ^ _host_address.ip) & _netmask.ip); + } + void frag_limit_mem(); + void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) { + _frags.erase(frag_id); + _frag_mem -= dropped_size; + } + void frag_arm(utime_t now) { + auto tp = now + _frag_timeout; + frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler)); + } + void frag_arm() { + auto now = ceph_clock_now(); + frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler)); + } + + public: + void frag_timeout(); + + public: + explicit ipv4(CephContext *c, EventCenter *cen, interface* netif); + ~ipv4() { + delete frag_handler; + } + void set_host_address(ipv4_address ip) { + _host_address = ip; + _arp.set_self_addr(ip); + } + ipv4_address host_address() { + return _host_address; + } + void set_gw_address(ipv4_address ip) { + _gw_address = ip; + } + ipv4_address gw_address() const { + return _gw_address; + } + void set_netmask_address(ipv4_address ip) { + _netmask = ip; + } + ipv4_address netmask_address() const { + return _netmask; + } + interface *netif() const { + return _netif; + } + // TODO or something. Should perhaps truly be a list + // of filters. With ordering. And blackjack. Etc. + // But for now, a simple single raw pointer suffices + void set_packet_filter(ip_packet_filter *f) { + _packet_filter = f; + } + ip_packet_filter * packet_filter() const { + return _packet_filter; + } + void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst); + tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; } + void register_l4(proto_type id, ip_protocol* handler); + const hw_features& get_hw_features() const; + static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) { + if (p.len() + ipv4_hdr_len_min <= hw_features.mtu) + return false; + + if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso)) + return false; + + return true; + } + void learn(ethernet_address l2, ipv4_address l3) { + _arp.learn(l2, l3); + } + void register_packet_provider(ipv4_traits::packet_provider_type&& func) { + _pkt_providers.push_back(std::move(func)); + } + void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb); +}; + +template <ip_protocol_num ProtoNum> +inline void ipv4_l4<ProtoNum>::register_packet_provider( + ipv4_traits::packet_provider_type func) { + _inet.register_packet_provider([func] { + auto l4p = func(); + if (l4p) { + (*l4p).proto_num = ProtoNum; + } + return l4p; + }); +} + +template <ip_protocol_num ProtoNum> +inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) { + _inet.wait_l2_dst_address(to, std::move(p), std::move(cb)); +} + +struct ip_hdr { + uint8_t ihl : 4; + uint8_t ver : 4; + uint8_t dscp : 6; + uint8_t ecn : 2; + uint16_t len; + uint16_t id; + uint16_t frag; + enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 }; + uint8_t ttl; + uint8_t ip_proto; + uint16_t csum; + ipv4_address src_ip; + ipv4_address dst_ip; + uint8_t options[0]; + ip_hdr hton() { + ip_hdr hdr = *this; + hdr.len = ::hton(len); + hdr.id = ::hton(id); + hdr.frag = ::hton(frag); + hdr.csum = ::hton(csum); + hdr.src_ip.ip = ::hton(src_ip.ip); + hdr.dst_ip.ip = ::hton(dst_ip.ip); + return hdr; + } + ip_hdr ntoh() { + ip_hdr hdr = *this; + hdr.len = ::ntoh(len); + hdr.id = ::ntoh(id); + hdr.frag = ::ntoh(frag); + hdr.csum = ::ntoh(csum); + hdr.src_ip = src_ip.ntoh(); + hdr.dst_ip = dst_ip.ntoh(); + return hdr; + } + + bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); } + bool df() { return frag & (1 << uint8_t(frag_bits::df)); } + uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); } +} __attribute__((packed)); + +template <typename InetTraits> +struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> { + size_t operator()(const l4connid<InetTraits>& id) const noexcept { + using h1 = std::hash<ipaddr>; + using h2 = std::hash<uint16_t>; + return h1::operator()(id.local_ip) + ^ h1::operator()(id.foreign_ip) + ^ h2::operator()(id.local_port) + ^ h2::operator()(id.foreign_port); + } +}; + +#endif /* CEPH_MSG_IP_H */ diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc new file mode 100644 index 00000000..7a3253c1 --- /dev/null +++ b/src/msg/async/dpdk/IPChecksum.cc @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include <arpa/inet.h> +#include "net.h" +#include "IPChecksum.h" + +void checksummer::sum(const char* data, size_t len) { + auto orig_len = len; + if (odd) { + csum += uint8_t(*data++); + --len; + } + auto p64 = reinterpret_cast<const uint64_t*>(data); + while (len >= 8) { + csum += ntohq(*p64++); + len -= 8; + } + auto p16 = reinterpret_cast<const uint16_t*>(p64); + while (len >= 2) { + csum += ntohs(*p16++); + len -= 2; + } + auto p8 = reinterpret_cast<const uint8_t*>(p16); + if (len) { + csum += *p8++ << 8; + len -= 1; + } + odd ^= orig_len & 1; +} + +uint16_t checksummer::get() const { + __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64); + uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64); + csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return htons(~csum); +} + +void checksummer::sum(const Packet& p) { + for (auto&& f : p.fragments()) { + sum(f.base, f.size); + } +} + +uint16_t ip_checksum(const void* data, size_t len) { + checksummer cksum; + cksum.sum(reinterpret_cast<const char*>(data), len); + return cksum.get(); +} diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h new file mode 100644 index 00000000..9af4a86b --- /dev/null +++ b/src/msg/async/dpdk/IPChecksum.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_CHECKSUM_H_ +#define CEPH_MSG_CHECKSUM_H_ + +#include <cstdint> +#include <cstddef> +#include <arpa/inet.h> + +#include "Packet.h" + +uint16_t ip_checksum(const void* data, size_t len); + +struct checksummer { + __int128 csum = 0; + bool odd = false; + void sum(const char* data, size_t len); + void sum(const Packet& p); + void sum(uint8_t data) { + if (!odd) { + csum += data << 8; + } else { + csum += data; + } + odd = !odd; + } + void sum(uint16_t data) { + if (odd) { + sum(uint8_t(data >> 8)); + sum(uint8_t(data)); + } else { + csum += data; + } + } + void sum(uint32_t data) { + if (odd) { + sum(uint16_t(data)); + sum(uint16_t(data >> 16)); + } else { + csum += data; + } + } + void sum_many() {} + template <typename T0, typename... T> + void sum_many(T0 data, T... rest) { + sum(data); + sum_many(rest...); + } + uint16_t get() const; +}; + +#endif /* CEPH_MSG_CHECKSUM_H_ */ diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc new file mode 100644 index 00000000..6c2320a0 --- /dev/null +++ b/src/msg/async/dpdk/Packet.cc @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> +#include <algorithm> +#include <cctype> + +#include "capture.h" +#include "Packet.h" + +constexpr size_t Packet::internal_data_size; +constexpr size_t Packet::default_nr_frags; + +void Packet::linearize(size_t at_frag, size_t desired_size) { + _impl->unuse_internal_data(); + size_t nr_frags = 0; + size_t accum_size = 0; + while (accum_size < desired_size) { + accum_size += _impl->frags[at_frag + nr_frags].size; + ++nr_frags; + } + char *new_frag = new char[accum_size]; + auto p = new_frag; + for (size_t i = 0; i < nr_frags; ++i) { + auto& f = _impl->frags[at_frag + i]; + p = std::copy(f.base, f.base + f.size, p); + } + // collapse nr_frags into one fragment + std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags, + _impl->frags + at_frag + 1); + _impl->_nr_frags -= nr_frags - 1; + _impl->frags[at_frag] = fragment{new_frag, accum_size}; + if (at_frag == 0 && desired_size == len()) { + // We can drop the old buffer safely + auto x = std::move(_impl->_deleter); + _impl->_deleter = make_deleter([new_frag] { delete []new_frag; }); + } else { + auto del = std::bind( + [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter)); + _impl->_deleter = make_deleter(std::move(del)); + } +} + +class C_free_on_cpu : public EventCallback { + deleter del; + std::function<void()> cb; + public: + C_free_on_cpu(deleter &&d, std::function<void()> &&c): + del(std::move(d)), cb(std::move(c)) {} + void do_request(uint64_t fd) { + // deleter needs to be moved from lambda capture to be destroyed here + // otherwise deleter destructor will be called on a cpu that called + // create_external_event when work_item is destroyed. + deleter xxx(std::move(del)); + cb(); + delete this; + } +}; + +Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb) +{ + auto del = std::bind( + [center, cb] (deleter &del) mutable { + center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb))); + }, std::move(_impl->_deleter)); + // make new deleter that runs old deleter on an origin cpu + _impl->_deleter = make_deleter(deleter(), std::move(del)); + + return Packet(impl::copy(_impl.get())); +} + +std::ostream& operator<<(std::ostream& os, const Packet& p) { + os << "Packet{"; + bool first = true; + for (auto&& frag : p.fragments()) { + if (!first) { + os << ", "; + } + first = false; + if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) { + os << '"'; + for (auto p = frag.base; p != frag.base + frag.size; ++p) { + auto c = *p; + if (isprint(c)) { + os << c; + } else if (c == '\r') { + os << "\\r"; + } else if (c == '\n') { + os << "\\n"; + } else if (c == '\t') { + os << "\\t"; + } else { + uint8_t b = c; + os << "\\x" << (b / 16) << (b % 16); + } + } + os << '"'; + } else { + os << "{"; + bool nfirst = true; + for (auto p = frag.base; p != frag.base + frag.size; ++p) { + if (!nfirst) { + os << " "; + } + nfirst = false; + uint8_t b = *p; + os << b; + } + os << "}"; + } + } + os << "}"; + return os; +} diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h new file mode 100644 index 00000000..db9cd2a7 --- /dev/null +++ b/src/msg/async/dpdk/Packet.h @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_PACKET_H_ +#define CEPH_MSG_PACKET_H_ + +#include <vector> +#include <algorithm> +#include <iosfwd> + +#include "include/types.h" +#include "common/Tub.h" +#include "common/deleter.h" +#include "msg/async/Event.h" + +#include "const.h" + +struct fragment { + char* base; + size_t size; +}; + +struct offload_info { + ip_protocol_num protocol = ip_protocol_num::unused; + bool needs_csum = false; + uint8_t ip_hdr_len = 20; + uint8_t tcp_hdr_len = 20; + uint8_t udp_hdr_len = 8; + bool needs_ip_csum = false; + bool reassembled = false; + uint16_t tso_seg_size = 0; + // HW stripped VLAN header (CPU order) + Tub<uint16_t> vlan_tci; +}; + +// Zero-copy friendly packet class +// +// For implementing zero-copy, we need a flexible destructor that can +// destroy packet data in different ways: decrementing a reference count, +// or calling a free()-like function. +// +// Moreover, we need different destructors for each set of fragments within +// a single fragment. For example, a header and trailer might need delete[] +// to be called, while the internal data needs a reference count to be +// released. Matters are complicated in that fragments can be split +// (due to virtual/physical translation). +// +// To implement this, we associate each packet with a single destructor, +// but allow composing a packet from another packet plus a fragment to +// be added, with its own destructor, causing the destructors to be chained. +// +// The downside is that the data needed for the destructor is duplicated, +// if it is already available in the fragment itself. +// +// As an optimization, when we allocate small fragments, we allocate some +// extra space, so prepending to the packet does not require extra +// allocations. This is useful when adding headers. +// +class Packet { + // enough for lots of headers, not quite two cache lines: + static constexpr size_t internal_data_size = 128 - 16; + static constexpr size_t default_nr_frags = 4; + + struct pseudo_vector { + fragment* _start; + fragment* _finish; + pseudo_vector(fragment* start, size_t nr) + : _start(start), _finish(_start + nr) {} + fragment* begin() { return _start; } + fragment* end() { return _finish; } + fragment& operator[](size_t idx) { return _start[idx]; } + }; + + struct impl { + // when destroyed, virtual destructor will reclaim resources + deleter _deleter; + unsigned _len = 0; + uint16_t _nr_frags = 0; + uint16_t _allocated_frags; + offload_info _offload_info; + Tub<uint32_t> rss_hash; + char data[internal_data_size]; // only frags[0] may use + unsigned headroom = internal_data_size; // in data + // FIXME: share data/frags space + + fragment frags[]; + + explicit impl(size_t nr_frags = default_nr_frags); + impl(const impl&) = delete; + impl(fragment frag, size_t nr_frags = default_nr_frags); + + pseudo_vector fragments() { return { frags, _nr_frags }; } + + static std::unique_ptr<impl> allocate(size_t nr_frags) { + nr_frags = std::max(nr_frags, default_nr_frags); + return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags)); + } + + static std::unique_ptr<impl> copy(impl* old, size_t nr) { + auto n = allocate(nr); + n->_deleter = std::move(old->_deleter); + n->_len = old->_len; + n->_nr_frags = old->_nr_frags; + n->headroom = old->headroom; + n->_offload_info = old->_offload_info; + n->rss_hash.construct(old->rss_hash); + std::copy(old->frags, old->frags + old->_nr_frags, n->frags); + old->copy_internal_fragment_to(n.get()); + return std::move(n); + } + + static std::unique_ptr<impl> copy(impl* old) { + return copy(old, old->_nr_frags); + } + + static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) { + if (old->_allocated_frags >= old->_nr_frags + extra_frags) { + return std::move(old); + } + return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags)); + } + void* operator new(size_t size, size_t nr_frags = default_nr_frags) { + ceph_assert(nr_frags == uint16_t(nr_frags)); + return ::operator new(size + nr_frags * sizeof(fragment)); + } + // Matching the operator new above + void operator delete(void* ptr, size_t nr_frags) { + return ::operator delete(ptr); + } + // Since the above "placement delete" hides the global one, expose it + void operator delete(void* ptr) { + return ::operator delete(ptr); + } + + bool using_internal_data() const { + return _nr_frags + && frags[0].base >= data + && frags[0].base < data + internal_data_size; + } + + void unuse_internal_data() { + if (!using_internal_data()) { + return; + } + auto buf = static_cast<char*>(::malloc(frags[0].size)); + if (!buf) { + throw std::bad_alloc(); + } + deleter d = make_free_deleter(buf); + std::copy(frags[0].base, frags[0].base + frags[0].size, buf); + frags[0].base = buf; + _deleter.append(std::move(d)); + headroom = internal_data_size; + } + void copy_internal_fragment_to(impl* to) { + if (!using_internal_data()) { + return; + } + to->frags[0].base = to->data + headroom; + std::copy(frags[0].base, frags[0].base + frags[0].size, + to->frags[0].base); + } + }; + explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {} + std::unique_ptr<impl> _impl; +public: + static Packet from_static_data(const char* data, size_t len) { + return {fragment{const_cast<char*>(data), len}, deleter()}; + } + + // build empty Packet + Packet(); + // build empty Packet with nr_frags allocated + explicit Packet(size_t nr_frags); + // move existing Packet + Packet(Packet&& x) noexcept; + // copy data into Packet + Packet(const char* data, size_t len); + // copy data into Packet + explicit Packet(fragment frag); + // zero-copy single fragment + Packet(fragment frag, deleter del); + // zero-copy multiple fragments + Packet(std::vector<fragment> frag, deleter del); + // build Packet with iterator + template <typename Iterator> + Packet(Iterator begin, Iterator end, deleter del); + // append fragment (copying new fragment) + Packet(Packet&& x, fragment frag); + // prepend fragment (copying new fragment, with header optimization) + Packet(fragment frag, Packet&& x); + // prepend fragment (zero-copy) + Packet(fragment frag, deleter del, Packet&& x); + // append fragment (zero-copy) + Packet(Packet&& x, fragment frag, deleter d); + // append deleter + Packet(Packet&& x, deleter d); + + Packet& operator=(Packet&& x) { + if (this != &x) { + this->~Packet(); + new (this) Packet(std::move(x)); + } + return *this; + } + + unsigned len() const { return _impl->_len; } + unsigned memory() const { return len() + sizeof(Packet::impl); } + + fragment frag(unsigned idx) const { return _impl->frags[idx]; } + fragment& frag(unsigned idx) { return _impl->frags[idx]; } + + unsigned nr_frags() const { return _impl->_nr_frags; } + pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; } + fragment* fragment_array() const { return _impl->frags; } + + // share Packet data (reference counted, non COW) + Packet share(); + Packet share(size_t offset, size_t len); + + void append(Packet&& p); + + void trim_front(size_t how_much); + void trim_back(size_t how_much); + + // get a header pointer, linearizing if necessary + template <typename Header> + Header* get_header(size_t offset = 0); + + // get a header pointer, linearizing if necessary + char* get_header(size_t offset, size_t size); + + // prepend a header (default-initializing it) + template <typename Header> + Header* prepend_header(size_t extra_size = 0); + + // prepend a header (uninitialized!) + char* prepend_uninitialized_header(size_t size); + + Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{}); + + void linearize() { return linearize(0, len()); } + + void reset() { _impl.reset(); } + + void reserve(int n_frags) { + if (n_frags > _impl->_nr_frags) { + auto extra = n_frags - _impl->_nr_frags; + _impl = impl::allocate_if_needed(std::move(_impl), extra); + } + } + Tub<uint32_t> rss_hash() { + return _impl->rss_hash; + } + void set_rss_hash(uint32_t hash) { + _impl->rss_hash.construct(hash); + } +private: + void linearize(size_t at_frag, size_t desired_size); + bool allocate_headroom(size_t size); +public: + class offload_info offload_info() const { return _impl->_offload_info; } + class offload_info& offload_info_ref() { return _impl->_offload_info; } + void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; } +}; + +std::ostream& operator<<(std::ostream& os, const Packet& p); + +inline Packet::Packet(Packet&& x) noexcept + : _impl(std::move(x._impl)) { +} + +inline Packet::impl::impl(size_t nr_frags) + : _len(0), _allocated_frags(nr_frags) { +} + +inline Packet::impl::impl(fragment frag, size_t nr_frags) + : _len(frag.size), _allocated_frags(nr_frags) { + ceph_assert(_allocated_frags > _nr_frags); + if (frag.size <= internal_data_size) { + headroom -= frag.size; + frags[0] = { data + headroom, frag.size }; + } else { + auto buf = static_cast<char*>(::malloc(frag.size)); + if (!buf) { + throw std::bad_alloc(); + } + deleter d = make_free_deleter(buf); + frags[0] = { buf, frag.size }; + _deleter.append(std::move(d)); + } + std::copy(frag.base, frag.base + frag.size, frags[0].base); + ++_nr_frags; +} + +inline Packet::Packet(): _impl(impl::allocate(1)) { +} + +inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) { +} + +inline Packet::Packet(fragment frag): _impl(new impl(frag)) { +} + +inline Packet::Packet(const char* data, size_t size): + Packet(fragment{const_cast<char*>(data), size}) { +} + +inline Packet::Packet(fragment frag, deleter d) + : _impl(impl::allocate(1)) { + _impl->_deleter = std::move(d); + _impl->frags[_impl->_nr_frags++] = frag; + _impl->_len = frag.size; +} + +inline Packet::Packet(std::vector<fragment> frag, deleter d) + : _impl(impl::allocate(frag.size())) { + _impl->_deleter = std::move(d); + std::copy(frag.begin(), frag.end(), _impl->frags); + _impl->_nr_frags = frag.size(); + _impl->_len = 0; + for (auto&& f : _impl->fragments()) { + _impl->_len += f.size; + } +} + +template <typename Iterator> +inline Packet::Packet(Iterator begin, Iterator end, deleter del) { + unsigned nr_frags = 0, len = 0; + nr_frags = std::distance(begin, end); + std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; }); + _impl = impl::allocate(nr_frags); + _impl->_deleter = std::move(del); + _impl->_len = len; + _impl->_nr_frags = nr_frags; + std::copy(begin, end, _impl->frags); +} + +inline Packet::Packet(Packet&& x, fragment frag) + : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) { + _impl->_len += frag.size; + char* buf = new char[frag.size]; + std::copy(frag.base, frag.base + frag.size, buf); + _impl->frags[_impl->_nr_frags++] = {buf, frag.size}; + _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] { + delete[] buf; + }); +} + +inline bool Packet::allocate_headroom(size_t size) { + if (_impl->headroom >= size) { + _impl->_len += size; + if (!_impl->using_internal_data()) { + _impl = impl::allocate_if_needed(std::move(_impl), 1); + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + _impl->frags[0] = { _impl->data + internal_data_size, 0 }; + ++_impl->_nr_frags; + } + _impl->headroom -= size; + _impl->frags[0].base -= size; + _impl->frags[0].size += size; + return true; + } else { + return false; + } +} + + +inline Packet::Packet(fragment frag, Packet&& x) + : _impl(std::move(x._impl)) { + // try to prepend into existing internal fragment + if (allocate_headroom(frag.size)) { + std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base); + return; + } else { + // didn't work out, allocate and copy + _impl->unuse_internal_data(); + _impl = impl::allocate_if_needed(std::move(_impl), 1); + _impl->_len += frag.size; + char *buf = new char[frag.size]; + std::copy(frag.base, frag.base + frag.size, buf); + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + ++_impl->_nr_frags; + _impl->frags[0] = {buf, frag.size}; + _impl->_deleter = make_deleter( + std::move(_impl->_deleter), [buf] { delete []buf; }); + } +} + +inline Packet::Packet(Packet&& x, fragment frag, deleter d) + : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) { + _impl->_len += frag.size; + _impl->frags[_impl->_nr_frags++] = frag; + d.append(std::move(_impl->_deleter)); + _impl->_deleter = std::move(d); +} + +inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) { + _impl->_deleter.append(std::move(d)); +} + +inline void Packet::append(Packet&& p) { + if (!_impl->_len) { + *this = std::move(p); + return; + } + _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags); + _impl->_len += p._impl->_len; + p._impl->unuse_internal_data(); + std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags, + _impl->frags + _impl->_nr_frags); + _impl->_nr_frags += p._impl->_nr_frags; + p._impl->_deleter.append(std::move(_impl->_deleter)); + _impl->_deleter = std::move(p._impl->_deleter); +} + +inline char* Packet::get_header(size_t offset, size_t size) { + if (offset + size > _impl->_len) { + return nullptr; + } + size_t i = 0; + while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) { + offset -= _impl->frags[i++].size; + } + if (i == _impl->_nr_frags) { + return nullptr; + } + if (offset + size > _impl->frags[i].size) { + linearize(i, offset + size); + } + return _impl->frags[i].base + offset; +} + +template <typename Header> +inline Header* Packet::get_header(size_t offset) { + return reinterpret_cast<Header*>(get_header(offset, sizeof(Header))); +} + +inline void Packet::trim_front(size_t how_much) { + ceph_assert(how_much <= _impl->_len); + _impl->_len -= how_much; + size_t i = 0; + while (how_much && how_much >= _impl->frags[i].size) { + how_much -= _impl->frags[i++].size; + } + std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags); + _impl->_nr_frags -= i; + if (!_impl->using_internal_data()) { + _impl->headroom = internal_data_size; + } + if (how_much) { + if (_impl->using_internal_data()) { + _impl->headroom += how_much; + } + _impl->frags[0].base += how_much; + _impl->frags[0].size -= how_much; + } +} + +inline void Packet::trim_back(size_t how_much) { + ceph_assert(how_much <= _impl->_len); + _impl->_len -= how_much; + size_t i = _impl->_nr_frags - 1; + while (how_much && how_much >= _impl->frags[i].size) { + how_much -= _impl->frags[i--].size; + } + _impl->_nr_frags = i + 1; + if (how_much) { + _impl->frags[i].size -= how_much; + if (i == 0 && _impl->using_internal_data()) { + _impl->headroom += how_much; + } + } +} + +template <typename Header> +Header* Packet::prepend_header(size_t extra_size) { + auto h = prepend_uninitialized_header(sizeof(Header) + extra_size); + return new (h) Header{}; +} + +// prepend a header (uninitialized!) +inline char* Packet::prepend_uninitialized_header(size_t size) { + if (!allocate_headroom(size)) { + // didn't work out, allocate and copy + _impl->unuse_internal_data(); + // try again, after unuse_internal_data we may have space after all + if (!allocate_headroom(size)) { + // failed + _impl->_len += size; + _impl = impl::allocate_if_needed(std::move(_impl), 1); + char *buf = new char[size]; + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + ++_impl->_nr_frags; + _impl->frags[0] = {buf, size}; + _impl->_deleter = make_deleter(std::move(_impl->_deleter), + [buf] { delete []buf; }); + } + } + return _impl->frags[0].base; +} + +inline Packet Packet::share() { + return share(0, _impl->_len); +} + +inline Packet Packet::share(size_t offset, size_t len) { + _impl->unuse_internal_data(); // FIXME: eliminate? + Packet n; + n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags); + size_t idx = 0; + while (offset > 0 && offset >= _impl->frags[idx].size) { + offset -= _impl->frags[idx++].size; + } + while (n._impl->_len < len) { + auto& f = _impl->frags[idx++]; + auto fsize = std::min(len - n._impl->_len, f.size - offset); + n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize }; + n._impl->_len += fsize; + offset = 0; + } + n._impl->_offload_info = _impl->_offload_info; + ceph_assert(!n._impl->_deleter); + n._impl->_deleter = _impl->_deleter.share(); + return n; +} + +#endif /* CEPH_MSG_PACKET_H_ */ diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h new file mode 100644 index 00000000..118218e6 --- /dev/null +++ b/src/msg/async/dpdk/PacketUtil.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_PACKET_UTIL_H_ +#define CEPH_MSG_PACKET_UTIL_H_ + +#include <map> +#include <iostream> + +#include "Packet.h" + +template <typename Offset, typename Tag> +class packet_merger { + private: + static uint64_t& linearizations_ref() { + static thread_local uint64_t linearization_count; + return linearization_count; + } + public: + std::map<Offset, Packet> map; + + static uint64_t linearizations() { + return linearizations_ref(); + } + + void merge(Offset offset, Packet p) { + bool insert = true; + auto beg = offset; + auto end = beg + p.len(); + // First, try to merge the packet with existing segment + for (auto it = map.begin(); it != map.end();) { + auto& seg_pkt = it->second; + auto seg_beg = it->first; + auto seg_end = seg_beg + seg_pkt.len(); + // There are 6 cases: + if (seg_beg <= beg && end <= seg_end) { + // 1) seg_beg beg end seg_end + // We already have data in this packet + return; + } else if (beg <= seg_beg && seg_end <= end) { + // 2) beg seg_beg seg_end end + // The new segment contains more data than this old segment + // Delete the old one, insert the new one + it = map.erase(it); + insert = true; + break; + } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) { + // 3) beg seg_beg end seg_end + // Merge two segments, trim front of old segment + auto trim = end - seg_beg; + seg_pkt.trim_front(trim); + p.append(std::move(seg_pkt)); + // Delete the old one, insert the new one + it = map.erase(it); + insert = true; + break; + } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) { + // 4) seg_beg beg seg_end end + // Merge two segments, trim front of new segment + auto trim = seg_end - beg; + p.trim_front(trim); + // Append new data to the old segment, keep the old segment + seg_pkt.append(std::move(p)); + seg_pkt.linearize(); + ++linearizations_ref(); + insert = false; + break; + } else { + // 5) beg end < seg_beg seg_end + // or + // 6) seg_beg seg_end < beg end + // Can not merge with this segment, keep looking + it++; + insert = true; + } + } + + if (insert) { + p.linearize(); + ++linearizations_ref(); + map.emplace(beg, std::move(p)); + } + + // Second, merge adjacent segments after this packet has been merged, + // because this packet might fill a "whole" and make two adjacent + // segments mergable + for (auto it = map.begin(); it != map.end();) { + // The first segment + auto& seg_pkt = it->second; + auto seg_beg = it->first; + auto seg_end = seg_beg + seg_pkt.len(); + + // The second segment + auto it_next = it; + it_next++; + if (it_next == map.end()) { + break; + } + auto& p = it_next->second; + auto beg = it_next->first; + auto end = beg + p.len(); + + // Merge the the second segment into first segment if possible + if (seg_beg <= beg && beg <= seg_end && seg_end < end) { + // Merge two segments, trim front of second segment + auto trim = seg_end - beg; + p.trim_front(trim); + // Append new data to the first segment, keep the first segment + seg_pkt.append(std::move(p)); + + // Delete the second segment + map.erase(it_next); + + // Keep merging this first segment with its new next packet + // So we do not update the iterator: it + continue; + } else if (end <= seg_end) { + // The first segment has all the data in the second segment + // Delete the second segment + map.erase(it_next); + continue; + } else if (seg_end < beg) { + // Can not merge first segment with second segment + it = it_next; + continue; + } else { + // If we reach here, we have a bug with merge. + std::cout << "packet_merger: merge error\n"; + abort(); + } + } + } +}; + +#endif diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h new file mode 100644 index 00000000..996ae93c --- /dev/null +++ b/src/msg/async/dpdk/TCP-Stack.h @@ -0,0 +1,40 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +// tcp/network-stack integration + +#ifndef CEPH_MSG_DPDK_TCP_STACK_H +#define CEPH_MSG_DPDK_TCP_STACK_H + +class ServerSocket; +class ConnectedSocket; + +class ipv4_traits; +template <typename InetTraits> +class tcp; + +int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, + int type, ServerSocket *sa); + +int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, + ConnectedSocket *sa); + +#endif diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc new file mode 100644 index 00000000..c6397709 --- /dev/null +++ b/src/msg/async/dpdk/TCP.cc @@ -0,0 +1,840 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "align.h" +#include "TCP.h" +#include "IP.h" +#include "DPDKStack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "tcp " + +void tcp_option::parse(uint8_t* beg, uint8_t* end) +{ + while (beg < end) { + auto kind = option_kind(*beg); + if (kind != option_kind::nop && kind != option_kind::eol) { + // Make sure there is enough room for this option + auto len = *(beg + 1); + if (beg + len > end) { + return; + } + } + switch (kind) { + case option_kind::mss: + _mss_received = true; + _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss); + beg += option_len::mss; + break; + case option_kind::win_scale: + _win_scale_received = true; + _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift; + // We can turn on win_scale option, 7 is Linux's default win scale size + _local_win_scale = 7; + beg += option_len::win_scale; + break; + case option_kind::sack: + _sack_received = true; + beg += option_len::sack; + break; + case option_kind::nop: + beg += option_len::nop; + break; + case option_kind::eol: + return; + default: + // Ignore options we do not understand + auto len = *(beg + 1); + beg += len; + // Prevent infinite loop + if (len == 0) { + return; + } + break; + } + } +} + +uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size) +{ + auto hdr = reinterpret_cast<uint8_t*>(th); + auto off = hdr + sizeof(tcp_hdr); + uint8_t size = 0; + bool syn_on = th->f_syn; + bool ack_on = th->f_ack; + + if (syn_on) { + if (_mss_received || !ack_on) { + auto mss = new (off) tcp_option::mss; + mss->mss = _local_mss; + off += mss->len; + size += mss->len; + *mss = mss->hton(); + } + if (_win_scale_received || !ack_on) { + auto win_scale = new (off) tcp_option::win_scale; + win_scale->shift = _local_win_scale; + off += win_scale->len; + size += win_scale->len; + } + } + if (size > 0) { + // Insert NOP option + auto size_max = align_up(uint8_t(size + 1), tcp_option::align); + while (size < size_max - uint8_t(option_len::eol)) { + new (off) tcp_option::nop; + off += option_len::nop; + size += option_len::nop; + } + new (off) tcp_option::eol; + size += option_len::eol; + } + ceph_assert(size == options_size); + + return size; +} + +uint8_t tcp_option::get_size(bool syn_on, bool ack_on) +{ + uint8_t size = 0; + if (syn_on) { + if (_mss_received || !ack_on) { + size += option_len::mss; + } + if (_win_scale_received || !ack_on) { + size += option_len::win_scale; + } + } + if (size > 0) { + size += option_len::eol; + // Insert NOP option to align on 32-bit + size = align_up(size, tcp_option::align); + } + return size; +} + +ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c) + : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c))) +{ } + +ipv4_tcp::~ipv4_tcp() { } + +void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to) +{ + _tcp->received(std::move(p), from, to); +} + +bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + return _tcp->forward(out_hash_data, p, off); +} + +int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, + int type, ServerSocket *sock) +{ + auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type); + int r = p->listen(); + if (r < 0) { + delete p; + return r; + } + *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p)); + return 0; +} + +int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, + ConnectedSocket *sock) +{ + auto conn = tcpv4.connect(addr); + *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>( + new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn)))); + return 0; +} + +template <typename InetTraits> +void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip) +{ + ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin) + << " syn=" << bool(rth->f_syn) << dendl; + if (rth->f_rst) { + return; + } + Packet p; + auto th = p.prepend_header<tcp_hdr>(); + th->src_port = rth->dst_port; + th->dst_port = rth->src_port; + if (rth->f_ack) { + th->seq = rth->ack; + } + // If this RST packet is in response to a SYN packet. We ACK the ISN. + if (rth->f_syn) { + th->ack = rth->seq + 1; + th->f_ack = true; + } + th->f_rst = true; + th->data_offset = sizeof(*th) / 4; + th->checksum = 0; + *th = th->hton(); + + checksummer csum; + offload_info oi; + InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th)); + if (get_hw_features().tx_csum_l4_offload) { + th->checksum = ~csum.get(); + oi.needs_csum = true; + } else { + csum.sum(p); + th->checksum = csum.get(); + oi.needs_csum = false; + } + + oi.protocol = ip_protocol_num::tcp; + oi.tcp_hdr_len = sizeof(tcp_hdr); + p.set_offload_info(oi); + + send_packet_without_tcb(local_ip, foreign_ip, std::move(p)); +} + +#undef dout_prefix +#define dout_prefix _prefix(_dout) +template<typename InetTraits> +ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) { + return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port + << " tcb(" << this << " fd=" << fd << " s=" << _state << ")."; +} + +template<typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p) +{ + auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); + auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); + auto opt_end = opt_start + opt_len; + p.trim_front(th->data_offset * 4); + tcp_sequence seg_seq = th->seq; + + // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ + _rcv.next = seg_seq + 1; + _rcv.initial = seg_seq; + + // ISS should be selected and a SYN segment sent of the form: + // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> + // SND.NXT is set to ISS+1 and SND.UNA to ISS + // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is + // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we + // have + // th->seq = syn_on ? _snd.initial : _snd.next + // to make sure retransmitted SYN has correct SEQ number. + do_setup_isn(); + + _rcv.urgent = _rcv.next; + + ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl; + init_from_options(th, opt_start, opt_end); + do_syn_received(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p) +{ + auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); + auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); + auto opt_end = opt_start + opt_len; + p.trim_front(th->data_offset * 4); + tcp_sequence seg_seq = th->seq; + auto seg_ack = th->ack; + + ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw + << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; + + bool acceptable = false; + // 3.1 first check the ACK bit + if (th->f_ack) { + // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the + // RST bit is set, if so drop the segment and return) + if (seg_ack <= _snd.initial || seg_ack > _snd.next) { + return respond_with_reset(th); + } + + // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next; + } + + // 3.2 second check the RST bit + if (th->f_rst) { + // If the ACK was acceptable then signal the user "error: connection + // reset", drop the segment, enter CLOSED state, delete TCB, and + // return. Otherwise (no ACK) drop the segment and return. + if (acceptable) { + return do_reset(); + } else { + return; + } + } + + // 3.3 third check the security and precedence + // NOTE: Ignored for now + + // 3.4 fourth check the SYN bit + if (th->f_syn) { + // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should + // be advanced to equal SEG.ACK (if there is an ACK), and any segments + // on the retransmission queue which are thereby acknowledged should be + // removed. + _rcv.next = seg_seq + 1; + _rcv.initial = seg_seq; + if (th->f_ack) { + // TODO: clean retransmission queue + _snd.unacknowledged = seg_ack; + } + if (_snd.unacknowledged > _snd.initial) { + // If SND.UNA > ISS (our SYN has been ACKed), change the connection + // state to ESTABLISHED, form an ACK segment + // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl; + init_from_options(th, opt_start, opt_end); + do_established(); + output(); + } else { + // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment + // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> + ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl; + do_syn_received(); + } + } + + // 3.5 fifth, if neither of the SYN or RST bits is set then drop the + // segment and return. + return; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p) +{ + p.trim_front(th->data_offset * 4); + bool do_output = false; + bool do_output_data = false; + tcp_sequence seg_seq = th->seq; + auto seg_ack = th->ack; + auto seg_len = p.len(); + ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw + << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw + << " rcv next " << _rcv.next.raw << " len " << seg_len + << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; + + // 4.1 first check sequence number + if (!segment_acceptable(seg_seq, seg_len)) { + //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + return output(); + } + + // In the following it is assumed that the segment is the idealized + // segment that begins at RCV.NXT and does not exceed the window. + if (seg_seq < _rcv.next) { + // ignore already acknowledged data + auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len); + ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl; + p.trim_front(dup); + seg_len -= dup; + seg_seq += dup; + } + // FIXME: We should trim data outside the right edge of the receive window as well + + if (seg_seq != _rcv.next) { + ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw + << " actual " << seg_seq.raw + << " out of order size " << _rcv.out_of_order.map.size() + << dendl; + insert_out_of_order(seg_seq, std::move(p)); + // A TCP receiver SHOULD send an immediate duplicate ACK + // when an out-of-order segment arrives. + return output(); + } + + // 4.2 second check the RST bit + if (th->f_rst) { + if (in_state(SYN_RECEIVED)) { + // If this connection was initiated with a passive OPEN (i.e., + // came from the LISTEN state), then return this connection to + // LISTEN state and return. The user need not be informed. If + // this connection was initiated with an active OPEN (i.e., came + // from SYN_SENT state) then the connection was refused, signal + // the user "connection refused". In either case, all segments + // on the retransmission queue should be removed. And in the + // active OPEN case, enter the CLOSED state and delete the TCB, + // and return. + errno = -ECONNREFUSED; + return do_reset(); + } + if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) { + // If the RST bit is set then, any outstanding RECEIVEs and SEND + // should receive "reset" responses. All segment queues should be + // flushed. Users should also receive an unsolicited general + // "connection reset" signal. Enter the CLOSED state, delete the + // TCB, and return. + return do_reset(); + } + if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) { + // If the RST bit is set then, enter the CLOSED state, delete the + // TCB, and return. + return do_closed(); + } + } + + // 4.3 third check security and precedence + // NOTE: Ignored for now + + // 4.4 fourth, check the SYN bit + if (th->f_syn) { + // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 + // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT + + // If the SYN is in the window it is an error, send a reset, any + // outstanding RECEIVEs and SEND should receive "reset" responses, + // all segment queues should be flushed, the user should also + // receive an unsolicited general "connection reset" signal, enter + // the CLOSED state, delete the TCB, and return. + respond_with_reset(th); + return do_reset(); + + // If the SYN is not in the window this step would not be reached + // and an ack would have been sent in the first step (sequence + // number check). + } + + // 4.5 fifth check the ACK field + if (!th->f_ack) { + // if the ACK bit is off drop the segment and return + return; + } else { + // SYN_RECEIVED STATE + if (in_state(SYN_RECEIVED)) { + // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state + // and continue processing. + if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) { + ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl; + do_established(); + if (_tcp.push_listen_queue(_local_port, this)) { + ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl; + } else { + ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl; + return respond_with_reset(th); + } + } else { + // <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(th); + } + } + auto update_window = [this, th, seg_seq, seg_ack] { + ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq + << " seg_ack=" << seg_ack << " old window=" << th->window + << " new window=" << int(_snd.window_scale) << dendl; + _snd.window = th->window << _snd.window_scale; + _snd.wl1 = seg_seq; + _snd.wl2 = seg_ack; + if (_snd.window == 0) { + _persist_time_out = _rto; + start_persist_timer(); + } else { + stop_persist_timer(); + } + }; + // ESTABLISHED STATE or + // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state. + if (in_state(ESTABLISHED | CLOSE_WAIT)) { + // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK. + if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) { + // Remote ACKed data we sent + auto acked_bytes = data_segment_acked(seg_ack); + + // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated. + if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) { + update_window(); + } + + // some data is acked, try send more data + do_output_data = true; + + auto set_retransmit_timer = [this] { + if (_snd.data.empty()) { + // All outstanding segments are acked, turn off the timer. + stop_retransmit_timer(); + // Signal the waiter of this event + signal_all_data_acked(); + } else { + // Restart the timer becasue new data is acked. + start_retransmit_timer(); + } + }; + + if (_snd.dupacks >= 3) { + // We are in fast retransmit / fast recovery phase + uint32_t smss = _snd.mss; + if (seg_ack > _snd.recover) { + ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl; + // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS) + _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss); + // Exit the fast recovery procedure + exit_fast_recovery(); + set_retransmit_timer(); + } else { + ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl; + // Retransmit the first unacknowledged segment + fast_retransmit(); + // Deflate the congestion window by the amount of new data + // acknowledged by the Cumulative Acknowledgment field + _snd.cwnd -= acked_bytes; + // If the partial ACK acknowledges at least one SMSS of new + // data, then add back SMSS bytes to the congestion window + if (acked_bytes >= smss) { + _snd.cwnd += smss; + } + // Send a new segment if permitted by the new value of + // cwnd. Do not exit the fast recovery procedure For + // the first partial ACK that arrives during fast + // recovery, also reset the retransmit timer. + if (++_snd.partial_ack == 1) { + start_retransmit_timer(); + } + } + } else { + // RFC5681: The fast retransmit algorithm uses the arrival + // of 3 duplicate ACKs (as defined in section 2, without + // any intervening ACKs which move SND.UNA) as an + // indication that a segment has been lost. + // + // So, here we reset dupacks to zero becasue this ACK moves + // SND.UNA. + exit_fast_recovery(); + set_retransmit_timer(); + } + } else if (!_snd.data.empty() && seg_len == 0 && + th->f_fin == 0 && th->f_syn == 0 && + th->ack == _snd.unacknowledged && + uint32_t(th->window << _snd.window_scale) == _snd.window) { + // Note: + // RFC793 states: + // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored + // RFC5681 states: + // The TCP sender SHOULD use the "fast retransmit" algorithm to detect + // and repair loss, based on incoming duplicate ACKs. + // Here, We follow RFC5681. + _snd.dupacks++; + uint32_t smss = _snd.mss; + // 3 duplicated ACKs trigger a fast retransmit + if (_snd.dupacks == 1 || _snd.dupacks == 2) { + // RFC5681 Step 3.1 + // Send cwnd + 2 * smss per RFC3042 + do_output_data = true; + } else if (_snd.dupacks == 3) { + // RFC6582 Step 3.2 + if (seg_ack - 1 > _snd.recover) { + _snd.recover = _snd.next - 1; + // RFC5681 Step 3.2 + _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss); + fast_retransmit(); + } else { + // Do not enter fast retransmit and do not reset ssthresh + } + // RFC5681 Step 3.3 + _snd.cwnd = _snd.ssthresh + 3 * smss; + } else if (_snd.dupacks > 3) { + // RFC5681 Step 3.4 + _snd.cwnd += smss; + // RFC5681 Step 3.5 + do_output_data = true; + } + } else if (seg_ack > _snd.next) { + // If the ACK acks something not yet sent (SEG.ACK > SND.NXT) + // then send an ACK, drop the segment, and return + return output(); + } else if (_snd.window == 0 && th->window > 0) { + update_window(); + do_output_data = true; + } + } + // FIN_WAIT_1 STATE + if (in_state(FIN_WAIT_1)) { + // In addition to the processing for the ESTABLISHED state, if + // our FIN is now acknowledged then enter FIN-WAIT-2 and continue + // processing in that state. + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl; + _state = FIN_WAIT_2; + do_local_fin_acked(); + } + } + // FIN_WAIT_2 STATE + if (in_state(FIN_WAIT_2)) { + // In addition to the processing for the ESTABLISHED state, if + // the retransmission queue is empty, the user’s CLOSE can be + // acknowledged ("ok") but do not delete the TCB. + // TODO + } + // CLOSING STATE + if (in_state(CLOSING)) { + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl; + do_local_fin_acked(); + return do_time_wait(); + } else { + return; + } + } + // LAST_ACK STATE + if (in_state(LAST_ACK)) { + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl; + do_local_fin_acked(); + return do_closed(); + } + } + // TIME_WAIT STATE + if (in_state(TIME_WAIT)) { + // The only thing that can arrive in this state is a + // retransmission of the remote FIN. Acknowledge it, and restart + // the 2 MSL timeout. + // TODO + } + } + + // 4.6 sixth, check the URG bit + if (th->f_urg) { + // TODO + } + + // 4.7 seventh, process the segment text + if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) { + if (p.len()) { + // Once the TCP takes responsibility for the data it advances + // RCV.NXT over the data accepted, and adjusts RCV.WND as + // apporopriate to the current buffer availability. The total of + // RCV.NXT and RCV.WND should not be reduced. + _rcv.data.push_back(std::move(p)); + _rcv.next += seg_len; + auto merged = merge_out_of_order(); + signal_data_received(); + // Send an acknowledgment of the form: + // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + // This acknowledgment should be piggybacked on a segment being + // transmitted if possible without incurring undue delay. + if (merged) { + // TCP receiver SHOULD send an immediate ACK when the + // incoming segment fills in all or part of a gap in the + // sequence space. + do_output = true; + } else { + do_output = should_send_ack(seg_len); + } + ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl; + } + } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) { + // This should not occur, since a FIN has been received from the + // remote side. Ignore the segment text. + return; + } + + // 4.8 eighth, check the FIN bit + if (th->f_fin) { + if (in_state(CLOSED | LISTEN | SYN_SENT)) { + // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT + // since the SEG.SEQ cannot be validated; drop the segment and return. + return; + } + auto fin_seq = seg_seq + seg_len; + if (fin_seq == _rcv.next) { + _rcv.next = fin_seq + 1; + + // If this <FIN> packet contains data as well, we can ACK both data + // and <FIN> in a single packet, so canncel the previous ACK. + clear_delayed_ack(); + do_output = false; + // Send ACK for the FIN! + output(); + signal_data_received(); + _errno = 0; + + if (in_state(SYN_RECEIVED | ESTABLISHED)) { + ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl; + _state = CLOSE_WAIT; + // EOF + } + if (in_state(FIN_WAIT_1)) { + // If our FIN has been ACKed (perhaps in this segment), then + // enter TIME-WAIT, start the time-wait timer, turn off the other + // timers; otherwise enter the CLOSING state. + // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2 + // not FIN_WAIT_1 if we reach here. + ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl; + _state = CLOSING; + } + if (in_state(FIN_WAIT_2)) { + ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl; + return do_time_wait(); + } + } + } + if (do_output || (do_output_data && can_send())) { + // Since we will do output, we can canncel scheduled delayed ACK. + clear_delayed_ack(); + output(); + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::connect() +{ + ldout(_tcp.cct, 20) << __func__ << dendl; + // An initial send sequence number (ISS) is selected. A SYN segment of the + // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1, + // enter SYN-SENT state, and return. + do_setup_isn(); + + // Local receive window scale factor + _rcv.window_scale = _option._local_win_scale = 7; + // Maximum segment size local can receive + _rcv.mss = _option._local_mss = local_mss(); + // Linux's default window size + _rcv.window = 29200 << _rcv.window_scale; + + do_syn_sent(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::close_final_cleanup() +{ + if (_snd._all_data_acked_fd >= 0) { + center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE); + _tcp.manager.close(_snd._all_data_acked_fd); + _snd._all_data_acked_fd = -1; + } + + _snd.closed = true; + signal_data_received(); + ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl; + if (in_state(CLOSE_WAIT)) { + ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl; + _state = LAST_ACK; + } else if (in_state(ESTABLISHED)) { + ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl; + _state = FIN_WAIT_1; + } + // Send <FIN> to remote + // Note: we call output_one to make sure a packet with FIN actually + // sent out. If we only call output() and _packetq is not empty, + // tcp::tcb::get_packet(), packet with FIN will not be generated. + output_one(); + output(); + center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::retransmit() +{ + auto output_update_rto = [this] { + output(); + // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off + this->_rto = std::min(this->_rto * 2, this->_rto_max); + start_retransmit_timer(); + }; + + // Retransmit SYN + if (syn_needs_on()) { + if (_snd.syn_retransmit++ < _max_nr_retransmit) { + output_update_rto(); + } else { + _errno = -ECONNABORTED; + ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + } + + // Retransmit FIN + if (fin_needs_on()) { + if (_snd.fin_retransmit++ < _max_nr_retransmit) { + output_update_rto(); + } else { + ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + } + + // Retransmit Data + if (_snd.data.empty()) { + return; + } + + // If there are unacked data, retransmit the earliest segment + auto& unacked_seg = _snd.data.front(); + + // According to RFC5681 + // Update ssthresh only for the first retransmit + uint32_t smss = _snd.mss; + if (unacked_seg.nr_transmits == 0) { + _snd.ssthresh = std::max(flight_size() / 2, 2 * smss); + } + // RFC6582 Step 4 + _snd.recover = _snd.next - 1; + // Start the slow start process + _snd.cwnd = smss; + // End fast recovery + exit_fast_recovery(); + + ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size() + << " nr=" << unacked_seg.nr_transmits << dendl; + if (unacked_seg.nr_transmits < _max_nr_retransmit) { + unacked_seg.nr_transmits++; + } else { + // Delete connection when max num of retransmission is reached + ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + retransmit_one(); + + output_update_rto(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::persist() { + ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl; + // Send 1 byte packet to probe peer's window size + _snd.window_probe = true; + output_one(); + _snd.window_probe = false; + + output(); + // Perform binary exponential back-off per RFC1122 + _persist_time_out = std::min(_persist_time_out * 2, _rto_max); + start_persist_timer(); +} diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h new file mode 100644 index 00000000..b7bd7132 --- /dev/null +++ b/src/msg/async/dpdk/TCP.h @@ -0,0 +1,1503 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_DPDK_TCP_H_ +#define CEPH_DPDK_TCP_H_ + +#include <unordered_map> +#include <map> +#include <queue> +#include <functional> +#include <deque> +#include <chrono> +#include <stdexcept> +#include <system_error> + +#include "msg/async/dpdk/EventDPDK.h" + +#include "include/utime.h" +#include "common/Throttle.h" +#include "common/ceph_time.h" +#include "common/ceph_crypto.h" +#include "msg/async/Event.h" +#include "IPChecksum.h" +#include "IP.h" +#include "const.h" +#include "byteorder.h" +#include "shared_ptr.h" +#include "PacketUtil.h" + +#include "include/random.h" + +struct tcp_hdr; + +enum class tcp_state : uint16_t { + CLOSED = (1 << 0), + LISTEN = (1 << 1), + SYN_SENT = (1 << 2), + SYN_RECEIVED = (1 << 3), + ESTABLISHED = (1 << 4), + FIN_WAIT_1 = (1 << 5), + FIN_WAIT_2 = (1 << 6), + CLOSE_WAIT = (1 << 7), + CLOSING = (1 << 8), + LAST_ACK = (1 << 9), + TIME_WAIT = (1 << 10) +}; + +inline tcp_state operator|(tcp_state s1, tcp_state s2) { + return tcp_state(uint16_t(s1) | uint16_t(s2)); +} + +inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) { + switch (s) { + case tcp_state::CLOSED: return str << "CLOSED"; + case tcp_state::LISTEN: return str << "LISTEN"; + case tcp_state::SYN_SENT: return str << "SYN_SENT"; + case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED"; + case tcp_state::ESTABLISHED: return str << "ESTABLISHED"; + case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1"; + case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2"; + case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT"; + case tcp_state::CLOSING: return str << "CLOSING"; + case tcp_state::LAST_ACK: return str << "LAST_ACK"; + case tcp_state::TIME_WAIT: return str << "TIME_WAIT"; + default: return str << "UNKNOWN"; + } +} + +struct tcp_option { + // The kind and len field are fixed and defined in TCP protocol + enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 }; + enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 }; + struct mss { + option_kind kind = option_kind::mss; + option_len len = option_len::mss; + uint16_t mss; + struct mss hton() { + struct mss m = *this; + m.mss = ::hton(m.mss); + return m; + } + } __attribute__((packed)); + struct win_scale { + option_kind kind = option_kind::win_scale; + option_len len = option_len::win_scale; + uint8_t shift; + } __attribute__((packed)); + struct sack { + option_kind kind = option_kind::sack; + option_len len = option_len::sack; + } __attribute__((packed)); + struct timestamps { + option_kind kind = option_kind::timestamps; + option_len len = option_len::timestamps; + uint32_t t1; + uint32_t t2; + } __attribute__((packed)); + struct nop { + option_kind kind = option_kind::nop; + } __attribute__((packed)); + struct eol { + option_kind kind = option_kind::eol; + } __attribute__((packed)); + static const uint8_t align = 4; + + void parse(uint8_t* beg, uint8_t* end); + uint8_t fill(tcp_hdr* th, uint8_t option_size); + uint8_t get_size(bool syn_on, bool ack_on); + + // For option negotiattion + bool _mss_received = false; + bool _win_scale_received = false; + bool _timestamps_received = false; + bool _sack_received = false; + + // Option data + uint16_t _remote_mss = 536; + uint16_t _local_mss; + uint8_t _remote_win_scale = 0; + uint8_t _local_win_scale = 0; +}; +inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; } +inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; } + +struct tcp_sequence { + uint32_t raw; +}; + +tcp_sequence ntoh(tcp_sequence ts) { + return tcp_sequence { ::ntoh(ts.raw) }; +} + +tcp_sequence hton(tcp_sequence ts) { + return tcp_sequence { ::hton(ts.raw) }; +} + +inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) { + return os << s.raw; +} + +inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; } +inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; } +inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; } +inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; } +inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; } +inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; } +inline bool operator==(tcp_sequence s, tcp_sequence q) { return s.raw == q.raw; } +inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); } +inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; } +inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; } +inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); } +inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); } + +struct tcp_hdr { + uint16_t src_port; + uint16_t dst_port; + tcp_sequence seq; + tcp_sequence ack; + uint8_t rsvd1 : 4; + uint8_t data_offset : 4; + uint8_t f_fin : 1; + uint8_t f_syn : 1; + uint8_t f_rst : 1; + uint8_t f_psh : 1; + uint8_t f_ack : 1; + uint8_t f_urg : 1; + uint8_t rsvd2 : 2; + uint16_t window; + uint16_t checksum; + uint16_t urgent; + + tcp_hdr hton() { + tcp_hdr hdr = *this; + hdr.src_port = ::hton(src_port); + hdr.dst_port = ::hton(dst_port); + hdr.seq = ::hton(seq); + hdr.ack = ::hton(ack); + hdr.window = ::hton(window); + hdr.checksum = ::hton(checksum); + hdr.urgent = ::hton(urgent); + return hdr; + } + + tcp_hdr ntoh() { + tcp_hdr hdr = *this; + hdr.src_port = ::ntoh(src_port); + hdr.dst_port = ::ntoh(dst_port); + hdr.seq = ::ntoh(seq); + hdr.ack = ::ntoh(ack); + hdr.window = ::ntoh(window); + hdr.checksum = ::ntoh(checksum); + hdr.urgent = ::ntoh(urgent); + return hdr; + } +} __attribute__((packed)); + +struct tcp_tag {}; +using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>; + +template <typename InetTraits> +class tcp { + public: + using ipaddr = typename InetTraits::address_type; + using inet_type = typename InetTraits::inet_type; + using connid = l4connid<InetTraits>; + using connid_hash = typename connid::connid_hash; + class connection; + class listener; + private: + class tcb; + + class C_handle_delayed_ack : public EventCallback { + tcb *tc; + + public: + C_handle_delayed_ack(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->_nr_full_seg_received = 0; + tc->output(); + } + }; + + class C_handle_retransmit : public EventCallback { + tcb *tc; + + public: + C_handle_retransmit(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->retransmit(); + } + }; + + class C_handle_persist : public EventCallback { + tcb *tc; + + public: + C_handle_persist(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->persist(); + } + }; + + class C_all_data_acked : public EventCallback { + tcb *tc; + + public: + C_all_data_acked(tcb *t): tc(t) {} + void do_request(uint64_t fd_or_id) { + tc->close_final_cleanup(); + } + }; + + class C_actual_remove_tcb : public EventCallback { + lw_shared_ptr<tcb> tc; + public: + C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {} + void do_request(uint64_t r) { + delete this; + } + }; + + class tcb : public enable_lw_shared_from_this<tcb> { + using clock_type = ceph::coarse_real_clock; + static constexpr tcp_state CLOSED = tcp_state::CLOSED; + static constexpr tcp_state LISTEN = tcp_state::LISTEN; + static constexpr tcp_state SYN_SENT = tcp_state::SYN_SENT; + static constexpr tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED; + static constexpr tcp_state ESTABLISHED = tcp_state::ESTABLISHED; + static constexpr tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1; + static constexpr tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2; + static constexpr tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT; + static constexpr tcp_state CLOSING = tcp_state::CLOSING; + static constexpr tcp_state LAST_ACK = tcp_state::LAST_ACK; + static constexpr tcp_state TIME_WAIT = tcp_state::TIME_WAIT; + tcp_state _state = CLOSED; + tcp& _tcp; + UserspaceEventManager &manager; + connection* _conn = nullptr; + bool _connect_done = false; + ipaddr _local_ip; + ipaddr _foreign_ip; + uint16_t _local_port; + uint16_t _foreign_port; + struct unacked_segment { + Packet p; + uint16_t data_len; + unsigned nr_transmits; + clock_type::time_point tx_time; + }; + struct send { + tcp_sequence unacknowledged; + tcp_sequence next; + uint32_t window; + uint8_t window_scale; + uint16_t mss; + tcp_sequence urgent; + tcp_sequence wl1; + tcp_sequence wl2; + tcp_sequence initial; + std::deque<unacked_segment> data; + std::deque<Packet> unsent; + uint32_t unsent_len = 0; + uint32_t queued_len = 0; + bool closed = false; + // Wait for all data are acked + int _all_data_acked_fd = -1; + // Limit number of data queued into send queue + Throttle user_queue_space; + // Round-trip time variation + std::chrono::microseconds rttvar; + // Smoothed round-trip time + std::chrono::microseconds srtt; + bool first_rto_sample = true; + clock_type::time_point syn_tx_time; + // Congestion window + uint32_t cwnd; + // Slow start threshold + uint32_t ssthresh; + // Duplicated ACKs + uint16_t dupacks = 0; + unsigned syn_retransmit = 0; + unsigned fin_retransmit = 0; + uint32_t limited_transfer = 0; + uint32_t partial_ack = 0; + tcp_sequence recover; + bool window_probe = false; + send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {} + } _snd; + struct receive { + tcp_sequence next; + uint32_t window; + uint8_t window_scale; + uint16_t mss; + tcp_sequence urgent; + tcp_sequence initial; + std::deque<Packet> data; + tcp_packet_merger out_of_order; + } _rcv; + EventCenter *center; + int fd; + // positive means no errno, 0 means eof, nagetive means error + int16_t _errno = 1; + tcp_option _option; + EventCallbackRef delayed_ack_event; + Tub<uint64_t> _delayed_ack_fd; + // Retransmission timeout + std::chrono::microseconds _rto{1000*1000}; + std::chrono::microseconds _persist_time_out{1000*1000}; + static constexpr std::chrono::microseconds _rto_min{1000*1000}; + static constexpr std::chrono::microseconds _rto_max{60000*1000}; + // Clock granularity + static constexpr std::chrono::microseconds _rto_clk_granularity{1000}; + static constexpr uint16_t _max_nr_retransmit{5}; + EventCallbackRef retransmit_event; + Tub<uint64_t> retransmit_fd; + EventCallbackRef persist_event; + EventCallbackRef all_data_ack_event; + Tub<uint64_t> persist_fd; + uint16_t _nr_full_seg_received = 0; + struct isn_secret { + // 512 bits secretkey for ISN generating + uint32_t key[16]; + isn_secret () { + for (auto& k : key) { + k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max()); + } + } + }; + static isn_secret _isn_secret; + tcp_sequence get_isn(); + circular_buffer<typename InetTraits::l4packet> _packetq; + bool _poll_active = false; + public: + // callback + void close_final_cleanup(); + ostream& _prefix(std::ostream *_dout); + + public: + tcb(tcp& t, connid id); + ~tcb(); + void input_handle_listen_state(tcp_hdr* th, Packet p); + void input_handle_syn_sent_state(tcp_hdr* th, Packet p); + void input_handle_other_state(tcp_hdr* th, Packet p); + void output_one(bool data_retransmit = false); + bool is_all_data_acked(); + int send(Packet p); + void connect(); + Tub<Packet> read(); + void close(); + void remove_from_tcbs() { + auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port}; + _tcp._tcbs.erase(id); + } + Tub<typename InetTraits::l4packet> get_packet(); + void output() { + if (!_poll_active) { + _poll_active = true; + + auto tcb = this->shared_from_this(); + _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) { + if (r == 0) { + tcb->_tcp.poll_tcb(dst, std::move(tcb)); + } else if (r == -ETIMEDOUT) { + // in other states connection should time out + if (tcb->in_state(SYN_SENT)) { + tcb->_errno = -ETIMEDOUT; + tcb->cleanup(); + } + } else if (r == -EBUSY) { + // retry later + tcb->_poll_active = false; + tcb->start_retransmit_timer(); + } + }); + } + } + + int16_t get_errno() const { + return _errno; + } + + tcp_state& state() { + return _state; + } + + uint64_t peek_sent_available() { + if (!in_state(ESTABLISHED)) + return 0; + uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current(); + return left; + } + + int is_connected() const { + if (_errno <= 0) + return _errno; + return _connect_done; + } + + private: + void respond_with_reset(tcp_hdr* th); + bool merge_out_of_order(); + void insert_out_of_order(tcp_sequence seq, Packet p); + void trim_receive_data_after_window(); + bool should_send_ack(uint16_t seg_len); + void clear_delayed_ack(); + Packet get_transmit_packet(); + void retransmit_one() { + bool data_retransmit = true; + output_one(data_retransmit); + } + void start_retransmit_timer() { + if (retransmit_fd) + center->delete_time_event(*retransmit_fd); + retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event)); + }; + void stop_retransmit_timer() { + if (retransmit_fd) { + center->delete_time_event(*retransmit_fd); + retransmit_fd.destroy(); + } + }; + void start_persist_timer() { + if (persist_fd) + center->delete_time_event(*persist_fd); + persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event)); + }; + void stop_persist_timer() { + if (persist_fd) { + center->delete_time_event(*persist_fd); + persist_fd.destroy(); + } + }; + void persist(); + void retransmit(); + void fast_retransmit(); + void update_rto(clock_type::time_point tx_time); + void update_cwnd(uint32_t acked_bytes); + void cleanup(); + uint32_t can_send() { + if (_snd.window_probe) { + return 1; + } + // Can not send more than advertised window allows + auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len); + // Can not send more than congestion window allows + x = std::min(_snd.cwnd, x); + if (_snd.dupacks == 1 || _snd.dupacks == 2) { + // RFC5681 Step 3.1 + // Send cwnd + 2 * smss per RFC3042 + auto flight = flight_size(); + auto max = _snd.cwnd + 2 * _snd.mss; + x = flight <= max ? std::min(x, max - flight) : 0; + _snd.limited_transfer += x; + } else if (_snd.dupacks >= 3) { + // RFC5681 Step 3.5 + // Sent 1 full-sized segment at most + x = std::min(uint32_t(_snd.mss), x); + } + return x; + } + uint32_t flight_size() { + uint32_t size = 0; + std::for_each(_snd.data.begin(), _snd.data.end(), + [&] (unacked_segment& seg) { size += seg.p.len(); }); + return size; + } + uint16_t local_mss() { + return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min; + } + void queue_packet(Packet p) { + _packetq.emplace_back( + typename InetTraits::l4packet{_foreign_ip, std::move(p)}); + } + void signal_data_received() { + manager.notify(fd, EVENT_READABLE); + } + void signal_all_data_acked() { + if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0) + manager.notify(_snd._all_data_acked_fd, EVENT_READABLE); + } + void do_syn_sent() { + _state = SYN_SENT; + _snd.syn_tx_time = clock_type::now(); + // Send <SYN> to remote + output(); + } + void do_syn_received() { + _state = SYN_RECEIVED; + _snd.syn_tx_time = clock_type::now(); + // Send <SYN,ACK> to remote + output(); + } + void do_established() { + _state = ESTABLISHED; + update_rto(_snd.syn_tx_time); + _connect_done = true; + manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE); + } + void do_reset() { + _state = CLOSED; + // Free packets to be sent which are waiting for user_queue_space + _snd.user_queue_space.reset(); + cleanup(); + _errno = -ECONNRESET; + manager.notify(fd, EVENT_READABLE); + + if (_snd._all_data_acked_fd >= 0) + manager.notify(_snd._all_data_acked_fd, EVENT_READABLE); + } + void do_time_wait() { + // FIXME: Implement TIME_WAIT state timer + _state = TIME_WAIT; + cleanup(); + } + void do_closed() { + _state = CLOSED; + cleanup(); + } + void do_setup_isn() { + _snd.initial = get_isn(); + _snd.unacknowledged = _snd.initial; + _snd.next = _snd.initial + 1; + _snd.recover = _snd.initial; + } + void do_local_fin_acked() { + _snd.unacknowledged += 1; + _snd.next += 1; + } + bool syn_needs_on() { + return in_state(SYN_SENT | SYN_RECEIVED); + } + bool fin_needs_on() { + return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed && + _snd.unsent_len == 0 && _snd.queued_len == 0; + } + bool ack_needs_on() { + return !in_state(CLOSED | LISTEN | SYN_SENT); + } + bool foreign_will_not_send() { + return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED); + } + bool in_state(tcp_state state) { + return uint16_t(_state) & uint16_t(state); + } + void exit_fast_recovery() { + _snd.dupacks = 0; + _snd.limited_transfer = 0; + _snd.partial_ack = 0; + } + uint32_t data_segment_acked(tcp_sequence seg_ack); + bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len); + void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end); + friend class connection; + + friend class C_handle_delayed_ack; + friend class C_handle_retransmit; + friend class C_handle_persist; + friend class C_all_data_acked; + }; + + CephContext *cct; + // ipv4_l4<ip_protocol_num::tcp> + inet_type& _inet; + EventCenter *center; + UserspaceEventManager &manager; + std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs; + std::unordered_map<uint16_t, listener*> _listening; + std::random_device _rd; + std::default_random_engine _e; + std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535}; + circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs; + // queue for packets that do not belong to any tcb + circular_buffer<ipv4_traits::l4packet> _packetq; + Throttle _queue_space; + // Limit number of data queued into send queue + public: + class connection { + lw_shared_ptr<tcb> _tcb; + public: + explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; } + connection(const connection&) = delete; + connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) { + _tcb->_conn = this; + } + ~connection(); + void operator=(const connection&) = delete; + connection& operator=(connection&& x) { + if (this != &x) { + this->~connection(); + new (this) connection(std::move(x)); + } + return *this; + } + int fd() const { + return _tcb->fd; + } + int send(Packet p) { + return _tcb->send(std::move(p)); + } + Tub<Packet> read() { + return _tcb->read(); + } + int16_t get_errno() const { + return _tcb->get_errno(); + } + void close_read(); + void close_write(); + entity_addr_t remote_addr() const { + entity_addr_t addr; + auto net_ip = _tcb->_foreign_ip.hton(); + memcpy((void*)&addr.in4_addr().sin_addr.s_addr, + &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr)); + addr.set_family(AF_INET); + return addr; + } + uint64_t peek_sent_available() { + return _tcb->peek_sent_available(); + } + int is_connected() const { return _tcb->is_connected(); } + }; + class listener { + tcp& _tcp; + uint16_t _port; + int _fd = -1; + int16_t _errno; + queue<connection> _q; + size_t _q_max_length; + + private: + listener(tcp& t, uint16_t port, size_t queue_length) + : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) { + } + public: + listener(const listener&) = delete; + void operator=(const listener&) = delete; + listener(listener&& x) + : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno), + _q(std::move(x._q)) { + if (_fd >= 0) + _tcp._listening[_port] = this; + } + ~listener() { + abort_accept(); + } + int listen() { + if (_tcp._listening.find(_port) != _tcp._listening.end()) + return -EADDRINUSE; + _tcp._listening.emplace(_port, this); + _fd = _tcp.manager.get_eventfd(); + return 0; + } + Tub<connection> accept() { + Tub<connection> c; + if (!_q.empty()) { + c = std::move(_q.front()); + _q.pop(); + } + return c; + } + void abort_accept() { + while (!_q.empty()) + _q.pop(); + if (_fd >= 0) { + _tcp._listening.erase(_port); + _tcp.manager.close(_fd); + _fd = -1; + } + } + int16_t get_errno() const { + return _errno; + } + bool full() const { + return _q.size() == _q_max_length; + } + int fd() const { + return _fd; + } + friend class tcp; + }; + public: + explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen); + void received(Packet p, ipaddr from, ipaddr to); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + listener listen(uint16_t port, size_t queue_length = 100); + connection connect(const entity_addr_t &addr); + const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); } + void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) { + _poll_tcbs.emplace_back(std::move(tcb), dst); + } + bool push_listen_queue(uint16_t port, tcb *t) { + auto listener = _listening.find(port); + if (listener == _listening.end() || listener->second->full()) { + return false; + } + listener->second->_q.push(connection(t->shared_from_this())); + manager.notify(listener->second->_fd, EVENT_READABLE); + return true; + } + + private: + void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p); + void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip); + friend class listener; +}; + +template <typename InetTraits> +tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen) + : cct(c), _inet(inet), center(cen), + manager(static_cast<DPDKDriver*>(cen->get_driver())->manager), + _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) { + int tcb_polled = 0u; + _inet.register_packet_provider([this, tcb_polled] () mutable { + Tub<typename InetTraits::l4packet> l4p; + auto c = _poll_tcbs.size(); + if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) { + l4p = std::move(_packetq.front()); + _packetq.pop_front(); + _queue_space.put(l4p->p.len()); + } else { + while (c--) { + tcb_polled++; + lw_shared_ptr<tcb> tcb; + ethernet_address dst; + std::tie(tcb, dst) = std::move(_poll_tcbs.front()); + _poll_tcbs.pop_front(); + l4p = std::move(tcb->get_packet()); + if (l4p) { + l4p->e_dst = dst; + break; + } + } + } + return l4p; + }); +} + +template <typename InetTraits> +auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener { + return listener(*this, port, queue_length); +} + +template <typename InetTraits> +typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) { + uint16_t src_port; + connid id; + auto src_ip = _inet._inet.host_address(); + auto dst_ip = ipv4_address(addr); + auto dst_port = addr.get_port(); + + do { + src_port = _port_dist(_e); + id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port}; + if (_tcbs.find(id) == _tcbs.end()) { + if (_inet._inet.netif()->hw_queues_count() == 1 || + _inet._inet.netif()->hash2cpu( + id.hash(_inet._inet.netif()->rss_key())) == center->get_id()) + break; + } + } while (true); + + auto tcbp = make_lw_shared<tcb>(*this, id); + _tcbs.insert({id, tcbp}); + tcbp->connect(); + return connection(tcbp); +} + +template <typename InetTraits> +bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) { + auto th = p.get_header<tcp_hdr>(off); + if (th) { + out_hash_data.push_back(th->src_port); + out_hash_data.push_back(th->dst_port); + } + return true; +} + +template <typename InetTraits> +void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) { + auto th = p.get_header<tcp_hdr>(0); + if (!th) { + return; + } + // th->data_offset is correct even before ntoh() + if (unsigned(th->data_offset * 4) < sizeof(*th)) { + return; + } + + if (!get_hw_features().rx_csum_offload) { + checksummer csum; + InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len()); + csum.sum(p); + if (csum.get() != 0) { + return; + } + } + auto h = th->ntoh(); + auto id = connid{to, from, h.dst_port, h.src_port}; + auto tcbi = _tcbs.find(id); + lw_shared_ptr<tcb> tcbp; + if (tcbi == _tcbs.end()) { + auto listener = _listening.find(id.local_port); + if (listener == _listening.end() || listener->second->full()) { + // 1) In CLOSE state + // 1.1 all data in the incoming segment is discarded. An incoming + // segment containing a RST is discarded. An incoming segment not + // containing a RST causes a RST to be sent in response. + // FIXME: + // if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> + // if ACK on: <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(&h, id.local_ip, id.foreign_ip); + } else { + // 2) In LISTEN state + // 2.1 first check for an RST + if (h.f_rst) { + // An incoming RST should be ignored + return; + } + // 2.2 second check for an ACK + if (h.f_ack) { + // Any acknowledgment is bad if it arrives on a connection + // still in the LISTEN state. + // <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(&h, id.local_ip, id.foreign_ip); + } + // 2.3 third check for a SYN + if (h.f_syn) { + // check the security + // NOTE: Ignored for now + tcbp = make_lw_shared<tcb>(*this, id); + _tcbs.insert({id, tcbp}); + return tcbp->input_handle_listen_state(&h, std::move(p)); + } + // 2.4 fourth other text or control + // So you are unlikely to get here, but if you do, drop the + // segment, and return. + return; + } + } else { + tcbp = tcbi->second; + if (tcbp->state() == tcp_state::SYN_SENT) { + // 3) In SYN_SENT State + return tcbp->input_handle_syn_sent_state(&h, std::move(p)); + } else { + // 4) In other state, can be one of the following: + // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 + // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT + return tcbp->input_handle_other_state(&h, std::move(p)); + } + } +} + +// Send packet does not belong to any tcb +template <typename InetTraits> +void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) { + if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue + _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable { + if (r == 0) + _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp}); + }); + } +} + +template <typename InetTraits> +tcp<InetTraits>::connection::~connection() { + if (_tcb) { + _tcb->_conn = nullptr; + close_read(); + close_write(); + } +} + +template <typename InetTraits> +tcp<InetTraits>::tcb::tcb(tcp& t, connid id) + : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip), + _local_port(id.local_port), _foreign_port(id.foreign_port), + _snd(_tcp.cct), + center(t.center), + fd(t.manager.get_eventfd()), + delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)), + retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)), + persist_event(new tcp<InetTraits>::C_handle_persist(this)), + all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {} + +template <typename InetTraits> +tcp<InetTraits>::tcb::~tcb() +{ + if (_delayed_ack_fd) + center->delete_time_event(*_delayed_ack_fd); + if (retransmit_fd) + center->delete_time_event(*retransmit_fd); + if (persist_fd) + center->delete_time_event(*persist_fd); + delete delayed_ack_event; + delete retransmit_event; + delete persist_event; + delete all_data_ack_event; + manager.close(fd); + fd = -1; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth) +{ + _tcp.respond_with_reset(rth, _local_ip, _foreign_ip); +} + +template <typename InetTraits> +uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) { + uint32_t total_acked_bytes = 0; + // Full ACK of segment + while (!_snd.data.empty() + && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) { + auto acked_bytes = _snd.data.front().p.len(); + _snd.unacknowledged += acked_bytes; + // Ignore retransmitted segments when setting the RTO + if (_snd.data.front().nr_transmits == 0) { + update_rto(_snd.data.front().tx_time); + } + update_cwnd(acked_bytes); + total_acked_bytes += acked_bytes; + _snd.user_queue_space.put(_snd.data.front().data_len); + manager.notify(fd, EVENT_WRITABLE); + _snd.data.pop_front(); + } + // Partial ACK of segment + if (_snd.unacknowledged < seg_ack) { + auto acked_bytes = seg_ack - _snd.unacknowledged; + if (!_snd.data.empty()) { + auto& unacked_seg = _snd.data.front(); + unacked_seg.p.trim_front(acked_bytes); + } + _snd.unacknowledged = seg_ack; + update_cwnd(acked_bytes); + total_acked_bytes += acked_bytes; + } + return total_acked_bytes; +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) { + if (seg_len == 0 && _rcv.window == 0) { + // SEG.SEQ = RCV.NXT + return seg_seq == _rcv.next; + } else if (seg_len == 0 && _rcv.window > 0) { + // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window); + } else if (seg_len > 0 && _rcv.window > 0) { + // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + // or + // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window); + bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window); + return x || y; + } else { + // SEG.LEN > 0 RCV.WND = 0, not acceptable + return false; + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) { + // Handle tcp options + _option.parse(opt_start, opt_end); + + // Remote receive window scale factor + _snd.window_scale = _option._remote_win_scale; + // Local receive window scale factor + _rcv.window_scale = _option._local_win_scale; + + // Maximum segment size remote can receive + _snd.mss = _option._remote_mss; + // Maximum segment size local can receive + _rcv.mss = _option._local_mss = local_mss(); + + // Linux's default window size + _rcv.window = 29200 << _rcv.window_scale; + _snd.window = th->window << _snd.window_scale; + + // Segment sequence number used for last window update + _snd.wl1 = th->seq; + // Segment acknowledgment number used for last window update + _snd.wl2 = th->ack; + + // Setup initial congestion window + if (2190 < _snd.mss) { + _snd.cwnd = 2 * _snd.mss; + } else if (1095 < _snd.mss && _snd.mss <= 2190) { + _snd.cwnd = 3 * _snd.mss; + } else { + _snd.cwnd = 4 * _snd.mss; + } + + // Setup initial slow start threshold + _snd.ssthresh = th->window << _snd.window_scale; +} + +template <typename InetTraits> +Packet tcp<InetTraits>::tcb::get_transmit_packet() { + // easy case: empty queue + if (_snd.unsent.empty()) { + return Packet(); + } + auto can_send = this->can_send(); + // Max number of TCP payloads we can pass to NIC + uint32_t len; + if (_tcp.get_hw_features().tx_tso) { + // FIXME: Info tap device the size of the split packet + len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min; + } else { + len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss); + } + can_send = std::min(can_send, len); + // easy case: one small packet + if (_snd.unsent.front().len() <= can_send) { + auto p = std::move(_snd.unsent.front()); + _snd.unsent.pop_front(); + _snd.unsent_len -= p.len(); + return p; + } + // moderate case: need to split one packet + if (_snd.unsent.front().len() > can_send) { + auto p = _snd.unsent.front().share(0, can_send); + _snd.unsent.front().trim_front(can_send); + _snd.unsent_len -= p.len(); + return p; + } + // hard case: merge some packets, possibly split last + auto p = std::move(_snd.unsent.front()); + _snd.unsent.pop_front(); + can_send -= p.len(); + while (!_snd.unsent.empty() + && _snd.unsent.front().len() <= can_send) { + can_send -= _snd.unsent.front().len(); + p.append(std::move(_snd.unsent.front())); + _snd.unsent.pop_front(); + } + // FIXME: this will result in calling "deleter" of packet which free managed objects + // will used later + // if (!_snd.unsent.empty() && can_send) { + // auto& q = _snd.unsent.front(); + // p.append(q.share(0, can_send)); + // q.trim_front(can_send); + // } + _snd.unsent_len -= p.len(); + return p; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::output_one(bool data_retransmit) { + if (in_state(CLOSED)) { + return; + } + + Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet(); + Packet clone = p.share(); // early clone to prevent share() from calling packet::unuse_internal_data() on header. + uint16_t len = p.len(); + bool syn_on = syn_needs_on(); + bool ack_on = ack_needs_on(); + + auto options_size = _option.get_size(syn_on, ack_on); + auto th = p.prepend_header<tcp_hdr>(options_size); + + th->src_port = _local_port; + th->dst_port = _foreign_port; + + th->f_syn = syn_on; + th->f_ack = ack_on; + if (ack_on) { + clear_delayed_ack(); + } + th->f_urg = false; + th->f_psh = false; + + tcp_sequence seq; + if (data_retransmit) { + seq = _snd.unacknowledged; + } else { + seq = syn_on ? _snd.initial : _snd.next; + _snd.next += len; + } + th->seq = seq; + th->ack = _rcv.next; + th->data_offset = (sizeof(*th) + options_size) / 4; + th->window = _rcv.window >> _rcv.window_scale; + th->checksum = 0; + + // FIXME: does the FIN have to fit in the window? + bool fin_on = fin_needs_on(); + th->f_fin = fin_on; + + // Add tcp options + _option.fill(th, options_size); + *th = th->hton(); + + offload_info oi; + checksummer csum; + uint16_t pseudo_hdr_seg_len = 0; + + oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size; + + if (_tcp.get_hw_features().tx_csum_l4_offload) { + oi.needs_csum = true; + + // + // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's + // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones' + // complement sum of the pseudo header. + // + // For TSO the csum should be calculated for a pseudo header with + // segment length set to 0. All the rest is the same as for a TCP Tx + // CSUM offload case. + // + if (_tcp.get_hw_features().tx_tso && len > _snd.mss) { + oi.tso_seg_size = _snd.mss; + } else { + pseudo_hdr_seg_len = sizeof(*th) + options_size + len; + } + } else { + pseudo_hdr_seg_len = sizeof(*th) + options_size + len; + oi.needs_csum = false; + } + + InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip, + pseudo_hdr_seg_len); + + if (_tcp.get_hw_features().tx_csum_l4_offload) { + th->checksum = ~csum.get(); + } else { + csum.sum(p); + th->checksum = csum.get(); + } + + oi.protocol = ip_protocol_num::tcp; + + p.set_offload_info(oi); + + if (!data_retransmit && (len || syn_on || fin_on)) { + auto now = clock_type::now(); + if (len) { + unsigned nr_transmits = 0; + _snd.data.emplace_back(unacked_segment{std::move(clone), + len, nr_transmits, now}); + } + if (!retransmit_fd) { + start_retransmit_timer(); + } + } + + queue_packet(std::move(p)); +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::is_all_data_acked() { + if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) { + return true; + } + return false; +} + +template <typename InetTraits> +Tub<Packet> tcp<InetTraits>::tcb::read() { + Tub<Packet> p; + if (_rcv.data.empty()) + return p; + + p.construct(); + for (auto&& q : _rcv.data) { + p->append(std::move(q)); + } + _rcv.data.clear(); + return p; +} + +template <typename InetTraits> +int tcp<InetTraits>::tcb::send(Packet p) { + // We can not send after the connection is closed + ceph_assert(!_snd.closed); + + if (in_state(CLOSED)) + return -ECONNRESET; + + auto len = p.len(); + if (!_snd.user_queue_space.get_or_fail(len)) { + // note: caller must ensure enough queue space to send + ceph_abort(); + } + // TODO: Handle p.len() > max user_queue_space case + _snd.queued_len += len; + _snd.unsent_len += len; + _snd.queued_len -= len; + _snd.unsent.push_back(std::move(p)); + if (can_send() > 0) { + output(); + } + return len; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::close() { + if (in_state(CLOSED) || _snd.closed) { + return ; + } + // TODO: We should make this asynchronous + + _errno = -EPIPE; + center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); + bool acked = is_all_data_acked(); + if (!acked) { + _snd._all_data_acked_fd = manager.get_eventfd(); + center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event); + } else { + close_final_cleanup(); + } +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) { + // We've received a TSO packet, do ack immediately + if (seg_len > _rcv.mss) { + _nr_full_seg_received = 0; + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } + return true; + } + + // We've received a full sized segment, ack for every second full sized segment + if (seg_len == _rcv.mss) { + if (_nr_full_seg_received++ >= 1) { + _nr_full_seg_received = 0; + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } + return true; + } + } + + // If the timer is armed and its callback hasn't been run. + if (_delayed_ack_fd) { + return false; + } + + // If the timer is not armed, schedule a delayed ACK. + // The maximum delayed ack timer allowed by RFC1122 is 500ms, most + // implementations use 200ms. + _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event)); + return false; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::clear_delayed_ack() { + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::merge_out_of_order() { + bool merged = false; + if (_rcv.out_of_order.map.empty()) { + return merged; + } + for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) { + auto& p = it->second; + auto seg_beg = it->first; + auto seg_len = p.len(); + auto seg_end = seg_beg + seg_len; + if (seg_beg <= _rcv.next && seg_end > _rcv.next) { + // This segment has been received out of order and its previous + // segment has been received now + auto trim = _rcv.next - seg_beg; + if (trim) { + p.trim_front(trim); + seg_len -= trim; + } + _rcv.next += seg_len; + _rcv.data.push_back(std::move(p)); + // Since c++11, erase() always returns the value of the following element + it = _rcv.out_of_order.map.erase(it); + merged = true; + } else if (_rcv.next >= seg_end) { + // This segment has been receive already, drop it + it = _rcv.out_of_order.map.erase(it); + } else { + // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only, + // so we can stop looking here. + it++; + break; + } + } + return merged; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) { + _rcv.out_of_order.merge(seg, std::move(p)); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::trim_receive_data_after_window() { + abort(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::fast_retransmit() { + if (!_snd.data.empty()) { + auto& unacked_seg = _snd.data.front(); + unacked_seg.nr_transmits++; + retransmit_one(); + output(); + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) { + // Update RTO according to RFC6298 + auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time); + if (_snd.first_rto_sample) { + _snd.first_rto_sample = false; + // RTTVAR <- R/2 + // SRTT <- R + _snd.rttvar = R / 2; + _snd.srtt = R; + } else { + // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'| + // SRTT <- (1 - alpha) * SRTT + alpha * R' + // where alpha = 1/8 and beta = 1/4 + auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt); + _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4; + _snd.srtt = _snd.srtt * 7 / 8 + R / 8; + } + // RTO <- SRTT + max(G, K * RTTVAR) + _rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar); + + // Make sure 1 sec << _rto << 60 sec + _rto = std::max(_rto, _rto_min); + _rto = std::min(_rto, _rto_max); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) { + uint32_t smss = _snd.mss; + if (_snd.cwnd < _snd.ssthresh) { + // In slow start phase + _snd.cwnd += std::min(acked_bytes, smss); + } else { + // In congestion avoidance phase + uint32_t round_up = 1; + _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd); + } +} + + +template <typename InetTraits> +void tcp<InetTraits>::tcb::cleanup() { + manager.notify(fd, EVENT_READABLE); + _snd.closed = true; + _snd.unsent.clear(); + _snd.data.clear(); + _rcv.out_of_order.map.clear(); + _rcv.data.clear(); + stop_retransmit_timer(); + clear_delayed_ack(); + center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this)); + remove_from_tcbs(); +} + +template <typename InetTraits> +tcp_sequence tcp<InetTraits>::tcb::get_isn() { + // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers + // with the expression: + // ISN = M + F(localip, localport, remoteip, remoteport, secretkey) + // M is the 4 microsecond timer + using namespace std::chrono; + uint32_t hash[4]; + hash[0] = _local_ip.ip; + hash[1] = _foreign_ip.ip; + hash[2] = (_local_port << 16) + _foreign_port; + hash[3] = _isn_secret.key[15]; + ceph::crypto::MD5 md5; + md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key)); + md5.Final((unsigned char*)hash); + auto seq = hash[0]; + auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch()); + seq += m.count() / 4; + return make_seq(seq); +} + +template <typename InetTraits> +Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() { + _poll_active = false; + if (_packetq.empty()) { + output_one(); + } + + Tub<typename InetTraits::l4packet> p; + if (in_state(CLOSED)) { + return p; + } + + ceph_assert(!_packetq.empty()); + + p = std::move(_packetq.front()); + _packetq.pop_front(); + if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) { + // If there are packets to send in the queue or tcb is allowed to send + // more add tcp back to polling set to keep sending. In addition, dupacks >= 3 + // is an indication that an segment is lost, stop sending more in this case. + output(); + } + return p; +} + +template <typename InetTraits> +void tcp<InetTraits>::connection::close_read() { + // do nothing + // _tcb->manager.notify(_tcb->fd, EVENT_READABLE); +} + +template <typename InetTraits> +void tcp<InetTraits>::connection::close_write() { + _tcb->close(); +} + +template <typename InetTraits> +constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity; + +template <typename InetTraits> +typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret; + + +#endif /* TCP_HH_ */ diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc new file mode 100644 index 00000000..282dcef1 --- /dev/null +++ b/src/msg/async/dpdk/UserspaceEvent.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "UserspaceEvent.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + +int UserspaceEventManager::get_eventfd() +{ + int fd; + if (!unused_fds.empty()) { + fd = unused_fds.front(); + unused_fds.pop_front(); + } else { + fd = ++max_fd; + fds.resize(fd + 1); + } + + Tub<UserspaceFDImpl> &impl = fds[fd]; + ceph_assert(!impl); + impl.construct(); + ldout(cct, 20) << __func__ << " fd=" << fd << dendl; + return fd; +} + +int UserspaceEventManager::notify(int fd, int mask) +{ + ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl; + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask) + << " listening=" << int(impl->listening_mask) + << " waiting_idx=" << int(impl->waiting_idx) << dendl; + + impl->activating_mask |= mask; + if (impl->waiting_idx) + return 0; + + if (impl->listening_mask & mask) { + if (waiting_fds.size() <= max_wait_idx) + waiting_fds.resize(waiting_fds.size()*2); + impl->waiting_idx = ++max_wait_idx; + waiting_fds[max_wait_idx] = fd; + } + + ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask) + << " listening=" << int(impl->listening_mask) + << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl; + return 0; +} + +void UserspaceEventManager::close(int fd) +{ + ldout(cct, 20) << __func__ << " fd=" << fd << dendl; + if ((size_t)fd >= fds.size()) + return ; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return ; + + if (fd == max_fd) + --max_fd; + else + unused_fds.push_back(fd); + + if (impl->activating_mask) { + if (waiting_fds[max_wait_idx] == fd) { + ceph_assert(impl->waiting_idx == max_wait_idx); + --max_wait_idx; + } + waiting_fds[impl->waiting_idx] = -1; + } + impl.destroy(); +} + +int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp) +{ + int fd; + uint32_t i = 0; + int count = 0; + ceph_assert(num_events); + // leave zero slot for waiting_fds + while (i < max_wait_idx) { + fd = waiting_fds[++i]; + if (fd == -1) + continue; + + events[count] = fd; + Tub<UserspaceFDImpl> &impl = fds[fd]; + ceph_assert(impl); + masks[count] = impl->listening_mask & impl->activating_mask; + ceph_assert(masks[count]); + ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl; + impl->activating_mask &= (~masks[count]); + impl->waiting_idx = 0; + if (++count >= num_events) + break; + } + if (i < max_wait_idx) { + memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i)); + } + max_wait_idx -= i; + return count; +} diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h new file mode 100644 index 00000000..7e89517d --- /dev/null +++ b/src/msg/async/dpdk/UserspaceEvent.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_USERSPACEEVENT_H +#define CEPH_USERSPACEEVENT_H + +#include <cstddef> +#include <errno.h> +#include <string.h> + +#include <vector> +#include <list> + +#include "include/ceph_assert.h" +#include "include/int_types.h" +#include "common/Tub.h" + +class CephContext; + +class UserspaceEventManager { + struct UserspaceFDImpl { + uint32_t waiting_idx = 0; + int16_t read_errno = 0; + int16_t write_errno = 0; + int8_t listening_mask = 0; + int8_t activating_mask = 0; + uint32_t magic = 4921; + }; + CephContext *cct; + int max_fd = 0; + uint32_t max_wait_idx = 0; + std::vector<Tub<UserspaceFDImpl> > fds; + std::vector<int> waiting_fds; + std::list<uint32_t> unused_fds; + + public: + explicit UserspaceEventManager(CephContext *c): cct(c) { + waiting_fds.resize(1024); + } + + int get_eventfd(); + + int listen(int fd, int mask) { + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + impl->listening_mask |= mask; + if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) { + if (waiting_fds.size() <= max_wait_idx) + waiting_fds.resize(waiting_fds.size()*2); + impl->waiting_idx = ++max_wait_idx; + waiting_fds[max_wait_idx] = fd; + } + return 0; + } + + int unlisten(int fd, int mask) { + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + impl->listening_mask &= (~mask); + if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) { + if (waiting_fds[max_wait_idx] == fd) { + ceph_assert(impl->waiting_idx == max_wait_idx); + --max_wait_idx; + } + waiting_fds[impl->waiting_idx] = -1; + impl->waiting_idx = 0; + } + return 0; + } + + int notify(int fd, int mask); + void close(int fd); + int poll(int *events, int *masks, int num_events, struct timeval *tp); + + bool check() { + for (auto &&m : fds) { + if (m && m->magic != 4921) + return false; + } + return true; + } +}; + +#endif //CEPH_USERSPACEEVENT_H diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h new file mode 100644 index 00000000..3b48f789 --- /dev/null +++ b/src/msg/async/dpdk/align.h @@ -0,0 +1,50 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_ALIGN_HH_ +#define CEPH_MSG_DPDK_ALIGN_HH_ + +#include <cstdint> +#include <cstdlib> + +template <typename T> +inline constexpr T align_up(T v, T align) { + return (v + align - 1) & ~(align - 1); +} + +template <typename T> +inline constexpr T* align_up(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align)); +} + +template <typename T> +inline constexpr T align_down(T v, T align) { + return v & ~(align - 1); +} + +template <typename T> +inline constexpr T* align_down(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align)); +} + +#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */ diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h new file mode 100644 index 00000000..40f7728d --- /dev/null +++ b/src/msg/async/dpdk/array_map.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_ARRAY_MAP_HH_ +#define CEPH_ARRAY_MAP_HH_ + +#include <array> + +// unordered_map implemented as a simple array + +template <typename Value, size_t Max> +class array_map { + std::array<Value, Max> _a {}; + public: + array_map(std::initializer_list<std::pair<size_t, Value>> i) { + for (auto kv : i) { + _a[kv.first] = kv.second; + } + } + Value& operator[](size_t key) { return _a[key]; } + const Value& operator[](size_t key) const { return _a[key]; } + + Value& at(size_t key) { + if (key >= Max) { + throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max)); + } + return _a[key]; + } +}; + +#endif /* ARRAY_MAP_HH_ */ diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h new file mode 100644 index 00000000..a996ec07 --- /dev/null +++ b/src/msg/async/dpdk/byteorder.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_BYTEORDER_H_ +#define CEPH_MSG_BYTEORDER_H_ + +#include <arpa/inet.h> // for ntohs() and friends +#include <iosfwd> +#include <utility> + +inline uint64_t ntohq(uint64_t v) { + return __builtin_bswap64(v); +} +inline uint64_t htonq(uint64_t v) { + return __builtin_bswap64(v); +} + +inline void ntoh() {} +inline void hton() {} + +inline uint8_t ntoh(uint8_t x) { return x; } +inline uint8_t hton(uint8_t x) { return x; } +inline uint16_t ntoh(uint16_t x) { return ntohs(x); } +inline uint16_t hton(uint16_t x) { return htons(x); } +inline uint32_t ntoh(uint32_t x) { return ntohl(x); } +inline uint32_t hton(uint32_t x) { return htonl(x); } +inline uint64_t ntoh(uint64_t x) { return ntohq(x); } +inline uint64_t hton(uint64_t x) { return htonq(x); } + +inline int8_t ntoh(int8_t x) { return x; } +inline int8_t hton(int8_t x) { return x; } +inline int16_t ntoh(int16_t x) { return ntohs(x); } +inline int16_t hton(int16_t x) { return htons(x); } +inline int32_t ntoh(int32_t x) { return ntohl(x); } +inline int32_t hton(int32_t x) { return htonl(x); } +inline int64_t ntoh(int64_t x) { return ntohq(x); } +inline int64_t hton(int64_t x) { return htonq(x); } + +#endif /* CEPH_MSG_BYTEORDER_H_ */ diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h new file mode 100644 index 00000000..1ace8eeb --- /dev/null +++ b/src/msg/async/dpdk/capture.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_DPDK_CAPTURE_H +#define CEPH_MSG_DPDK_CAPTURE_H + +#include <utility> + +template <typename T, typename F> +class capture_impl { + T x; + F f; + public: + capture_impl(capture_impl &) = delete; + capture_impl( T && x, F && f ) + : x{std::forward<T>(x)}, f{std::forward<F>(f)} + {} + + template <typename ...Ts> auto operator()( Ts&&...args ) + -> decltype(f( x, std::forward<Ts>(args)... )) + { + return f( x, std::forward<Ts>(args)... ); + } + + template <typename ...Ts> auto operator()( Ts&&...args ) const + -> decltype(f( x, std::forward<Ts>(args)... )) + { + return f( x, std::forward<Ts>(args)... ); + } +}; + +template <typename T, typename F> +capture_impl<T,F> capture( T && x, F && f ) { + return capture_impl<T,F>( + std::forward<T>(x), std::forward<F>(f) ); +} + +#endif //CEPH_MSG_DPDK_CAPTURE_H diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h new file mode 100644 index 00000000..2c92c120 --- /dev/null +++ b/src/msg/async/dpdk/circular_buffer.h @@ -0,0 +1,347 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_CIRCULAR_BUFFER_HH_ +#define CEPH_CIRCULAR_BUFFER_HH_ + +// A growable double-ended queue container that can be efficiently +// extended (and shrunk) from both ends. Implementation is a single +// storage vector. +// +// Similar to libstdc++'s std::deque, except that it uses a single level +// store, and so is more efficient for simple stored items. +// Similar to boost::circular_buffer_space_optimized, except it uses +// uninitialized storage for unoccupied elements (and thus move/copy +// constructors instead of move/copy assignments, which are less efficient). + +#include <memory> +#include <algorithm> + +#include "transfer.h" + +template <typename T, typename Alloc = std::allocator<T>> +class circular_buffer { + struct impl : Alloc { + T* storage = nullptr; + // begin, end interpreted (mod capacity) + size_t begin = 0; + size_t end = 0; + size_t capacity = 0; + }; + impl _impl; + public: + using value_type = T; + using size_type = size_t; + using reference = T&; + using pointer = T*; + using const_reference = const T&; + using const_pointer = const T*; + public: + circular_buffer() = default; + circular_buffer(circular_buffer&& X); + circular_buffer(const circular_buffer& X) = delete; + ~circular_buffer(); + circular_buffer& operator=(const circular_buffer&) = delete; + circular_buffer& operator=(circular_buffer&&) = delete; + void push_front(const T& data); + void push_front(T&& data); + template <typename... A> + void emplace_front(A&&... args); + void push_back(const T& data); + void push_back(T&& data); + template <typename... A> + void emplace_back(A&&... args); + T& front(); + T& back(); + void pop_front(); + void pop_back(); + bool empty() const; + size_t size() const; + size_t capacity() const; + T& operator[](size_t idx); + template <typename Func> + void for_each(Func func); + // access an element, may return wrong or destroyed element + // only useful if you do not rely on data accuracy (e.g. prefetch) + T& access_element_unsafe(size_t idx); + private: + void expand(); + void maybe_expand(size_t nr = 1); + size_t mask(size_t idx) const; + + template<typename CB, typename ValueType> + struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> { + typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t; + + ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; } + ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; } + // prefix + cbiterator<CB, ValueType>& operator++() { + idx++; + return *this; + } + // postfix + cbiterator<CB, ValueType> operator++(int unused) { + auto v = *this; + idx++; + return v; + } + // prefix + cbiterator<CB, ValueType>& operator--() { + idx--; + return *this; + } + // postfix + cbiterator<CB, ValueType> operator--(int unused) { + auto v = *this; + idx--; + return v; + } + cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const { + return cbiterator<CB, ValueType>(cb, idx + n); + } + cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const { + return cbiterator<CB, ValueType>(cb, idx - n); + } + cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) { + idx += n; + return *this; + } + cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) { + idx -= n; + return *this; + } + bool operator==(const cbiterator<CB, ValueType>& rhs) const { + return idx == rhs.idx; + } + bool operator!=(const cbiterator<CB, ValueType>& rhs) const { + return idx != rhs.idx; + } + bool operator<(const cbiterator<CB, ValueType>& rhs) const { + return idx < rhs.idx; + } + bool operator>(const cbiterator<CB, ValueType>& rhs) const { + return idx > rhs.idx; + } + bool operator>=(const cbiterator<CB, ValueType>& rhs) const { + return idx >= rhs.idx; + } + bool operator<=(const cbiterator<CB, ValueType>& rhs) const { + return idx <= rhs.idx; + } + typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const { + return idx - rhs.idx; + } + private: + CB* cb; + size_t idx; + cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {} + friend class circular_buffer; + }; + friend class iterator; + + public: + typedef cbiterator<circular_buffer, T> iterator; + typedef cbiterator<const circular_buffer, const T> const_iterator; + + iterator begin() { + return iterator(this, _impl.begin); + } + const_iterator begin() const { + return const_iterator(this, _impl.begin); + } + iterator end() { + return iterator(this, _impl.end); + } + const_iterator end() const { + return const_iterator(this, _impl.end); + } + const_iterator cbegin() const { + return const_iterator(this, _impl.begin); + } + const_iterator cend() const { + return const_iterator(this, _impl.end); + } +}; + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const { + return idx & (_impl.capacity - 1); +} + +template <typename T, typename Alloc> +inline bool circular_buffer<T, Alloc>::empty() const { + return _impl.begin == _impl.end; +} + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::size() const { + return _impl.end - _impl.begin; +} + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::capacity() const { + return _impl.capacity; +} + +template <typename T, typename Alloc> +inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x) + : _impl(std::move(x._impl)) { + x._impl = {}; +} + +template <typename T, typename Alloc> +template <typename Func> +inline void circular_buffer<T, Alloc>::for_each(Func func) { + auto s = _impl.storage; + auto m = _impl.capacity - 1; + for (auto i = _impl.begin; i != _impl.end; ++i) { + func(s[i & m]); + } +} + +template <typename T, typename Alloc> +inline circular_buffer<T, Alloc>::~circular_buffer() { + for_each([this] (T& obj) { + _impl.destroy(&obj); + }); + _impl.deallocate(_impl.storage, _impl.capacity); +} + +template <typename T, typename Alloc> +void circular_buffer<T, Alloc>::expand() { + auto new_cap = std::max<size_t>(_impl.capacity * 2, 1); + auto new_storage = _impl.allocate(new_cap); + auto p = new_storage; + try { + for_each([this, &p] (T& obj) { + transfer_pass1(_impl, &obj, p); + p++; + }); + } catch (...) { + while (p != new_storage) { + _impl.destroy(--p); + } + _impl.deallocate(new_storage, new_cap); + throw; + } + p = new_storage; + for_each([this, &p] (T& obj) { + transfer_pass2(_impl, &obj, p++); + }); + std::swap(_impl.storage, new_storage); + std::swap(_impl.capacity, new_cap); + _impl.begin = 0; + _impl.end = p - _impl.storage; + _impl.deallocate(new_storage, new_cap); +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) { + if (_impl.end - _impl.begin + nr > _impl.capacity) { + expand(); + } +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_front(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, data); + --_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_front(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, std::move(data)); + --_impl.begin; +} + +template <typename T, typename Alloc> +template <typename... Args> +inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, std::forward<Args>(args)...); + --_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_back(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, data); + ++_impl.end; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_back(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, std::move(data)); + ++_impl.end; +} + +template <typename T, typename Alloc> +template <typename... Args> +inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, std::forward<Args>(args)...); + ++_impl.end; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::front() { + return _impl.storage[mask(_impl.begin)]; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::back() { + return _impl.storage[mask(_impl.end - 1)]; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::pop_front() { + _impl.destroy(&front()); + ++_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::pop_back() { + _impl.destroy(&back()); + --_impl.end; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::operator[](size_t idx) { + return _impl.storage[mask(_impl.begin + idx)]; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) { + return _impl.storage[mask(_impl.begin + idx)]; +} + +#endif /* CEPH_CIRCULAR_BUFFER_HH_ */ diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h new file mode 100644 index 00000000..ea5dc49e --- /dev/null +++ b/src/msg/async/dpdk/const.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_CONST_H_ +#define CEPH_MSG_CONST_H_ + +#include <stdint.h> + +enum class ip_protocol_num : uint8_t { + icmp = 1, tcp = 6, unused = 255 +}; + +enum class eth_protocol_num : uint16_t { + ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd +}; + +const uint8_t eth_hdr_len = 14; +const uint8_t tcp_hdr_len_min = 20; +const uint8_t ipv4_hdr_len_min = 20; +const uint8_t ipv6_hdr_len_min = 40; +const uint16_t ip_packet_len_max = 65535; + +#endif diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc new file mode 100644 index 00000000..9f9d343b --- /dev/null +++ b/src/msg/async/dpdk/dpdk_rte.cc @@ -0,0 +1,154 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <bitset> + +#include <rte_config.h> +#include <rte_common.h> +#include <rte_ethdev.h> +#include <rte_version.h> + +#include "DPDK.h" +#include "dpdk_rte.h" + +namespace dpdk { + + static inline std::vector<char> string2vector(std::string str) { + auto v = std::vector<char>(str.begin(), str.end()); + v.push_back('\0'); + return v; + } + + bool eal::initialized = false; + std::thread eal::t; + std::mutex eal::lock; + std::condition_variable eal::cond; + std::list<std::function<void()>> eal::funcs; + + static int bitcount(unsigned long long n) + { + return std::bitset<CHAR_BIT * sizeof(n)>{n}.count(); + } + + int eal::init(CephContext *c) + { + if (initialized) { + return 1; + } + + bool done = false; + auto num = std::stoull(c->_conf.get_val<std::string>("ms_dpdk_coremask"), + nullptr, 16); + unsigned int coremaskbit = bitcount(num); + + ceph_assert(coremaskbit > c->_conf->ms_async_op_threads); + + t = std::thread([&]() { + // TODO: Inherit these from the app parameters - "opts" + std::vector<std::vector<char>> args { + string2vector(string("ceph")), + string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")), + string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel), + }; + + Tub<std::string> hugepages_path; + if (!c->_conf->ms_dpdk_hugepages.empty()) { + hugepages_path.construct(c->_conf->ms_dpdk_hugepages); + } + + // If "hugepages" is not provided and DPDK PMD drivers mode is requested - + // use the default DPDK huge tables configuration. + if (hugepages_path) { + args.push_back(string2vector("--huge-dir")); + args.push_back(string2vector(*hugepages_path)); + + // + // We don't know what is going to be our networking configuration so we + // assume there is going to be a queue per-CPU. Plus we'll give a DPDK + // 64MB for "other stuff". + // + unsigned int x; + std::stringstream ss; + ss << std::hex << "fffefffe"; + ss >> x; + size_t size_MB = mem_size(bitcount(x)) >> 20; + std::stringstream size_MB_str; + size_MB_str << size_MB; + + args.push_back(string2vector("-m")); + args.push_back(string2vector(size_MB_str.str())); + } else if (!c->_conf->ms_dpdk_pmd.empty()) { + args.push_back(string2vector("--no-huge")); + } + + std::string rte_file_prefix; + rte_file_prefix = "rte_"; + rte_file_prefix += c->_conf->name.to_str(); + args.push_back(string2vector("--file-prefix")); + args.push_back(string2vector(rte_file_prefix)); + + std::vector<char*> cargs; + + for (auto&& a: args) { + cargs.push_back(a.data()); + } + /* initialise the EAL for all */ + int ret = rte_eal_init(cargs.size(), cargs.data()); + if (ret < 0) + return ret; + + std::unique_lock<std::mutex> l(lock); + initialized = true; + done = true; + cond.notify_all(); + while (true) { + if (!funcs.empty()) { + auto f = std::move(funcs.front()); + funcs.pop_front(); + f(); + cond.notify_all(); + } else { + cond.wait(l); + } + } + }); + t.detach(); + std::unique_lock<std::mutex> l(lock); + while (!done) + cond.wait(l); + return 0; + } + + size_t eal::mem_size(int num_cpus) + { + size_t memsize = 0; + // + // PMD mempool memory: + // + // We don't know what is going to be our networking configuration so we + // assume there is going to be a queue per-CPU. + // + memsize += num_cpus * qp_mempool_obj_size(); + + // Plus we'll give a DPDK 64MB for "other stuff". + memsize += (64UL << 20); + + return memsize; + } + +} // namespace dpdk diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h new file mode 100644 index 00000000..4aa83899 --- /dev/null +++ b/src/msg/async/dpdk/dpdk_rte.h @@ -0,0 +1,74 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef CEPH_DPDK_RTE_H_ +#define CEPH_DPDK_RTE_H_ + + +#include <condition_variable> +#include <mutex> +#include <thread> + +#include <bitset> +#include <rte_config.h> +#include <rte_version.h> +#include <boost/program_options.hpp> + +/*********************** Compat section ***************************************/ +// We currently support only versions 2.0 and above. +#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0)) +#error "DPDK version above 2.0.0 is required" +#endif + +#if defined(RTE_MBUF_REFCNT_ATOMIC) +#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \ + "config/common_linuxapp" +#endif +/******************************************************************************/ + +namespace dpdk { + +// DPDK Environment Abstraction Layer +class eal { + public: + using cpuset = std::bitset<RTE_MAX_LCORE>; + + static std::mutex lock; + static std::condition_variable cond; + static std::list<std::function<void()>> funcs; + static int init(CephContext *c); + static void execute_on_master(std::function<void()> &&f) { + bool done = false; + std::unique_lock<std::mutex> l(lock); + funcs.emplace_back([&]() { f(); done = true; }); + cond.notify_all(); + while (!done) + cond.wait(l); + } + /** + * Returns the amount of memory needed for DPDK + * @param num_cpus Number of CPUs the application is going to use + * + * @return + */ + static size_t mem_size(int num_cpus); + static bool initialized; + static std::thread t; +}; + +} // namespace dpdk +#endif // CEPH_DPDK_RTE_H_ diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc new file mode 100644 index 00000000..9aca5078 --- /dev/null +++ b/src/msg/async/dpdk/ethernet.cc @@ -0,0 +1,16 @@ +#include <iomanip> + +#include "ethernet.h" + +std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) { + auto& m = ea.mac; + using u = uint32_t; + os << std::hex << std::setw(2) + << u(m[0]) << ":" + << u(m[1]) << ":" + << u(m[2]) << ":" + << u(m[3]) << ":" + << u(m[4]) << ":" + << u(m[5]); + return os; +} diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h new file mode 100644 index 00000000..b007425f --- /dev/null +++ b/src/msg/async/dpdk/ethernet.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_ETHERNET_H_ +#define CEPH_MSG_ETHERNET_H_ + +#include <array> +#include <sstream> + +#include "include/ceph_assert.h" +#include "byteorder.h" + +struct ethernet_address { + ethernet_address() {} + + ethernet_address(const uint8_t *eaddr) { + std::copy(eaddr, eaddr + 6, mac.begin()); + } + + ethernet_address(std::initializer_list<uint8_t> eaddr) { + ceph_assert(eaddr.size() == mac.size()); + std::copy(eaddr.begin(), eaddr.end(), mac.begin()); + } + + ethernet_address ntoh() { + return *this; + } + ethernet_address hton() { + return *this; + } + std::array<uint8_t, 6> mac; +} __attribute__((packed)); + +inline bool operator==(const ethernet_address& a, const ethernet_address& b) { + return a.mac == b.mac; +} +std::ostream& operator<<(std::ostream& os, const ethernet_address& ea); + +struct ethernet { + using address = ethernet_address; + static address broadcast_address() { + return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + } + static constexpr uint16_t arp_hardware_type() { return 1; } +}; + +struct eth_hdr { + ethernet_address dst_mac; + ethernet_address src_mac; + uint16_t eth_proto; + eth_hdr hton() { + eth_hdr hdr = *this; + hdr.eth_proto = ::hton(eth_proto); + return hdr; + } + eth_hdr ntoh() { + eth_hdr hdr = *this; + hdr.eth_proto = ::ntoh(eth_proto); + return hdr; + } +} __attribute__((packed)); + +ethernet_address parse_ethernet_address(std::string addr); + +#endif /* CEPH_MSG_ETHERNET_H_ */ diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h new file mode 100644 index 00000000..356d8fd6 --- /dev/null +++ b/src/msg/async/dpdk/ip_types.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_IP_TYPES_H_H +#define CEPH_IP_TYPES_H_H + +#include <boost/asio/ip/address_v4.hpp> +#include <string> + +class Packet; +class ethernet_address; +using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>; + +struct ipv4_addr { + uint32_t ip; + uint16_t port; + + ipv4_addr() : ip(0), port(0) {} + ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {} + ipv4_addr(uint16_t port) : ip(0), port(port) {} + ipv4_addr(const std::string &addr); + ipv4_addr(const std::string &addr, uint16_t port); + + ipv4_addr(const entity_addr_t &ad) { + ip = ntoh(ad.in4_addr().sin_addr.s_addr); + port = ad.get_port(); + } + + ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {} +}; + +struct ipv4_address { + ipv4_address() : ip(0) {} + explicit ipv4_address(uint32_t ip) : ip(ip) {} + explicit ipv4_address(const std::string& addr) { + ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong()); + } + ipv4_address(ipv4_addr addr) { + ip = addr.ip; + } + + uint32_t ip; + + ipv4_address hton() { + ipv4_address addr; + addr.ip = ::hton(ip); + return addr; + } + ipv4_address ntoh() { + ipv4_address addr; + addr.ip = ::ntoh(ip); + return addr; + } + + friend bool operator==(ipv4_address x, ipv4_address y) { + return x.ip == y.ip; + } + friend bool operator!=(ipv4_address x, ipv4_address y) { + return x.ip != y.ip; + } +} __attribute__((packed)); + +static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; } + +std::ostream& operator<<(std::ostream& os, const ipv4_address& a); + +namespace std { + + template <> + struct hash<ipv4_address> { + size_t operator()(ipv4_address a) const { return a.ip; } + }; + +} + +#endif //CEPH_IP_TYPES_H_H diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc new file mode 100644 index 00000000..6e361f18 --- /dev/null +++ b/src/msg/async/dpdk/net.cc @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + */ + +#include "net.h" +#include "DPDK.h" +#include "DPDKStack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "net " + +interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center) + : cct(cct), _dev(dev), + _rx(_dev->receive( + center->get_id(), + [center, this] (Packet p) { + return dispatch_packet(center, std::move(p)); + } + )), + _hw_address(_dev->hw_address()), + _hw_features(_dev->get_hw_features()) { + auto idx = 0u; + unsigned qid = center->get_id(); + dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable { + Tub<Packet> p; + for (size_t i = 0; i < _pkt_providers.size(); i++) { + auto l3p = _pkt_providers[idx++](); + if (idx == _pkt_providers.size()) + idx = 0; + if (l3p) { + auto l3pv = std::move(*l3p); + auto eh = l3pv.p.prepend_header<eth_hdr>(); + eh->dst_mac = l3pv.to; + eh->src_mac = _hw_address; + eh->eth_proto = uint16_t(l3pv.proto_num); + *eh = eh->hton(); + ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num) + << " " << _hw_address << " -> " << l3pv.to + << " length " << std::dec << l3pv.p.len() << dendl; + p = std::move(l3pv.p); + return p; + } + } + return p; + }); +} + +subscription<Packet, ethernet_address> interface::register_l3( + eth_protocol_num proto_num, + std::function<int (Packet p, ethernet_address from)> next, + std::function<bool (forward_hash&, Packet& p, size_t)> forward) +{ + auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward))); + ceph_assert(i.second); + l3_rx_stream& l3_rx = i.first->second; + return l3_rx.packet_stream.listen(std::move(next)); +} + +unsigned interface::hash2cpu(uint32_t hash) { + return _dev->hash2cpu(hash); +} + +const rss_key_type& interface::rss_key() const { + return _dev->rss_key(); +} + +uint16_t interface::hw_queues_count() const { + return _dev->hw_queues_count(); +} + +class C_handle_l2forward : public EventCallback { + std::shared_ptr<DPDKDevice> sdev; + unsigned &queue_depth; + Packet p; + unsigned dst; + + public: + C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target) + : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {} + void do_request(uint64_t fd) { + sdev->l2receive(dst, std::move(p)); + queue_depth--; + delete this; + } +}; + +void interface::forward(EventCenter *source, unsigned target, Packet p) { + static __thread unsigned queue_depth; + + if (queue_depth < 1000) { + queue_depth++; + // FIXME: need ensure this event not be called after EventCenter destruct + _dev->workers[target]->center.dispatch_event_external( + new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target)); + } +} + +int interface::dispatch_packet(EventCenter *center, Packet p) { + auto eh = p.get_header<eth_hdr>(); + if (eh) { + auto i = _proto_map.find(ntoh(eh->eth_proto)); + auto hwrss = p.rss_hash(); + if (hwrss) { + ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto) + << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh() + << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl; + } else { + ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto) + << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh() + << " length " << std::dec << p.len() << dendl; + } + if (i != _proto_map.end()) { + l3_rx_stream& l3 = i->second; + auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () { + auto hwrss = p.rss_hash(); + if (hwrss) { + return *hwrss; + } else { + forward_hash data; + if (l3.forward(data, p, sizeof(eth_hdr))) { + return toeplitz_hash(rss_key(), data); + } + return 0u; + } + }); + if (fw != center->get_id()) { + ldout(cct, 1) << __func__ << " forward to " << fw << dendl; + forward(center, fw, std::move(p)); + } else { + auto h = eh->ntoh(); + auto from = h.src_mac; + p.trim_front(sizeof(*eh)); + // avoid chaining, since queue length is unlimited + // drop instead. + if (l3.ready()) { + return l3.packet_stream.produce(std::move(p), from); + } + } + } + } + return 0; +} + +class C_arp_learn : public EventCallback { + DPDKWorker *worker; + ethernet_address l2_addr; + ipv4_address l3_addr; + + public: + C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3) + : worker(w), l2_addr(l2), l3_addr(l3) {} + void do_request(uint64_t id) { + worker->arp_learn(l2_addr, l3_addr); + delete this; + } +}; + +void interface::arp_learn(ethernet_address l2, ipv4_address l3) +{ + for (auto &&w : _dev->workers) { + w->center.dispatch_event_external( + new C_arp_learn(w, l2, l3)); + } +} + +l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func) + : _netif(netif), _proto_num(proto_num) { + _netif->register_packet_provider(std::move(func)); +} + +subscription<Packet, ethernet_address> l3_protocol::receive( + std::function<int (Packet, ethernet_address)> rx_fn, + std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) { + return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward)); +}; diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h new file mode 100644 index 00000000..63f0422b --- /dev/null +++ b/src/msg/async/dpdk/net.h @@ -0,0 +1,138 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_NET_H +#define CEPH_MSG_DPDK_NET_H + +#include "const.h" +#include "ethernet.h" +#include "Packet.h" +#include "stream.h" +#include "toeplitz.h" + +struct hw_features { + // Enable tx ip header checksum offload + bool tx_csum_ip_offload = false; + // Enable tx l4 (TCP or UDP) checksum offload + bool tx_csum_l4_offload = false; + // Enable rx checksum offload + bool rx_csum_offload = false; + // LRO is enabled + bool rx_lro = false; + // Enable tx TCP segment offload + bool tx_tso = false; + // Enable tx UDP fragmentation offload + bool tx_ufo = false; + // Maximum Transmission Unit + uint16_t mtu = 1500; + // Maximun packet len when TCP/UDP offload is enabled + uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len; +}; + +class forward_hash { + uint8_t data[64]; + size_t end_idx = 0; + public: + size_t size() const { + return end_idx; + } + void push_back(uint8_t b) { + ceph_assert(end_idx < sizeof(data)); + data[end_idx++] = b; + } + void push_back(uint16_t b) { + push_back(uint8_t(b)); + push_back(uint8_t(b >> 8)); + } + void push_back(uint32_t b) { + push_back(uint16_t(b)); + push_back(uint16_t(b >> 16)); + } + const uint8_t& operator[](size_t idx) const { + return data[idx]; + } +}; + +class interface; + +class l3_protocol { + public: + struct l3packet { + eth_protocol_num proto_num; + ethernet_address to; + Packet p; + }; + using packet_provider_type = std::function<Tub<l3packet> ()>; + + private: + interface* _netif; + eth_protocol_num _proto_num; + + public: + explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func); + subscription<Packet, ethernet_address> receive( + std::function<int (Packet, ethernet_address)> rx_fn, + std::function<bool (forward_hash &h, Packet &p, size_t s)> forward); + + private: + friend class interface; +}; + +class DPDKDevice; +struct ipv4_address; + +class interface { + CephContext *cct; + struct l3_rx_stream { + stream<Packet, ethernet_address> packet_stream; + std::function<bool (forward_hash&, Packet&, size_t)> forward; + bool ready() { return packet_stream.started(); } + explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {} + }; + std::unordered_map<uint16_t, l3_rx_stream> _proto_map; + std::shared_ptr<DPDKDevice> _dev; + subscription<Packet> _rx; + ethernet_address _hw_address; + struct hw_features _hw_features; + std::vector<l3_protocol::packet_provider_type> _pkt_providers; + + private: + int dispatch_packet(EventCenter *c, Packet p); + public: + explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center); + ethernet_address hw_address() { return _hw_address; } + const struct hw_features& get_hw_features() const { return _hw_features; } + subscription<Packet, ethernet_address> register_l3( + eth_protocol_num proto_num, + std::function<int (Packet, ethernet_address)> next, + std::function<bool (forward_hash&, Packet&, size_t)> forward); + void forward(EventCenter *source, unsigned target, Packet p); + unsigned hash2cpu(uint32_t hash); + void register_packet_provider(l3_protocol::packet_provider_type func) { + _pkt_providers.push_back(std::move(func)); + } + const rss_key_type& rss_key() const; + uint16_t hw_queues_count() const; + void arp_learn(ethernet_address l2, ipv4_address l3); + friend class l3_protocol; +}; + +#endif //CEPH_MSG_DPDK_NET_H diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h new file mode 100644 index 00000000..984ddca1 --- /dev/null +++ b/src/msg/async/dpdk/queue.h @@ -0,0 +1,96 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_QUEUE_H_ +#define CEPH_MSG_DPDK_QUEUE_H_ + +#include <queue> + +#include "circular_buffer.h" + +template <typename T> +class queue { + std::queue<T, circular_buffer<T>> _q; + size_t _max; + + public: + explicit queue(size_t size): _max(size) {} + + // Push an item. + // + // Returns false if the queue was full and the item was not pushed. + bool push(T&& a); + + // pops an item. + T pop(); + + // Consumes items from the queue, passing them to @func, until @func + // returns false or the queue it empty + // + // Returns false if func returned false. + template <typename Func> + bool consume(Func&& func); + + // Returns true when the queue is empty. + bool empty() const; + + // Returns true when the queue is full. + bool full() const; + + size_t size() const { return _q.size(); } + + // Destroy any items in the queue + void clear() { + while (!_q.empty()) { + _q.pop(); + } + } +}; + +template <typename T> +inline bool queue<T>::push(T&& data) { + if (_q.size() < _max) { + _q.push(std::move(data)); + notify_not_empty(); + return true; + } else { + return false; + } +} + +template <typename T> +inline T queue<T>::pop() { + T data = std::move(_q.front()); + _q.pop(); + return data; +} + +template <typename T> +inline bool queue<T>::empty() const { + return _q.empty(); +} + +template <typename T> +inline bool queue<T>::full() const { + return _q.size() == _max; +} + +#endif /* CEPH_MSG_DPDK_QUEUE_H_ */ diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h new file mode 100644 index 00000000..d078063b --- /dev/null +++ b/src/msg/async/dpdk/shared_ptr.h @@ -0,0 +1,391 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_LW_SHARED_PTR_H_ +#define CEPH_LW_SHARED_PTR_H_ + +#include <utility> +#include <type_traits> +#include <functional> +#include <iostream> + +// This header defines a shared pointer facility, lw_shared_ptr<>, +// modeled after std::shared_ptr<>. +// +// Unlike std::shared_ptr<>, this implementation is thread +// safe, and two pointers sharing the same object must not be used in +// different threads. +// +// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<> +// occupying just one machine word, and adding just one word to the shared +// object. However, it does not support polymorphism. +// +// It supports shared_from_this() via enable_shared_from_this<> +// and lw_enable_shared_from_this<>(). +// + +template <typename T> +class lw_shared_ptr; + +template <typename T> +class enable_lw_shared_from_this; + +template <typename T> +class enable_shared_from_this; + +template <typename T, typename... A> +lw_shared_ptr<T> make_lw_shared(A&&... a); + +template <typename T> +lw_shared_ptr<T> make_lw_shared(T&& a); + +template <typename T> +lw_shared_ptr<T> make_lw_shared(T& a); + +struct lw_shared_ptr_counter_base { + long _count = 0; +}; + + +namespace internal { + +template <class T, class U> +struct lw_shared_ptr_accessors; + +template <class T> +struct lw_shared_ptr_accessors_esft; + +template <class T> +struct lw_shared_ptr_accessors_no_esft; + +} + + +// We want to support two use cases for shared_ptr<T>: +// +// 1. T is any type (primitive or class type) +// +// 2. T is a class type that inherits from enable_shared_from_this<T>. +// +// In the first case, we must wrap T in an object containing the counter, +// since T may be a primitive type and cannot be a base class. +// +// In the second case, we want T to reach the counter through its +// enable_shared_from_this<> base class, so that we can implement +// shared_from_this(). +// +// To implement those two conflicting requirements (T alongside its counter; +// T inherits from an object containing the counter) we use std::conditional<> +// and some accessor functions to select between two implementations. + + +// CRTP from this to enable shared_from_this: +template <typename T> +class enable_lw_shared_from_this : private lw_shared_ptr_counter_base { + using ctor = T; +protected: + enable_lw_shared_from_this() noexcept {} + enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {} + enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {} + enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; } + enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; } +public: + lw_shared_ptr<T> shared_from_this(); + lw_shared_ptr<const T> shared_from_this() const; + + template <typename X> + friend class lw_shared_ptr; + template <typename X> + friend class ::internal::lw_shared_ptr_accessors_esft; + template <typename X, class Y> + friend class ::internal::lw_shared_ptr_accessors; +}; + +template <typename T> +struct shared_ptr_no_esft : private lw_shared_ptr_counter_base { + T _value; + + shared_ptr_no_esft() = default; + shared_ptr_no_esft(const T& x) : _value(x) {} + shared_ptr_no_esft(T&& x) : _value(std::move(x)) {} + template <typename... A> + shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {} + + template <typename X> + friend class lw_shared_ptr; + template <typename X> + friend class ::internal::lw_shared_ptr_accessors_no_esft; + template <typename X, class Y> + friend class ::internal::lw_shared_ptr_accessors; +}; + + +/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed, +/// primarily so that incomplete classes can be used. +/// +/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>. +/// The specialization must be visible for all uses of \c lw_shared_ptr<T>. +/// +/// To customize, the template must have a `static void dispose(T*)` operator that disposes of +/// the object. +template <typename T> +struct lw_shared_ptr_deleter; // No generic implementation + +namespace internal { + +template <typename T> +struct lw_shared_ptr_accessors_esft { + using concrete_type = std::remove_const_t<T>; + static T* to_value(lw_shared_ptr_counter_base* counter) { + return static_cast<T*>(counter); + } + static void dispose(lw_shared_ptr_counter_base* counter) { + delete static_cast<T*>(counter); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // since to_value() is defined above, we don't need to do anything special + // to force-instantiate it + } +}; + +template <typename T> +struct lw_shared_ptr_accessors_no_esft { + using concrete_type = shared_ptr_no_esft<T>; + static T* to_value(lw_shared_ptr_counter_base* counter) { + return &static_cast<concrete_type*>(counter)->_value; + } + static void dispose(lw_shared_ptr_counter_base* counter) { + delete static_cast<concrete_type*>(counter); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // since to_value() is defined above, we don't need to do anything special + // to force-instantiate it + } +}; + +// Generic case: lw_shared_ptr_deleter<T> is not specialized, select +// implementation based on whether T inherits from enable_lw_shared_from_this<T>. +template <typename T, typename U = void> +struct lw_shared_ptr_accessors : std::conditional_t< + std::is_base_of<enable_lw_shared_from_this<T>, T>::value, + lw_shared_ptr_accessors_esft<T>, + lw_shared_ptr_accessors_no_esft<T>> { +}; + +// Overload when lw_shared_ptr_deleter<T> specialized +template <typename T> +struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> { + using concrete_type = T; + static T* to_value(lw_shared_ptr_counter_base* counter); + static void dispose(lw_shared_ptr_counter_base* counter) { + lw_shared_ptr_deleter<T>::dispose(to_value(counter)); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // instantiate to_value(); must be defined by shared_ptr_incomplete.hh + to_value(p); + } +}; + +} + +template <typename T> +class lw_shared_ptr { + using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>; + using concrete_type = typename accessors::concrete_type; + mutable lw_shared_ptr_counter_base* _p = nullptr; +private: + lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) { + if (_p) { + ++_p->_count; + } + } + template <typename... A> + static lw_shared_ptr make(A&&... a) { + auto p = new concrete_type(std::forward<A>(a)...); + accessors::instantiate_to_value(p); + return lw_shared_ptr(p); + } +public: + using element_type = T; + + lw_shared_ptr() noexcept = default; + lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {} + lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) { + if (_p) { + ++_p->_count; + } + } + lw_shared_ptr(lw_shared_ptr&& x) noexcept : _p(x._p) { + x._p = nullptr; + } + [[gnu::always_inline]] + ~lw_shared_ptr() { + if (_p && !--_p->_count) { + accessors::dispose(_p); + } + } + lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept { + if (_p != x._p) { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(x); + } + return *this; + } + lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept { + if (_p != x._p) { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(std::move(x)); + } + return *this; + } + lw_shared_ptr& operator=(std::nullptr_t) noexcept { + return *this = lw_shared_ptr(); + } + lw_shared_ptr& operator=(T&& x) noexcept { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x))); + return *this; + } + + T& operator*() const noexcept { return *accessors::to_value(_p); } + T* operator->() const noexcept { return accessors::to_value(_p); } + T* get() const noexcept { + if (_p) { + return accessors::to_value(_p); + } else { + return nullptr; + } + } + + long int use_count() const noexcept { + if (_p) { + return _p->_count; + } else { + return 0; + } + } + + operator lw_shared_ptr<const T>() const noexcept { + return lw_shared_ptr<const T>(_p); + } + + explicit operator bool() const noexcept { + return _p; + } + + bool owned() const noexcept { + return _p->_count == 1; + } + + bool operator==(const lw_shared_ptr<const T>& x) const { + return _p == x._p; + } + + bool operator!=(const lw_shared_ptr<const T>& x) const { + return !operator==(x); + } + + bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return _p == x._p; + } + + bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return !operator==(x); + } + + bool operator<(const lw_shared_ptr<const T>& x) const { + return _p < x._p; + } + + bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return _p < x._p; + } + + template <typename U> + friend class lw_shared_ptr; + + template <typename X, typename... A> + friend lw_shared_ptr<X> make_lw_shared(A&&...); + + template <typename U> + friend lw_shared_ptr<U> make_lw_shared(U&&); + + template <typename U> + friend lw_shared_ptr<U> make_lw_shared(U&); + + template <typename U> + friend class enable_lw_shared_from_this; +}; + +template <typename T, typename... A> +inline +lw_shared_ptr<T> make_lw_shared(A&&... a) { + return lw_shared_ptr<T>::make(std::forward<A>(a)...); +} + +template <typename T> +inline +lw_shared_ptr<T> make_lw_shared(T&& a) { + return lw_shared_ptr<T>::make(std::move(a)); +} + +template <typename T> +inline +lw_shared_ptr<T> make_lw_shared(T& a) { + return lw_shared_ptr<T>::make(a); +} + +template <typename T> +inline +lw_shared_ptr<T> +enable_lw_shared_from_this<T>::shared_from_this() { + return lw_shared_ptr<T>(this); +} + +template <typename T> +inline +lw_shared_ptr<const T> +enable_lw_shared_from_this<T>::shared_from_this() const { + return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this)); +} + +template <typename T> +static inline +std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) { + if (!p) { + return out << "null"; + } + return out << *p; +} + +namespace std { + + template <typename T> + struct hash<lw_shared_ptr<T>> : private hash<T*> { + size_t operator()(const lw_shared_ptr<T>& p) const { + return hash<T*>::operator()(p.get()); + } + }; + +} + +#endif /* CEPH_LW_SHARED_PTR_H_ */ diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h new file mode 100644 index 00000000..1898e8f8 --- /dev/null +++ b/src/msg/async/dpdk/stream.h @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_STREAM_H_ +#define CEPH_MSG_STREAM_H_ + +#include <exception> +#include <cassert> + +// A stream<> is the producer side. It may call produce() as long +// as the returned from the previous invocation is ready. +// To signify no more data is available, call close(). +// +// A subscription<> is the consumer side. It is created by a call +// to stream::listen(). Calling subscription::start(), +// which registers the data processing callback, starts processing +// events. It may register for end-of-stream notifications by +// return the when_done() future, which also delivers error +// events (as exceptions). +// +// The consumer can pause generation of new data by returning +// positive integer; when it becomes ready, the producer +// will resume processing. + +template <typename... T> +class subscription; + +template <typename... T> +class stream { + subscription<T...>* _sub = nullptr; + int done; + bool ready; + public: + using next_fn = std::function<int (T...)>; + stream() = default; + stream(const stream&) = delete; + stream(stream&&) = delete; + ~stream() { + if (_sub) { + _sub->_stream = nullptr; + } + } + + void operator=(const stream&) = delete; + void operator=(stream&&) = delete; + + // Returns a subscription that reads value from this + // stream. + subscription<T...> listen() { + return subscription<T...>(this); + } + + // Returns a subscription that reads value from this + // stream, and also sets up the listen function. + subscription<T...> listen(next_fn next) { + auto sub = subscription<T...>(this); + sub.start(std::move(next)); + return sub; + } + + // Becomes ready when the listener is ready to accept + // values. Call only once, when beginning to produce + // values. + bool started() { + return ready; + } + + // Produce a value. Call only after started(), and after + // a previous produce() is ready. + int produce(T... data) { + return _sub->_next(std::move(data)...); + } + + // End the stream. Call only after started(), and after + // a previous produce() is ready. No functions may be called + // after this. + void close() { + done = 1; + } + + // Signal an error. Call only after started(), and after + // a previous produce() is ready. No functions may be called + // after this. + void set_exception(int error) { + done = error; + } + private: + void start(); + friend class subscription<T...>; +}; + +template <typename... T> +class subscription { + public: + using next_fn = typename stream<T...>::next_fn; + private: + stream<T...>* _stream; + next_fn _next; + private: + explicit subscription(stream<T...>* s): _stream(s) { + ceph_assert(!_stream->_sub); + _stream->_sub = this; + } + + public: + subscription(subscription&& x) + : _stream(x._stream), _next(std::move(x._next)) { + x._stream = nullptr; + if (_stream) { + _stream->_sub = this; + } + } + ~subscription() { + if (_stream) { + _stream->_sub = nullptr; + } + } + + /// \brief Start receiving events from the stream. + /// + /// \param next Callback to call for each event + void start(std::function<int (T...)> next) { + _next = std::move(next); + _stream->ready = true; + } + + // Becomes ready when the stream is empty, or when an error + // happens (in that case, an exception is held). + int done() { + return _stream->done; + } + + friend class stream<T...>; +}; + +#endif /* CEPH_MSG_STREAM_H_ */ diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h new file mode 100644 index 00000000..3ca38808 --- /dev/null +++ b/src/msg/async/dpdk/toeplitz.h @@ -0,0 +1,92 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*- + * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef CEPH_MSG_TOEPLITZ_H_ +#define CEPH_MSG_TOEPLITZ_H_ + +#include <vector> + +using rss_key_type = std::vector<uint8_t>; + +// Mellanox Linux's driver key +static const rss_key_type default_rsskey_40bytes = { + 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, + 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, + 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, + 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, + 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc +}; + +// Intel's i40e PMD default RSS key +static const rss_key_type default_rsskey_52bytes = { + 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, + 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, + 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, + 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, + 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, + 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, + 0x81, 0x15, 0x03, 0x66 +}; + +template<typename T> +static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data) +{ + uint32_t hash = 0, v; + u_int i, b; + + /* XXXRW: Perhaps an assertion about key length vs. data length? */ + + v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; + for (i = 0; i < data.size(); i++) { + for (b = 0; b < 8; b++) { + if (data[i] & (1<<(7-b))) + hash ^= v; + v <<= 1; + if ((i + 4) < key.size() && + (key[i+4] & (1<<(7-b)))) + v |= 1; + } + } + return (hash); +} +#endif diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h new file mode 100644 index 00000000..599db5bd --- /dev/null +++ b/src/msg/async/dpdk/transfer.h @@ -0,0 +1,64 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_TRANSFER_H_ +#define CEPH_TRANSFER_H_ + +// Helper functions for copying or moving multiple objects in an exception +// safe manner, then destroying the sources. +// +// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs, +// (this copies the object from @from to @to). If no exceptions are encountered, +// call transfer_pass2(allocator, &from, &to). This destroys the object at the +// origin. If exceptions were encountered, simply destroy all copied objects. +// +// As an optimization, if the objects are moveable without throwing (noexcept) +// transfer_pass1() simply moves the objects and destroys the source, and +// transfer_pass2() does nothing. + +#include <type_traits> +#include <utility> + +template <typename T, typename Alloc> +inline void transfer_pass1(Alloc& a, T* from, T* to, + typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.construct(to, std::move(*from)); + a.destroy(from); +} + +template <typename T, typename Alloc> +inline void transfer_pass2(Alloc& a, T* from, T* to, + typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { +} + +template <typename T, typename Alloc> +inline void transfer_pass1(Alloc& a, T* from, T* to, + typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.construct(to, *from); +} + +template <typename T, typename Alloc> +inline void transfer_pass2(Alloc& a, T* from, T* to, + typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.destroy(from); +} + +#endif /* CEPH_TRANSFER_H_ */ diff --git a/src/msg/async/frames_v2.cc b/src/msg/async/frames_v2.cc new file mode 100644 index 00000000..f047eb18 --- /dev/null +++ b/src/msg/async/frames_v2.cc @@ -0,0 +1,480 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2020 Red Hat + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "frames_v2.h" + +#include <ostream> + +#undef FMT_HEADER_ONLY +#define FMT_HEADER_ONLY 1 +#include "seastar/fmt/include/fmt/format.h" + +namespace ceph::msgr::v2 { + +// Unpads bufferlist to unpadded_len. +static void unpad_zero(bufferlist& bl, uint32_t unpadded_len) { + ceph_assert(bl.length() >= unpadded_len); + if (bl.length() > unpadded_len) { + bl.splice(unpadded_len, bl.length() - unpadded_len); + } +} + +// Discards trailing empty segments, unless there is just one segment. +// A frame always has at least one (possibly empty) segment. +static size_t calc_num_segments(const bufferlist segment_bls[], + size_t segment_count) { + ceph_assert(segment_count > 0 && segment_count <= MAX_NUM_SEGMENTS); + for (size_t i = segment_count; i-- > 0; ) { + if (segment_bls[i].length() > 0) { + return i + 1; + } + } + return 1; +} + +static void check_segment_crc(const bufferlist& segment_bl, + uint32_t expected_crc) { + uint32_t crc = segment_bl.crc32c(-1); + if (crc != expected_crc) { + throw FrameError(fmt::format( + "bad segment crc calculated={} expected={}", crc, expected_crc)); + } +} + +// Returns true if the frame is ready for dispatching, or false if +// it was aborted by the sender and must be dropped. +static bool check_epilogue_late_status(__u8 late_status) { + __u8 aborted = late_status & FRAME_LATE_STATUS_ABORTED_MASK; + if (aborted != FRAME_LATE_STATUS_ABORTED && + aborted != FRAME_LATE_STATUS_COMPLETE) { + throw FrameError(fmt::format("bad late_status")); + } + return aborted == FRAME_LATE_STATUS_COMPLETE; +} + +void FrameAssembler::fill_preamble(Tag tag, + preamble_block_t& preamble) const { + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&preamble, 0, sizeof(preamble)); + + preamble.tag = static_cast<__u8>(tag); + for (size_t i = 0; i < m_descs.size(); i++) { + preamble.segments[i].length = m_descs[i].logical_len; + preamble.segments[i].alignment = m_descs[i].align; + } + preamble.num_segments = m_descs.size(); + preamble.crc = ceph_crc32c( + 0, reinterpret_cast<const unsigned char*>(&preamble), + sizeof(preamble) - sizeof(preamble.crc)); +} + +uint64_t FrameAssembler::get_frame_logical_len() const { + ceph_assert(!m_descs.empty()); + uint64_t logical_len = 0; + for (size_t i = 0; i < m_descs.size(); i++) { + logical_len += m_descs[i].logical_len; + } + return logical_len; +} + +uint64_t FrameAssembler::get_frame_onwire_len() const { + ceph_assert(!m_descs.empty()); + uint64_t onwire_len = get_preamble_onwire_len(); + for (size_t i = 0; i < m_descs.size(); i++) { + onwire_len += get_segment_onwire_len(i); + } + onwire_len += get_epilogue_onwire_len(); + return onwire_len; +} + +bufferlist FrameAssembler::asm_crc_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + epilogue_crc_rev0_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + + bufferlist frame_bl(sizeof(preamble) + sizeof(epilogue)); + frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble)); + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + epilogue.crc_values[i] = segment_bls[i].crc32c(-1); + if (segment_bls[i].length() > 0) { + frame_bl.claim_append(segment_bls[i]); + } + } + frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue)); + return frame_bl; +} + +bufferlist FrameAssembler::asm_secure_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + bufferlist preamble_bl(sizeof(preamble)); + preamble_bl.append(reinterpret_cast<const char*>(&preamble), + sizeof(preamble)); + + epilogue_secure_rev0_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + bufferlist epilogue_bl(sizeof(epilogue)); + epilogue_bl.append(reinterpret_cast<const char*>(&epilogue), + sizeof(epilogue)); + + // preamble + MAX_NUM_SEGMENTS + epilogue + uint32_t onwire_lens[MAX_NUM_SEGMENTS + 2]; + onwire_lens[0] = preamble_bl.length(); + for (size_t i = 0; i < m_descs.size(); i++) { + onwire_lens[i + 1] = segment_bls[i].length(); // already padded + } + onwire_lens[m_descs.size() + 1] = epilogue_bl.length(); + m_crypto->tx->reset_tx_handler(onwire_lens, + onwire_lens + m_descs.size() + 2); + m_crypto->tx->authenticated_encrypt_update(preamble_bl); + for (size_t i = 0; i < m_descs.size(); i++) { + if (segment_bls[i].length() > 0) { + m_crypto->tx->authenticated_encrypt_update(segment_bls[i]); + } + } + m_crypto->tx->authenticated_encrypt_update(epilogue_bl); + return m_crypto->tx->authenticated_encrypt_final(); +} + +bufferlist FrameAssembler::asm_crc_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + epilogue_crc_rev1_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE; + + bufferlist frame_bl(sizeof(preamble) + FRAME_CRC_SIZE + sizeof(epilogue)); + frame_bl.append(reinterpret_cast<const char*>(&preamble), sizeof(preamble)); + + ceph_assert(segment_bls[0].length() == m_descs[0].logical_len); + if (segment_bls[0].length() > 0) { + uint32_t crc = segment_bls[0].crc32c(-1); + frame_bl.claim_append(segment_bls[0]); + encode(crc, frame_bl); + } + if (m_descs.size() == 1) { + return frame_bl; // no epilogue if only one segment + } + + for (size_t i = 1; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + epilogue.crc_values[i - 1] = segment_bls[i].crc32c(-1); + if (segment_bls[i].length() > 0) { + frame_bl.claim_append(segment_bls[i]); + } + } + frame_bl.append(reinterpret_cast<const char*>(&epilogue), sizeof(epilogue)); + return frame_bl; +} + +bufferlist FrameAssembler::asm_secure_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const { + bufferlist preamble_bl; + if (segment_bls[0].length() > FRAME_PREAMBLE_INLINE_SIZE) { + // first segment is partially inlined, inline buffer is full + preamble_bl.reserve(sizeof(preamble)); + preamble_bl.append(reinterpret_cast<const char*>(&preamble), + sizeof(preamble)); + segment_bls[0].splice(0, FRAME_PREAMBLE_INLINE_SIZE, &preamble_bl); + } else { + // first segment is fully inlined, inline buffer may need padding + uint32_t pad_len = FRAME_PREAMBLE_INLINE_SIZE - segment_bls[0].length(); + preamble_bl.reserve(sizeof(preamble) + pad_len); + preamble_bl.append(reinterpret_cast<const char*>(&preamble), + sizeof(preamble)); + preamble_bl.claim_append(segment_bls[0]); + if (pad_len > 0) { + preamble_bl.append_zero(pad_len); + } + } + + m_crypto->tx->reset_tx_handler({preamble_bl.length()}); + m_crypto->tx->authenticated_encrypt_update(preamble_bl); + auto frame_bl = m_crypto->tx->authenticated_encrypt_final(); + + if (segment_bls[0].length() > 0) { + m_crypto->tx->reset_tx_handler({segment_bls[0].length()}); + m_crypto->tx->authenticated_encrypt_update(segment_bls[0]); + auto tmp = m_crypto->tx->authenticated_encrypt_final(); + frame_bl.claim_append(tmp); + } + if (m_descs.size() == 1) { + return frame_bl; // no epilogue if only one segment + } + + epilogue_secure_rev1_block_t epilogue; + // FIPS zeroization audit 20191115: this memset is not security related. + ::memset(&epilogue, 0, sizeof(epilogue)); + epilogue.late_status |= FRAME_LATE_STATUS_COMPLETE; + bufferlist epilogue_bl(sizeof(epilogue)); + epilogue_bl.append(reinterpret_cast<const char*>(&epilogue), + sizeof(epilogue)); + + // MAX_NUM_SEGMENTS - 1 + epilogue + uint32_t onwire_lens[MAX_NUM_SEGMENTS]; + for (size_t i = 1; i < m_descs.size(); i++) { + onwire_lens[i - 1] = segment_bls[i].length(); // already padded + } + onwire_lens[m_descs.size() - 1] = epilogue_bl.length(); + m_crypto->tx->reset_tx_handler(onwire_lens, onwire_lens + m_descs.size()); + for (size_t i = 1; i < m_descs.size(); i++) { + if (segment_bls[i].length() > 0) { + m_crypto->tx->authenticated_encrypt_update(segment_bls[i]); + } + } + m_crypto->tx->authenticated_encrypt_update(epilogue_bl); + auto tmp = m_crypto->tx->authenticated_encrypt_final(); + frame_bl.claim_append(tmp); + return frame_bl; +} + +bufferlist FrameAssembler::assemble_frame(Tag tag, bufferlist segment_bls[], + const uint16_t segment_aligns[], + size_t segment_count) { + m_descs.resize(calc_num_segments(segment_bls, segment_count)); + for (size_t i = 0; i < m_descs.size(); i++) { + m_descs[i].logical_len = segment_bls[i].length(); + m_descs[i].align = segment_aligns[i]; + } + + preamble_block_t preamble; + fill_preamble(tag, preamble); + + if (m_crypto->rx) { + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + // We're padding segments to biggest cipher's block size. Although + // AES-GCM can live without that as it's a stream cipher, we don't + // want to be fixed to stream ciphers only. + uint32_t padded_len = get_segment_padded_len(i); + if (padded_len > segment_bls[i].length()) { + uint32_t pad_len = padded_len - segment_bls[i].length(); + segment_bls[i].reserve(pad_len); + segment_bls[i].append_zero(pad_len); + } + } + if (m_is_rev1) { + return asm_secure_rev1(preamble, segment_bls); + } + return asm_secure_rev0(preamble, segment_bls); + } + if (m_is_rev1) { + return asm_crc_rev1(preamble, segment_bls); + } + return asm_crc_rev0(preamble, segment_bls); +} + +Tag FrameAssembler::disassemble_preamble(bufferlist& preamble_bl) { + if (m_crypto->rx) { + m_crypto->rx->reset_rx_handler(); + if (m_is_rev1) { + ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE + + get_auth_tag_len()); + m_crypto->rx->authenticated_decrypt_update_final(preamble_bl); + } else { + ceph_assert(preamble_bl.length() == sizeof(preamble_block_t)); + m_crypto->rx->authenticated_decrypt_update(preamble_bl); + } + } else { + ceph_assert(preamble_bl.length() == sizeof(preamble_block_t)); + } + + // I expect ceph_le32 will make the endian conversion for me. Passing + // everything through ::Decode is unnecessary. + auto preamble = reinterpret_cast<const preamble_block_t*>( + preamble_bl.c_str()); + // check preamble crc before any further processing + uint32_t crc = ceph_crc32c( + 0, reinterpret_cast<const unsigned char*>(preamble), + sizeof(*preamble) - sizeof(preamble->crc)); + if (crc != preamble->crc) { + throw FrameError(fmt::format( + "bad preamble crc calculated={} expected={}", crc, preamble->crc)); + } + + // see calc_num_segments() + if (preamble->num_segments < 1 || + preamble->num_segments > MAX_NUM_SEGMENTS) { + throw FrameError(fmt::format( + "bad number of segments num_segments={}", preamble->num_segments)); + } + if (preamble->num_segments > 1 && + preamble->segments[preamble->num_segments - 1].length == 0) { + throw FrameError("last segment empty"); + } + + m_descs.resize(preamble->num_segments); + for (size_t i = 0; i < m_descs.size(); i++) { + m_descs[i].logical_len = preamble->segments[i].length; + m_descs[i].align = preamble->segments[i].alignment; + } + return static_cast<Tag>(preamble->tag); +} + +bool FrameAssembler::disasm_all_crc_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const { + ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev0_block_t)); + auto epilogue = reinterpret_cast<const epilogue_crc_rev0_block_t*>( + epilogue_bl.c_str()); + + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + check_segment_crc(segment_bls[i], epilogue->crc_values[i]); + } + return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED); +} + +bool FrameAssembler::disasm_all_secure_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const { + for (size_t i = 0; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == get_segment_padded_len(i)); + if (segment_bls[i].length() > 0) { + m_crypto->rx->authenticated_decrypt_update(segment_bls[i]); + unpad_zero(segment_bls[i], m_descs[i].logical_len); + } + } + + ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev0_block_t) + + get_auth_tag_len()); + m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl); + auto epilogue = reinterpret_cast<const epilogue_secure_rev0_block_t*>( + epilogue_bl.c_str()); + return !(epilogue->late_flags & FRAME_LATE_FLAG_ABORTED); +} + +void FrameAssembler::disasm_first_crc_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const { + ceph_assert(preamble_bl.length() == sizeof(preamble_block_t)); + if (m_descs[0].logical_len > 0) { + ceph_assert(segment_bl.length() == m_descs[0].logical_len + + FRAME_CRC_SIZE); + bufferlist::const_iterator it(&segment_bl, m_descs[0].logical_len); + uint32_t expected_crc; + decode(expected_crc, it); + segment_bl.splice(m_descs[0].logical_len, FRAME_CRC_SIZE); + check_segment_crc(segment_bl, expected_crc); + } else { + ceph_assert(segment_bl.length() == 0); + } +} + +bool FrameAssembler::disasm_remaining_crc_rev1(bufferlist segment_bls[], + bufferlist& epilogue_bl) const { + ceph_assert(epilogue_bl.length() == sizeof(epilogue_crc_rev1_block_t)); + auto epilogue = reinterpret_cast<const epilogue_crc_rev1_block_t*>( + epilogue_bl.c_str()); + + for (size_t i = 1; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == m_descs[i].logical_len); + check_segment_crc(segment_bls[i], epilogue->crc_values[i - 1]); + } + return check_epilogue_late_status(epilogue->late_status); +} + +void FrameAssembler::disasm_first_secure_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const { + ceph_assert(preamble_bl.length() == FRAME_PREAMBLE_WITH_INLINE_SIZE); + uint32_t padded_len = get_segment_padded_len(0); + if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) { + ceph_assert(segment_bl.length() == padded_len + get_auth_tag_len() - + FRAME_PREAMBLE_INLINE_SIZE); + m_crypto->rx->reset_rx_handler(); + m_crypto->rx->authenticated_decrypt_update_final(segment_bl); + // prepend the inline buffer (already decrypted) to segment_bl + bufferlist tmp; + segment_bl.swap(tmp); + preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE, + &segment_bl); + segment_bl.claim_append(tmp); + } else { + ceph_assert(segment_bl.length() == 0); + preamble_bl.splice(sizeof(preamble_block_t), FRAME_PREAMBLE_INLINE_SIZE, + &segment_bl); + } + unpad_zero(segment_bl, m_descs[0].logical_len); + ceph_assert(segment_bl.length() == m_descs[0].logical_len); +} + +bool FrameAssembler::disasm_remaining_secure_rev1( + bufferlist segment_bls[], bufferlist& epilogue_bl) const { + m_crypto->rx->reset_rx_handler(); + for (size_t i = 1; i < m_descs.size(); i++) { + ceph_assert(segment_bls[i].length() == get_segment_padded_len(i)); + if (segment_bls[i].length() > 0) { + m_crypto->rx->authenticated_decrypt_update(segment_bls[i]); + unpad_zero(segment_bls[i], m_descs[i].logical_len); + } + } + + ceph_assert(epilogue_bl.length() == sizeof(epilogue_secure_rev1_block_t) + + get_auth_tag_len()); + m_crypto->rx->authenticated_decrypt_update_final(epilogue_bl); + auto epilogue = reinterpret_cast<const epilogue_secure_rev1_block_t*>( + epilogue_bl.c_str()); + return check_epilogue_late_status(epilogue->late_status); +} + +void FrameAssembler::disassemble_first_segment(bufferlist& preamble_bl, + bufferlist& segment_bl) const { + ceph_assert(!m_descs.empty()); + if (m_is_rev1) { + if (m_crypto->rx) { + disasm_first_secure_rev1(preamble_bl, segment_bl); + } else { + disasm_first_crc_rev1(preamble_bl, segment_bl); + } + } else { + // noop, everything is handled in disassemble_remaining_segments() + } +} + +bool FrameAssembler::disassemble_remaining_segments( + bufferlist segment_bls[], bufferlist& epilogue_bl) const { + ceph_assert(!m_descs.empty()); + if (m_is_rev1) { + if (m_descs.size() == 1) { + // no epilogue if only one segment + ceph_assert(epilogue_bl.length() == 0); + return true; + } + if (m_crypto->rx) { + return disasm_remaining_secure_rev1(segment_bls, epilogue_bl); + } + return disasm_remaining_crc_rev1(segment_bls, epilogue_bl); + } + if (m_crypto->rx) { + return disasm_all_secure_rev0(segment_bls, epilogue_bl); + } + return disasm_all_crc_rev0(segment_bls, epilogue_bl); +} + +std::ostream& operator<<(std::ostream& os, const FrameAssembler& frame_asm) { + if (!frame_asm.m_descs.empty()) { + os << frame_asm.get_preamble_onwire_len(); + for (size_t i = 0; i < frame_asm.m_descs.size(); i++) { + os << " + " << frame_asm.get_segment_onwire_len(i) + << " (logical " << frame_asm.m_descs[i].logical_len + << "/" << frame_asm.m_descs[i].align << ")"; + } + os << " + " << frame_asm.get_epilogue_onwire_len() << " "; + } + os << "rev1=" << frame_asm.m_is_rev1 + << " rx=" << frame_asm.m_crypto->rx.get() + << " tx=" << frame_asm.m_crypto->tx.get(); + return os; +} + +} // namespace ceph::msgr::v2 diff --git a/src/msg/async/frames_v2.h b/src/msg/async/frames_v2.h new file mode 100644 index 00000000..88fa4e1b --- /dev/null +++ b/src/msg/async/frames_v2.h @@ -0,0 +1,842 @@ +#ifndef _MSG_ASYNC_FRAMES_V2_ +#define _MSG_ASYNC_FRAMES_V2_ + +#include "include/types.h" +#include "common/Clock.h" +#include "crypto_onwire.h" +#include <array> +#include <iosfwd> +#include <utility> + +#include <boost/container/static_vector.hpp> + +/** + * Protocol V2 Frame Structures + * + * Documentation in: doc/dev/msgr2.rst + **/ + +namespace ceph::msgr::v2 { + +// We require these features from any peer, period, in order to encode +// a entity_addrvec_t. +const uint64_t msgr2_required = CEPH_FEATUREMASK_MSG_ADDR2; + +// We additionally assume the peer has the below features *purely for +// the purpose of encoding the frames themselves*. The only complex +// types in the frames are entity_addr_t and entity_addrvec_t, and we +// specifically want the peer to understand the (new in nautilus) +// TYPE_ANY. We treat narrow this assumption to frames because we +// expect there may be future clients (the kernel) that understand +// msgr v2 and understand this encoding but don't necessarily have +// everything else that SERVER_NAUTILUS implies. Yes, a fresh feature +// bit would be a cleaner approach, but those are scarce these days. +const uint64_t msgr2_frame_assumed = + msgr2_required | + CEPH_FEATUREMASK_SERVER_NAUTILUS; + +enum class Tag : __u8 { + HELLO = 1, + AUTH_REQUEST, + AUTH_BAD_METHOD, + AUTH_REPLY_MORE, + AUTH_REQUEST_MORE, + AUTH_DONE, + AUTH_SIGNATURE, + CLIENT_IDENT, + SERVER_IDENT, + IDENT_MISSING_FEATURES, + SESSION_RECONNECT, + SESSION_RESET, + SESSION_RETRY, + SESSION_RETRY_GLOBAL, + SESSION_RECONNECT_OK, + WAIT, + MESSAGE, + KEEPALIVE2, + KEEPALIVE2_ACK, + ACK +}; + +struct segment_t { + // TODO: this will be dropped with support for `allocation policies`. + // We need them because of the rx_buffers zero-copy optimization. + static constexpr __le16 PAGE_SIZE_ALIGNMENT{4096}; + + static constexpr __le16 DEFAULT_ALIGNMENT = sizeof(void *); + + ceph_le32 length; + ceph_le16 alignment; +} __attribute__((packed)); + +struct SegmentIndex { + struct Msg { + static constexpr std::size_t HEADER = 0; + static constexpr std::size_t FRONT = 1; + static constexpr std::size_t MIDDLE = 2; + static constexpr std::size_t DATA = 3; + }; + + struct Control { + static constexpr std::size_t PAYLOAD = 0; + }; +}; + +static constexpr uint8_t CRYPTO_BLOCK_SIZE { 16 }; + +static constexpr std::size_t MAX_NUM_SEGMENTS = 4; + +// V2 preamble consists of one or more preamble blocks depending on +// the number of segments a particular frame needs. Each block holds +// up to MAX_NUM_SEGMENTS segments and has its own CRC. +// +// XXX: currently the multi-segment facility is NOT implemented. +struct preamble_block_t { + // Tag. For multi-segmented frames the value is the same + // between subsequent preamble blocks. + __u8 tag; + + // Number of segments to go in entire frame. First preable block has + // set this to just #segments, second #segments - MAX_NUM_SEGMENTS, + // third to #segments - MAX_NUM_SEGMENTS and so on. + __u8 num_segments; + + std::array<segment_t, MAX_NUM_SEGMENTS> segments; + __u8 _reserved[2]; + + // CRC32 for this single preamble block. + ceph_le32 crc; +} __attribute__((packed)); +static_assert(sizeof(preamble_block_t) % CRYPTO_BLOCK_SIZE == 0); +static_assert(std::is_standard_layout<preamble_block_t>::value); + +struct epilogue_crc_rev0_block_t { + __u8 late_flags; // FRAME_LATE_FLAG_ABORTED + std::array<ceph_le32, MAX_NUM_SEGMENTS> crc_values; +} __attribute__((packed)); +static_assert(std::is_standard_layout_v<epilogue_crc_rev0_block_t>); + +struct epilogue_crc_rev1_block_t { + __u8 late_status; // FRAME_LATE_STATUS_* + ceph_le32 crc_values[MAX_NUM_SEGMENTS - 1]; +} __attribute__((packed)); +static_assert(std::is_standard_layout_v<epilogue_crc_rev1_block_t>); + +struct epilogue_secure_rev0_block_t { + __u8 late_flags; // FRAME_LATE_FLAG_ABORTED + __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_flags)]; +} __attribute__((packed)); +static_assert(sizeof(epilogue_secure_rev0_block_t) % CRYPTO_BLOCK_SIZE == 0); +static_assert(std::is_standard_layout_v<epilogue_secure_rev0_block_t>); + +// epilogue_secure_rev0_block_t with late_flags changed to late_status +struct epilogue_secure_rev1_block_t { + __u8 late_status; // FRAME_LATE_STATUS_* + __u8 padding[CRYPTO_BLOCK_SIZE - sizeof(late_status)]; +} __attribute__((packed)); +static_assert(sizeof(epilogue_secure_rev1_block_t) % CRYPTO_BLOCK_SIZE == 0); +static_assert(std::is_standard_layout_v<epilogue_secure_rev1_block_t>); + +static constexpr uint32_t FRAME_CRC_SIZE = 4; +static constexpr uint32_t FRAME_PREAMBLE_INLINE_SIZE = 48; +static_assert(FRAME_PREAMBLE_INLINE_SIZE % CRYPTO_BLOCK_SIZE == 0); +// just for performance, nothing should break otherwise +static_assert(sizeof(ceph_msg_header2) <= FRAME_PREAMBLE_INLINE_SIZE); +static constexpr uint32_t FRAME_PREAMBLE_WITH_INLINE_SIZE = + sizeof(preamble_block_t) + FRAME_PREAMBLE_INLINE_SIZE; + +// A frame can be aborted by the sender after transmitting the +// preamble and the first segment. The remainder of the frame +// is filled with zeros, up until the epilogue. +// +// This flag is for msgr2.0. Note that in crc mode, late_flags +// is not covered by any crc -- a single bit flip can result in +// a completed frame being dropped or in an aborted frame with +// garbage segment payloads being dispatched. +#define FRAME_LATE_FLAG_ABORTED (1<<0) + +// For msgr2.1, FRAME_LATE_STATUS_ABORTED has the same meaning +// as FRAME_LATE_FLAG_ABORTED and late_status replaces late_flags. +// Bit error detection in crc mode is achieved by using a 4-bit +// nibble per flag with two code words that are far apart in terms +// of Hamming Distance (HD=4, same as provided by CRC32-C for +// input lengths over ~5K). +#define FRAME_LATE_STATUS_ABORTED 0x1 +#define FRAME_LATE_STATUS_COMPLETE 0xe +#define FRAME_LATE_STATUS_ABORTED_MASK 0xf + +#define FRAME_LATE_STATUS_RESERVED_TRUE 0x10 +#define FRAME_LATE_STATUS_RESERVED_FALSE 0xe0 +#define FRAME_LATE_STATUS_RESERVED_MASK 0xf0 + +struct FrameError : std::runtime_error { + using runtime_error::runtime_error; +}; + +class FrameAssembler { +public: + // crypto must be non-null + FrameAssembler(const ceph::crypto::onwire::rxtx_t* crypto, bool is_rev1) + : m_crypto(crypto), m_is_rev1(is_rev1) {} + + void set_is_rev1(bool is_rev1) { + m_descs.clear(); + m_is_rev1 = is_rev1; + } + + bool get_is_rev1() { + return m_is_rev1; + } + + size_t get_num_segments() const { + ceph_assert(!m_descs.empty()); + return m_descs.size(); + } + + uint32_t get_segment_logical_len(size_t seg_idx) const { + ceph_assert(seg_idx < m_descs.size()); + return m_descs[seg_idx].logical_len; + } + + uint16_t get_segment_align(size_t seg_idx) const { + ceph_assert(seg_idx < m_descs.size()); + return m_descs[seg_idx].align; + } + + // Preamble: + // + // preamble_block_t + // [preamble inline buffer + auth tag -- only in msgr2.1 secure mode] + // + // The preamble is generated unconditionally. + // + // In msgr2.1 secure mode, the first segment is inlined into the + // preamble inline buffer, either fully or partially. + uint32_t get_preamble_onwire_len() const { + if (m_is_rev1 && m_crypto->rx) { + return FRAME_PREAMBLE_WITH_INLINE_SIZE + get_auth_tag_len(); + } + return sizeof(preamble_block_t); + } + + // Segment: + // + // segment payload + // [zero padding -- only in secure mode] + // [crc or auth tag -- only in msgr2.1, only for the first segment] + // + // For an empty segment, nothing is generated. In msgr2.1 secure + // mode, if the first segment gets fully inlined into the preamble + // inline buffer, it is considered empty. + uint32_t get_segment_onwire_len(size_t seg_idx) const { + ceph_assert(seg_idx < m_descs.size()); + if (m_crypto->rx) { + uint32_t padded_len = get_segment_padded_len(seg_idx); + if (m_is_rev1 && seg_idx == 0) { + if (padded_len > FRAME_PREAMBLE_INLINE_SIZE) { + return padded_len + get_auth_tag_len() - FRAME_PREAMBLE_INLINE_SIZE; + } + return 0; + } + return padded_len; + } + if (m_is_rev1 && seg_idx == 0 && m_descs[0].logical_len > 0) { + return m_descs[0].logical_len + FRAME_CRC_SIZE; + } + return m_descs[seg_idx].logical_len; + } + + // Epilogue: + // + // epilogue_*_block_t + // [auth tag -- only in secure mode] + // + // For msgr2.0, the epilogue is generated unconditionally. In + // crc mode, it stores crcs for all segments; the preamble is + // covered by its own crc. In secure mode, the epilogue auth tag + // covers the whole frame. + // + // For msgr2.1, the epilogue is generated only if the frame has + // more than one segment (i.e. at least one of second to fourth + // segments is not empty). In crc mode, it stores crcs for + // second to fourh segments; the preamble and the first segment + // are covered by their own crcs. In secure mode, the epilogue + // auth tag covers second to fourth segments; the preamble and the + // first segment (if not fully inlined into the preamble inline + // buffer) are covered by their own auth tags. + // + // Note that the auth tag format is an implementation detail of a + // particular cipher. FrameAssembler is concerned only with where + // the auth tag is placed (at the end of the ciphertext) and how + // long it is (RxHandler::get_extra_size_at_final()). This is to + // provide room for other encryption algorithms: currently we use + // AES-128-GCM with 16-byte tags, but it is possible to switch to + // e.g. AES-128-CBC + HMAC-SHA512 without affecting the protocol + // (except for the cipher negotiation, of course). + // + // Additionally, each variant of the epilogue contains either + // late_flags or late_status field that directs handling of frames + // with more than one segment. + uint32_t get_epilogue_onwire_len() const { + ceph_assert(!m_descs.empty()); + if (m_is_rev1 && m_descs.size() == 1) { + return 0; + } + if (m_crypto->rx) { + return (m_is_rev1 ? sizeof(epilogue_secure_rev1_block_t) : + sizeof(epilogue_secure_rev0_block_t)) + get_auth_tag_len(); + } + return m_is_rev1 ? sizeof(epilogue_crc_rev1_block_t) : + sizeof(epilogue_crc_rev0_block_t); + } + + uint64_t get_frame_logical_len() const; + uint64_t get_frame_onwire_len() const; + + bufferlist assemble_frame(Tag tag, bufferlist segment_bls[], + const uint16_t segment_aligns[], + size_t segment_count); + + Tag disassemble_preamble(bufferlist& preamble_bl); + + // Like msgr1, and unlike msgr2.0, msgr2.1 allows interpreting the + // first segment before reading in the rest of the frame. + // + // For msgr2.1 (set_is_rev1(true)), you may: + // + // - read in the first segment + // - call disassemble_first_segment() + // - use the contents of the first segment, for example to + // look up user-provided buffers based on ceph_msg_header2::tid + // - read in the remaining segments, possibly directly into + // user-provided buffers + // - read in epilogue + // - call disassemble_remaining_segments() + // + // For msgr2.0 (set_is_rev1(false)), disassemble_first_segment() is + // a noop. To accomodate, disassemble_remaining_segments() always + // takes all segments and skips over the first segment in msgr2.1 + // case. You must: + // + // - read in all segments + // - read in epilogue + // - call disassemble_remaining_segments() + // + // disassemble_remaining_segments() returns true if the frame is + // ready for dispatching, or false if it was aborted by the sender + // and must be dropped. + void disassemble_first_segment(bufferlist& preamble_bl, + bufferlist& segment_bl) const; + bool disassemble_remaining_segments(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + +private: + struct segment_desc_t { + uint32_t logical_len; + uint16_t align; + }; + + uint32_t get_segment_padded_len(size_t seg_idx) const { + return p2roundup<uint32_t>(m_descs[seg_idx].logical_len, + CRYPTO_BLOCK_SIZE); + } + + uint32_t get_auth_tag_len() const { + return m_crypto->rx->get_extra_size_at_final(); + } + + bufferlist asm_crc_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + bufferlist asm_secure_rev0(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + bufferlist asm_crc_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + bufferlist asm_secure_rev1(const preamble_block_t& preamble, + bufferlist segment_bls[]) const; + + bool disasm_all_crc_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + bool disasm_all_secure_rev0(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + void disasm_first_crc_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const; + bool disasm_remaining_crc_rev1(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + void disasm_first_secure_rev1(bufferlist& preamble_bl, + bufferlist& segment_bl) const; + bool disasm_remaining_secure_rev1(bufferlist segment_bls[], + bufferlist& epilogue_bl) const; + + void fill_preamble(Tag tag, preamble_block_t& preamble) const; + friend std::ostream& operator<<(std::ostream& os, + const FrameAssembler& frame_asm); + + boost::container::static_vector<segment_desc_t, MAX_NUM_SEGMENTS> m_descs; + const ceph::crypto::onwire::rxtx_t* m_crypto; + bool m_is_rev1; // msgr2.1? +}; + +template <class T, uint16_t... SegmentAlignmentVs> +struct Frame { + static constexpr size_t SegmentsNumV = sizeof...(SegmentAlignmentVs); + static_assert(SegmentsNumV > 0 && SegmentsNumV <= MAX_NUM_SEGMENTS); +protected: + std::array<ceph::bufferlist, SegmentsNumV> segments; + +private: + static constexpr std::array<uint16_t, SegmentsNumV> alignments { + SegmentAlignmentVs... + }; + +public: + ceph::bufferlist get_buffer(FrameAssembler& tx_frame_asm) { + auto bl = tx_frame_asm.assemble_frame(T::tag, segments.data(), + alignments.data(), SegmentsNumV); + ceph_assert(bl.length() == tx_frame_asm.get_frame_onwire_len()); + return bl; + } +}; + +// ControlFrames are used to manage transceiver state (like connections) and +// orchestrate transfers of MessageFrames. They use only single segment with +// marshalling facilities -- derived classes specify frame structure through +// Args pack while ControlFrame provides common encode/decode machinery. +template <class C, typename... Args> +class ControlFrame : public Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */> { +protected: + ceph::bufferlist &get_payload_segment() { + return this->segments[SegmentIndex::Control::PAYLOAD]; + } + + // this tuple is only used when decoding values from a payload segment + std::tuple<Args...> _values; + + // FIXME: for now, we assume specific features for the purpoess of encoding + // the frames themselves (*not* messages in message frames!). + uint64_t features = msgr2_frame_assumed; + + template <typename T> + inline void _encode_payload_each(T &t) { + if constexpr (std::is_same<T, std::vector<uint32_t> const>()) { + encode((uint32_t)t.size(), this->get_payload_segment(), features); + for (const auto &elem : t) { + encode(elem, this->get_payload_segment(), features); + } + } else { + encode(t, this->get_payload_segment(), features); + } + } + + template <typename T> + inline void _decode_payload_each(T &t, bufferlist::const_iterator &ti) const { + if constexpr (std::is_same<T, std::vector<uint32_t>>()) { + uint32_t size; + decode(size, ti); + t.resize(size); + for (uint32_t i = 0; i < size; ++i) { + decode(t[i], ti); + } + } else { + decode(t, ti); + } + } + + template <std::size_t... Is> + inline void _decode_payload(bufferlist::const_iterator &ti, + std::index_sequence<Is...>) const { + (_decode_payload_each((Args &)std::get<Is>(_values), ti), ...); + } + + template <std::size_t N> + inline decltype(auto) get_val() { + return std::get<N>(_values); + } + + ControlFrame() + : Frame<C, segment_t::DEFAULT_ALIGNMENT /* single segment */>() { + } + + void _encode(const Args &... args) { + (_encode_payload_each(args), ...); + } + + void _decode(const ceph::bufferlist &bl) { + auto ti = bl.cbegin(); + _decode_payload(ti, std::index_sequence_for<Args...>()); + } + +public: + static C Encode(const Args &... args) { + C c; + c._encode(args...); + return c; + } + + static C Decode(const ceph::bufferlist &payload) { + C c; + c._decode(payload); + return c; + } +}; + +struct HelloFrame : public ControlFrame<HelloFrame, + uint8_t, // entity type + entity_addr_t> { // peer address + static const Tag tag = Tag::HELLO; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint8_t &entity_type() { return get_val<0>(); } + inline entity_addr_t &peer_addr() { return get_val<1>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthRequestFrame : public ControlFrame<AuthRequestFrame, + uint32_t, // auth method + vector<uint32_t>, // preferred modes + bufferlist> { // auth payload + static const Tag tag = Tag::AUTH_REQUEST; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint32_t &method() { return get_val<0>(); } + inline vector<uint32_t> &preferred_modes() { return get_val<1>(); } + inline bufferlist &auth_payload() { return get_val<2>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthBadMethodFrame : public ControlFrame<AuthBadMethodFrame, + uint32_t, // method + int32_t, // result + std::vector<uint32_t>, // allowed methods + std::vector<uint32_t>> { // allowed modes + static const Tag tag = Tag::AUTH_BAD_METHOD; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint32_t &method() { return get_val<0>(); } + inline int32_t &result() { return get_val<1>(); } + inline std::vector<uint32_t> &allowed_methods() { return get_val<2>(); } + inline std::vector<uint32_t> &allowed_modes() { return get_val<3>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthReplyMoreFrame : public ControlFrame<AuthReplyMoreFrame, + bufferlist> { // auth payload + static const Tag tag = Tag::AUTH_REPLY_MORE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline bufferlist &auth_payload() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthRequestMoreFrame : public ControlFrame<AuthRequestMoreFrame, + bufferlist> { // auth payload + static const Tag tag = Tag::AUTH_REQUEST_MORE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline bufferlist &auth_payload() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthDoneFrame : public ControlFrame<AuthDoneFrame, + uint64_t, // global id + uint32_t, // connection mode + bufferlist> { // auth method payload + static const Tag tag = Tag::AUTH_DONE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &global_id() { return get_val<0>(); } + inline uint32_t &con_mode() { return get_val<1>(); } + inline bufferlist &auth_payload() { return get_val<2>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AuthSignatureFrame + : public ControlFrame<AuthSignatureFrame, + sha256_digest_t> { + static const Tag tag = Tag::AUTH_SIGNATURE; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline sha256_digest_t &signature() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ClientIdentFrame + : public ControlFrame<ClientIdentFrame, + entity_addrvec_t, // my addresses + entity_addr_t, // target address + int64_t, // global_id + uint64_t, // global seq + uint64_t, // supported features + uint64_t, // required features + uint64_t, // flags + uint64_t> { // client cookie + static const Tag tag = Tag::CLIENT_IDENT; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline entity_addrvec_t &addrs() { return get_val<0>(); } + inline entity_addr_t &target_addr() { return get_val<1>(); } + inline int64_t &gid() { return get_val<2>(); } + inline uint64_t &global_seq() { return get_val<3>(); } + inline uint64_t &supported_features() { return get_val<4>(); } + inline uint64_t &required_features() { return get_val<5>(); } + inline uint64_t &flags() { return get_val<6>(); } + inline uint64_t &cookie() { return get_val<7>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ServerIdentFrame + : public ControlFrame<ServerIdentFrame, + entity_addrvec_t, // my addresses + int64_t, // global_id + uint64_t, // global seq + uint64_t, // supported features + uint64_t, // required features + uint64_t, // flags + uint64_t> { // server cookie + static const Tag tag = Tag::SERVER_IDENT; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline entity_addrvec_t &addrs() { return get_val<0>(); } + inline int64_t &gid() { return get_val<1>(); } + inline uint64_t &global_seq() { return get_val<2>(); } + inline uint64_t &supported_features() { return get_val<3>(); } + inline uint64_t &required_features() { return get_val<4>(); } + inline uint64_t &flags() { return get_val<5>(); } + inline uint64_t &cookie() { return get_val<6>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ReconnectFrame + : public ControlFrame<ReconnectFrame, + entity_addrvec_t, // my addresses + uint64_t, // client cookie + uint64_t, // server cookie + uint64_t, // global sequence + uint64_t, // connect sequence + uint64_t> { // message sequence + static const Tag tag = Tag::SESSION_RECONNECT; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline entity_addrvec_t &addrs() { return get_val<0>(); } + inline uint64_t &client_cookie() { return get_val<1>(); } + inline uint64_t &server_cookie() { return get_val<2>(); } + inline uint64_t &global_seq() { return get_val<3>(); } + inline uint64_t &connect_seq() { return get_val<4>(); } + inline uint64_t &msg_seq() { return get_val<5>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct ResetFrame : public ControlFrame<ResetFrame, + bool> { // full reset + static const Tag tag = Tag::SESSION_RESET; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline bool &full() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct RetryFrame : public ControlFrame<RetryFrame, + uint64_t> { // connection seq + static const Tag tag = Tag::SESSION_RETRY; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &connect_seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct RetryGlobalFrame : public ControlFrame<RetryGlobalFrame, + uint64_t> { // global seq + static const Tag tag = Tag::SESSION_RETRY_GLOBAL; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &global_seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct WaitFrame : public ControlFrame<WaitFrame> { + static const Tag tag = Tag::WAIT; + using ControlFrame::Encode; + using ControlFrame::Decode; + +protected: + using ControlFrame::ControlFrame; +}; + +struct ReconnectOkFrame : public ControlFrame<ReconnectOkFrame, + uint64_t> { // message seq + static const Tag tag = Tag::SESSION_RECONNECT_OK; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &msg_seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct IdentMissingFeaturesFrame + : public ControlFrame<IdentMissingFeaturesFrame, + uint64_t> { // missing features mask + static const Tag tag = Tag::IDENT_MISSING_FEATURES; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &features() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct KeepAliveFrame : public ControlFrame<KeepAliveFrame, + utime_t> { // timestamp + static const Tag tag = Tag::KEEPALIVE2; + using ControlFrame::Encode; + using ControlFrame::Decode; + + static KeepAliveFrame Encode() { + return KeepAliveFrame::Encode(ceph_clock_now()); + } + + inline utime_t ×tamp() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct KeepAliveFrameAck : public ControlFrame<KeepAliveFrameAck, + utime_t> { // ack timestamp + static const Tag tag = Tag::KEEPALIVE2_ACK; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline utime_t ×tamp() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +struct AckFrame : public ControlFrame<AckFrame, + uint64_t> { // message sequence + static const Tag tag = Tag::ACK; + using ControlFrame::Encode; + using ControlFrame::Decode; + + inline uint64_t &seq() { return get_val<0>(); } + +protected: + using ControlFrame::ControlFrame; +}; + +using segment_bls_t = + boost::container::static_vector<bufferlist, MAX_NUM_SEGMENTS>; + +// This class is used for encoding/decoding header of the message frame. +// Body is processed almost independently with the sole junction point +// being the `extra_payload_len` passed to get_buffer(). +struct MessageFrame : public Frame<MessageFrame, + /* four segments */ + segment_t::DEFAULT_ALIGNMENT, + segment_t::DEFAULT_ALIGNMENT, + segment_t::DEFAULT_ALIGNMENT, + segment_t::PAGE_SIZE_ALIGNMENT> { + static const Tag tag = Tag::MESSAGE; + + static MessageFrame Encode(const ceph_msg_header2 &msg_header, + const ceph::bufferlist &front, + const ceph::bufferlist &middle, + const ceph::bufferlist &data) { + MessageFrame f; + f.segments[SegmentIndex::Msg::HEADER].append( + reinterpret_cast<const char*>(&msg_header), sizeof(msg_header)); + + f.segments[SegmentIndex::Msg::FRONT] = front; + f.segments[SegmentIndex::Msg::MIDDLE] = middle; + f.segments[SegmentIndex::Msg::DATA] = data; + + return f; + } + + static MessageFrame Decode(segment_bls_t& recv_segments) { + MessageFrame f; + // transfer segments' bufferlists. If a MessageFrame contains less + // SegmentsNumV segments, the missing ones will be seen as zeroed. + for (__u8 idx = 0; idx < std::size(recv_segments); idx++) { + f.segments[idx] = std::move(recv_segments[idx]); + } + return f; + } + + inline const ceph_msg_header2 &header() { + auto& hdrbl = segments[SegmentIndex::Msg::HEADER]; + return reinterpret_cast<const ceph_msg_header2&>(*hdrbl.c_str()); + } + + ceph::bufferlist &front() { + return segments[SegmentIndex::Msg::FRONT]; + } + + ceph::bufferlist &middle() { + return segments[SegmentIndex::Msg::MIDDLE]; + } + + ceph::bufferlist &data() { + return segments[SegmentIndex::Msg::DATA]; + } + + uint32_t front_len() const { + return segments[SegmentIndex::Msg::FRONT].length(); + } + + uint32_t middle_len() const { + return segments[SegmentIndex::Msg::MIDDLE].length(); + } + + uint32_t data_len() const { + return segments[SegmentIndex::Msg::DATA].length(); + } + +protected: + using Frame::Frame; +}; + +} // namespace ceph::msgr::v2 + +#endif // _MSG_ASYNC_FRAMES_V2_ diff --git a/src/msg/async/net_handler.cc b/src/msg/async/net_handler.cc new file mode 100644 index 00000000..2b4e646d --- /dev/null +++ b/src/msg/async/net_handler.cc @@ -0,0 +1,233 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> + +#include "net_handler.h" +#include "common/debug.h" +#include "common/errno.h" +#include "include/compat.h" +#include "include/sock_compat.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "NetHandler " + +namespace ceph{ + +int NetHandler::create_socket(int domain, bool reuse_addr) +{ + int s; + int r = 0; + + if ((s = socket_cloexec(domain, SOCK_STREAM, 0)) == -1) { + r = errno; + lderr(cct) << __func__ << " couldn't create socket " << cpp_strerror(r) << dendl; + return -r; + } + +#if !defined(__FreeBSD__) + /* Make sure connection-intensive things like the benchmark + * will be able to close/open sockets a zillion of times */ + if (reuse_addr) { + int on = 1; + if (::setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) { + r = errno; + lderr(cct) << __func__ << " setsockopt SO_REUSEADDR failed: " + << strerror(r) << dendl; + close(s); + return -r; + } + } +#endif + + return s; +} + +int NetHandler::set_nonblock(int sd) +{ + int flags; + int r = 0; + + /* Set the socket nonblocking. + * Note that fcntl(2) for F_GETFL and F_SETFL can't be + * interrupted by a signal. */ + if ((flags = fcntl(sd, F_GETFL)) < 0 ) { + r = errno; + lderr(cct) << __func__ << " fcntl(F_GETFL) failed: " << cpp_strerror(r) << dendl; + return -r; + } + if (fcntl(sd, F_SETFL, flags | O_NONBLOCK) < 0) { + r = errno; + lderr(cct) << __func__ << " fcntl(F_SETFL,O_NONBLOCK): " << cpp_strerror(r) << dendl; + return -r; + } + + return 0; +} + +int NetHandler::set_socket_options(int sd, bool nodelay, int size) +{ + int r = 0; + // disable Nagle algorithm? + if (nodelay) { + int flag = 1; + r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); + if (r < 0) { + r = errno; + ldout(cct, 0) << "couldn't set TCP_NODELAY: " << cpp_strerror(r) << dendl; + } + } + if (size) { + r = ::setsockopt(sd, SOL_SOCKET, SO_RCVBUF, (void*)&size, sizeof(size)); + if (r < 0) { + r = errno; + ldout(cct, 0) << "couldn't set SO_RCVBUF to " << size << ": " << cpp_strerror(r) << dendl; + } + } + + // block ESIGPIPE +#ifdef CEPH_USE_SO_NOSIGPIPE + int val = 1; + r = ::setsockopt(sd, SOL_SOCKET, SO_NOSIGPIPE, (void*)&val, sizeof(val)); + if (r) { + r = errno; + ldout(cct,0) << "couldn't set SO_NOSIGPIPE: " << cpp_strerror(r) << dendl; + } +#endif + return -r; +} + +void NetHandler::set_priority(int sd, int prio, int domain) +{ +#ifdef SO_PRIORITY + if (prio < 0) { + return; + } + int r = -1; +#ifdef IPTOS_CLASS_CS6 + int iptos = IPTOS_CLASS_CS6; + switch (domain) { + case AF_INET: + r = ::setsockopt(sd, IPPROTO_IP, IP_TOS, &iptos, sizeof(iptos)); + break; + case AF_INET6: + r = ::setsockopt(sd, IPPROTO_IPV6, IPV6_TCLASS, &iptos, sizeof(iptos)); + break; + default: + lderr(cct) << "couldn't set ToS of unknown family (" << domain << ")" + << " to " << iptos << dendl; + return; + } + if (r < 0) { + r = errno; + ldout(cct,0) << "couldn't set TOS to " << iptos + << ": " << cpp_strerror(r) << dendl; + } + +#endif // IPTOS_CLASS_CS6 + // setsockopt(IPTOS_CLASS_CS6) sets the priority of the socket as 0. + // See http://goo.gl/QWhvsD and http://goo.gl/laTbjT + // We need to call setsockopt(SO_PRIORITY) after it. + r = ::setsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(prio)); + if (r < 0) { + r = errno; + ldout(cct, 0) << __func__ << " couldn't set SO_PRIORITY to " << prio + << ": " << cpp_strerror(r) << dendl; + } +#else + return; +#endif // SO_PRIORITY +} + +int NetHandler::generic_connect(const entity_addr_t& addr, const entity_addr_t &bind_addr, bool nonblock) +{ + int ret; + int s = create_socket(addr.get_family()); + if (s < 0) + return s; + + if (nonblock) { + ret = set_nonblock(s); + if (ret < 0) { + close(s); + return ret; + } + } + + set_socket_options(s, cct->_conf->ms_tcp_nodelay, cct->_conf->ms_tcp_rcvbuf); + + { + entity_addr_t addr = bind_addr; + if (cct->_conf->ms_bind_before_connect && (!addr.is_blank_ip())) { + addr.set_port(0); + ret = ::bind(s, addr.get_sockaddr(), addr.get_sockaddr_len()); + if (ret < 0) { + ret = errno; + ldout(cct, 2) << __func__ << " client bind error " << ", " << cpp_strerror(ret) << dendl; + close(s); + return -ret; + } + } + } + + ret = ::connect(s, addr.get_sockaddr(), addr.get_sockaddr_len()); + if (ret < 0) { + ret = errno; + if (errno == EINPROGRESS && nonblock) + return s; + + ldout(cct, 10) << __func__ << " connect: " << cpp_strerror(ret) << dendl; + close(s); + return -ret; + } + + return s; +} + +int NetHandler::reconnect(const entity_addr_t &addr, int sd) +{ + int r = 0; + int ret = ::connect(sd, addr.get_sockaddr(), addr.get_sockaddr_len()); + + if (ret < 0 && errno != EISCONN) { + r = errno; + ldout(cct, 10) << __func__ << " reconnect: " << strerror(r) << dendl; + if (r == EINPROGRESS || r == EALREADY) + return 1; + return -r; + } + + return 0; +} + +int NetHandler::connect(const entity_addr_t &addr, const entity_addr_t& bind_addr) +{ + return generic_connect(addr, bind_addr, false); +} + +int NetHandler::nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr) +{ + return generic_connect(addr, bind_addr, true); +} + + +} diff --git a/src/msg/async/net_handler.h b/src/msg/async/net_handler.h new file mode 100644 index 00000000..19042377 --- /dev/null +++ b/src/msg/async/net_handler.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2014 UnitedStack <haomai@unitedstack.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_NET_UTILS_H +#define CEPH_COMMON_NET_UTILS_H +#include "common/config.h" + +namespace ceph { + class NetHandler { + int generic_connect(const entity_addr_t& addr, const entity_addr_t& bind_addr, bool nonblock); + + CephContext *cct; + public: + int create_socket(int domain, bool reuse_addr=false); + explicit NetHandler(CephContext *c): cct(c) {} + int set_nonblock(int sd); + int set_socket_options(int sd, bool nodelay, int size); + int connect(const entity_addr_t &addr, const entity_addr_t& bind_addr); + + /** + * Try to reconnect the socket. + * + * @return 0 success + * > 0 just break, and wait for event + * < 0 need to goto fail + */ + int reconnect(const entity_addr_t &addr, int sd); + int nonblock_connect(const entity_addr_t &addr, const entity_addr_t& bind_addr); + void set_priority(int sd, int priority, int domain); + }; +} + +#endif diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc new file mode 100644 index 00000000..34299975 --- /dev/null +++ b/src/msg/async/rdma/Infiniband.cc @@ -0,0 +1,1234 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Infiniband.h" +#include "common/errno.h" +#include "common/debug.h" +#include "RDMAStack.h" +#include <sys/time.h> +#include <sys/resource.h> + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "Infiniband " + +static const uint32_t MAX_SHARED_RX_SGE_COUNT = 1; +static const uint32_t MAX_INLINE_DATA = 0; +static const uint32_t TCP_MSG_LEN = sizeof("0000:00000000:00000000:00000000:00000000000000000000000000000000"); +static const uint32_t CQ_DEPTH = 30000; + +Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr), gid_idx(0) +{ +#ifdef HAVE_IBV_EXP + union ibv_gid cgid; + struct ibv_exp_gid_attr gid_attr; + bool malformed = false; + + ldout(cct,1) << __func__ << " using experimental verbs for gid" << dendl; + int r = ibv_query_port(ctxt, port_num, port_attr); + if (r == -1) { + lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + lid = port_attr->lid; + + // search for requested GID in GIDs table + ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid) + << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl; + r = sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(), + "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx" + ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx", + &cgid.raw[ 0], &cgid.raw[ 1], + &cgid.raw[ 2], &cgid.raw[ 3], + &cgid.raw[ 4], &cgid.raw[ 5], + &cgid.raw[ 6], &cgid.raw[ 7], + &cgid.raw[ 8], &cgid.raw[ 9], + &cgid.raw[10], &cgid.raw[11], + &cgid.raw[12], &cgid.raw[13], + &cgid.raw[14], &cgid.raw[15]); + + if (r != 16) { + ldout(cct, 1) << __func__ << " malformed or no GID supplied, using GID index 0" << dendl; + malformed = true; + } + + gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE; + + for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) { + r = ibv_query_gid(ctxt, port_num, gid_idx, &gid); + if (r) { + lderr(cct) << __func__ << " query gid of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr); + if (r) { + lderr(cct) << __func__ << " query gid attributes of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + if (malformed) break; // stay with gid_idx=0 + if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) && + (memcmp(&gid, &cgid, 16) == 0) ) { + ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl; + break; + } + } + + if (gid_idx == port_attr->gid_tbl_len) { + lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl; + ceph_abort(); + } +#else + int r = ibv_query_port(ctxt, port_num, port_attr); + if (r == -1) { + lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + lid = port_attr->lid; + r = ibv_query_gid(ctxt, port_num, 0, &gid); + if (r) { + lderr(cct) << __func__ << " query gid failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +#endif +} + + +Device::Device(CephContext *cct, ibv_device* d, struct ibv_context *dc) + : device(d), device_attr(new ibv_device_attr), active_port(nullptr) +{ + if (device == NULL) { + lderr(cct) << __func__ << " device == NULL" << cpp_strerror(errno) << dendl; + ceph_abort(); + } + name = ibv_get_device_name(device); + if (cct->_conf->ms_async_rdma_cm) { + ctxt = dc; + } else { + ctxt = ibv_open_device(device); + } + if (ctxt == NULL) { + lderr(cct) << __func__ << " open rdma device failed. " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + int r = ibv_query_device(ctxt, device_attr); + if (r == -1) { + lderr(cct) << __func__ << " failed to query rdma device. " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +} + +void Device::binding_port(CephContext *cct, int port_num) { + port_cnt = device_attr->phys_port_cnt; + for (uint8_t i = 0; i < port_cnt; ++i) { + Port *port = new Port(cct, ctxt, i+1); + if (i + 1 == port_num && port->get_port_attr()->state == IBV_PORT_ACTIVE) { + active_port = port; + ldout(cct, 1) << __func__ << " found active port " << i+1 << dendl; + break; + } else { + ldout(cct, 10) << __func__ << " port " << i+1 << " is not what we want. state: " << port->get_port_attr()->state << ")"<< dendl; + } + delete port; + } + if (nullptr == active_port) { + lderr(cct) << __func__ << " port not found" << dendl; + ceph_assert(active_port); + } +} + + +Infiniband::QueuePair::QueuePair( + CephContext *c, Infiniband& infiniband, ibv_qp_type type, + int port, ibv_srq *srq, + Infiniband::CompletionQueue* txcq, Infiniband::CompletionQueue* rxcq, + uint32_t tx_queue_len, uint32_t rx_queue_len, struct rdma_cm_id *cid, uint32_t q_key) +: cct(c), infiniband(infiniband), + type(type), + ctxt(infiniband.device->ctxt), + ib_physical_port(port), + pd(infiniband.pd->pd), + srq(srq), + qp(NULL), + cm_id(cid), + txcq(txcq), + rxcq(rxcq), + initial_psn(0), + max_send_wr(tx_queue_len), + max_recv_wr(rx_queue_len), + q_key(q_key), + dead(false) +{ + initial_psn = lrand48() & 0xffffff; + if (type != IBV_QPT_RC && type != IBV_QPT_UD && type != IBV_QPT_RAW_PACKET) { + lderr(cct) << __func__ << " invalid queue pair type" << cpp_strerror(errno) << dendl; + ceph_abort(); + } + pd = infiniband.pd->pd; +} + +int Infiniband::QueuePair::init() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + ibv_qp_init_attr qpia; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&qpia, 0, sizeof(qpia)); + qpia.send_cq = txcq->get_cq(); + qpia.recv_cq = rxcq->get_cq(); + if (srq) { + qpia.srq = srq; // use the same shared receive queue + } else { + qpia.cap.max_recv_wr = max_recv_wr; + qpia.cap.max_recv_sge = 1; + } + qpia.cap.max_send_wr = max_send_wr; // max outstanding send requests + qpia.cap.max_send_sge = 1; // max send scatter-gather elements + qpia.cap.max_inline_data = MAX_INLINE_DATA; // max bytes of immediate data on send q + qpia.qp_type = type; // RC, UC, UD, or XRC + qpia.sq_sig_all = 0; // only generate CQEs on requested WQEs + + if (!cct->_conf->ms_async_rdma_cm) { + qp = ibv_create_qp(pd, &qpia); + if (qp == NULL) { + lderr(cct) << __func__ << " failed to create queue pair" << cpp_strerror(errno) << dendl; + if (errno == ENOMEM) { + lderr(cct) << __func__ << " try reducing ms_async_rdma_receive_queue_length, " + " ms_async_rdma_send_buffers or" + " ms_async_rdma_buffer_size" << dendl; + } + return -1; + } + } else { + ceph_assert(cm_id->verbs == pd->context); + if (rdma_create_qp(cm_id, pd, &qpia)) { + lderr(cct) << __func__ << " failed to create queue pair with rdmacm library" + << cpp_strerror(errno) << dendl; + return -1; + } + qp = cm_id->qp; + } + ldout(cct, 20) << __func__ << " successfully create queue pair: " + << "qp=" << qp << dendl; + + if (cct->_conf->ms_async_rdma_cm) + return 0; + + // move from RESET to INIT state + ibv_qp_attr qpa; + memset(&qpa, 0, sizeof(qpa)); + qpa.qp_state = IBV_QPS_INIT; + qpa.pkey_index = 0; + qpa.port_num = (uint8_t)(ib_physical_port); + qpa.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; + qpa.qkey = q_key; + + int mask = IBV_QP_STATE | IBV_QP_PORT; + switch (type) { + case IBV_QPT_RC: + mask |= IBV_QP_ACCESS_FLAGS; + mask |= IBV_QP_PKEY_INDEX; + break; + case IBV_QPT_UD: + mask |= IBV_QP_QKEY; + mask |= IBV_QP_PKEY_INDEX; + break; + case IBV_QPT_RAW_PACKET: + break; + default: + ceph_abort(); + } + + int ret = ibv_modify_qp(qp, &qpa, mask); + if (ret) { + ibv_destroy_qp(qp); + lderr(cct) << __func__ << " failed to transition to INIT state: " + << cpp_strerror(errno) << dendl; + return -1; + } + ldout(cct, 20) << __func__ << " successfully change queue pair to INIT:" + << " qp=" << qp << dendl; + return 0; +} + +/** + * Change RC QueuePair into the ERROR state. This is necessary modify + * the Queue Pair into the Error state and poll all of the relevant + * Work Completions prior to destroying a Queue Pair. + * Since destroying a Queue Pair does not guarantee that its Work + * Completions are removed from the CQ upon destruction. Even if the + * Work Completions are already in the CQ, it might not be possible to + * retrieve them. If the Queue Pair is associated with an SRQ, it is + * recommended wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED + * + * \return + * -errno if the QueuePair can't switch to ERROR + * 0 for success. + */ +int Infiniband::QueuePair::to_dead() +{ + if (dead) + return 0; + ibv_qp_attr qpa; + memset(&qpa, 0, sizeof(qpa)); + qpa.qp_state = IBV_QPS_ERR; + + int mask = IBV_QP_STATE; + int ret = ibv_modify_qp(qp, &qpa, mask); + if (ret) { + lderr(cct) << __func__ << " failed to transition to ERROR state: " + << cpp_strerror(errno) << dendl; + return -errno; + } + dead = true; + return ret; +} + +int Infiniband::QueuePair::get_remote_qp_number(uint32_t *rqp) const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, IBV_QP_DEST_QPN, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to query qp: " + << cpp_strerror(errno) << dendl; + return -1; + } + + if (rqp) + *rqp = qpa.dest_qp_num; + return 0; +} + +/** + * Get the remote infiniband address for this QueuePair, as set in #plumb(). + * LIDs are "local IDs" in infiniband terminology. They are short, locally + * routable addresses. + */ +int Infiniband::QueuePair::get_remote_lid(uint16_t *lid) const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, IBV_QP_AV, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to query qp: " + << cpp_strerror(errno) << dendl; + return -1; + } + + if (lid) + *lid = qpa.ah_attr.dlid; + return 0; +} + +/** + * Get the state of a QueuePair. + */ +int Infiniband::QueuePair::get_state() const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, IBV_QP_STATE, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to get state: " + << cpp_strerror(errno) << dendl; + return -1; + } + return qpa.qp_state; +} + +/** + * Return true if the queue pair is in an error state, false otherwise. + */ +bool Infiniband::QueuePair::is_error() const +{ + ibv_qp_attr qpa; + ibv_qp_init_attr qpia; + + int r = ibv_query_qp(qp, &qpa, -1, &qpia); + if (r) { + lderr(cct) << __func__ << " failed to get state: " + << cpp_strerror(errno) << dendl; + return true; + } + return qpa.cur_qp_state == IBV_QPS_ERR; +} + + +Infiniband::CompletionChannel::CompletionChannel(CephContext *c, Infiniband &ib) + : cct(c), infiniband(ib), channel(NULL), cq(NULL), cq_events_that_need_ack(0) +{ +} + +Infiniband::CompletionChannel::~CompletionChannel() +{ + if (channel) { + int r = ibv_destroy_comp_channel(channel); + if (r < 0) + lderr(cct) << __func__ << " failed to destroy cc: " << cpp_strerror(errno) << dendl; + ceph_assert(r == 0); + } +} + +int Infiniband::CompletionChannel::init() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + channel = ibv_create_comp_channel(infiniband.device->ctxt); + if (!channel) { + lderr(cct) << __func__ << " failed to create receive completion channel: " + << cpp_strerror(errno) << dendl; + return -1; + } + int rc = NetHandler(cct).set_nonblock(channel->fd); + if (rc < 0) { + ibv_destroy_comp_channel(channel); + return -1; + } + return 0; +} + +void Infiniband::CompletionChannel::ack_events() +{ + ibv_ack_cq_events(cq, cq_events_that_need_ack); + cq_events_that_need_ack = 0; +} + +bool Infiniband::CompletionChannel::get_cq_event() +{ + ibv_cq *cq = NULL; + void *ev_ctx; + if (ibv_get_cq_event(channel, &cq, &ev_ctx)) { + if (errno != EAGAIN && errno != EINTR) + lderr(cct) << __func__ << " failed to retrieve CQ event: " + << cpp_strerror(errno) << dendl; + return false; + } + + /* accumulate number of cq events that need to + * * be acked, and periodically ack them + * */ + if (++cq_events_that_need_ack == MAX_ACK_EVENT) { + ldout(cct, 20) << __func__ << " ack aq events." << dendl; + ibv_ack_cq_events(cq, MAX_ACK_EVENT); + cq_events_that_need_ack = 0; + } + + return true; +} + + +Infiniband::CompletionQueue::~CompletionQueue() +{ + if (cq) { + int r = ibv_destroy_cq(cq); + if (r < 0) + lderr(cct) << __func__ << " failed to destroy cq: " << cpp_strerror(errno) << dendl; + ceph_assert(r == 0); + } +} + +int Infiniband::CompletionQueue::init() +{ + cq = ibv_create_cq(infiniband.device->ctxt, queue_depth, this, channel->get_channel(), 0); + if (!cq) { + lderr(cct) << __func__ << " failed to create receive completion queue: " + << cpp_strerror(errno) << dendl; + return -1; + } + + if (ibv_req_notify_cq(cq, 0)) { + lderr(cct) << __func__ << " ibv_req_notify_cq failed: " << cpp_strerror(errno) << dendl; + ibv_destroy_cq(cq); + cq = nullptr; + return -1; + } + + channel->bind_cq(cq); + ldout(cct, 20) << __func__ << " successfully create cq=" << cq << dendl; + return 0; +} + +int Infiniband::CompletionQueue::rearm_notify(bool solicite_only) +{ + ldout(cct, 20) << __func__ << " started." << dendl; + int r = ibv_req_notify_cq(cq, 0); + if (r < 0) + lderr(cct) << __func__ << " failed to notify cq: " << cpp_strerror(errno) << dendl; + return r; +} + +int Infiniband::CompletionQueue::poll_cq(int num_entries, ibv_wc *ret_wc_array) { + int r = ibv_poll_cq(cq, num_entries, ret_wc_array); + if (r < 0) { + lderr(cct) << __func__ << " poll_completion_queue occur met error: " + << cpp_strerror(errno) << dendl; + return -1; + } + return r; +} + + +Infiniband::ProtectionDomain::ProtectionDomain(CephContext *cct, Device *device) + : pd(ibv_alloc_pd(device->ctxt)) +{ + if (pd == NULL) { + lderr(cct) << __func__ << " failed to allocate infiniband protection domain: " << cpp_strerror(errno) << dendl; + ceph_abort(); + } +} + +Infiniband::ProtectionDomain::~ProtectionDomain() +{ + ibv_dealloc_pd(pd); +} + + +Infiniband::MemoryManager::Chunk::Chunk(ibv_mr* m, uint32_t len, char* b) + : mr(m), bytes(len), offset(0), buffer(b) +{ +} + +Infiniband::MemoryManager::Chunk::~Chunk() +{ +} + +void Infiniband::MemoryManager::Chunk::set_offset(uint32_t o) +{ + offset = o; +} + +uint32_t Infiniband::MemoryManager::Chunk::get_offset() +{ + return offset; +} + +void Infiniband::MemoryManager::Chunk::set_bound(uint32_t b) +{ + bound = b; +} + +void Infiniband::MemoryManager::Chunk::prepare_read(uint32_t b) +{ + offset = 0; + bound = b; +} + +uint32_t Infiniband::MemoryManager::Chunk::get_bound() +{ + return bound; +} + +uint32_t Infiniband::MemoryManager::Chunk::read(char* buf, uint32_t len) +{ + uint32_t left = bound - offset; + if (left >= len) { + memcpy(buf, buffer+offset, len); + offset += len; + return len; + } else { + memcpy(buf, buffer+offset, left); + offset = 0; + bound = 0; + return left; + } +} + +uint32_t Infiniband::MemoryManager::Chunk::write(char* buf, uint32_t len) +{ + uint32_t left = bytes - offset; + if (left >= len) { + memcpy(buffer+offset, buf, len); + offset += len; + return len; + } else { + memcpy(buffer+offset, buf, left); + offset = bytes; + return left; + } +} + +bool Infiniband::MemoryManager::Chunk::full() +{ + return offset == bytes; +} + +bool Infiniband::MemoryManager::Chunk::over() +{ + return Infiniband::MemoryManager::Chunk::offset == bound; +} + +void Infiniband::MemoryManager::Chunk::clear() +{ + offset = 0; + bound = 0; +} + +Infiniband::MemoryManager::Cluster::Cluster(MemoryManager& m, uint32_t s) + : manager(m), buffer_size(s), lock("cluster_lock") +{ +} + +Infiniband::MemoryManager::Cluster::~Cluster() +{ + int r = ibv_dereg_mr(chunk_base->mr); + ceph_assert(r == 0); + const auto chunk_end = chunk_base + num_chunk; + for (auto chunk = chunk_base; chunk != chunk_end; chunk++) { + chunk->~Chunk(); + } + + ::free(chunk_base); + manager.free(base); +} + +int Infiniband::MemoryManager::Cluster::fill(uint32_t num) +{ + ceph_assert(!base); + num_chunk = num; + uint32_t bytes = buffer_size * num; + + base = (char*)manager.malloc(bytes); + end = base + bytes; + ceph_assert(base); + chunk_base = static_cast<Chunk*>(::malloc(sizeof(Chunk) * num)); + // FIPS zeroization audit 20191115: this memset is not security related. + memset(static_cast<void*>(chunk_base), 0, sizeof(Chunk) * num); + free_chunks.reserve(num); + ibv_mr* m = ibv_reg_mr(manager.pd->pd, base, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + ceph_assert(m); + Chunk* chunk = chunk_base; + for (uint32_t offset = 0; offset < bytes; offset += buffer_size){ + new(chunk) Chunk(m, buffer_size, base+offset); + free_chunks.push_back(chunk); + chunk++; + } + return 0; +} + +void Infiniband::MemoryManager::Cluster::take_back(std::vector<Chunk*> &ck) +{ + Mutex::Locker l(lock); + for (auto c : ck) { + c->clear(); + free_chunks.push_back(c); + } +} + +int Infiniband::MemoryManager::Cluster::get_buffers(std::vector<Chunk*> &chunks, size_t bytes) +{ + uint32_t num = bytes / buffer_size + 1; + if (bytes % buffer_size == 0) + --num; + int r = num; + Mutex::Locker l(lock); + if (free_chunks.empty()) + return 0; + if (!bytes) { + r = free_chunks.size(); + for (auto c : free_chunks) + chunks.push_back(c); + free_chunks.clear(); + return r; + } + if (free_chunks.size() < num) { + num = free_chunks.size(); + r = num; + } + for (uint32_t i = 0; i < num; ++i) { + chunks.push_back(free_chunks.back()); + free_chunks.pop_back(); + } + return r; +} + +bool Infiniband::MemoryManager::MemPoolContext::can_alloc(unsigned nbufs) +{ + /* unlimited */ + if (manager->cct->_conf->ms_async_rdma_receive_buffers <= 0) + return true; + + if (n_bufs_allocated + nbufs > (unsigned)manager->cct->_conf->ms_async_rdma_receive_buffers) { + lderr(manager->cct) << __func__ << " WARNING: OUT OF RX BUFFERS: allocated: " << + n_bufs_allocated << " requested: " << nbufs << + " limit: " << manager->cct->_conf->ms_async_rdma_receive_buffers << dendl; + return false; + } + + return true; +} + +void Infiniband::MemoryManager::MemPoolContext::set_stat_logger(PerfCounters *logger) { + perf_logger = logger; + if (perf_logger != nullptr) + perf_logger->set(l_msgr_rdma_rx_bufs_total, n_bufs_allocated); +} + +void Infiniband::MemoryManager::MemPoolContext::update_stats(int nbufs) +{ + n_bufs_allocated += nbufs; + + if (!perf_logger) + return; + + if (nbufs > 0) { + perf_logger->inc(l_msgr_rdma_rx_bufs_total, nbufs); + } else { + perf_logger->dec(l_msgr_rdma_rx_bufs_total, -nbufs); + } +} + +void *Infiniband::MemoryManager::mem_pool::slow_malloc() +{ + void *p; + + Mutex::Locker l(PoolAllocator::lock); + PoolAllocator::g_ctx = ctx; + // this will trigger pool expansion via PoolAllocator::malloc() + p = boost::pool<PoolAllocator>::malloc(); + PoolAllocator::g_ctx = nullptr; + return p; +} + +Infiniband::MemoryManager::MemPoolContext *Infiniband::MemoryManager::PoolAllocator::g_ctx = nullptr; +Mutex Infiniband::MemoryManager::PoolAllocator::lock("pool-alloc-lock"); + +// lock is taken by mem_pool::slow_malloc() +char *Infiniband::MemoryManager::PoolAllocator::malloc(const size_type bytes) +{ + mem_info *m; + Chunk *ch; + size_t rx_buf_size; + unsigned nbufs; + MemoryManager *manager; + CephContext *cct; + + ceph_assert(g_ctx); + manager = g_ctx->manager; + cct = manager->cct; + rx_buf_size = sizeof(Chunk) + cct->_conf->ms_async_rdma_buffer_size; + nbufs = bytes/rx_buf_size; + + if (!g_ctx->can_alloc(nbufs)) + return NULL; + + m = static_cast<mem_info *>(manager->malloc(bytes + sizeof(*m))); + if (!m) { + lderr(cct) << __func__ << " failed to allocate " << + bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl; + return NULL; + } + + m->mr = ibv_reg_mr(manager->pd->pd, m->chunks, bytes, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); + if (m->mr == NULL) { + lderr(cct) << __func__ << " failed to register " << + bytes << " + " << sizeof(*m) << " bytes of memory for " << nbufs << dendl; + manager->free(m); + return NULL; + } + + m->nbufs = nbufs; + // save this chunk context + m->ctx = g_ctx; + + // note that the memory can be allocated before perf logger is set + g_ctx->update_stats(nbufs); + + /* initialize chunks */ + ch = m->chunks; + for (unsigned i = 0; i < nbufs; i++) { + ch->lkey = m->mr->lkey; + ch->bytes = cct->_conf->ms_async_rdma_buffer_size; + ch->offset = 0; + ch->buffer = ch->data; // TODO: refactor tx and remove buffer + ch = reinterpret_cast<Chunk *>(reinterpret_cast<char *>(ch) + rx_buf_size); + } + + return reinterpret_cast<char *>(m->chunks); +} + + +void Infiniband::MemoryManager::PoolAllocator::free(char * const block) +{ + mem_info *m; + Mutex::Locker l(lock); + + m = reinterpret_cast<mem_info *>(block) - 1; + m->ctx->update_stats(-m->nbufs); + ibv_dereg_mr(m->mr); + m->ctx->manager->free(m); +} + +Infiniband::MemoryManager::MemoryManager(CephContext *c, Device *d, ProtectionDomain *p) + : cct(c), device(d), pd(p), + rxbuf_pool_ctx(this), + rxbuf_pool(&rxbuf_pool_ctx, sizeof(Chunk) + c->_conf->ms_async_rdma_buffer_size, + c->_conf->ms_async_rdma_receive_buffers > 0 ? + // if possible make initial pool size 2 * receive_queue_len + // that way there will be no pool expansion upon receive of the + // first packet. + (c->_conf->ms_async_rdma_receive_buffers < 2 * c->_conf->ms_async_rdma_receive_queue_len ? + c->_conf->ms_async_rdma_receive_buffers : 2 * c->_conf->ms_async_rdma_receive_queue_len) : + // rx pool is infinite, we can set any initial size that we want + 2 * c->_conf->ms_async_rdma_receive_queue_len) +{ +} + +Infiniband::MemoryManager::~MemoryManager() +{ + if (send) + delete send; +} + +void* Infiniband::MemoryManager::huge_pages_malloc(size_t size) +{ + size_t real_size = ALIGN_TO_PAGE_SIZE(size + HUGE_PAGE_SIZE); + char *ptr = (char *)mmap(NULL, real_size, PROT_READ | PROT_WRITE,MAP_PRIVATE | MAP_ANONYMOUS |MAP_POPULATE | MAP_HUGETLB,-1, 0); + if (ptr == MAP_FAILED) { + ptr = (char *)std::malloc(real_size); + if (ptr == NULL) return NULL; + real_size = 0; + } + *((size_t *)ptr) = real_size; + return ptr + HUGE_PAGE_SIZE; +} + +void Infiniband::MemoryManager::huge_pages_free(void *ptr) +{ + if (ptr == NULL) return; + void *real_ptr = (char *)ptr -HUGE_PAGE_SIZE; + size_t real_size = *((size_t *)real_ptr); + ceph_assert(real_size % HUGE_PAGE_SIZE == 0); + if (real_size != 0) + munmap(real_ptr, real_size); + else + std::free(real_ptr); +} + + +void* Infiniband::MemoryManager::malloc(size_t size) +{ + if (cct->_conf->ms_async_rdma_enable_hugepage) + return huge_pages_malloc(size); + else + return std::malloc(size); +} + +void Infiniband::MemoryManager::free(void *ptr) +{ + if (cct->_conf->ms_async_rdma_enable_hugepage) + huge_pages_free(ptr); + else + std::free(ptr); +} + +void Infiniband::MemoryManager::create_tx_pool(uint32_t size, uint32_t tx_num) +{ + ceph_assert(device); + ceph_assert(pd); + + send = new Cluster(*this, size); + send->fill(tx_num); +} + +void Infiniband::MemoryManager::return_tx(std::vector<Chunk*> &chunks) +{ + send->take_back(chunks); +} + +int Infiniband::MemoryManager::get_send_buffers(std::vector<Chunk*> &c, size_t bytes) +{ + return send->get_buffers(c, bytes); +} + +static std::atomic<bool> init_prereq = {false}; + +void Infiniband::verify_prereq(CephContext *cct) { + + //On RDMA MUST be called before fork + int rc = ibv_fork_init(); + if (rc) { + lderr(cct) << __func__ << " failed to call ibv_for_init(). On RDMA must be called before fork. Application aborts." << dendl; + ceph_abort(); + } + + ldout(cct, 20) << __func__ << " ms_async_rdma_enable_hugepage value is: " << cct->_conf->ms_async_rdma_enable_hugepage << dendl; + if (cct->_conf->ms_async_rdma_enable_hugepage){ + rc = setenv("RDMAV_HUGEPAGES_SAFE","1",1); + ldout(cct, 0) << __func__ << " RDMAV_HUGEPAGES_SAFE is set as: " << getenv("RDMAV_HUGEPAGES_SAFE") << dendl; + if (rc) { + lderr(cct) << __func__ << " failed to export RDMA_HUGEPAGES_SAFE. On RDMA must be exported before using huge pages. Application aborts." << dendl; + ceph_abort(); + } + } + + //Check ulimit + struct rlimit limit; + getrlimit(RLIMIT_MEMLOCK, &limit); + if (limit.rlim_cur != RLIM_INFINITY || limit.rlim_max != RLIM_INFINITY) { + lderr(cct) << __func__ << "!!! WARNING !!! For RDMA to work properly user memlock (ulimit -l) must be big enough to allow large amount of registered memory." + " We recommend setting this parameter to infinity" << dendl; + } + init_prereq = true; +} + +Infiniband::Infiniband(CephContext *cct) + : cct(cct), lock("IB lock"), + device_name(cct->_conf->ms_async_rdma_device_name), + port_num( cct->_conf->ms_async_rdma_port_num) +{ + if (!init_prereq) + verify_prereq(cct); + ldout(cct, 20) << __func__ << " constructing Infiniband..." << dendl; +} + +void Infiniband::init() +{ + Mutex::Locker l(lock); + + if (initialized) + return; + + device_list = new DeviceList(cct); + initialized = true; + + device = device_list->get_device(device_name.c_str()); + ceph_assert(device); + device->binding_port(cct, port_num); + ib_physical_port = device->active_port->get_port_num(); + pd = new ProtectionDomain(cct, device); + ceph_assert(NetHandler(cct).set_nonblock(device->ctxt->async_fd) == 0); + + support_srq = cct->_conf->ms_async_rdma_support_srq; + if (support_srq) + rx_queue_len = device->device_attr->max_srq_wr; + else + rx_queue_len = device->device_attr->max_qp_wr; + if (rx_queue_len > cct->_conf->ms_async_rdma_receive_queue_len) { + rx_queue_len = cct->_conf->ms_async_rdma_receive_queue_len; + ldout(cct, 1) << __func__ << " receive queue length is " << rx_queue_len << " receive buffers" << dendl; + } else { + ldout(cct, 0) << __func__ << " requested receive queue length " << + cct->_conf->ms_async_rdma_receive_queue_len << + " is too big. Setting " << rx_queue_len << dendl; + } + + // check for the misconfiguration + if (cct->_conf->ms_async_rdma_receive_buffers > 0 && + rx_queue_len > (unsigned)cct->_conf->ms_async_rdma_receive_buffers) { + lderr(cct) << __func__ << " rdma_receive_queue_len (" << + rx_queue_len << ") > ms_async_rdma_receive_buffers(" << + cct->_conf->ms_async_rdma_receive_buffers << ")." << dendl; + ceph_abort(); + } + + tx_queue_len = device->device_attr->max_qp_wr; + if (tx_queue_len > cct->_conf->ms_async_rdma_send_buffers) { + tx_queue_len = cct->_conf->ms_async_rdma_send_buffers; + ldout(cct, 1) << __func__ << " assigning: " << tx_queue_len << " send buffers" << dendl; + } else { + ldout(cct, 0) << __func__ << " using the max allowed send buffers: " << tx_queue_len << dendl; + } + + ldout(cct, 1) << __func__ << " device allow " << device->device_attr->max_cqe + << " completion entries" << dendl; + + memory_manager = new MemoryManager(cct, device, pd); + memory_manager->create_tx_pool(cct->_conf->ms_async_rdma_buffer_size, tx_queue_len); + + if (support_srq) { + srq = create_shared_receive_queue(rx_queue_len, MAX_SHARED_RX_SGE_COUNT); + post_chunks_to_rq(rx_queue_len, NULL); //add to srq + } +} + +Infiniband::~Infiniband() +{ + if (!initialized) + return; + if (support_srq) + ibv_destroy_srq(srq); + delete memory_manager; + delete pd; +} + +/** + * Create a shared receive queue. This basically wraps the verbs call. + * + * \param[in] max_wr + * The max number of outstanding work requests in the SRQ. + * \param[in] max_sge + * The max number of scatter elements per WR. + * \return + * A valid ibv_srq pointer, or NULL on error. + */ +ibv_srq* Infiniband::create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge) +{ + ibv_srq_init_attr sia; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&sia, 0, sizeof(sia)); + sia.srq_context = device->ctxt; + sia.attr.max_wr = max_wr; + sia.attr.max_sge = max_sge; + return ibv_create_srq(pd->pd, &sia); +} + +int Infiniband::get_tx_buffers(std::vector<Chunk*> &c, size_t bytes) +{ + return memory_manager->get_send_buffers(c, bytes); +} + +/** + * Create a new QueuePair. This factory should be used in preference to + * the QueuePair constructor directly, since this lets derivatives of + * Infiniband, e.g. MockInfiniband (if it existed), + * return mocked out QueuePair derivatives. + * + * \return + * QueuePair on success or NULL if init fails + * See QueuePair::QueuePair for parameter documentation. + */ +Infiniband::QueuePair* Infiniband::create_queue_pair(CephContext *cct, CompletionQueue *tx, + CompletionQueue* rx, ibv_qp_type type, struct rdma_cm_id *cm_id) +{ + Infiniband::QueuePair *qp = new QueuePair( + cct, *this, type, ib_physical_port, srq, tx, rx, tx_queue_len, rx_queue_len, cm_id); + if (qp->init()) { + delete qp; + return NULL; + } + return qp; +} + +int Infiniband::post_chunks_to_rq(int num, ibv_qp *qp) +{ + int ret, i = 0; + ibv_sge isge[num]; + Chunk *chunk; + ibv_recv_wr rx_work_request[num]; + + while (i < num) { + chunk = get_memory_manager()->get_rx_buffer(); + if (chunk == NULL) { + lderr(cct) << __func__ << " WARNING: out of memory. Requested " << num << + " rx buffers. Got " << i << dendl; + if (i == 0) + return 0; + // if we got some buffers post them and hope for the best + rx_work_request[i-1].next = 0; + break; + } + + isge[i].addr = reinterpret_cast<uint64_t>(chunk->data); + isge[i].length = chunk->bytes; + isge[i].lkey = chunk->lkey; + + memset(&rx_work_request[i], 0, sizeof(rx_work_request[i])); + rx_work_request[i].wr_id = reinterpret_cast<uint64_t>(chunk);// stash descriptor ptr + if (i == num - 1) { + rx_work_request[i].next = 0; + } else { + rx_work_request[i].next = &rx_work_request[i+1]; + } + rx_work_request[i].sg_list = &isge[i]; + rx_work_request[i].num_sge = 1; + i++; + } + ibv_recv_wr *badworkrequest; + if (support_srq) { + ret = ibv_post_srq_recv(srq, &rx_work_request[0], &badworkrequest); + ceph_assert(ret == 0); + } else { + ceph_assert(qp); + ret = ibv_post_recv(qp, &rx_work_request[0], &badworkrequest); + ceph_assert(ret == 0); + } + return i; +} + +Infiniband::CompletionChannel* Infiniband::create_comp_channel(CephContext *c) +{ + Infiniband::CompletionChannel *cc = new Infiniband::CompletionChannel(c, *this); + if (cc->init()) { + delete cc; + return NULL; + } + return cc; +} + +Infiniband::CompletionQueue* Infiniband::create_comp_queue( + CephContext *cct, CompletionChannel *cc) +{ + Infiniband::CompletionQueue *cq = new Infiniband::CompletionQueue( + cct, *this, CQ_DEPTH, cc); + if (cq->init()) { + delete cq; + return NULL; + } + return cq; +} + +// 1 means no valid buffer read, 0 means got enough buffer +// else return < 0 means error +int Infiniband::recv_msg(CephContext *cct, int sd, IBSYNMsg& im) +{ + char msg[TCP_MSG_LEN]; + char gid[33]; + ssize_t r = ::read(sd, &msg, sizeof(msg)); + // Drop incoming qpt + if (cct->_conf->ms_inject_socket_failures && sd >= 0) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 0) << __func__ << " injecting socket failure" << dendl; + return -EINVAL; + } + } + if (r < 0) { + r = -errno; + lderr(cct) << __func__ << " got error " << r << ": " + << cpp_strerror(r) << dendl; + } else if (r == 0) { // valid disconnect message of length 0 + ldout(cct, 10) << __func__ << " got disconnect message " << dendl; + } else if ((size_t)r != sizeof(msg)) { // invalid message + ldout(cct, 1) << __func__ << " got bad length (" << r << ") " << dendl; + r = -EINVAL; + } else { // valid message + sscanf(msg, "%hx:%x:%x:%x:%s", &(im.lid), &(im.qpn), &(im.psn), &(im.peer_qpn),gid); + wire_gid_to_gid(gid, &(im.gid)); + ldout(cct, 5) << __func__ << " recevd: " << im.lid << ", " << im.qpn << ", " << im.psn << ", " << im.peer_qpn << ", " << gid << dendl; + } + return r; +} + +int Infiniband::send_msg(CephContext *cct, int sd, IBSYNMsg& im) +{ + int retry = 0; + ssize_t r; + + char msg[TCP_MSG_LEN]; + char gid[33]; +retry: + gid_to_wire_gid(&(im.gid), gid); + sprintf(msg, "%04x:%08x:%08x:%08x:%s", im.lid, im.qpn, im.psn, im.peer_qpn, gid); + ldout(cct, 10) << __func__ << " sending: " << im.lid << ", " << im.qpn << ", " << im.psn + << ", " << im.peer_qpn << ", " << gid << dendl; + r = ::write(sd, msg, sizeof(msg)); + // Drop incoming qpt + if (cct->_conf->ms_inject_socket_failures && sd >= 0) { + if (rand() % cct->_conf->ms_inject_socket_failures == 0) { + ldout(cct, 0) << __func__ << " injecting socket failure" << dendl; + return -EINVAL; + } + } + + if ((size_t)r != sizeof(msg)) { + // FIXME need to handle EAGAIN instead of retry + if (r < 0 && (errno == EINTR || errno == EAGAIN) && retry < 3) { + retry++; + goto retry; + } + if (r < 0) + lderr(cct) << __func__ << " send returned error " << errno << ": " + << cpp_strerror(errno) << dendl; + else + lderr(cct) << __func__ << " send got bad length (" << r << ") " << cpp_strerror(errno) << dendl; + return -errno; + } + return 0; +} + +void Infiniband::wire_gid_to_gid(const char *wgid, union ibv_gid *gid) +{ + char tmp[9]; + uint32_t v32; + int i; + + for (tmp[8] = 0, i = 0; i < 4; ++i) { + memcpy(tmp, wgid + i * 8, 8); + sscanf(tmp, "%x", &v32); + *(uint32_t *)(&gid->raw[i * 4]) = ntohl(v32); + } +} + +void Infiniband::gid_to_wire_gid(const union ibv_gid *gid, char wgid[]) +{ + for (int i = 0; i < 4; ++i) + sprintf(&wgid[i * 8], "%08x", htonl(*(uint32_t *)(gid->raw + i * 4))); +} + +Infiniband::QueuePair::~QueuePair() +{ + if (qp) { + ldout(cct, 20) << __func__ << " destroy qp=" << qp << dendl; + ceph_assert(!ibv_destroy_qp(qp)); + } +} + +/** + * Given a string representation of the `status' field from Verbs + * struct `ibv_wc'. + * + * \param[in] status + * The integer status obtained in ibv_wc.status. + * \return + * A string corresponding to the given status. + */ +const char* Infiniband::wc_status_to_string(int status) +{ + static const char *lookup[] = { + "SUCCESS", + "LOC_LEN_ERR", + "LOC_QP_OP_ERR", + "LOC_EEC_OP_ERR", + "LOC_PROT_ERR", + "WR_FLUSH_ERR", + "MW_BIND_ERR", + "BAD_RESP_ERR", + "LOC_ACCESS_ERR", + "REM_INV_REQ_ERR", + "REM_ACCESS_ERR", + "REM_OP_ERR", + "RETRY_EXC_ERR", + "RNR_RETRY_EXC_ERR", + "LOC_RDD_VIOL_ERR", + "REM_INV_RD_REQ_ERR", + "REM_ABORT_ERR", + "INV_EECN_ERR", + "INV_EEC_STATE_ERR", + "FATAL_ERR", + "RESP_TIMEOUT_ERR", + "GENERAL_ERR" + }; + + if (status < IBV_WC_SUCCESS || status > IBV_WC_GENERAL_ERR) + return "<status out of range!>"; + return lookup[status]; +} + +const char* Infiniband::qp_state_string(int status) { + switch(status) { + case IBV_QPS_RESET : return "IBV_QPS_RESET"; + case IBV_QPS_INIT : return "IBV_QPS_INIT"; + case IBV_QPS_RTR : return "IBV_QPS_RTR"; + case IBV_QPS_RTS : return "IBV_QPS_RTS"; + case IBV_QPS_SQD : return "IBV_QPS_SQD"; + case IBV_QPS_SQE : return "IBV_QPS_SQE"; + case IBV_QPS_ERR : return "IBV_QPS_ERR"; + default: return " out of range."; + } +} diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h new file mode 100644 index 00000000..2889cdfc --- /dev/null +++ b/src/msg/async/rdma/Infiniband.h @@ -0,0 +1,529 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_INFINIBAND_H +#define CEPH_INFINIBAND_H + +#include <boost/pool/pool.hpp> +// need this because boost messes with ceph log/assert definitions +#include "include/ceph_assert.h" + +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> + +#include <atomic> +#include <string> +#include <vector> + +#include "include/int_types.h" +#include "include/page.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/Mutex.h" +#include "common/perf_counters.h" +#include "msg/msg_types.h" +#include "msg/async/net_handler.h" + +#define HUGE_PAGE_SIZE (2 * 1024 * 1024) +#define ALIGN_TO_PAGE_SIZE(x) \ + (((x) + HUGE_PAGE_SIZE -1) / HUGE_PAGE_SIZE * HUGE_PAGE_SIZE) + +struct IBSYNMsg { + uint16_t lid; + uint32_t qpn; + uint32_t psn; + uint32_t peer_qpn; + union ibv_gid gid; +} __attribute__((packed)); + +class RDMAStack; +class CephContext; + +class Port { + struct ibv_context* ctxt; + int port_num; + struct ibv_port_attr* port_attr; + uint16_t lid; + int gid_idx = 0; + union ibv_gid gid; + + public: + explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn); + uint16_t get_lid() { return lid; } + ibv_gid get_gid() { return gid; } + int get_port_num() { return port_num; } + ibv_port_attr* get_port_attr() { return port_attr; } + int get_gid_idx() { return gid_idx; } +}; + + +class Device { + ibv_device *device; + const char* name; + uint8_t port_cnt = 0; + public: + explicit Device(CephContext *c, ibv_device* d, struct ibv_context *dc); + ~Device() { + if (active_port) { + delete active_port; + ceph_assert(ibv_close_device(ctxt) == 0); + } + } + const char* get_name() { return name;} + uint16_t get_lid() { return active_port->get_lid(); } + ibv_gid get_gid() { return active_port->get_gid(); } + int get_gid_idx() { return active_port->get_gid_idx(); } + void binding_port(CephContext *c, int port_num); + struct ibv_context *ctxt; + ibv_device_attr *device_attr; + Port* active_port; +}; + + +class DeviceList { + struct ibv_device ** device_list; + struct ibv_context ** device_context_list; + int num; + Device** devices; + public: + explicit DeviceList(CephContext *cct): device_list(ibv_get_device_list(&num)), + device_context_list(rdma_get_devices(&num)) { + if (device_list == NULL || num == 0) { + lderr(cct) << __func__ << " failed to get rdma device list. " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + devices = new Device*[num]; + + for (int i = 0;i < num; ++i) { + devices[i] = new Device(cct, device_list[i], device_context_list[i]); + } + } + ~DeviceList() { + for (int i=0; i < num; ++i) { + delete devices[i]; + } + delete []devices; + ibv_free_device_list(device_list); + } + + Device* get_device(const char* device_name) { + ceph_assert(devices); + for (int i = 0; i < num; ++i) { + if (!strlen(device_name) || !strcmp(device_name, devices[i]->get_name())) { + return devices[i]; + } + } + return NULL; + } +}; + +// stat counters +enum { + l_msgr_rdma_dispatcher_first = 94000, + + l_msgr_rdma_polling, + l_msgr_rdma_inflight_tx_chunks, + l_msgr_rdma_rx_bufs_in_use, + l_msgr_rdma_rx_bufs_total, + + l_msgr_rdma_tx_total_wc, + l_msgr_rdma_tx_total_wc_errors, + l_msgr_rdma_tx_wc_retry_errors, + l_msgr_rdma_tx_wc_wr_flush_errors, + + l_msgr_rdma_rx_total_wc, + l_msgr_rdma_rx_total_wc_errors, + l_msgr_rdma_rx_fin, + + l_msgr_rdma_handshake_errors, + + l_msgr_rdma_total_async_events, + l_msgr_rdma_async_last_wqe_events, + + l_msgr_rdma_created_queue_pair, + l_msgr_rdma_active_queue_pair, + + l_msgr_rdma_dispatcher_last, +}; + +enum { + l_msgr_rdma_first = 95000, + + l_msgr_rdma_tx_no_mem, + l_msgr_rdma_tx_parital_mem, + l_msgr_rdma_tx_failed, + + l_msgr_rdma_tx_chunks, + l_msgr_rdma_tx_bytes, + l_msgr_rdma_rx_chunks, + l_msgr_rdma_rx_bytes, + l_msgr_rdma_pending_sent_conns, + + l_msgr_rdma_last, +}; + +class RDMADispatcher; + +class Infiniband { + public: + class ProtectionDomain { + public: + explicit ProtectionDomain(CephContext *cct, Device *device); + ~ProtectionDomain(); + + ibv_pd* const pd; + }; + + + class MemoryManager { + public: + class Chunk { + public: + Chunk(ibv_mr* m, uint32_t len, char* b); + ~Chunk(); + + void set_offset(uint32_t o); + uint32_t get_offset(); + void set_bound(uint32_t b); + void prepare_read(uint32_t b); + uint32_t get_bound(); + uint32_t read(char* buf, uint32_t len); + uint32_t write(char* buf, uint32_t len); + bool full(); + bool over(); + void clear(); + + public: + ibv_mr* mr; + uint32_t lkey = 0; + uint32_t bytes; + uint32_t bound = 0; + uint32_t offset; + char* buffer; // TODO: remove buffer/refactor TX + char data[0]; + }; + + class Cluster { + public: + Cluster(MemoryManager& m, uint32_t s); + ~Cluster(); + + int fill(uint32_t num); + void take_back(std::vector<Chunk*> &ck); + int get_buffers(std::vector<Chunk*> &chunks, size_t bytes); + Chunk *get_chunk_by_buffer(const char *c) { + uint32_t idx = (c - base) / buffer_size; + Chunk *chunk = chunk_base + idx; + return chunk; + } + bool is_my_buffer(const char *c) const { + return c >= base && c < end; + } + + MemoryManager& manager; + uint32_t buffer_size; + uint32_t num_chunk = 0; + Mutex lock; + std::vector<Chunk*> free_chunks; + char *base = nullptr; + char *end = nullptr; + Chunk* chunk_base = nullptr; + }; + + class MemPoolContext { + PerfCounters *perf_logger; + + public: + MemoryManager *manager; + unsigned n_bufs_allocated; + // true if it is possible to alloc + // more memory for the pool + explicit MemPoolContext(MemoryManager *m) : + perf_logger(nullptr), + manager(m), + n_bufs_allocated(0) {} + bool can_alloc(unsigned nbufs); + void update_stats(int val); + void set_stat_logger(PerfCounters *logger); + }; + + class PoolAllocator { + struct mem_info { + ibv_mr *mr; + MemPoolContext *ctx; + unsigned nbufs; + Chunk chunks[0]; + }; + public: + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + static char * malloc(const size_type bytes); + static void free(char * const block); + + static MemPoolContext *g_ctx; + static Mutex lock; + }; + + /** + * modify boost pool so that it is possible to + * have a thread safe 'context' when allocating/freeing + * the memory. It is needed to allow a different pool + * configurations and bookkeeping per CephContext and + * also to be able to use same allocator to deal with + * RX and TX pool. + * TODO: use boost pool to allocate TX chunks too + */ + class mem_pool : public boost::pool<PoolAllocator> { + private: + MemPoolContext *ctx; + void *slow_malloc(); + + public: + explicit mem_pool(MemPoolContext *ctx, const size_type nrequested_size, + const size_type nnext_size = 32, + const size_type nmax_size = 0) : + pool(nrequested_size, nnext_size, nmax_size), + ctx(ctx) { } + + void *malloc() { + if (!store().empty()) + return (store().malloc)(); + // need to alloc more memory... + // slow path code + return slow_malloc(); + } + }; + + MemoryManager(CephContext *c, Device *d, ProtectionDomain *p); + ~MemoryManager(); + + void* malloc(size_t size); + void free(void *ptr); + + void create_tx_pool(uint32_t size, uint32_t tx_num); + void return_tx(std::vector<Chunk*> &chunks); + int get_send_buffers(std::vector<Chunk*> &c, size_t bytes); + bool is_tx_buffer(const char* c) { return send->is_my_buffer(c); } + Chunk *get_tx_chunk_by_buffer(const char *c) { + return send->get_chunk_by_buffer(c); + } + uint32_t get_tx_buffer_size() const { + return send->buffer_size; + } + + Chunk *get_rx_buffer() { + return reinterpret_cast<Chunk *>(rxbuf_pool.malloc()); + } + + void release_rx_buffer(Chunk *chunk) { + rxbuf_pool.free(chunk); + } + + void set_rx_stat_logger(PerfCounters *logger) { + rxbuf_pool_ctx.set_stat_logger(logger); + } + + CephContext *cct; + private: + // TODO: Cluster -> TxPool txbuf_pool + // chunk layout fix + // + Cluster* send = nullptr;// SEND + Device *device; + ProtectionDomain *pd; + MemPoolContext rxbuf_pool_ctx; + mem_pool rxbuf_pool; + + + void* huge_pages_malloc(size_t size); + void huge_pages_free(void *ptr); + }; + + private: + uint32_t tx_queue_len = 0; + uint32_t rx_queue_len = 0; + uint32_t max_sge = 0; + uint8_t ib_physical_port = 0; + MemoryManager* memory_manager = nullptr; + ibv_srq* srq = nullptr; // shared receive work queue + Device *device = NULL; + ProtectionDomain *pd = NULL; + DeviceList *device_list = nullptr; + void wire_gid_to_gid(const char *wgid, union ibv_gid *gid); + void gid_to_wire_gid(const union ibv_gid *gid, char wgid[]); + CephContext *cct; + Mutex lock; + bool initialized = false; + const std::string &device_name; + uint8_t port_num; + bool support_srq = false; + + public: + explicit Infiniband(CephContext *c); + ~Infiniband(); + void init(); + static void verify_prereq(CephContext *cct); + + class CompletionChannel { + static const uint32_t MAX_ACK_EVENT = 5000; + CephContext *cct; + Infiniband& infiniband; + ibv_comp_channel *channel; + ibv_cq *cq; + uint32_t cq_events_that_need_ack; + + public: + CompletionChannel(CephContext *c, Infiniband &ib); + ~CompletionChannel(); + int init(); + bool get_cq_event(); + int get_fd() { return channel->fd; } + ibv_comp_channel* get_channel() { return channel; } + void bind_cq(ibv_cq *c) { cq = c; } + void ack_events(); + }; + + // this class encapsulates the creation, use, and destruction of an RC + // completion queue. + // + // You need to call init and it will create a cq and associate to comp channel + class CompletionQueue { + public: + CompletionQueue(CephContext *c, Infiniband &ib, + const uint32_t qd, CompletionChannel *cc) + : cct(c), infiniband(ib), channel(cc), cq(NULL), queue_depth(qd) {} + ~CompletionQueue(); + int init(); + int poll_cq(int num_entries, ibv_wc *ret_wc_array); + + ibv_cq* get_cq() const { return cq; } + int rearm_notify(bool solicited_only=true); + CompletionChannel* get_cc() const { return channel; } + private: + CephContext *cct; + Infiniband& infiniband; // Infiniband to which this QP belongs + CompletionChannel *channel; + ibv_cq *cq; + uint32_t queue_depth; + }; + + // this class encapsulates the creation, use, and destruction of an RC + // queue pair. + // + // you need call init and it will create a qp and bring it to the INIT state. + // after obtaining the lid, qpn, and psn of a remote queue pair, one + // must call plumb() to bring the queue pair to the RTS state. + class QueuePair { + public: + QueuePair(CephContext *c, Infiniband& infiniband, ibv_qp_type type, + int ib_physical_port, ibv_srq *srq, + Infiniband::CompletionQueue* txcq, + Infiniband::CompletionQueue* rxcq, + uint32_t tx_queue_len, uint32_t max_recv_wr, struct rdma_cm_id *cid, uint32_t q_key = 0); + ~QueuePair(); + + int init(); + + /** + * Get the initial packet sequence number for this QueuePair. + * This is randomly generated on creation. It should not be confused + * with the remote side's PSN, which is set in #plumb(). + */ + uint32_t get_initial_psn() const { return initial_psn; }; + /** + * Get the local queue pair number for this QueuePair. + * QPNs are analogous to UDP/TCP port numbers. + */ + uint32_t get_local_qp_number() const { return qp->qp_num; }; + /** + * Get the remote queue pair number for this QueuePair, as set in #plumb(). + * QPNs are analogous to UDP/TCP port numbers. + */ + int get_remote_qp_number(uint32_t *rqp) const; + /** + * Get the remote infiniband address for this QueuePair, as set in #plumb(). + * LIDs are "local IDs" in infiniband terminology. They are short, locally + * routable addresses. + */ + int get_remote_lid(uint16_t *lid) const; + /** + * Get the state of a QueuePair. + */ + int get_state() const; + /** + * Return true if the queue pair is in an error state, false otherwise. + */ + bool is_error() const; + void add_tx_wr(uint32_t amt) { tx_wr_inflight += amt; } + void dec_tx_wr(uint32_t amt) { tx_wr_inflight -= amt; } + uint32_t get_tx_wr() const { return tx_wr_inflight; } + ibv_qp* get_qp() const { return qp; } + Infiniband::CompletionQueue* get_tx_cq() const { return txcq; } + Infiniband::CompletionQueue* get_rx_cq() const { return rxcq; } + int to_dead(); + bool is_dead() const { return dead; } + + private: + CephContext *cct; + Infiniband& infiniband; // Infiniband to which this QP belongs + ibv_qp_type type; // QP type (IBV_QPT_RC, etc.) + ibv_context* ctxt; // device context of the HCA to use + int ib_physical_port; + ibv_pd* pd; // protection domain + ibv_srq* srq; // shared receive queue + ibv_qp* qp; // infiniband verbs QP handle + struct rdma_cm_id *cm_id; + Infiniband::CompletionQueue* txcq; + Infiniband::CompletionQueue* rxcq; + uint32_t initial_psn; // initial packet sequence number + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t q_key; + bool dead; + std::atomic<uint32_t> tx_wr_inflight = {0}; // counter for inflight Tx WQEs + }; + + public: + typedef MemoryManager::Cluster Cluster; + typedef MemoryManager::Chunk Chunk; + QueuePair* create_queue_pair(CephContext *c, CompletionQueue*, CompletionQueue*, + ibv_qp_type type, struct rdma_cm_id *cm_id); + ibv_srq* create_shared_receive_queue(uint32_t max_wr, uint32_t max_sge); + // post rx buffers to srq, return number of buffers actually posted + int post_chunks_to_rq(int num, ibv_qp *qp=NULL); + void post_chunk_to_pool(Chunk* chunk) { + get_memory_manager()->release_rx_buffer(chunk); + } + int get_tx_buffers(std::vector<Chunk*> &c, size_t bytes); + CompletionChannel *create_comp_channel(CephContext *c); + CompletionQueue *create_comp_queue(CephContext *c, CompletionChannel *cc=NULL); + uint8_t get_ib_physical_port() { return ib_physical_port; } + int send_msg(CephContext *cct, int sd, IBSYNMsg& msg); + int recv_msg(CephContext *cct, int sd, IBSYNMsg& msg); + uint16_t get_lid() { return device->get_lid(); } + ibv_gid get_gid() { return device->get_gid(); } + MemoryManager* get_memory_manager() { return memory_manager; } + Device* get_device() { return device; } + int get_async_fd() { return device->ctxt->async_fd; } + bool is_tx_buffer(const char* c) { return memory_manager->is_tx_buffer(c);} + Chunk *get_tx_chunk_by_buffer(const char *c) { return memory_manager->get_tx_chunk_by_buffer(c); } + static const char* wc_status_to_string(int status); + static const char* qp_state_string(int status); + uint32_t get_rx_queue_len() const { return rx_queue_len; } +}; + +#endif diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc new file mode 100644 index 00000000..89be7428 --- /dev/null +++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc @@ -0,0 +1,743 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "RDMAStack.h" + +class C_handle_connection_established : public EventCallback { + RDMAConnectedSocketImpl *csi; + bool active = true; + public: + C_handle_connection_established(RDMAConnectedSocketImpl *w) : csi(w) {} + void do_request(uint64_t fd) final { + if (active) + csi->handle_connection_established(); + } + void close() { + active = false; + } +}; + +class C_handle_connection_read : public EventCallback { + RDMAConnectedSocketImpl *csi; + bool active = true; + public: + explicit C_handle_connection_read(RDMAConnectedSocketImpl *w): csi(w) {} + void do_request(uint64_t fd) final { + if (active) + csi->handle_connection(); + } + void close() { + active = false; + } +}; + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAConnectedSocketImpl " + +RDMAConnectedSocketImpl::RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w) + : cct(cct), connected(0), error(0), infiniband(ib), + dispatcher(s), worker(w), lock("RDMAConnectedSocketImpl::lock"), + is_server(false), read_handler(new C_handle_connection_read(this)), + established_handler(new C_handle_connection_established(this)), + active(false), pending(false) +{ + if (!cct->_conf->ms_async_rdma_cm) { + qp = infiniband->create_queue_pair(cct, s->get_tx_cq(), s->get_rx_cq(), IBV_QPT_RC, NULL); + my_msg.qpn = qp->get_local_qp_number(); + my_msg.psn = qp->get_initial_psn(); + my_msg.lid = infiniband->get_lid(); + my_msg.peer_qpn = 0; + my_msg.gid = infiniband->get_gid(); + notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK); + dispatcher->register_qp(qp, this); + dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair); + dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair); + } +} + +RDMAConnectedSocketImpl::~RDMAConnectedSocketImpl() +{ + ldout(cct, 20) << __func__ << " destruct." << dendl; + cleanup(); + worker->remove_pending_conn(this); + dispatcher->erase_qpn(my_msg.qpn); + + for (unsigned i=0; i < wc.size(); ++i) { + dispatcher->post_chunk_to_pool(reinterpret_cast<Chunk*>(wc[i].wr_id)); + } + for (unsigned i=0; i < buffers.size(); ++i) { + dispatcher->post_chunk_to_pool(buffers[i]); + } + + Mutex::Locker l(lock); + if (notify_fd >= 0) + ::close(notify_fd); + if (tcp_fd >= 0) + ::close(tcp_fd); + error = ECONNRESET; +} + +void RDMAConnectedSocketImpl::pass_wc(std::vector<ibv_wc> &&v) +{ + Mutex::Locker l(lock); + if (wc.empty()) + wc = std::move(v); + else + wc.insert(wc.end(), v.begin(), v.end()); + notify(); +} + +void RDMAConnectedSocketImpl::get_wc(std::vector<ibv_wc> &w) +{ + Mutex::Locker l(lock); + if (wc.empty()) + return ; + w.swap(wc); +} + +int RDMAConnectedSocketImpl::activate() +{ + ibv_qp_attr qpa; + int r; + + // now connect up the qps and switch to RTR + memset(&qpa, 0, sizeof(qpa)); + qpa.qp_state = IBV_QPS_RTR; + qpa.path_mtu = IBV_MTU_1024; + qpa.dest_qp_num = peer_msg.qpn; + qpa.rq_psn = peer_msg.psn; + qpa.max_dest_rd_atomic = 1; + qpa.min_rnr_timer = 12; + //qpa.ah_attr.is_global = 0; + qpa.ah_attr.is_global = 1; + qpa.ah_attr.grh.hop_limit = 6; + qpa.ah_attr.grh.dgid = peer_msg.gid; + + qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx(); + + qpa.ah_attr.dlid = peer_msg.lid; + qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl; + qpa.ah_attr.grh.traffic_class = cct->_conf->ms_async_rdma_dscp; + qpa.ah_attr.src_path_bits = 0; + qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port()); + + ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl; + + r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_MAX_DEST_RD_ATOMIC); + if (r) { + lderr(cct) << __func__ << " failed to transition to RTR state: " + << cpp_strerror(errno) << dendl; + return -1; + } + + ldout(cct, 20) << __func__ << " transition to RTR state successfully." << dendl; + + // now move to RTS + qpa.qp_state = IBV_QPS_RTS; + + // How long to wait before retrying if packet lost or server dead. + // Supposedly the timeout is 4.096us*2^timeout. However, the actual + // timeout appears to be 4.096us*2^(timeout+1), so the setting + // below creates a 135ms timeout. + qpa.timeout = 14; + + // How many times to retry after timeouts before giving up. + qpa.retry_cnt = 7; + + // How many times to retry after RNR (receiver not ready) condition + // before giving up. Occurs when the remote side has not yet posted + // a receive request. + qpa.rnr_retry = 7; // 7 is infinite retry. + qpa.sq_psn = my_msg.psn; + qpa.max_rd_atomic = 1; + + r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC); + if (r) { + lderr(cct) << __func__ << " failed to transition to RTS state: " + << cpp_strerror(errno) << dendl; + return -1; + } + + // the queue pair should be ready to use once the client has finished + // setting up their end. + ldout(cct, 20) << __func__ << " transition to RTS state successfully." << dendl; + ldout(cct, 20) << __func__ << " QueuePair: " << qp << " with qp:" << qp->get_qp() << dendl; + + if (!is_server) { + connected = 1; //indicate successfully + ldout(cct, 20) << __func__ << " handle fake send, wake it up. QP: " << my_msg.qpn << dendl; + submit(false); + } + active = true; + + return 0; +} + +int RDMAConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) { + ldout(cct, 20) << __func__ << " nonblock:" << opts.nonblock << ", nodelay:" + << opts.nodelay << ", rbuf_size: " << opts.rcbuf_size << dendl; + NetHandler net(cct); + + // we construct a socket to transport ib sync message + // but we shouldn't block in tcp connecting + if (opts.nonblock) { + tcp_fd = net.nonblock_connect(peer_addr, opts.connect_bind_addr); + } else { + tcp_fd = net.connect(peer_addr, opts.connect_bind_addr); + } + + if (tcp_fd < 0) { + return -errno; + } + + int r = net.set_socket_options(tcp_fd, opts.nodelay, opts.rcbuf_size); + if (r < 0) { + ::close(tcp_fd); + tcp_fd = -1; + return -errno; + } + + ldout(cct, 20) << __func__ << " tcp_fd: " << tcp_fd << dendl; + net.set_priority(tcp_fd, opts.priority, peer_addr.get_family()); + r = 0; + if (opts.nonblock) { + worker->center.create_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE , established_handler); + } else { + r = handle_connection_established(false); + } + return r; +} + +int RDMAConnectedSocketImpl::handle_connection_established(bool need_set_fault) { + ldout(cct, 20) << __func__ << " start " << dendl; + // delete read event + worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE); + if (1 == connected) { + ldout(cct, 1) << __func__ << " warnning: logic failed " << dendl; + if (need_set_fault) { + fault(); + } + return -1; + } + // send handshake msg to server + my_msg.peer_qpn = 0; + int r = infiniband->send_msg(cct, tcp_fd, my_msg); + if (r < 0) { + ldout(cct, 1) << __func__ << " send handshake msg failed." << r << dendl; + if (need_set_fault) { + fault(); + } + return r; + } + worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler); + ldout(cct, 20) << __func__ << " finish " << dendl; + return 0; +} + +void RDMAConnectedSocketImpl::handle_connection() { + ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " tcp_fd: " << tcp_fd << " notify_fd: " << notify_fd << dendl; + int r = infiniband->recv_msg(cct, tcp_fd, peer_msg); + if (r <= 0) { + if (r != -EAGAIN) { + dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors); + ldout(cct, 1) << __func__ << " recv handshake msg failed." << dendl; + fault(); + } + return; + } + + if (1 == connected) { + ldout(cct, 1) << __func__ << " warnning: logic failed: read len: " << r << dendl; + fault(); + return; + } + + if (!is_server) {// syn + ack from server + my_msg.peer_qpn = peer_msg.qpn; + ldout(cct, 20) << __func__ << " peer msg : < " << peer_msg.qpn << ", " << peer_msg.psn + << ", " << peer_msg.lid << ", " << peer_msg.peer_qpn << "> " << dendl; + if (!connected) { + r = activate(); + ceph_assert(!r); + } + notify(); + r = infiniband->send_msg(cct, tcp_fd, my_msg); + if (r < 0) { + ldout(cct, 1) << __func__ << " send client ack failed." << dendl; + dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors); + fault(); + } + } else { + if (peer_msg.peer_qpn == 0) {// syn from client + if (active) { + ldout(cct, 10) << __func__ << " server is already active." << dendl; + return ; + } + r = activate(); + ceph_assert(!r); + r = infiniband->send_msg(cct, tcp_fd, my_msg); + if (r < 0) { + ldout(cct, 1) << __func__ << " server ack failed." << dendl; + dispatcher->perf_logger->inc(l_msgr_rdma_handshake_errors); + fault(); + return ; + } + } else { // ack from client + connected = 1; + ldout(cct, 10) << __func__ << " handshake of rdma is done. server connected: " << connected << dendl; + //cleanup(); + submit(false); + notify(); + } + } +} + +ssize_t RDMAConnectedSocketImpl::read(char* buf, size_t len) +{ + uint64_t i = 0; + int r = ::read(notify_fd, &i, sizeof(i)); + ldout(cct, 20) << __func__ << " notify_fd : " << i << " in " << my_msg.qpn << " r = " << r << dendl; + + if (!active) { + ldout(cct, 1) << __func__ << " when ib not active. len: " << len << dendl; + return -EAGAIN; + } + + if (0 == connected) { + ldout(cct, 1) << __func__ << " when ib not connected. len: " << len <<dendl; + return -EAGAIN; + } + ssize_t read = 0; + if (!buffers.empty()) + read = read_buffers(buf,len); + + std::vector<ibv_wc> cqe; + get_wc(cqe); + if (cqe.empty()) { + if (!buffers.empty()) { + notify(); + } + if (read > 0) { + return read; + } + if (error) { + return -error; + } else { + return -EAGAIN; + } + } + + ldout(cct, 20) << __func__ << " poll queue got " << cqe.size() << " responses. QP: " << my_msg.qpn << dendl; + for (size_t i = 0; i < cqe.size(); ++i) { + ibv_wc* response = &cqe[i]; + ceph_assert(response->status == IBV_WC_SUCCESS); + Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id); + ldout(cct, 25) << __func__ << " chunk length: " << response->byte_len << " bytes." << chunk << dendl; + chunk->prepare_read(response->byte_len); + worker->perf_logger->inc(l_msgr_rdma_rx_bytes, response->byte_len); + if (response->byte_len == 0) { + dispatcher->perf_logger->inc(l_msgr_rdma_rx_fin); + if (connected) { + error = ECONNRESET; + ldout(cct, 20) << __func__ << " got remote close msg..." << dendl; + } + dispatcher->post_chunk_to_pool(chunk); + } else { + if (read == (ssize_t)len) { + buffers.push_back(chunk); + ldout(cct, 25) << __func__ << " buffers add a chunk: " << response->byte_len << dendl; + } else if (read + response->byte_len > (ssize_t)len) { + read += chunk->read(buf+read, (ssize_t)len-read); + buffers.push_back(chunk); + ldout(cct, 25) << __func__ << " buffers add a chunk: " << chunk->get_offset() << ":" << chunk->get_bound() << dendl; + } else { + read += chunk->read(buf+read, response->byte_len); + dispatcher->post_chunk_to_pool(chunk); + update_post_backlog(); + } + } + } + + worker->perf_logger->inc(l_msgr_rdma_rx_chunks, cqe.size()); + if (is_server && connected == 0) { + ldout(cct, 20) << __func__ << " we do not need last handshake, QP: " << my_msg.qpn << " peer QP: " << peer_msg.qpn << dendl; + connected = 1; //if so, we don't need the last handshake + cleanup(); + submit(false); + } + + if (!buffers.empty()) { + notify(); + } + + if (read == 0 && error) + return -error; + return read == 0 ? -EAGAIN : read; +} + +ssize_t RDMAConnectedSocketImpl::read_buffers(char* buf, size_t len) +{ + size_t read = 0, tmp = 0; + auto c = buffers.begin(); + for (; c != buffers.end() ; ++c) { + tmp = (*c)->read(buf+read, len-read); + read += tmp; + ldout(cct, 25) << __func__ << " this iter read: " << tmp << " bytes." << " offset: " << (*c)->get_offset() << " ,bound: " << (*c)->get_bound() << ". Chunk:" << *c << dendl; + if ((*c)->over()) { + dispatcher->post_chunk_to_pool(*c); + update_post_backlog(); + ldout(cct, 25) << __func__ << " one chunk over." << dendl; + } + if (read == len) { + break; + } + } + + if (c != buffers.end() && (*c)->over()) + ++c; + buffers.erase(buffers.begin(), c); + ldout(cct, 25) << __func__ << " got " << read << " bytes, buffers size: " << buffers.size() << dendl; + return read; +} + +ssize_t RDMAConnectedSocketImpl::zero_copy_read(bufferptr &data) +{ + if (error) + return -error; + static const int MAX_COMPLETIONS = 16; + ibv_wc wc[MAX_COMPLETIONS]; + ssize_t size = 0; + + ibv_wc* response; + Chunk* chunk; + bool loaded = false; + auto iter = buffers.begin(); + if (iter != buffers.end()) { + chunk = *iter; + // FIXME need to handle release + // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband); + buffers.erase(iter); + loaded = true; + size = chunk->bound; + } + + std::vector<ibv_wc> cqe; + get_wc(cqe); + if (cqe.empty()) + return size == 0 ? -EAGAIN : size; + + ldout(cct, 20) << __func__ << " pool completion queue got " << cqe.size() << " responses."<< dendl; + + for (size_t i = 0; i < cqe.size(); ++i) { + response = &wc[i]; + chunk = reinterpret_cast<Chunk*>(response->wr_id); + chunk->prepare_read(response->byte_len); + if (!loaded && i == 0) { + // FIXME need to handle release + // auto del = std::bind(&Chunk::post_srq, std::move(chunk), infiniband); + size = chunk->bound; + continue; + } + buffers.push_back(chunk); + iter++; + } + + if (size == 0) + return -EAGAIN; + return size; +} + +ssize_t RDMAConnectedSocketImpl::send(bufferlist &bl, bool more) +{ + if (error) { + if (!active) + return -EPIPE; + return -error; + } + size_t bytes = bl.length(); + if (!bytes) + return 0; + { + Mutex::Locker l(lock); + pending_bl.claim_append(bl); + if (!connected) { + ldout(cct, 20) << __func__ << " fake send to upper, QP: " << my_msg.qpn << dendl; + return bytes; + } + } + ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << dendl; + ssize_t r = submit(more); + if (r < 0 && r != -EAGAIN) + return r; + return bytes; +} + +ssize_t RDMAConnectedSocketImpl::submit(bool more) +{ + if (error) + return -error; + Mutex::Locker l(lock); + size_t bytes = pending_bl.length(); + ldout(cct, 20) << __func__ << " we need " << bytes << " bytes. iov size: " + << pending_bl.buffers().size() << dendl; + if (!bytes) + return 0; + + auto fill_tx_via_copy = [this](std::vector<Chunk*> &tx_buffers, + unsigned bytes, + auto& start, + const auto& end) -> unsigned { + ceph_assert(start != end); + auto chunk_idx = tx_buffers.size(); + int ret = worker->get_reged_mem(this, tx_buffers, bytes); + if (ret == 0) { + ldout(cct, 1) << __func__ << " no enough buffers in worker " << worker << dendl; + worker->perf_logger->inc(l_msgr_rdma_tx_no_mem); + return 0; + } + + unsigned total_copied = 0; + Chunk *current_chunk = tx_buffers[chunk_idx]; + while (start != end) { + const uintptr_t addr = reinterpret_cast<uintptr_t>(start->c_str()); + unsigned copied = 0; + while (copied < start->length()) { + uint32_t r = current_chunk->write((char*)addr+copied, start->length() - copied); + copied += r; + total_copied += r; + bytes -= r; + if (current_chunk->full()){ + if (++chunk_idx == tx_buffers.size()) + return total_copied; + current_chunk = tx_buffers[chunk_idx]; + } + } + ++start; + } + ceph_assert(bytes == 0); + return total_copied; + }; + + std::vector<Chunk*> tx_buffers; + auto it = std::cbegin(pending_bl.buffers()); + auto copy_it = it; + unsigned total = 0; + unsigned need_reserve_bytes = 0; + while (it != pending_bl.buffers().end()) { + if (infiniband->is_tx_buffer(it->raw_c_str())) { + if (need_reserve_bytes) { + unsigned copied = fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it); + total += copied; + if (copied < need_reserve_bytes) + goto sending; + need_reserve_bytes = 0; + } + ceph_assert(copy_it == it); + tx_buffers.push_back(infiniband->get_tx_chunk_by_buffer(it->raw_c_str())); + total += it->length(); + ++copy_it; + } else { + need_reserve_bytes += it->length(); + } + ++it; + } + if (need_reserve_bytes) + total += fill_tx_via_copy(tx_buffers, need_reserve_bytes, copy_it, it); + + sending: + if (total == 0) + return -EAGAIN; + ceph_assert(total <= pending_bl.length()); + bufferlist swapped; + if (total < pending_bl.length()) { + worker->perf_logger->inc(l_msgr_rdma_tx_parital_mem); + pending_bl.splice(total, pending_bl.length()-total, &swapped); + pending_bl.swap(swapped); + } else { + pending_bl.clear(); + } + + ldout(cct, 20) << __func__ << " left bytes: " << pending_bl.length() << " in buffers " + << pending_bl.buffers().size() << " tx chunks " << tx_buffers.size() << dendl; + + int r = post_work_request(tx_buffers); + if (r < 0) + return r; + + ldout(cct, 20) << __func__ << " finished sending " << bytes << " bytes." << dendl; + return pending_bl.length() ? -EAGAIN : 0; +} + +int RDMAConnectedSocketImpl::post_work_request(std::vector<Chunk*> &tx_buffers) +{ + ldout(cct, 20) << __func__ << " QP: " << my_msg.qpn << " " << tx_buffers[0] << dendl; + vector<Chunk*>::iterator current_buffer = tx_buffers.begin(); + ibv_sge isge[tx_buffers.size()]; + uint32_t current_sge = 0; + ibv_send_wr iswr[tx_buffers.size()]; + uint32_t current_swr = 0; + ibv_send_wr* pre_wr = NULL; + uint32_t num = 0; + + // FIPS zeroization audit 20191115: these memsets are not security related. + memset(iswr, 0, sizeof(iswr)); + memset(isge, 0, sizeof(isge)); + + while (current_buffer != tx_buffers.end()) { + isge[current_sge].addr = reinterpret_cast<uint64_t>((*current_buffer)->buffer); + isge[current_sge].length = (*current_buffer)->get_offset(); + isge[current_sge].lkey = (*current_buffer)->mr->lkey; + ldout(cct, 25) << __func__ << " sending buffer: " << *current_buffer << " length: " << isge[current_sge].length << dendl; + + iswr[current_swr].wr_id = reinterpret_cast<uint64_t>(*current_buffer); + iswr[current_swr].next = NULL; + iswr[current_swr].sg_list = &isge[current_sge]; + iswr[current_swr].num_sge = 1; + iswr[current_swr].opcode = IBV_WR_SEND; + iswr[current_swr].send_flags = IBV_SEND_SIGNALED; + /*if (isge[current_sge].length < infiniband->max_inline_data) { + iswr[current_swr].send_flags = IBV_SEND_INLINE; + ldout(cct, 20) << __func__ << " send_inline." << dendl; + }*/ + + num++; + worker->perf_logger->inc(l_msgr_rdma_tx_bytes, isge[current_sge].length); + if (pre_wr) + pre_wr->next = &iswr[current_swr]; + pre_wr = &iswr[current_swr]; + ++current_sge; + ++current_swr; + ++current_buffer; + } + + ibv_send_wr *bad_tx_work_request; + if (ibv_post_send(qp->get_qp(), iswr, &bad_tx_work_request)) { + ldout(cct, 1) << __func__ << " failed to send data" + << " (most probably should be peer not ready): " + << cpp_strerror(errno) << dendl; + worker->perf_logger->inc(l_msgr_rdma_tx_failed); + return -errno; + } + qp->add_tx_wr(num); + worker->perf_logger->inc(l_msgr_rdma_tx_chunks, tx_buffers.size()); + ldout(cct, 20) << __func__ << " qp state is " << Infiniband::qp_state_string(qp->get_state()) << dendl; + return 0; +} + +void RDMAConnectedSocketImpl::fin() { + ibv_send_wr wr; + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&wr, 0, sizeof(wr)); + + wr.wr_id = reinterpret_cast<uint64_t>(qp); + wr.num_sge = 0; + wr.opcode = IBV_WR_SEND; + wr.send_flags = IBV_SEND_SIGNALED; + ibv_send_wr* bad_tx_work_request; + if (ibv_post_send(qp->get_qp(), &wr, &bad_tx_work_request)) { + ldout(cct, 1) << __func__ << " failed to send message=" + << " ibv_post_send failed(most probably should be peer not ready): " + << cpp_strerror(errno) << dendl; + worker->perf_logger->inc(l_msgr_rdma_tx_failed); + return ; + } + qp->add_tx_wr(1); +} + +void RDMAConnectedSocketImpl::cleanup() { + if (read_handler && tcp_fd >= 0) { + (static_cast<C_handle_connection_read*>(read_handler))->close(); + worker->center.submit_to(worker->center.get_id(), [this]() { + worker->center.delete_file_event(tcp_fd, EVENT_READABLE | EVENT_WRITABLE); + }, false); + delete read_handler; + read_handler = nullptr; + } + if (established_handler) { + (static_cast<C_handle_connection_established*>(established_handler))->close(); + delete established_handler; + established_handler = nullptr; + } +} + +void RDMAConnectedSocketImpl::notify() +{ + // note: notify_fd is an event fd (man eventfd) + // write argument must be a 64bit integer + uint64_t i = 1; + + ceph_assert(sizeof(i) == write(notify_fd, &i, sizeof(i))); +} + +void RDMAConnectedSocketImpl::shutdown() +{ + if (!error) + fin(); + error = ECONNRESET; + active = false; +} + +void RDMAConnectedSocketImpl::close() +{ + if (!error) + fin(); + error = ECONNRESET; + active = false; +} + +void RDMAConnectedSocketImpl::fault() +{ + ldout(cct, 1) << __func__ << " tcp fd " << tcp_fd << dendl; + /*if (qp) { + qp->to_dead(); + qp = NULL; + }*/ + error = ECONNRESET; + connected = 1; + notify(); +} + +void RDMAConnectedSocketImpl::set_accept_fd(int sd) +{ + tcp_fd = sd; + is_server = true; + worker->center.submit_to(worker->center.get_id(), [this]() { + worker->center.create_file_event(tcp_fd, EVENT_READABLE, read_handler); + }, true); +} + +void RDMAConnectedSocketImpl::post_chunks_to_rq(int num) +{ + post_backlog += num - infiniband->post_chunks_to_rq(num, qp->get_qp()); +} + +void RDMAConnectedSocketImpl::update_post_backlog() +{ + if (post_backlog) + post_backlog -= post_backlog - dispatcher->post_chunks_to_rq(post_backlog, qp->get_qp()); +} diff --git a/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc new file mode 100644 index 00000000..432c2d2b --- /dev/null +++ b/src/msg/async/rdma/RDMAIWARPConnectedSocketImpl.cc @@ -0,0 +1,183 @@ +#include "RDMAStack.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAIWARPConnectedSocketImpl " + +#define TIMEOUT_MS 3000 +#define RETRY_COUNT 7 + +RDMAIWARPConnectedSocketImpl::RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w, RDMACMInfo *info) + : RDMAConnectedSocketImpl(cct, ib, s, w), cm_con_handler(new C_handle_cm_connection(this)) +{ + status = IDLE; + notify_fd = eventfd(0, EFD_CLOEXEC|EFD_NONBLOCK); + if (info) { + is_server = true; + cm_id = info->cm_id; + cm_channel = info->cm_channel; + status = RDMA_ID_CREATED; + remote_qpn = info->qp_num; + if (alloc_resource()) { + close_notify(); + return; + } + worker->center.submit_to(worker->center.get_id(), [this]() { + worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler); + status = CHANNEL_FD_CREATED; + }, false); + status = RESOURCE_ALLOCATED; + local_qpn = qp->get_local_qp_number(); + my_msg.qpn = local_qpn; + } else { + is_server = false; + cm_channel = rdma_create_event_channel(); + rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP); + status = RDMA_ID_CREATED; + ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl; + } +} + +RDMAIWARPConnectedSocketImpl::~RDMAIWARPConnectedSocketImpl() { + ldout(cct, 20) << __func__ << " destruct." << dendl; + std::unique_lock l(close_mtx); + close_condition.wait(l, [&] { return closed; }); + if (status >= RDMA_ID_CREATED) { + rdma_destroy_id(cm_id); + rdma_destroy_event_channel(cm_channel); + } +} + +int RDMAIWARPConnectedSocketImpl::try_connect(const entity_addr_t& peer_addr, const SocketOptions &opts) { + worker->center.create_file_event(cm_channel->fd, EVENT_READABLE, cm_con_handler); + status = CHANNEL_FD_CREATED; + if (rdma_resolve_addr(cm_id, NULL, const_cast<struct sockaddr*>(peer_addr.get_sockaddr()), TIMEOUT_MS)) { + lderr(cct) << __func__ << " failed to resolve addr" << dendl; + return -1; + } + return 0; +} + +void RDMAIWARPConnectedSocketImpl::close() { + error = ECONNRESET; + active = false; + if (status >= CONNECTED) { + rdma_disconnect(cm_id); + } + close_notify(); +} + +void RDMAIWARPConnectedSocketImpl::shutdown() { + error = ECONNRESET; + active = false; +} + +void RDMAIWARPConnectedSocketImpl::handle_cm_connection() { + struct rdma_cm_event *event; + rdma_get_cm_event(cm_channel, &event); + ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(event->event) + << " (cm id: " << cm_id << ")" << dendl; + struct rdma_conn_param cm_params; + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + status = ADDR_RESOLVED; + if (rdma_resolve_route(cm_id, TIMEOUT_MS)) { + lderr(cct) << __func__ << " failed to resolve rdma addr" << dendl; + notify(); + } + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + status = ROUTE_RESOLVED; + if (alloc_resource()) { + lderr(cct) << __func__ << " failed to alloc resource while resolving the route" << dendl; + connected = -ECONNREFUSED; + notify(); + break; + } + local_qpn = qp->get_local_qp_number(); + my_msg.qpn = local_qpn; + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&cm_params, 0, sizeof(cm_params)); + cm_params.retry_count = RETRY_COUNT; + cm_params.qp_num = local_qpn; + if (rdma_connect(cm_id, &cm_params)) { + lderr(cct) << __func__ << " failed to connect remote rdma port" << dendl; + connected = -ECONNREFUSED; + notify(); + } + break; + + case RDMA_CM_EVENT_ESTABLISHED: + ldout(cct, 20) << __func__ << " qp_num=" << cm_id->qp->qp_num << dendl; + status = CONNECTED; + if (!is_server) { + remote_qpn = event->param.conn.qp_num; + activate(); + notify(); + } + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + lderr(cct) << __func__ << " rdma connection rejected" << dendl; + connected = -ECONNREFUSED; + notify(); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + status = DISCONNECTED; + close_notify(); + if (!error) { + error = ECONNRESET; + notify(); + } + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + break; + + default: + ceph_abort_msg("unhandled event"); + break; + } + rdma_ack_cm_event(event); +} + +void RDMAIWARPConnectedSocketImpl::activate() { + ldout(cct, 30) << __func__ << dendl; + active = true; + connected = 1; +} + +int RDMAIWARPConnectedSocketImpl::alloc_resource() { + ldout(cct, 30) << __func__ << dendl; + qp = infiniband->create_queue_pair(cct, dispatcher->get_tx_cq(), + dispatcher->get_rx_cq(), IBV_QPT_RC, cm_id); + if (!qp) { + return -1; + } + if (!cct->_conf->ms_async_rdma_support_srq) + dispatcher->post_chunks_to_rq(infiniband->get_rx_queue_len(), qp->get_qp()); + dispatcher->register_qp(qp, this); + dispatcher->perf_logger->inc(l_msgr_rdma_created_queue_pair); + dispatcher->perf_logger->inc(l_msgr_rdma_active_queue_pair); + return 0; +} + +void RDMAIWARPConnectedSocketImpl::close_notify() { + ldout(cct, 30) << __func__ << dendl; + if (status >= CHANNEL_FD_CREATED) { + worker->center.delete_file_event(cm_channel->fd, EVENT_READABLE); + } + std::unique_lock l(close_mtx); + if (!closed) { + closed = true; + close_condition.notify_all(); + } +} diff --git a/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc new file mode 100644 index 00000000..210eaf00 --- /dev/null +++ b/src/msg/async/rdma/RDMAIWARPServerSocketImpl.cc @@ -0,0 +1,107 @@ +#include <poll.h> + +#include "msg/async/net_handler.h" +#include "RDMAStack.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAIWARPServerSocketImpl " + +RDMAIWARPServerSocketImpl::RDMAIWARPServerSocketImpl( + CephContext *cct, Infiniband* i, + RDMADispatcher *s, RDMAWorker *w, entity_addr_t& a, unsigned addr_slot) + : RDMAServerSocketImpl(cct, i, s, w, a, addr_slot) +{ +} + +int RDMAIWARPServerSocketImpl::listen(entity_addr_t &sa, + const SocketOptions &opt) +{ + ldout(cct, 20) << __func__ << " bind to rdma point" << dendl; + cm_channel = rdma_create_event_channel(); + rdma_create_id(cm_channel, &cm_id, NULL, RDMA_PS_TCP); + ldout(cct, 20) << __func__ << " successfully created cm id: " << cm_id << dendl; + int rc = rdma_bind_addr(cm_id, const_cast<struct sockaddr*>(sa.get_sockaddr())); + if (rc < 0) { + rc = -errno; + ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr() + << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl; + goto err; + } + rc = rdma_listen(cm_id, 128); + if (rc < 0) { + rc = -errno; + ldout(cct, 10) << __func__ << " unable to listen to " << sa.get_sockaddr() + << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl; + goto err; + } + server_setup_socket = cm_channel->fd; + ldout(cct, 20) << __func__ << " fd of cm_channel is " << server_setup_socket << dendl; + return 0; + +err: + server_setup_socket = -1; + rdma_destroy_id(cm_id); + rdma_destroy_event_channel(cm_channel); + return rc; +} + +int RDMAIWARPServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, + entity_addr_t *out, Worker *w) +{ + ldout(cct, 15) << __func__ << dendl; + + ceph_assert(sock); + struct pollfd pfd = { + .fd = cm_channel->fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, 0); + ceph_assert(ret >= 0); + if (!ret) + return -EAGAIN; + + struct rdma_cm_event *cm_event; + rdma_get_cm_event(cm_channel, &cm_event); + ldout(cct, 20) << __func__ << " event name: " << rdma_event_str(cm_event->event) << dendl; + + struct rdma_cm_id *event_cm_id = cm_event->id; + struct rdma_event_channel *event_channel = rdma_create_event_channel(); + + rdma_migrate_id(event_cm_id, event_channel); + + struct rdma_cm_id *new_cm_id = event_cm_id; + struct rdma_conn_param *remote_conn_param = &cm_event->param.conn; + struct rdma_conn_param local_conn_param; + + RDMACMInfo info(new_cm_id, event_channel, remote_conn_param->qp_num); + RDMAIWARPConnectedSocketImpl* server = + new RDMAIWARPConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w), &info); + + // FIPS zeroization audit 20191115: this memset is not security related. + memset(&local_conn_param, 0, sizeof(local_conn_param)); + local_conn_param.qp_num = server->get_local_qpn(); + + if (rdma_accept(new_cm_id, &local_conn_param)) { + return -EAGAIN; + } + server->activate(); + ldout(cct, 20) << __func__ << " accepted a new QP" << dendl; + + rdma_ack_cm_event(cm_event); + + std::unique_ptr<RDMAConnectedSocketImpl> csi(server); + *sock = ConnectedSocket(std::move(csi)); + struct sockaddr *addr = &new_cm_id->route.addr.dst_addr; + out->set_sockaddr(addr); + + return 0; +} + +void RDMAIWARPServerSocketImpl::abort_accept() +{ + if (server_setup_socket >= 0) { + rdma_destroy_id(cm_id); + rdma_destroy_event_channel(cm_channel); + } +} diff --git a/src/msg/async/rdma/RDMAServerSocketImpl.cc b/src/msg/async/rdma/RDMAServerSocketImpl.cc new file mode 100644 index 00000000..98402cfd --- /dev/null +++ b/src/msg/async/rdma/RDMAServerSocketImpl.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "msg/async/net_handler.h" +#include "RDMAStack.h" + +#include "include/compat.h" +#include "include/sock_compat.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << " RDMAServerSocketImpl " + +RDMAServerSocketImpl::RDMAServerSocketImpl( + CephContext *cct, Infiniband* i, RDMADispatcher *s, RDMAWorker *w, + entity_addr_t& a, unsigned slot) + : ServerSocketImpl(a.get_type(), slot), + cct(cct), net(cct), server_setup_socket(-1), infiniband(i), + dispatcher(s), worker(w), sa(a) +{ +} + +int RDMAServerSocketImpl::listen(entity_addr_t &sa, const SocketOptions &opt) +{ + int rc = 0; + server_setup_socket = net.create_socket(sa.get_family(), true); + if (server_setup_socket < 0) { + rc = -errno; + lderr(cct) << __func__ << " failed to create server socket: " + << cpp_strerror(errno) << dendl; + return rc; + } + + rc = net.set_nonblock(server_setup_socket); + if (rc < 0) { + goto err; + } + + rc = net.set_socket_options(server_setup_socket, opt.nodelay, opt.rcbuf_size); + if (rc < 0) { + goto err; + } + + rc = ::bind(server_setup_socket, sa.get_sockaddr(), sa.get_sockaddr_len()); + if (rc < 0) { + rc = -errno; + ldout(cct, 10) << __func__ << " unable to bind to " << sa.get_sockaddr() + << " on port " << sa.get_port() << ": " << cpp_strerror(errno) << dendl; + goto err; + } + + rc = ::listen(server_setup_socket, cct->_conf->ms_tcp_listen_backlog); + if (rc < 0) { + rc = -errno; + lderr(cct) << __func__ << " unable to listen on " << sa << ": " << cpp_strerror(errno) << dendl; + goto err; + } + + ldout(cct, 20) << __func__ << " bind to " << sa.get_sockaddr() << " on port " << sa.get_port() << dendl; + return 0; + +err: + ::close(server_setup_socket); + server_setup_socket = -1; + return rc; +} + +int RDMAServerSocketImpl::accept(ConnectedSocket *sock, const SocketOptions &opt, entity_addr_t *out, Worker *w) +{ + ldout(cct, 15) << __func__ << dendl; + + ceph_assert(sock); + + sockaddr_storage ss; + socklen_t slen = sizeof(ss); + int sd = accept_cloexec(server_setup_socket, (sockaddr*)&ss, &slen); + if (sd < 0) { + return -errno; + } + + int r = net.set_nonblock(sd); + if (r < 0) { + ::close(sd); + return -errno; + } + + r = net.set_socket_options(sd, opt.nodelay, opt.rcbuf_size); + if (r < 0) { + ::close(sd); + return -errno; + } + + ceph_assert(NULL != out); //out should not be NULL in accept connection + + out->set_type(addr_type); + out->set_sockaddr((sockaddr*)&ss); + net.set_priority(sd, opt.priority, out->get_family()); + + RDMAConnectedSocketImpl* server; + //Worker* w = dispatcher->get_stack()->get_worker(); + server = new RDMAConnectedSocketImpl(cct, infiniband, dispatcher, dynamic_cast<RDMAWorker*>(w)); + server->set_accept_fd(sd); + ldout(cct, 20) << __func__ << " accepted a new QP, tcp_fd: " << sd << dendl; + std::unique_ptr<RDMAConnectedSocketImpl> csi(server); + *sock = ConnectedSocket(std::move(csi)); + + return 0; +} + +void RDMAServerSocketImpl::abort_accept() +{ + if (server_setup_socket >= 0) + ::close(server_setup_socket); +} diff --git a/src/msg/async/rdma/RDMAStack.cc b/src/msg/async/rdma/RDMAStack.cc new file mode 100644 index 00000000..f63a8e7d --- /dev/null +++ b/src/msg/async/rdma/RDMAStack.cc @@ -0,0 +1,610 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <poll.h> +#include <errno.h> +#include <sys/time.h> +#include <sys/resource.h> + +#include "include/str_list.h" +#include "include/compat.h" +#include "common/Cycles.h" +#include "common/deleter.h" +#include "common/Tub.h" +#include "RDMAStack.h" + +#define dout_subsys ceph_subsys_ms +#undef dout_prefix +#define dout_prefix *_dout << "RDMAStack " + +RDMADispatcher::~RDMADispatcher() +{ + ldout(cct, 20) << __func__ << " destructing rdma dispatcher" << dendl; + polling_stop(); + + ceph_assert(qp_conns.empty()); + ceph_assert(num_qp_conn == 0); + ceph_assert(dead_queue_pairs.empty()); + ceph_assert(num_dead_queue_pair == 0); + + delete async_handler; +} + +RDMADispatcher::RDMADispatcher(CephContext* c, RDMAStack* s) + : cct(c), async_handler(new C_handle_cq_async(this)), lock("RDMADispatcher::lock"), + w_lock("RDMADispatcher::for worker pending list"), stack(s) +{ + PerfCountersBuilder plb(cct, "AsyncMessenger::RDMADispatcher", l_msgr_rdma_dispatcher_first, l_msgr_rdma_dispatcher_last); + + plb.add_u64_counter(l_msgr_rdma_polling, "polling", "Whether dispatcher thread is polling"); + plb.add_u64_counter(l_msgr_rdma_inflight_tx_chunks, "inflight_tx_chunks", "The number of inflight tx chunks"); + plb.add_u64_counter(l_msgr_rdma_rx_bufs_in_use, "rx_bufs_in_use", "The number of rx buffers that are holding data and being processed"); + plb.add_u64_counter(l_msgr_rdma_rx_bufs_total, "rx_bufs_total", "The total number of rx buffers"); + + plb.add_u64_counter(l_msgr_rdma_tx_total_wc, "tx_total_wc", "The number of tx work comletions"); + plb.add_u64_counter(l_msgr_rdma_tx_total_wc_errors, "tx_total_wc_errors", "The number of tx errors"); + plb.add_u64_counter(l_msgr_rdma_tx_wc_retry_errors, "tx_retry_errors", "The number of tx retry errors"); + plb.add_u64_counter(l_msgr_rdma_tx_wc_wr_flush_errors, "tx_wr_flush_errors", "The number of tx work request flush errors"); + + plb.add_u64_counter(l_msgr_rdma_rx_total_wc, "rx_total_wc", "The number of total rx work completion"); + plb.add_u64_counter(l_msgr_rdma_rx_total_wc_errors, "rx_total_wc_errors", "The number of total rx error work completion"); + plb.add_u64_counter(l_msgr_rdma_rx_fin, "rx_fin", "The number of rx finish work request"); + + plb.add_u64_counter(l_msgr_rdma_total_async_events, "total_async_events", "The number of async events"); + plb.add_u64_counter(l_msgr_rdma_async_last_wqe_events, "async_last_wqe_events", "The number of last wqe events"); + + plb.add_u64_counter(l_msgr_rdma_handshake_errors, "handshake_errors", "The number of handshake errors"); + + + plb.add_u64_counter(l_msgr_rdma_created_queue_pair, "created_queue_pair", "Active queue pair number"); + plb.add_u64_counter(l_msgr_rdma_active_queue_pair, "active_queue_pair", "Created queue pair number"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + Cycles::init(); +} + +void RDMADispatcher::polling_start() +{ + // take lock because listen/connect can happen from different worker threads + Mutex::Locker l(lock); + + if (t.joinable()) + return; // dispatcher thread already running + + get_stack()->get_infiniband().get_memory_manager()->set_rx_stat_logger(perf_logger); + + tx_cc = get_stack()->get_infiniband().create_comp_channel(cct); + ceph_assert(tx_cc); + rx_cc = get_stack()->get_infiniband().create_comp_channel(cct); + ceph_assert(rx_cc); + tx_cq = get_stack()->get_infiniband().create_comp_queue(cct, tx_cc); + ceph_assert(tx_cq); + rx_cq = get_stack()->get_infiniband().create_comp_queue(cct, rx_cc); + ceph_assert(rx_cq); + + t = std::thread(&RDMADispatcher::polling, this); + ceph_pthread_setname(t.native_handle(), "rdma-polling"); +} + +void RDMADispatcher::polling_stop() +{ + { + Mutex::Locker l(lock); + done = true; + } + + if (!t.joinable()) + return; + + t.join(); + + tx_cc->ack_events(); + rx_cc->ack_events(); + delete tx_cq; + delete rx_cq; + delete tx_cc; + delete rx_cc; +} + +void RDMADispatcher::handle_async_event() +{ + ldout(cct, 30) << __func__ << dendl; + while (1) { + ibv_async_event async_event; + if (ibv_get_async_event(get_stack()->get_infiniband().get_device()->ctxt, &async_event)) { + if (errno != EAGAIN) + lderr(cct) << __func__ << " ibv_get_async_event failed. (errno=" << errno + << " " << cpp_strerror(errno) << ")" << dendl; + return; + } + perf_logger->inc(l_msgr_rdma_total_async_events); + // FIXME: Currently we must ensure no other factor make QP in ERROR state, + // otherwise this qp can't be deleted in current cleanup flow. + if (async_event.event_type == IBV_EVENT_QP_LAST_WQE_REACHED) { + perf_logger->inc(l_msgr_rdma_async_last_wqe_events); + uint64_t qpn = async_event.element.qp->qp_num; + ldout(cct, 10) << __func__ << " event associated qp=" << async_event.element.qp + << " evt: " << ibv_event_type_str(async_event.event_type) << dendl; + Mutex::Locker l(lock); + RDMAConnectedSocketImpl *conn = get_conn_lockless(qpn); + if (!conn) { + ldout(cct, 1) << __func__ << " missing qp_num=" << qpn << " discard event" << dendl; + } else { + ldout(cct, 1) << __func__ << " it's not forwardly stopped by us, reenable=" << conn << dendl; + conn->fault(); + if (!cct->_conf->ms_async_rdma_cm) + erase_qpn_lockless(qpn); + } + } else { + ldout(cct, 1) << __func__ << " ibv_get_async_event: dev=" << get_stack()->get_infiniband().get_device()->ctxt + << " evt: " << ibv_event_type_str(async_event.event_type) + << dendl; + } + ibv_ack_async_event(&async_event); + } +} + +void RDMADispatcher::post_chunk_to_pool(Chunk* chunk) +{ + Mutex::Locker l(lock); + get_stack()->get_infiniband().post_chunk_to_pool(chunk); + perf_logger->dec(l_msgr_rdma_rx_bufs_in_use); +} + +int RDMADispatcher::post_chunks_to_rq(int num, ibv_qp *qp) +{ + Mutex::Locker l(lock); + return get_stack()->get_infiniband().post_chunks_to_rq(num, qp); +} + +void RDMADispatcher::polling() +{ + static int MAX_COMPLETIONS = 32; + ibv_wc wc[MAX_COMPLETIONS]; + + std::map<RDMAConnectedSocketImpl*, std::vector<ibv_wc> > polled; + std::vector<ibv_wc> tx_cqe; + ldout(cct, 20) << __func__ << " going to poll tx cq: " << tx_cq << " rx cq: " << rx_cq << dendl; + RDMAConnectedSocketImpl *conn = nullptr; + uint64_t last_inactive = Cycles::rdtsc(); + bool rearmed = false; + int r = 0; + + while (true) { + int tx_ret = tx_cq->poll_cq(MAX_COMPLETIONS, wc); + if (tx_ret > 0) { + ldout(cct, 20) << __func__ << " tx completion queue got " << tx_ret + << " responses."<< dendl; + handle_tx_event(wc, tx_ret); + } + + int rx_ret = rx_cq->poll_cq(MAX_COMPLETIONS, wc); + if (rx_ret > 0) { + ldout(cct, 20) << __func__ << " rx completion queue got " << rx_ret + << " responses."<< dendl; + perf_logger->inc(l_msgr_rdma_rx_total_wc, rx_ret); + perf_logger->inc(l_msgr_rdma_rx_bufs_in_use, rx_ret); + + Mutex::Locker l(lock);//make sure connected socket alive when pass wc + + for (int i = 0; i < rx_ret; ++i) { + ibv_wc* response = &wc[i]; + Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id); + + if (response->status == IBV_WC_SUCCESS) { + ceph_assert(wc[i].opcode == IBV_WC_RECV); + conn = get_conn_lockless(response->qp_num); + if (!conn) { + ldout(cct, 1) << __func__ << " csi with qpn " << response->qp_num << " may be dead. chunk " << chunk << " will be back ? " << r << dendl; + get_stack()->get_infiniband().post_chunk_to_pool(chunk); + perf_logger->dec(l_msgr_rdma_rx_bufs_in_use); + } else { + conn->post_chunks_to_rq(1); + polled[conn].push_back(*response); + } + } else { + perf_logger->inc(l_msgr_rdma_rx_total_wc_errors); + ldout(cct, 1) << __func__ << " work request returned error for buffer(" << chunk + << ") status(" << response->status << ":" + << get_stack()->get_infiniband().wc_status_to_string(response->status) << ")" << dendl; + if (response->status != IBV_WC_WR_FLUSH_ERR) { + conn = get_conn_lockless(response->qp_num); + if (conn && conn->is_connected()) + conn->fault(); + } + get_stack()->get_infiniband().post_chunk_to_pool(chunk); + perf_logger->dec(l_msgr_rdma_rx_bufs_in_use); + } + } + for (auto &&i : polled) + i.first->pass_wc(std::move(i.second)); + polled.clear(); + } + + if (!tx_ret && !rx_ret) { + // NOTE: Has TX just transitioned to idle? We should do it when idle! + // It's now safe to delete queue pairs (see comment by declaration + // for dead_queue_pairs). + // Additionally, don't delete qp while outstanding_buffers isn't empty, + // because we need to check qp's state before sending + perf_logger->set(l_msgr_rdma_inflight_tx_chunks, inflight); + if (num_dead_queue_pair) { + Mutex::Locker l(lock); // FIXME reuse dead qp because creating one qp costs 1 ms + auto it = dead_queue_pairs.begin(); + while (it != dead_queue_pairs.end()) { + auto i = *it; + // Bypass QPs that do not collect all Tx completions yet. + if (i->get_tx_wr()) { + ldout(cct, 20) << __func__ << " bypass qp=" << i << " tx_wr=" << i->get_tx_wr() << dendl; + ++it; + } else { + ldout(cct, 10) << __func__ << " finally delete qp=" << i << dendl; + delete i; + it = dead_queue_pairs.erase(it); + perf_logger->dec(l_msgr_rdma_active_queue_pair); + --num_dead_queue_pair; + } + } + } + if (!num_qp_conn && done && dead_queue_pairs.empty()) + break; + + uint64_t now = Cycles::rdtsc(); + if (Cycles::to_microseconds(now - last_inactive) > cct->_conf->ms_async_rdma_polling_us) { + handle_async_event(); + if (!rearmed) { + // Clean up cq events after rearm notify ensure no new incoming event + // arrived between polling and rearm + tx_cq->rearm_notify(); + rx_cq->rearm_notify(); + rearmed = true; + continue; + } + + struct pollfd channel_poll[2]; + channel_poll[0].fd = tx_cc->get_fd(); + channel_poll[0].events = POLLIN | POLLERR | POLLNVAL | POLLHUP; + channel_poll[0].revents = 0; + channel_poll[1].fd = rx_cc->get_fd(); + channel_poll[1].events = POLLIN | POLLERR | POLLNVAL | POLLHUP; + channel_poll[1].revents = 0; + r = 0; + perf_logger->set(l_msgr_rdma_polling, 0); + while (!done && r == 0) { + r = TEMP_FAILURE_RETRY(poll(channel_poll, 2, 100)); + if (r < 0) { + r = -errno; + lderr(cct) << __func__ << " poll failed " << r << dendl; + ceph_abort(); + } + } + if (r > 0 && tx_cc->get_cq_event()) + ldout(cct, 20) << __func__ << " got tx cq event." << dendl; + if (r > 0 && rx_cc->get_cq_event()) + ldout(cct, 20) << __func__ << " got rx cq event." << dendl; + last_inactive = Cycles::rdtsc(); + perf_logger->set(l_msgr_rdma_polling, 1); + rearmed = false; + } + } + } +} + +void RDMADispatcher::notify_pending_workers() { + if (num_pending_workers) { + RDMAWorker *w = nullptr; + { + Mutex::Locker l(w_lock); + if (!pending_workers.empty()) { + w = pending_workers.front(); + pending_workers.pop_front(); + --num_pending_workers; + } + } + if (w) + w->notify_worker(); + } +} + +void RDMADispatcher::register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi) +{ + Mutex::Locker l(lock); + ceph_assert(!qp_conns.count(qp->get_local_qp_number())); + qp_conns[qp->get_local_qp_number()] = std::make_pair(qp, csi); + ++num_qp_conn; +} + +RDMAConnectedSocketImpl* RDMADispatcher::get_conn_lockless(uint32_t qp) +{ + auto it = qp_conns.find(qp); + if (it == qp_conns.end()) + return nullptr; + if (it->second.first->is_dead()) + return nullptr; + return it->second.second; +} + +Infiniband::QueuePair* RDMADispatcher::get_qp(uint32_t qp) +{ + Mutex::Locker l(lock); + // Try to find the QP in qp_conns firstly. + auto it = qp_conns.find(qp); + if (it != qp_conns.end()) + return it->second.first; + + // Try again in dead_queue_pairs. + for (auto &i: dead_queue_pairs) + if (i->get_local_qp_number() == qp) + return i; + + return nullptr; +} + +void RDMADispatcher::erase_qpn_lockless(uint32_t qpn) +{ + auto it = qp_conns.find(qpn); + if (it == qp_conns.end()) + return ; + ++num_dead_queue_pair; + dead_queue_pairs.push_back(it->second.first); + qp_conns.erase(it); + --num_qp_conn; +} + +void RDMADispatcher::erase_qpn(uint32_t qpn) +{ + Mutex::Locker l(lock); + erase_qpn_lockless(qpn); +} + +void RDMADispatcher::handle_tx_event(ibv_wc *cqe, int n) +{ + std::vector<Chunk*> tx_chunks; + + for (int i = 0; i < n; ++i) { + ibv_wc* response = &cqe[i]; + Chunk* chunk = reinterpret_cast<Chunk *>(response->wr_id); + ldout(cct, 25) << __func__ << " QP: " << response->qp_num + << " len: " << response->byte_len << " , addr:" << chunk + << " " << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl; + + QueuePair *qp = get_qp(response->qp_num); + if (qp) + qp->dec_tx_wr(1); + + if (response->status != IBV_WC_SUCCESS) { + perf_logger->inc(l_msgr_rdma_tx_total_wc_errors); + if (response->status == IBV_WC_RETRY_EXC_ERR) { + ldout(cct, 1) << __func__ << " connection between server and client not working. Disconnect this now" << dendl; + perf_logger->inc(l_msgr_rdma_tx_wc_retry_errors); + } else if (response->status == IBV_WC_WR_FLUSH_ERR) { + ldout(cct, 1) << __func__ << " Work Request Flushed Error: this connection's qp=" + << response->qp_num << " should be down while this WR=" << response->wr_id + << " still in flight." << dendl; + perf_logger->inc(l_msgr_rdma_tx_wc_wr_flush_errors); + } else { + ldout(cct, 1) << __func__ << " send work request returned error for buffer(" + << response->wr_id << ") status(" << response->status << "): " + << get_stack()->get_infiniband().wc_status_to_string(response->status) << dendl; + Mutex::Locker l(lock);//make sure connected socket alive when pass wc + RDMAConnectedSocketImpl *conn = get_conn_lockless(response->qp_num); + + if (conn && conn->is_connected()) { + ldout(cct, 25) << __func__ << " qp state is : " << conn->get_qp_state() << dendl; + conn->fault(); + } else { + ldout(cct, 1) << __func__ << " missing qp_num=" << response->qp_num << " discard event" << dendl; + } + } + } + + //TX completion may come either from regular send message or from 'fin' message. + //In the case of 'fin' wr_id points to the QueuePair. + if (get_stack()->get_infiniband().get_memory_manager()->is_tx_buffer(chunk->buffer)) { + tx_chunks.push_back(chunk); + } else if (reinterpret_cast<QueuePair*>(response->wr_id)->get_local_qp_number() == response->qp_num ) { + ldout(cct, 1) << __func__ << " sending of the disconnect msg completed" << dendl; + } else { + ldout(cct, 1) << __func__ << " not tx buffer, chunk " << chunk << dendl; + ceph_abort(); + } + } + + perf_logger->inc(l_msgr_rdma_tx_total_wc, n); + post_tx_buffer(tx_chunks); +} + +/** + * Add the given Chunks to the given free queue. + * + * \param[in] chunks + * The Chunks to enqueue. + * \return + * 0 if success or -1 for failure + */ +void RDMADispatcher::post_tx_buffer(std::vector<Chunk*> &chunks) +{ + if (chunks.empty()) + return ; + + inflight -= chunks.size(); + get_stack()->get_infiniband().get_memory_manager()->return_tx(chunks); + ldout(cct, 30) << __func__ << " release " << chunks.size() + << " chunks, inflight " << inflight << dendl; + notify_pending_workers(); +} + + +RDMAWorker::RDMAWorker(CephContext *c, unsigned i) + : Worker(c, i), stack(nullptr), + tx_handler(new C_handle_cq_tx(this)), lock("RDMAWorker::lock") +{ + // initialize perf_logger + char name[128]; + sprintf(name, "AsyncMessenger::RDMAWorker-%u", id); + PerfCountersBuilder plb(cct, name, l_msgr_rdma_first, l_msgr_rdma_last); + + plb.add_u64_counter(l_msgr_rdma_tx_no_mem, "tx_no_mem", "The count of no tx buffer"); + plb.add_u64_counter(l_msgr_rdma_tx_parital_mem, "tx_parital_mem", "The count of parital tx buffer"); + plb.add_u64_counter(l_msgr_rdma_tx_failed, "tx_failed_post", "The number of tx failed posted"); + + plb.add_u64_counter(l_msgr_rdma_tx_chunks, "tx_chunks", "The number of tx chunks transmitted"); + plb.add_u64_counter(l_msgr_rdma_tx_bytes, "tx_bytes", "The bytes of tx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_rdma_rx_chunks, "rx_chunks", "The number of rx chunks transmitted"); + plb.add_u64_counter(l_msgr_rdma_rx_bytes, "rx_bytes", "The bytes of rx chunks transmitted", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_msgr_rdma_pending_sent_conns, "pending_sent_conns", "The count of pending sent conns"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); +} + +RDMAWorker::~RDMAWorker() +{ + delete tx_handler; +} + +void RDMAWorker::initialize() +{ + if (!dispatcher) { + dispatcher = &stack->get_dispatcher(); + } +} + +int RDMAWorker::listen(entity_addr_t &sa, unsigned addr_slot, + const SocketOptions &opt,ServerSocket *sock) +{ + get_stack()->get_infiniband().init(); + dispatcher->polling_start(); + RDMAServerSocketImpl *p; + if (cct->_conf->ms_async_rdma_type == "iwarp") { + p = new RDMAIWARPServerSocketImpl( + cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this, + sa, addr_slot); + } else { + p = new RDMAServerSocketImpl(cct, &get_stack()->get_infiniband(), + &get_stack()->get_dispatcher(), this, sa, + addr_slot); + } + int r = p->listen(sa, opt); + if (r < 0) { + delete p; + return r; + } + + *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p)); + return 0; +} + +int RDMAWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) +{ + get_stack()->get_infiniband().init(); + dispatcher->polling_start(); + + RDMAConnectedSocketImpl* p; + if (cct->_conf->ms_async_rdma_type == "iwarp") { + p = new RDMAIWARPConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this); + } else { + p = new RDMAConnectedSocketImpl(cct, &get_stack()->get_infiniband(), &get_stack()->get_dispatcher(), this); + } + int r = p->try_connect(addr, opts); + + if (r < 0) { + ldout(cct, 1) << __func__ << " try connecting failed." << dendl; + delete p; + return r; + } + std::unique_ptr<RDMAConnectedSocketImpl> csi(p); + *socket = ConnectedSocket(std::move(csi)); + return 0; +} + +int RDMAWorker::get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes) +{ + ceph_assert(center.in_thread()); + int r = get_stack()->get_infiniband().get_tx_buffers(c, bytes); + ceph_assert(r >= 0); + size_t got = get_stack()->get_infiniband().get_memory_manager()->get_tx_buffer_size() * r; + ldout(cct, 30) << __func__ << " need " << bytes << " bytes, reserve " << got << " registered bytes, inflight " << dispatcher->inflight << dendl; + stack->get_dispatcher().inflight += r; + if (got >= bytes) + return r; + + if (o) { + if (!o->is_pending()) { + pending_sent_conns.push_back(o); + perf_logger->inc(l_msgr_rdma_pending_sent_conns, 1); + o->set_pending(1); + } + dispatcher->make_pending_worker(this); + } + return r; +} + + +void RDMAWorker::handle_pending_message() +{ + ldout(cct, 20) << __func__ << " pending conns " << pending_sent_conns.size() << dendl; + while (!pending_sent_conns.empty()) { + RDMAConnectedSocketImpl *o = pending_sent_conns.front(); + pending_sent_conns.pop_front(); + ssize_t r = o->submit(false); + ldout(cct, 20) << __func__ << " sent pending bl socket=" << o << " r=" << r << dendl; + if (r < 0) { + if (r == -EAGAIN) { + pending_sent_conns.push_back(o); + dispatcher->make_pending_worker(this); + return ; + } + o->fault(); + } + o->set_pending(0); + perf_logger->dec(l_msgr_rdma_pending_sent_conns, 1); + } + dispatcher->notify_pending_workers(); +} + +RDMAStack::RDMAStack(CephContext *cct, const string &t) + : NetworkStack(cct, t), ib(cct), dispatcher(cct, this) +{ + ldout(cct, 20) << __func__ << " constructing RDMAStack..." << dendl; + + unsigned num = get_num_worker(); + for (unsigned i = 0; i < num; ++i) { + RDMAWorker* w = dynamic_cast<RDMAWorker*>(get_worker(i)); + w->set_stack(this); + } + ldout(cct, 20) << " creating RDMAStack:" << this << " with dispatcher:" << &dispatcher << dendl; +} + +RDMAStack::~RDMAStack() +{ + if (cct->_conf->ms_async_rdma_enable_hugepage) { + unsetenv("RDMAV_HUGEPAGES_SAFE"); //remove env variable on destruction + } +} + +void RDMAStack::spawn_worker(unsigned i, std::function<void ()> &&func) +{ + threads.resize(i+1); + threads[i] = std::thread(func); +} + +void RDMAStack::join_worker(unsigned i) +{ + ceph_assert(threads.size() > i && threads[i].joinable()); + threads[i].join(); +} diff --git a/src/msg/async/rdma/RDMAStack.h b/src/msg/async/rdma/RDMAStack.h new file mode 100644 index 00000000..e4d34ee0 --- /dev/null +++ b/src/msg/async/rdma/RDMAStack.h @@ -0,0 +1,348 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSKY <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_RDMASTACK_H +#define CEPH_MSG_RDMASTACK_H + +#include <sys/eventfd.h> + +#include <list> +#include <vector> +#include <thread> + +#include "common/ceph_context.h" +#include "common/debug.h" +#include "common/errno.h" +#include "msg/async/Stack.h" +#include "Infiniband.h" + +class RDMAConnectedSocketImpl; +class RDMAServerSocketImpl; +class RDMAStack; +class RDMAWorker; + +class RDMADispatcher { + typedef Infiniband::MemoryManager::Chunk Chunk; + typedef Infiniband::QueuePair QueuePair; + + std::thread t; + CephContext *cct; + Infiniband::CompletionQueue* tx_cq = nullptr; + Infiniband::CompletionQueue* rx_cq = nullptr; + Infiniband::CompletionChannel *tx_cc = nullptr, *rx_cc = nullptr; + EventCallbackRef async_handler; + bool done = false; + std::atomic<uint64_t> num_dead_queue_pair = {0}; + std::atomic<uint64_t> num_qp_conn = {0}; + Mutex lock; // protect `qp_conns`, `dead_queue_pairs` + // qp_num -> InfRcConnection + // The main usage of `qp_conns` is looking up connection by qp_num, + // so the lifecycle of element in `qp_conns` is the lifecycle of qp. + //// make qp queue into dead state + /** + * 1. Connection call mark_down + * 2. Move the Queue Pair into the Error state(QueuePair::to_dead) + * 3. Wait for the affiliated event IBV_EVENT_QP_LAST_WQE_REACHED(handle_async_event) + * 4. Wait for CQ to be empty(handle_tx_event) + * 5. Destroy the QP by calling ibv_destroy_qp()(handle_tx_event) + * + * @param qp The qp needed to dead + */ + ceph::unordered_map<uint32_t, std::pair<QueuePair*, RDMAConnectedSocketImpl*> > qp_conns; + + /// if a queue pair is closed when transmit buffers are active + /// on it, the transmit buffers never get returned via tx_cq. To + /// work around this problem, don't delete queue pairs immediately. Instead, + /// save them in this vector and delete them at a safe time, when there are + /// no outstanding transmit buffers to be lost. + std::vector<QueuePair*> dead_queue_pairs; + + std::atomic<uint64_t> num_pending_workers = {0}; + Mutex w_lock; // protect pending workers + // fixme: lockfree + std::list<RDMAWorker*> pending_workers; + RDMAStack* stack; + + class C_handle_cq_async : public EventCallback { + RDMADispatcher *dispatcher; + public: + explicit C_handle_cq_async(RDMADispatcher *w): dispatcher(w) {} + void do_request(uint64_t fd) { + // worker->handle_tx_event(); + dispatcher->handle_async_event(); + } + }; + + public: + PerfCounters *perf_logger; + + explicit RDMADispatcher(CephContext* c, RDMAStack* s); + virtual ~RDMADispatcher(); + void handle_async_event(); + + void polling_start(); + void polling_stop(); + void polling(); + void register_qp(QueuePair *qp, RDMAConnectedSocketImpl* csi); + void make_pending_worker(RDMAWorker* w) { + Mutex::Locker l(w_lock); + auto it = std::find(pending_workers.begin(), pending_workers.end(), w); + if (it != pending_workers.end()) + return; + pending_workers.push_back(w); + ++num_pending_workers; + } + RDMAStack* get_stack() { return stack; } + RDMAConnectedSocketImpl* get_conn_lockless(uint32_t qp); + QueuePair* get_qp(uint32_t qp); + void erase_qpn_lockless(uint32_t qpn); + void erase_qpn(uint32_t qpn); + Infiniband::CompletionQueue* get_tx_cq() const { return tx_cq; } + Infiniband::CompletionQueue* get_rx_cq() const { return rx_cq; } + void notify_pending_workers(); + void handle_tx_event(ibv_wc *cqe, int n); + void post_tx_buffer(std::vector<Chunk*> &chunks); + + std::atomic<uint64_t> inflight = {0}; + + void post_chunk_to_pool(Chunk* chunk); + int post_chunks_to_rq(int num, ibv_qp *qp=NULL); +}; + +class RDMAWorker : public Worker { + typedef Infiniband::CompletionQueue CompletionQueue; + typedef Infiniband::CompletionChannel CompletionChannel; + typedef Infiniband::MemoryManager::Chunk Chunk; + typedef Infiniband::MemoryManager MemoryManager; + typedef std::vector<Chunk*>::iterator ChunkIter; + RDMAStack *stack; + EventCallbackRef tx_handler; + std::list<RDMAConnectedSocketImpl*> pending_sent_conns; + RDMADispatcher* dispatcher = nullptr; + Mutex lock; + + class C_handle_cq_tx : public EventCallback { + RDMAWorker *worker; + public: + explicit C_handle_cq_tx(RDMAWorker *w): worker(w) {} + void do_request(uint64_t fd) { + worker->handle_pending_message(); + } + }; + + public: + PerfCounters *perf_logger; + explicit RDMAWorker(CephContext *c, unsigned i); + virtual ~RDMAWorker(); + virtual int listen(entity_addr_t &addr, + unsigned addr_slot, + const SocketOptions &opts, ServerSocket *) override; + virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; + virtual void initialize() override; + RDMAStack *get_stack() { return stack; } + int get_reged_mem(RDMAConnectedSocketImpl *o, std::vector<Chunk*> &c, size_t bytes); + void remove_pending_conn(RDMAConnectedSocketImpl *o) { + ceph_assert(center.in_thread()); + pending_sent_conns.remove(o); + } + void handle_pending_message(); + void set_stack(RDMAStack *s) { stack = s; } + void notify_worker() { + center.dispatch_event_external(tx_handler); + } +}; + +struct RDMACMInfo { + RDMACMInfo(rdma_cm_id *cid, rdma_event_channel *cm_channel_, uint32_t qp_num_) + : cm_id(cid), cm_channel(cm_channel_), qp_num(qp_num_) {} + rdma_cm_id *cm_id; + rdma_event_channel *cm_channel; + uint32_t qp_num; +}; + +class RDMAConnectedSocketImpl : public ConnectedSocketImpl { + public: + typedef Infiniband::MemoryManager::Chunk Chunk; + typedef Infiniband::CompletionChannel CompletionChannel; + typedef Infiniband::CompletionQueue CompletionQueue; + + protected: + CephContext *cct; + Infiniband::QueuePair *qp; + IBSYNMsg peer_msg; + IBSYNMsg my_msg; + int connected; + int error; + Infiniband* infiniband; + RDMADispatcher* dispatcher; + RDMAWorker* worker; + std::vector<Chunk*> buffers; + int notify_fd = -1; + bufferlist pending_bl; + + Mutex lock; + std::vector<ibv_wc> wc; + bool is_server; + EventCallbackRef read_handler; + EventCallbackRef established_handler; + int tcp_fd = -1; + bool active;// qp is active ? + bool pending; + int post_backlog = 0; + + void notify(); + ssize_t read_buffers(char* buf, size_t len); + int post_work_request(std::vector<Chunk*>&); + + public: + RDMAConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w); + virtual ~RDMAConnectedSocketImpl(); + + void pass_wc(std::vector<ibv_wc> &&v); + void get_wc(std::vector<ibv_wc> &w); + virtual int is_connected() override { return connected; } + + virtual ssize_t read(char* buf, size_t len) override; + virtual ssize_t zero_copy_read(bufferptr &data) override; + virtual ssize_t send(bufferlist &bl, bool more) override; + virtual void shutdown() override; + virtual void close() override; + virtual int fd() const override { return notify_fd; } + virtual int socket_fd() const override { return tcp_fd; } + void fault(); + const char* get_qp_state() { return Infiniband::qp_state_string(qp->get_state()); } + ssize_t submit(bool more); + int activate(); + void fin(); + void handle_connection(); + int handle_connection_established(bool need_set_fault = true); + void cleanup(); + void set_accept_fd(int sd); + virtual int try_connect(const entity_addr_t&, const SocketOptions &opt); + bool is_pending() {return pending;} + void set_pending(bool val) {pending = val;} + void post_chunks_to_rq(int num); + void update_post_backlog(); +}; + +enum RDMA_CM_STATUS { + IDLE = 1, + RDMA_ID_CREATED, + CHANNEL_FD_CREATED, + RESOURCE_ALLOCATED, + ADDR_RESOLVED, + ROUTE_RESOLVED, + CONNECTED, + DISCONNECTED, + ERROR +}; + +class RDMAIWARPConnectedSocketImpl : public RDMAConnectedSocketImpl { + public: + RDMAIWARPConnectedSocketImpl(CephContext *cct, Infiniband* ib, RDMADispatcher* s, + RDMAWorker *w, RDMACMInfo *info = nullptr); + ~RDMAIWARPConnectedSocketImpl(); + virtual int try_connect(const entity_addr_t&, const SocketOptions &opt) override; + virtual void close() override; + virtual void shutdown() override; + virtual void handle_cm_connection(); + uint32_t get_local_qpn() const { return local_qpn; } + void activate(); + int alloc_resource(); + void close_notify(); + + private: + rdma_cm_id *cm_id; + rdma_event_channel *cm_channel; + uint32_t local_qpn; + uint32_t remote_qpn; + EventCallbackRef cm_con_handler; + bool is_server; + std::mutex close_mtx; + std::condition_variable close_condition; + bool closed; + RDMA_CM_STATUS status; + + + class C_handle_cm_connection : public EventCallback { + RDMAIWARPConnectedSocketImpl *csi; + public: + C_handle_cm_connection(RDMAIWARPConnectedSocketImpl *w): csi(w) {} + void do_request(uint64_t fd) { + csi->handle_cm_connection(); + } + }; +}; + +class RDMAServerSocketImpl : public ServerSocketImpl { + protected: + CephContext *cct; + NetHandler net; + int server_setup_socket; + Infiniband* infiniband; + RDMADispatcher *dispatcher; + RDMAWorker *worker; + entity_addr_t sa; + + public: + RDMAServerSocketImpl(CephContext *cct, Infiniband* i, RDMADispatcher *s, + RDMAWorker *w, entity_addr_t& a, unsigned slot); + + virtual int listen(entity_addr_t &sa, const SocketOptions &opt); + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + virtual int fd() const override { return server_setup_socket; } + int get_fd() { return server_setup_socket; } +}; + +class RDMAIWARPServerSocketImpl : public RDMAServerSocketImpl { + public: + RDMAIWARPServerSocketImpl( + CephContext *cct, Infiniband *i, RDMADispatcher *s, RDMAWorker *w, + entity_addr_t& addr, unsigned addr_slot); + virtual int listen(entity_addr_t &sa, const SocketOptions &opt) override; + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + private: + rdma_cm_id *cm_id; + rdma_event_channel *cm_channel; +}; + +class RDMAStack : public NetworkStack { + vector<std::thread> threads; + PerfCounters *perf_counter; + Infiniband ib; + RDMADispatcher dispatcher; + + std::atomic<bool> fork_finished = {false}; + + public: + explicit RDMAStack(CephContext *cct, const string &t); + virtual ~RDMAStack(); + virtual bool support_zero_copy_read() const override { return false; } + virtual bool nonblock_connect_need_writable_event() const override { return false; } + + virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override; + virtual void join_worker(unsigned i) override; + RDMADispatcher &get_dispatcher() { return dispatcher; } + Infiniband &get_infiniband() { return ib; } + virtual bool is_ready() override { return fork_finished.load(); }; + virtual void ready() override { fork_finished = true; }; +}; + + +#endif |