diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 18:24:20 +0000 |
commit | 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch) | |
tree | e5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/msg/async/dpdk | |
parent | Initial commit. (diff) | |
download | ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip |
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/msg/async/dpdk')
38 files changed, 9905 insertions, 0 deletions
diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc new file mode 100644 index 00000000..dedc9e3c --- /dev/null +++ b/src/msg/async/dpdk/ARP.cc @@ -0,0 +1,89 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "ARP.h" + +arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num) + : _arp(a), _proto_num(proto_num) +{ + _arp.add(proto_num, this); +} + +arp_for_protocol::~arp_for_protocol() +{ + _arp.del(_proto_num); +} + +arp::arp(interface* netif): + _netif(netif), + _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }), + _rx_packets( + _proto.receive( + [this] (Packet p, ethernet_address ea) { + return process_packet(std::move(p), ea); + }, + [this](forward_hash& out_hash_data, Packet& p, size_t off) { + return forward(out_hash_data, p, off); + } + ) + ) +{} + +Tub<l3_protocol::l3packet> arp::get_packet() +{ + Tub<l3_protocol::l3packet> p; + if (!_packetq.empty()) { + p = std::move(_packetq.front()); + _packetq.pop_front(); + } + return p; +} + +bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + auto ah = p.get_header<arp_hdr>(off); + auto i = _arp_for_protocol.find(ntoh(ah->ptype)); + if (i != _arp_for_protocol.end()) { + return i->second->forward(out_hash_data, p, off); + } + return false; +} + +void arp::add(uint16_t proto_num, arp_for_protocol* afp) +{ + _arp_for_protocol[proto_num] = afp; +} + +void arp::del(uint16_t proto_num) +{ + _arp_for_protocol.erase(proto_num); +} + +int arp::process_packet(Packet p, ethernet_address from) +{ + auto ah = p.get_header<arp_hdr>()->ntoh(); + auto i = _arp_for_protocol.find(ah.ptype); + if (i != _arp_for_protocol.end()) { + i->second->received(std::move(p)); + } + return 0; +} diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h new file mode 100644 index 00000000..54569564 --- /dev/null +++ b/src/msg/async/dpdk/ARP.h @@ -0,0 +1,301 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ + +#ifndef CEPH_MSG_ARP_H_ +#define CEPH_MSG_ARP_H_ + +#include <errno.h> + +#include <unordered_map> +#include <functional> + +#include "msg/async/Event.h" + +#include "ethernet.h" +#include "circular_buffer.h" +#include "ip_types.h" +#include "net.h" +#include "Packet.h" + +class arp; +template <typename L3> +class arp_for; + +class arp_for_protocol { + protected: + arp& _arp; + uint16_t _proto_num; + public: + arp_for_protocol(arp& a, uint16_t proto_num); + virtual ~arp_for_protocol(); + virtual int received(Packet p) = 0; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; } +}; + +class interface; + +class arp { + interface* _netif; + l3_protocol _proto; + subscription<Packet, ethernet_address> _rx_packets; + std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol; + circular_buffer<l3_protocol::l3packet> _packetq; + private: + struct arp_hdr { + uint16_t htype; + uint16_t ptype; + arp_hdr ntoh() { + arp_hdr hdr = *this; + hdr.htype = ::ntoh(htype); + hdr.ptype = ::ntoh(ptype); + return hdr; + } + arp_hdr hton() { + arp_hdr hdr = *this; + hdr.htype = ::hton(htype); + hdr.ptype = ::hton(ptype); + return hdr; + } + }; + public: + explicit arp(interface* netif); + void add(uint16_t proto_num, arp_for_protocol* afp); + void del(uint16_t proto_num); + private: + ethernet_address l2self() { return _netif->hw_address(); } + int process_packet(Packet p, ethernet_address from); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + Tub<l3_protocol::l3packet> get_packet(); + template <class l3_proto> + friend class arp_for; +}; + +template <typename L3> +class arp_for : public arp_for_protocol { + public: + using l2addr = ethernet_address; + using l3addr = typename L3::address_type; + private: + static constexpr auto max_waiters = 512; + enum oper { + op_request = 1, + op_reply = 2, + }; + struct arp_hdr { + uint16_t htype; + uint16_t ptype; + uint8_t hlen; + uint8_t plen; + uint16_t oper; + l2addr sender_hwaddr; + l3addr sender_paddr; + l2addr target_hwaddr; + l3addr target_paddr; + + arp_hdr ntoh() { + arp_hdr hdr = *this; + hdr.htype = ::ntoh(htype); + hdr.ptype = ::ntoh(ptype); + hdr.oper = ::ntoh(oper); + hdr.sender_hwaddr = sender_hwaddr.ntoh(); + hdr.sender_paddr = sender_paddr.ntoh(); + hdr.target_hwaddr = target_hwaddr.ntoh(); + hdr.target_paddr = target_paddr.ntoh(); + return hdr; + } + + arp_hdr hton() { + arp_hdr hdr = *this; + hdr.htype = ::hton(htype); + hdr.ptype = ::hton(ptype); + hdr.oper = ::hton(oper); + hdr.sender_hwaddr = sender_hwaddr.hton(); + hdr.sender_paddr = sender_paddr.hton(); + hdr.target_hwaddr = target_hwaddr.hton(); + hdr.target_paddr = target_paddr.hton(); + return hdr; + } + }; + struct resolution { + std::vector<std::pair<resolution_cb, Packet>> _waiters; + uint64_t timeout_fd; + }; + class C_handle_arp_timeout : public EventCallback { + arp_for *arp; + l3addr paddr; + bool first_request; + + public: + C_handle_arp_timeout(arp_for *a, l3addr addr, bool first): + arp(a), paddr(addr), first_request(first) {} + void do_request(uint64_t r) { + arp->send_query(paddr); + auto &res = arp->_in_progress[paddr]; + + for (auto& p : res._waiters) { + p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT); + } + res._waiters.clear(); + res.timeout_fd = arp->center->create_time_event( + 1*1000*1000, this); + } + }; + friend class C_handle_arp_timeout; + + private: + CephContext *cct; + EventCenter *center; + l3addr _l3self = L3::broadcast_address(); + std::unordered_map<l3addr, l2addr> _table; + std::unordered_map<l3addr, resolution> _in_progress; + private: + Packet make_query_packet(l3addr paddr); + virtual int received(Packet p) override; + int handle_request(arp_hdr* ah); + l2addr l2self() { return _arp.l2self(); } + void send(l2addr to, Packet &&p); + public: + void send_query(const l3addr& paddr); + explicit arp_for(CephContext *c, arp& a, EventCenter *cen) + : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) { + _table[L3::broadcast_address()] = ethernet::broadcast_address(); + } + ~arp_for() { + for (auto && p : _in_progress) + center->delete_time_event(p.second.timeout_fd); + } + void wait(const l3addr& addr, Packet p, resolution_cb cb); + void learn(l2addr l2, l3addr l3); + void run(); + void set_self_addr(l3addr addr) { + _table.erase(_l3self); + _table[addr] = l2self(); + _l3self = addr; + } + friend class arp; +}; + +template <typename L3> +void arp_for<L3>::send(l2addr to, Packet &&p) { + _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)}); +} + +template <typename L3> +Packet arp_for<L3>::make_query_packet(l3addr paddr) { + arp_hdr hdr; + hdr.htype = ethernet::arp_hardware_type(); + hdr.ptype = L3::arp_protocol_type(); + hdr.hlen = sizeof(l2addr); + hdr.plen = sizeof(l3addr); + hdr.oper = op_request; + hdr.sender_hwaddr = l2self(); + hdr.sender_paddr = _l3self; + hdr.target_hwaddr = ethernet::broadcast_address(); + hdr.target_paddr = paddr; + hdr = hdr.hton(); + return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr)); +} + +template <typename L3> +void arp_for<L3>::send_query(const l3addr& paddr) { + send(ethernet::broadcast_address(), make_query_packet(paddr)); +} + +template <typename L3> +void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) { + _table[paddr] = hwaddr; + auto i = _in_progress.find(paddr); + if (i != _in_progress.end()) { + auto& res = i->second; + center->delete_time_event(res.timeout_fd); + for (auto &&p : res._waiters) { + p.first(hwaddr, std::move(p.second), 0); + } + _in_progress.erase(i); + } +} + +template <typename L3> +void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) { + auto i = _table.find(paddr); + if (i != _table.end()) { + cb(i->second, std::move(p), 0); + return ; + } + + auto j = _in_progress.find(paddr); + auto first_request = j == _in_progress.end(); + auto& res = first_request ? _in_progress[paddr] : j->second; + + if (first_request) { + res.timeout_fd = center->create_time_event( + 1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request)); + send_query(paddr); + } + + if (res._waiters.size() >= max_waiters) { + cb(ethernet_address(), std::move(p), -EBUSY); + return ; + } + + res._waiters.emplace_back(cb, std::move(p)); + return ; +} + +template <typename L3> +int arp_for<L3>::received(Packet p) { + auto ah = p.get_header<arp_hdr>(); + if (!ah) { + return 0; + } + auto h = ah->ntoh(); + if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) { + return 0; + } + switch (h.oper) { + case op_request: + return handle_request(&h); + case op_reply: + _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr); + return 0; + default: + return 0; + } +} + +template <typename L3> +int arp_for<L3>::handle_request(arp_hdr* ah) { + if (ah->target_paddr == _l3self + && _l3self != L3::broadcast_address()) { + ah->oper = op_reply; + ah->target_hwaddr = ah->sender_hwaddr; + ah->target_paddr = ah->sender_paddr; + ah->sender_hwaddr = l2self(); + ah->sender_paddr = _l3self; + *ah = ah->hton(); + send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah))); + } + return 0; +} + +#endif /* CEPH_MSG_ARP_H_ */ diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc new file mode 100644 index 00000000..278efe9e --- /dev/null +++ b/src/msg/async/dpdk/DPDK.cc @@ -0,0 +1,1267 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include <atomic> +#include <vector> +#include <queue> + +#include <rte_config.h> +#include <rte_common.h> +#include <rte_eal.h> +#include <rte_pci.h> +#include <rte_ethdev.h> +#include <rte_cycles.h> +#include <rte_memzone.h> + +#include "include/page.h" +#include "align.h" +#include "IP.h" +#include "const.h" +#include "dpdk_rte.h" +#include "DPDK.h" +#include "toeplitz.h" + +#include "common/Cycles.h" +#include "common/dout.h" +#include "common/errno.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + + +void* as_cookie(struct rte_pktmbuf_pool_private& p) { + return &p; +}; + +#ifndef MARKER +typedef void *MARKER[0]; /**< generic marker for a point in a structure */ +#endif + +/******************* Net device related constatns *****************************/ +static constexpr uint16_t default_ring_size = 512; + +// +// We need 2 times the ring size of buffers because of the way PMDs +// refill the ring. +// +static constexpr uint16_t mbufs_per_queue_rx = 2 * default_ring_size; +static constexpr uint16_t rx_gc_thresh = 64; + +// +// No need to keep more descriptors in the air than can be sent in a single +// rte_eth_tx_burst() call. +// +static constexpr uint16_t mbufs_per_queue_tx = 2 * default_ring_size; + +static constexpr uint16_t mbuf_cache_size = 512; +// +// Size of the data buffer in the non-inline case. +// +// We may want to change (increase) this value in future, while the +// inline_mbuf_data_size value will unlikely change due to reasons described +// above. +// +static constexpr size_t mbuf_data_size = 4096; + +static constexpr uint16_t mbuf_overhead = + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM; +// +// We'll allocate 2K data buffers for an inline case because this would require +// a single page per mbuf. If we used 4K data buffers here it would require 2 +// pages for a single buffer (due to "mbuf_overhead") and this is a much more +// demanding memory constraint. +// +static constexpr size_t inline_mbuf_data_size = 2048; + + +// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers +static constexpr uint8_t max_frags = 32 + 1; + +// +// Intel's 40G NIC HW limit for a number of fragments in an xmit segment. +// +// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices +// spec. for more details. +// +static constexpr uint8_t i40e_max_xmit_segment_frags = 8; + +// +// VMWare's virtual NIC limit for a number of fragments in an xmit segment. +// +// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT +// +static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16; + +static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead; + +static size_t huge_page_size = 512 * CEPH_PAGE_SIZE; + +uint32_t qp_mempool_obj_size() +{ + uint32_t mp_size = 0; + struct rte_mempool_objsz mp_obj_sz = {}; + + // + // We will align each size to huge page size because DPDK allocates + // physically contiguous memory region for each pool object. + // + + // Rx + mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+ + sizeof(struct rte_pktmbuf_pool_private), + huge_page_size); + + //Tx + std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz)); + mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0, + &mp_obj_sz)+ + sizeof(struct rte_pktmbuf_pool_private), + huge_page_size); + return mp_size; +} + +static constexpr const char* pktmbuf_pool_name = "dpdk_net_pktmbuf_pool"; + +/* + * When doing reads from the NIC queues, use this batch size + */ +static constexpr uint8_t packet_read_size = 32; +/******************************************************************************/ + +int DPDKDevice::init_port_start() +{ + ceph_assert(_port_idx < rte_eth_dev_count()); + + rte_eth_dev_info_get(_port_idx, &_dev_info); + + // + // This is a workaround for a missing handling of a HW limitation in the + // DPDK i40e driver. This and all related to _is_i40e_device code should be + // removed once this handling is added. + // + if (std::string("rte_i40evf_pmd") == _dev_info.driver_name || + std::string("rte_i40e_pmd") == _dev_info.driver_name) { + ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl; + _is_i40e_device = true; + } + + if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) { + ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl; + _is_vmxnet3_device = true; + } + + // + // Another workaround: this time for a lack of number of RSS bits. + // ixgbe PF NICs support up to 16 RSS queues. + // ixgbe VF NICs support up to 4 RSS queues. + // i40e PF NICs support up to 64 RSS queues. + // i40e VF NICs support up to 16 RSS queues. + // + if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16); + } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4); + } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64); + } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) { + _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16); + } + + // Clear txq_flags - we want to support all available offload features + // except for multi-mempool and refcnt'ing which we don't need + _dev_info.default_txconf.txq_flags = + ETH_TXQ_FLAGS_NOMULTMEMP | ETH_TXQ_FLAGS_NOREFCOUNT; + + // + // Disable features that are not supported by port's HW + // + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_UDP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMUDP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMTCP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOXSUMSCTP; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_VLAN_INSERT)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOVLANOFFL; + } + + if (!(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO)) { + _dev_info.default_txconf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS; + } + + /* for port configuration all features are off by default */ + rte_eth_conf port_conf = { 0 }; + + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues " + << _dev_info.max_rx_queues << " max_tx_queues " + << _dev_info.max_tx_queues << dendl; + + _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues}); + + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using " + << _num_queues << " queues" << dendl;; + + // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU. + // Even if port has a single queue we still want the RSS feature to be + // available in order to make HW calculate RSS hash for us. + if (_num_queues > 1) { + if (_dev_info.hash_key_size == 40) { + _rss_key = default_rsskey_40bytes; + } else if (_dev_info.hash_key_size == 52) { + _rss_key = default_rsskey_52bytes; + } else if (_dev_info.hash_key_size != 0) { + // WTF?!! + rte_exit(EXIT_FAILURE, + "Port %d: We support only 40 or 52 bytes RSS hash keys, %d bytes key requested", + _port_idx, _dev_info.hash_key_size); + } else { + _rss_key = default_rsskey_40bytes; + _dev_info.hash_key_size = 40; + } + + port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + port_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; + if (_dev_info.hash_key_size) { + port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data()); + port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size; + } + } else { + port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE; + } + + if (_num_queues > 1) { + if (_dev_info.reta_size) { + // RETA size should be a power of 2 + ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0); + + // Set the RSS table to the correct size + _redir_table.resize(_dev_info.reta_size); + _rss_table_bits = std::lround(std::log2(_dev_info.reta_size)); + ldout(cct, 5) << __func__ << " Port " << int(_port_idx) + << ": RSS table size is " << _dev_info.reta_size << dendl; + } else { + // FIXME: same with sw_reta + _redir_table.resize(128); + _rss_table_bits = std::lround(std::log2(128)); + } + } else { + _redir_table.push_back(0); + } + + // Set Rx VLAN stripping + if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) { + port_conf.rxmode.hw_vlan_strip = 1; + } + + // Enable HW CRC stripping + port_conf.rxmode.hw_strip_crc = 1; + +#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT + // Enable LRO + if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) { + ldout(cct, 1) << __func__ << " LRO is on" << dendl; + port_conf.rxmode.enable_lro = 1; + _hw_features.rx_lro = true; + } else +#endif + ldout(cct, 1) << __func__ << " LRO is off" << dendl; + + // Check that all CSUM features are either all set all together or not set + // all together. If this assumption breaks we need to rework the below logic + // by splitting the csum offload feature bit into separate bits for IPv4, + // TCP. + ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) || + (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM))); + + // Set Rx checksum checking + if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) && + (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) { + ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl; + port_conf.rxmode.hw_ip_checksum = 1; + _hw_features.rx_csum_offload = 1; + } + + if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) { + ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl; + _hw_features.tx_csum_ip_offload = 1; + } + + // TSO is supported starting from DPDK v1.8 + if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) { + ldout(cct, 1) << __func__ << " TSO is supported" << dendl; + _hw_features.tx_tso = 1; + } + + // Check that Tx TCP CSUM features are either all set all together + // or not set all together. If this assumption breaks we need to rework the + // below logic by splitting the csum offload feature bit into separate bits + // for TCP. + ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) || + !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM)); + + if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) { + ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl; + _hw_features.tx_csum_l4_offload = 1; + } + + int retval; + + ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl; + + /* + * Standard DPDK port initialisation - config port, then set up + * rx and tx rings. + */ + if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues, + &port_conf)) != 0) { + lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx + << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl; + return retval; + } + + //rte_eth_promiscuous_enable(port_num); + ldout(cct, 1) << __func__ << " done." << dendl; + + return 0; +} + +void DPDKDevice::set_hw_flow_control() +{ + // Read the port's current/default flow control settings + struct rte_eth_fc_conf fc_conf; + auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf); + + if (ret == -ENOTSUP) { + ldout(cct, 1) << __func__ << " port " << int(_port_idx) + << ": not support to get hardware flow control settings: " << ret << dendl; + goto not_supported; + } + + if (ret < 0) { + lderr(cct) << __func__ << " port " << int(_port_idx) + << ": failed to get hardware flow control settings: " << ret << dendl; + ceph_abort(); + } + + if (_enable_fc) { + fc_conf.mode = RTE_FC_FULL; + } else { + fc_conf.mode = RTE_FC_NONE; + } + + ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf); + if (ret == -ENOTSUP) { + ldout(cct, 1) << __func__ << " port " << int(_port_idx) + << ": not support to set hardware flow control settings: " << ret << dendl; + goto not_supported; + } + + if (ret < 0) { + lderr(cct) << __func__ << " port " << int(_port_idx) + << ": failed to set hardware flow control settings: " << ret << dendl; + ceph_abort(); + } + + ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": HW FC " << _enable_fc << dendl; + return; + +not_supported: + ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl; +} + +int DPDKDevice::init_port_fini() +{ + // Changing FC requires HW reset, so set it before the port is initialized. + set_hw_flow_control(); + + if (rte_eth_dev_start(_port_idx) != 0) { + lderr(cct) << __func__ << " can't start port " << _port_idx << dendl; + return -1; + } + + if (_num_queues > 1) { + if (!rte_eth_dev_filter_supported(_port_idx, RTE_ETH_FILTER_HASH)) { + ldout(cct, 5) << __func__ << " Port " << _port_idx << ": HASH FILTER configuration is supported" << dendl; + + // Setup HW touse the TOEPLITZ hash function as an RSS hash function + struct rte_eth_hash_filter_info info = {}; + + info.info_type = RTE_ETH_HASH_FILTER_GLOBAL_CONFIG; + info.info.global_conf.hash_func = RTE_ETH_HASH_FUNCTION_TOEPLITZ; + + if (rte_eth_dev_filter_ctrl(_port_idx, RTE_ETH_FILTER_HASH, + RTE_ETH_FILTER_SET, &info) < 0) { + lderr(cct) << __func__ << " cannot set hash function on a port " << _port_idx << dendl; + return -1; + } + } + + set_rss_table(); + } + + // Wait for a link + if (check_port_link_status() < 0) { + lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl; + return -1; + } + + ldout(cct, 5) << __func__ << " created DPDK device" << dendl; + return 0; +} + +void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) { + ceph_assert(!cpu_weights.empty()); + if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) { + // special case queue sending to self only, to avoid requiring a hash value + return; + } + register_packet_provider([this] { + Tub<Packet> p; + if (!_proxy_packetq.empty()) { + p = std::move(_proxy_packetq.front()); + _proxy_packetq.pop_front(); + } + return p; + }); + build_sw_reta(cpu_weights); +} + +void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) { + float total_weight = 0; + for (auto&& x : cpu_weights) { + total_weight += x.second; + } + float accum = 0; + unsigned idx = 0; + std::array<uint8_t, 128> reta; + for (auto&& entry : cpu_weights) { + auto cpu = entry.first; + auto weight = entry.second; + accum += weight; + while (idx < (accum / total_weight * reta.size() - 0.5)) { + reta[idx++] = cpu; + } + } + _sw_reta = reta; +} + + +bool DPDKQueuePair::init_rx_mbuf_pool() +{ + std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx"; + + // reserve the memory for Rx buffers containers + _rx_free_pkts.reserve(mbufs_per_queue_rx); + _rx_free_bufs.reserve(mbufs_per_queue_rx); + + _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str()); + if (!_pktmbuf_pool_rx) { + ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str() + << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl; + + // + // Don't pass single-producer/single-consumer flags to mbuf create as it + // seems faster to use a cache instead. + // + struct rte_pktmbuf_pool_private roomsz = {}; + roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM; + _pktmbuf_pool_rx = rte_mempool_create( + name.c_str(), + mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size, + mbuf_cache_size, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, as_cookie(roomsz), + rte_pktmbuf_init, nullptr, + rte_socket_id(), 0); + if (!_pktmbuf_pool_rx) { + lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl; + return false; + } + + // + // allocate more data buffer + int bufs_count = cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx; + int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY; + std::string mz_name = "rx_buffer_data" + std::to_string(_qid); + const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(), + mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size); + ceph_assert(mz); + void* m = mz->addr; + for (int i = 0; i < bufs_count; i++) { + ceph_assert(m); + _alloc_bufs.push_back(m); + m += mbuf_data_size; + } + + if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size, + rte_eth_dev_socket_id(_dev_port_idx), + _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) { + lderr(cct) << __func__ << " cannot initialize rx queue" << dendl; + return false; + } + } + + return _pktmbuf_pool_rx != nullptr; +} + +int DPDKDevice::check_port_link_status() +{ + int count = 0; + + ldout(cct, 20) << __func__ << dendl; + const int sleep_time = 100 * 1000; + const int max_check_time = 90; /* 9s (90 * 100ms) in total */ + while (true) { + struct rte_eth_link link; + memset(&link, 0, sizeof(link)); + rte_eth_link_get_nowait(_port_idx, &link); + + if (true) { + if (link.link_status) { + ldout(cct, 5) << __func__ << " done port " + << static_cast<unsigned>(_port_idx) + << " link Up - speed " << link.link_speed + << " Mbps - " + << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n")) + << dendl; + break; + } else if (count++ < max_check_time) { + ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl; + usleep(sleep_time); + } else { + lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl; + return -1; + } + } + } + return 0; +} + +class C_handle_dev_stats : public EventCallback { + DPDKQueuePair *_qp; + public: + C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { } + void do_request(uint64_t id) { + _qp->handle_stats(); + } +}; + +DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid) + : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid), + _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid), + _tx_gc_poller(this) +{ + if (!init_rx_mbuf_pool()) { + lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl; + ceph_abort(); + } + + static_assert(offsetof(tx_buf, private_end) - + offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM, + "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! " + "Increase the headroom size in the DPDK configuration"); + static_assert(offsetof(tx_buf, _mbuf) == 0, + "There is a pad at the beginning of the tx_buf before _mbuf " + "field!"); + static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0, + "inline_mbuf_data_size has to be a power of two!"); + + std::string name(std::string("queue") + std::to_string(qid)); + PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last); + + plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets"); + plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets"); + plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets"); + plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets"); + plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch"); + plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch"); + plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments"); + plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments"); + plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations"); + plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations"); + plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES)); + plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations"); + plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations"); + plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + + if (!_qid) + device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this)); +} + +void DPDKQueuePair::handle_stats() +{ + ldout(cct, 20) << __func__ << " started." << dendl; + rte_eth_stats rte_stats = {}; + int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats); + + if (rc) { + ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl; + return ; + } + +#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0) + _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts); + _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc); +#endif + _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed); + _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf); + + _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors); + _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors); + device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this)); +} + +bool DPDKQueuePair::poll_tx() { + bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback; +#ifdef CEPH_PERF_DEV + uint64_t start = Cycles::rdtsc(); +#endif + uint32_t total_work = 0; + if (_tx_packetq.size() < 16) { + // refill send queue from upper layers + uint32_t work; + do { + work = 0; + for (auto&& pr : _pkt_providers) { + auto p = pr(); + if (p) { + work++; + if (likely(nonloopback)) { + // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl; + _tx_packetq.push_back(std::move(*p)); + } else { + auto th = p->get_header<eth_hdr>(0); + if (th->dst_mac == th->src_mac) { + _dev->l2receive(_qid, std::move(*p)); + } else { + _tx_packetq.push_back(std::move(*p)); + } + } + if (_tx_packetq.size() == 128) { + break; + } + } + } + total_work += work; + } while (work && total_work < 256 && _tx_packetq.size() < 128); + } + if (!_tx_packetq.empty()) { + uint64_t c = send(_tx_packetq); + perf_logger->inc(l_dpdk_qp_tx_packets, c); + perf_logger->set(l_dpdk_qp_tx_last_bunch, c); +#ifdef CEPH_PERF_DEV + tx_count += total_work; + tx_cycles += Cycles::rdtsc() - start; +#endif + return true; + } + + return false; +} + +inline Tub<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m) +{ + _frags.clear(); + _bufs.clear(); + + for (; m != nullptr; m = m->next) { + char* data = rte_pktmbuf_mtod(m, char*); + + _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)}); + _bufs.push_back(data); + } + + auto del = std::bind( + [this](std::vector<char*> &bufs) { + for (auto&& b : bufs) { _alloc_bufs.push_back(b); } + }, std::move(_bufs)); + return Packet( + _frags.begin(), _frags.end(), make_deleter(std::move(del))); +} + +inline Tub<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m) +{ + _rx_free_pkts.push_back(m); + _num_rx_free_segs += m->nb_segs; + + if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) { + char* data = rte_pktmbuf_mtod(m, char*); + + return Packet(fragment{data, rte_pktmbuf_data_len(m)}, + make_deleter([this, data] { _alloc_bufs.push_back(data); })); + } else { + return from_mbuf_lro(m); + } +} + +inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head) +{ + for (; head != nullptr; head = head->next) { + if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) { + // + // If we failed to allocate a new buffer - push the rest of the + // cluster back to the free_packets list for a later retry. + // + _rx_free_pkts.push_back(head); + return false; + } + _rx_free_bufs.push_back(head); + } + + return true; +} + +bool DPDKQueuePair::rx_gc(bool force) +{ + if (_num_rx_free_segs >= rx_gc_thresh || force) { + ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs + << " thresh " << rx_gc_thresh + << " free pkts " << _rx_free_pkts.size() + << dendl; + + while (!_rx_free_pkts.empty()) { + // + // Use back() + pop_back() semantics to avoid an extra + // _rx_free_pkts.clear() at the end of the function - clear() has a + // linear complexity. + // + auto m = _rx_free_pkts.back(); + _rx_free_pkts.pop_back(); + + if (!refill_one_cluster(m)) { + ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl; + break; + } + } + for (auto&& m : _rx_free_bufs) { + rte_pktmbuf_prefree_seg(m); + } + + if (_rx_free_bufs.size()) { + rte_mempool_put_bulk(_pktmbuf_pool_rx, + (void **)_rx_free_bufs.data(), + _rx_free_bufs.size()); + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size()); + + _num_rx_free_segs -= _rx_free_bufs.size(); + _rx_free_bufs.clear(); + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) || + (!_rx_free_pkts.empty() && _num_rx_free_segs)); + } + } + + return _num_rx_free_segs >= rx_gc_thresh; +} + + +void DPDKQueuePair::process_packets( + struct rte_mbuf **bufs, uint16_t count) +{ + uint64_t nr_frags = 0, bytes = 0; + + for (uint16_t i = 0; i < count; i++) { + struct rte_mbuf *m = bufs[i]; + offload_info oi; + + Tub<Packet> p = from_mbuf(m); + + // Drop the packet if translation above has failed + if (!p) { + perf_logger->inc(l_dpdk_qp_rx_no_memory_errors); + continue; + } + // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl; + + nr_frags += m->nb_segs; + bytes += m->pkt_len; + + // Set stipped VLAN value if available + if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) && + (m->ol_flags & PKT_RX_VLAN_STRIPPED)) { + oi.vlan_tci = m->vlan_tci; + } + + if (_dev->get_hw_features().rx_csum_offload) { + if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) { + // Packet with bad checksum, just drop it. + perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors); + continue; + } + // Note that when _hw_features.rx_csum_offload is on, the receive + // code for ip, tcp and udp will assume they don't need to check + // the checksum again, because we did this here. + } + + p->set_offload_info(oi); + if (m->ol_flags & PKT_RX_RSS_HASH) { + p->set_rss_hash(m->hash.rss); + } + + _dev->l2receive(_qid, std::move(*p)); + } + + perf_logger->inc(l_dpdk_qp_rx_packets, count); + perf_logger->set(l_dpdk_qp_rx_last_bunch, count); + perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags); + perf_logger->inc(l_dpdk_qp_rx_bytes, bytes); +} + +bool DPDKQueuePair::poll_rx_once() +{ + struct rte_mbuf *buf[packet_read_size]; + + /* read a port */ +#ifdef CEPH_PERF_DEV + uint64_t start = Cycles::rdtsc(); +#endif + uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid, + buf, packet_read_size); + + /* Now process the NIC packets read */ + if (likely(count > 0)) { + process_packets(buf, count); +#ifdef CEPH_PERF_DEV + rx_cycles = Cycles::rdtsc() - start; + rx_count += count; +#endif + } +#ifdef CEPH_PERF_DEV + else { + if (rx_count > 10000 && tx_count) { + ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns " + << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns" + << dendl; + rx_count = rx_cycles = tx_count = tx_cycles = 0; + } + } +#endif + + return count; +} + +DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c, + DPDKDevice *dev, uint8_t qid): cct(c) +{ + std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx"; + + _pool = rte_mempool_lookup(name.c_str()); + if (!_pool) { + ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str() + << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl; + // + // We are going to push the buffers from the mempool into + // the circular_buffer and then poll them from there anyway, so + // we prefer to make a mempool non-atomic in this case. + // + _pool = rte_mempool_create(name.c_str(), + mbufs_per_queue_tx, inline_mbuf_size, + mbuf_cache_size, + sizeof(struct rte_pktmbuf_pool_private), + rte_pktmbuf_pool_init, nullptr, + rte_pktmbuf_init, nullptr, + rte_socket_id(), 0); + + if (!_pool) { + lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl; + ceph_abort(); + } + if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size, + rte_eth_dev_socket_id(dev->port_idx()), + dev->def_tx_conf()) < 0) { + lderr(cct) << __func__ << " cannot initialize tx queue" << dendl; + ceph_abort(); + } + } + + // + // Fill the factory with the buffers from the mempool allocated + // above. + // + init_factory(); +} + +bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head) +{ + bool is_tso = head->ol_flags & PKT_TX_TCP_SEG; + + // For a non-TSO case: number of fragments should not exceed 8 + if (!is_tso){ + return head->nb_segs > i40e_max_xmit_segment_frags; + } + + // + // For a TSO case each MSS window should not include more than 8 + // fragments including headers. + // + + // Calculate the number of frags containing headers. + // + // Note: we support neither VLAN nor tunneling thus headers size + // accounting is super simple. + // + size_t headers_size = head->l2_len + head->l3_len + head->l4_len; + unsigned hdr_frags = 0; + size_t cur_payload_len = 0; + rte_mbuf *cur_seg = head; + + while (cur_seg && cur_payload_len < headers_size) { + cur_payload_len += cur_seg->data_len; + cur_seg = cur_seg->next; + hdr_frags++; + } + + // + // Header fragments will be used for each TSO segment, thus the + // maximum number of data segments will be 8 minus the number of + // header fragments. + // + // It's unclear from the spec how the first TSO segment is treated + // if the last fragment with headers contains some data bytes: + // whether this fragment will be accounted as a single fragment or + // as two separate fragments. We prefer to play it safe and assume + // that this fragment will be accounted as two separate fragments. + // + size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags; + + if (head->nb_segs <= max_win_size) { + return false; + } + + // Get the data (without headers) part of the first data fragment + size_t prev_frag_data = cur_payload_len - headers_size; + auto mss = head->tso_segsz; + + while (cur_seg) { + unsigned frags_in_seg = 0; + size_t cur_seg_size = 0; + + if (prev_frag_data) { + cur_seg_size = prev_frag_data; + frags_in_seg++; + prev_frag_data = 0; + } + + while (cur_seg_size < mss && cur_seg) { + cur_seg_size += cur_seg->data_len; + cur_seg = cur_seg->next; + frags_in_seg++; + + if (frags_in_seg > max_win_size) { + return true; + } + } + + if (cur_seg_size > mss) { + prev_frag_data = cur_seg_size - mss; + } + } + + return false; +} + +void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head) +{ + // Handle TCP checksum offload + auto oi = p.offload_info(); + if (oi.needs_ip_csum) { + head->ol_flags |= PKT_TX_IP_CKSUM; + // TODO: Take a VLAN header into an account here + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = oi.ip_hdr_len; + } + if (qp.port().get_hw_features().tx_csum_l4_offload) { + if (oi.protocol == ip_protocol_num::tcp) { + head->ol_flags |= PKT_TX_TCP_CKSUM; + // TODO: Take a VLAN header into an account here + head->l2_len = sizeof(struct ether_hdr); + head->l3_len = oi.ip_hdr_len; + + if (oi.tso_seg_size) { + ceph_assert(oi.needs_ip_csum); + head->ol_flags |= PKT_TX_TCP_SEG; + head->l4_len = oi.tcp_hdr_len; + head->tso_segsz = oi.tso_seg_size; + } + } + } +} + +DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc( + CephContext *cct, Packet&& p, DPDKQueuePair& qp) +{ + // Too fragmented - linearize + if (p.nr_frags() > max_frags) { + p.linearize(); + qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops); + } + + build_mbuf_cluster: + rte_mbuf *head = nullptr, *last_seg = nullptr; + unsigned nsegs = 0; + + // + // Create a HEAD of the fragmented packet: check if frag0 has to be + // copied and if yes - send it in a copy way + // + if (!check_frag0(p)) { + if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl; + return nullptr; + } + } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl; + return nullptr; + } + + unsigned total_nsegs = nsegs; + + for (unsigned i = 1; i < p.nr_frags(); i++) { + rte_mbuf *h = nullptr, *new_last_seg = nullptr; + if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) { + ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl; + me(head)->recycle(); + return nullptr; + } + + total_nsegs += nsegs; + + // Attach a new buffers' chain to the packet chain + last_seg->next = h; + last_seg = new_last_seg; + } + + // Update the HEAD buffer with the packet info + head->pkt_len = p.len(); + head->nb_segs = total_nsegs; + + set_cluster_offload_info(p, qp, head); + + // + // If a packet hasn't been linearized already and the resulting + // cluster requires the linearisation due to HW limitation: + // + // - Recycle the cluster. + // - Linearize the packet. + // - Build the cluster once again + // + if (head->nb_segs > max_frags || + (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) || + (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) { + me(head)->recycle(); + p.linearize(); + qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops); + + goto build_mbuf_cluster; + } + + me(last_seg)->set_packet(std::move(p)); + + return me(head); +} + +void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head) +{ + rte_mbuf* cur_seg = head; + size_t cur_seg_offset = 0; + unsigned cur_frag_idx = 0; + size_t cur_frag_offset = 0; + + while (true) { + size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset, + inline_mbuf_data_size - cur_seg_offset); + + memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset), + p.frag(cur_frag_idx).base + cur_frag_offset, to_copy); + + cur_frag_offset += to_copy; + cur_seg_offset += to_copy; + + if (cur_frag_offset >= p.frag(cur_frag_idx).size) { + ++cur_frag_idx; + if (cur_frag_idx >= p.nr_frags()) { + // + // We are done - set the data size of the last segment + // of the cluster. + // + cur_seg->data_len = cur_seg_offset; + break; + } + + cur_frag_offset = 0; + } + + if (cur_seg_offset >= inline_mbuf_data_size) { + cur_seg->data_len = inline_mbuf_data_size; + cur_seg = cur_seg->next; + cur_seg_offset = 0; + + // FIXME: assert in a fast-path - remove!!! + ceph_assert(cur_seg); + } + } +} + +DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp) +{ + // sanity + if (!p.len()) { + return nullptr; + } + + /* + * Here we are going to use the fact that the inline data size is a + * power of two. + * + * We will first try to allocate the cluster and only if we are + * successful - we will go and copy the data. + */ + auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size); + unsigned nsegs = aligned_len / inline_mbuf_data_size; + rte_mbuf *head = nullptr, *last_seg = nullptr; + + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return nullptr; + } + + head = buf->rte_mbuf_p(); + last_seg = head; + for (unsigned i = 1; i < nsegs; i++) { + buf = qp.get_tx_buf(); + if (!buf) { + me(head)->recycle(); + return nullptr; + } + + last_seg->next = buf->rte_mbuf_p(); + last_seg = last_seg->next; + } + + // + // If we've got here means that we have succeeded already! + // We only need to copy the data and set the head buffer with the + // relevant info. + // + head->pkt_len = p.len(); + head->nb_segs = nsegs; + + copy_packet_to_cluster(p, head); + set_cluster_offload_info(p, qp, head); + + return me(head); +} + +size_t DPDKQueuePair::tx_buf::copy_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len) +{ + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return 0; + } + + size_t len = std::min(buf_len, inline_mbuf_data_size); + + m = buf->rte_mbuf_p(); + + // mbuf_put() + m->data_len = len; + m->pkt_len = len; + + qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops); + qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len); + + memcpy(rte_pktmbuf_mtod(m, void*), data, len); + + return len; +} + +void DPDKDevice::set_rss_table() +{ + // always fill our local indirection table. + unsigned i = 0; + for (auto& r : _redir_table) { + r = i++ % _num_queues; + } + + if (_dev_info.reta_size == 0) + return; + + int reta_conf_size = std::max(1, _dev_info.reta_size / RTE_RETA_GROUP_SIZE); + rte_eth_rss_reta_entry64 reta_conf[reta_conf_size]; + + // Configure the HW indirection table + i = 0; + for (auto& x : reta_conf) { + x.mask = ~0ULL; + for (auto& r: x.reta) { + r = i++ % _num_queues; + } + } + + if (rte_eth_dev_rss_reta_update(_port_idx, reta_conf, _dev_info.reta_size)) { + rte_exit(EXIT_FAILURE, "Port %d: Failed to update an RSS indirection table", _port_idx); + } +} + +/******************************** Interface functions *************************/ + +std::unique_ptr<DPDKDevice> create_dpdk_net_device( + CephContext *cct, + unsigned cores, + uint8_t port_idx, + bool use_lro, + bool enable_fc) +{ + // Check that we have at least one DPDK-able port + if (rte_eth_dev_count() == 0) { + rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n"); + } else { + ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count()) << dendl; + } + + return std::unique_ptr<DPDKDevice>( + new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc)); +} diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h new file mode 100644 index 00000000..fa12af6b --- /dev/null +++ b/src/msg/async/dpdk/DPDK.h @@ -0,0 +1,918 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_DPDK_DEV_H +#define CEPH_DPDK_DEV_H + +#include <memory> +#include <functional> +#include <rte_config.h> +#include <rte_common.h> +#include <rte_ethdev.h> +#include <rte_malloc.h> +#include <rte_version.h> + +#include "include/page.h" +#include "common/Tub.h" +#include "common/perf_counters.h" +#include "msg/async/Event.h" +#include "const.h" +#include "circular_buffer.h" +#include "ethernet.h" +#include "Packet.h" +#include "stream.h" +#include "net.h" +#include "toeplitz.h" + + +struct free_deleter { + void operator()(void* p) { ::free(p); } +}; + + +enum { + l_dpdk_dev_first = 58800, + l_dpdk_dev_rx_mcast, + l_dpdk_dev_rx_total_errors, + l_dpdk_dev_tx_total_errors, + l_dpdk_dev_rx_badcrc_errors, + l_dpdk_dev_rx_dropped_errors, + l_dpdk_dev_rx_nombuf_errors, + l_dpdk_dev_last +}; + +enum { + l_dpdk_qp_first = 58900, + l_dpdk_qp_rx_packets, + l_dpdk_qp_tx_packets, + l_dpdk_qp_rx_bad_checksum_errors, + l_dpdk_qp_rx_no_memory_errors, + l_dpdk_qp_rx_bytes, + l_dpdk_qp_tx_bytes, + l_dpdk_qp_rx_last_bunch, + l_dpdk_qp_tx_last_bunch, + l_dpdk_qp_rx_fragments, + l_dpdk_qp_tx_fragments, + l_dpdk_qp_rx_copy_ops, + l_dpdk_qp_tx_copy_ops, + l_dpdk_qp_rx_copy_bytes, + l_dpdk_qp_tx_copy_bytes, + l_dpdk_qp_rx_linearize_ops, + l_dpdk_qp_tx_linearize_ops, + l_dpdk_qp_tx_queue_length, + l_dpdk_qp_last +}; + +class DPDKDevice; +class DPDKWorker; + +class DPDKQueuePair { + using packet_provider_type = std::function<Tub<Packet> ()>; + public: + void configure_proxies(const std::map<unsigned, float>& cpu_weights); + // build REdirection TAble for cpu_weights map: target cpu -> weight + void build_sw_reta(const std::map<unsigned, float>& cpu_weights); + void proxy_send(Packet p) { + _proxy_packetq.push_back(std::move(p)); + } + void register_packet_provider(packet_provider_type func) { + _pkt_providers.push_back(std::move(func)); + } + bool poll_tx(); + friend class DPDKDevice; + + class tx_buf_factory; + + class tx_buf { + friend class DPDKQueuePair; + public: + static tx_buf* me(rte_mbuf* mbuf) { + return reinterpret_cast<tx_buf*>(mbuf); + } + + private: + /** + * Checks if the original packet of a given cluster should be linearized + * due to HW limitations. + * + * @param head head of a cluster to check + * + * @return TRUE if a packet should be linearized. + */ + static bool i40e_should_linearize(rte_mbuf *head); + + /** + * Sets the offload info in the head buffer of an rte_mbufs cluster. + * + * @param p an original packet the cluster is built for + * @param qp QP handle + * @param head a head of an rte_mbufs cluster + */ + static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head); + + /** + * Creates a tx_buf cluster representing a given packet in a "zero-copy" + * way. + * + * @param p packet to translate + * @param qp DPDKQueuePair handle + * + * @return the HEAD tx_buf of the cluster or nullptr in case of a + * failure + */ + static tx_buf* from_packet_zc( + CephContext *cct, Packet&& p, DPDKQueuePair& qp); + + /** + * Copy the contents of the "packet" into the given cluster of + * rte_mbuf's. + * + * @note Size of the cluster has to be big enough to accommodate all the + * contents of the given packet. + * + * @param p packet to copy + * @param head head of the rte_mbuf's cluster + */ + static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head); + + /** + * Creates a tx_buf cluster representing a given packet in a "copy" way. + * + * @param p packet to translate + * @param qp DPDKQueuePair handle + * + * @return the HEAD tx_buf of the cluster or nullptr in case of a + * failure + */ + static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp); + + /** + * Zero-copy handling of a single fragment. + * + * @param do_one_buf Functor responsible for a single rte_mbuf + * handling + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * @return TRUE in case of success + */ + template <class DoOneBufFunc> + static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp, + fragment& frag, rte_mbuf*& head, + rte_mbuf*& last_seg, unsigned& nsegs) { + size_t len, left_to_set = frag.size; + char* base = frag.base; + + rte_mbuf* m; + + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(frag.size); + + // Create a HEAD of mbufs' cluster and set the first bytes into it + len = do_one_buf(qp, head, base, left_to_set); + if (!len) { + return false; + } + + left_to_set -= len; + base += len; + nsegs = 1; + + // + // Set the rest of the data into the new mbufs and chain them to + // the cluster. + // + rte_mbuf* prev_seg = head; + while (left_to_set) { + len = do_one_buf(qp, m, base, left_to_set); + if (!len) { + me(head)->recycle(); + return false; + } + + left_to_set -= len; + base += len; + nsegs++; + + prev_seg->next = m; + prev_seg = m; + } + + // Return the last mbuf in the cluster + last_seg = prev_seg; + + return true; + } + + /** + * Zero-copy handling of a single fragment. + * + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * @return TRUE in case of success + */ + static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag, + rte_mbuf*& head, rte_mbuf*& last_seg, + unsigned& nsegs) { + return do_one_frag(set_one_data_buf, qp, frag, head, + last_seg, nsegs); + } + + /** + * Copies one fragment into the cluster of rte_mbuf's. + * + * @param qp DPDKQueuePair handle (in) + * @param frag Fragment to copy (in) + * @param head Head of the cluster (out) + * @param last_seg Last segment of the cluster (out) + * @param nsegs Number of segments in the cluster (out) + * + * We return the "last_seg" to avoid traversing the cluster in order to get + * it. + * + * @return TRUE in case of success + */ + static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag, + rte_mbuf*& head, rte_mbuf*& last_seg, + unsigned& nsegs) { + return do_one_frag(copy_one_data_buf, qp, frag, head, + last_seg, nsegs); + } + + /** + * Allocates a single rte_mbuf and sets it to point to a given data + * buffer. + * + * @param qp DPDKQueuePair handle (in) + * @param m New allocated rte_mbuf (out) + * @param va virtual address of a data buffer (in) + * @param buf_len length of the data to copy (in) + * + * @return The actual number of bytes that has been set in the mbuf + */ + static size_t set_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) { + static constexpr size_t max_frag_len = 15 * 1024; // 15K + + // FIXME: current all tx buf is allocated without rte_malloc + return copy_one_data_buf(qp, m, va, buf_len); + // + // Currently we break a buffer on a 15K boundary because 82599 + // devices have a 15.5K limitation on a maximum single fragment + // size. + // + rte_iova_t pa = rte_malloc_virt2iova(va); + if (!pa) + return copy_one_data_buf(qp, m, va, buf_len); + + ceph_assert(buf_len); + tx_buf* buf = qp.get_tx_buf(); + if (!buf) { + return 0; + } + + size_t len = std::min(buf_len, max_frag_len); + + buf->set_zc_info(va, pa, len); + m = buf->rte_mbuf_p(); + + return len; + } + + /** + * Allocates a single rte_mbuf and copies a given data into it. + * + * @param qp DPDKQueuePair handle (in) + * @param m New allocated rte_mbuf (out) + * @param data Data to copy from (in) + * @param buf_len length of the data to copy (in) + * + * @return The actual number of bytes that has been copied + */ + static size_t copy_one_data_buf( + DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len); + + /** + * Checks if the first fragment of the given packet satisfies the + * zero-copy flow requirement: its first 128 bytes should not cross the + * 4K page boundary. This is required in order to avoid splitting packet + * headers. + * + * @param p packet to check + * + * @return TRUE if packet is ok and FALSE otherwise. + */ + static bool check_frag0(Packet& p) + { + // + // First frag is special - it has headers that should not be split. + // If the addressing is such that the first fragment has to be + // split, then send this packet in a (non-zero) copy flow. We'll + // check if the first 128 bytes of the first fragment reside in the + // physically contiguous area. If that's the case - we are good to + // go. + // + if (p.frag(0).size < 128) + return false; + + return true; + } + + public: + tx_buf(tx_buf_factory& fc) : _fc(fc) { + + _buf_physaddr = _mbuf.buf_physaddr; + _data_off = _mbuf.data_off; + } + + rte_mbuf* rte_mbuf_p() { return &_mbuf; } + + void set_zc_info(void* va, phys_addr_t pa, size_t len) { + // mbuf_put() + _mbuf.data_len = len; + _mbuf.pkt_len = len; + + // Set the mbuf to point to our data + _mbuf.buf_addr = va; + _mbuf.buf_physaddr = pa; + _mbuf.data_off = 0; + _is_zc = true; + } + + void reset_zc() { + + // + // If this mbuf was the last in a cluster and contains an + // original packet object then call the destructor of the + // original packet object. + // + if (_p) { + // + // Reset the std::optional. This in particular is going + // to call the "packet"'s destructor and reset the + // "optional" state to "nonengaged". + // + _p.destroy(); + + } else if (!_is_zc) { + return; + } + + // Restore the rte_mbuf fields we trashed in set_zc_info() + _mbuf.buf_physaddr = _buf_physaddr; + _mbuf.buf_addr = rte_mbuf_to_baddr(&_mbuf); + _mbuf.data_off = _data_off; + + _is_zc = false; + } + + void recycle() { + struct rte_mbuf *m = &_mbuf, *m_next; + + while (m != nullptr) { + m_next = m->next; + rte_pktmbuf_reset(m); + _fc.put(me(m)); + m = m_next; + } + } + + void set_packet(Packet&& p) { + _p = std::move(p); + } + + private: + struct rte_mbuf _mbuf; + MARKER private_start; + Tub<Packet> _p; + phys_addr_t _buf_physaddr; + uint16_t _data_off; + // TRUE if underlying mbuf has been used in the zero-copy flow + bool _is_zc = false; + // buffers' factory the buffer came from + tx_buf_factory& _fc; + MARKER private_end; + }; + + class tx_buf_factory { + // + // Number of buffers to free in each GC iteration: + // We want the buffers to be allocated from the mempool as many as + // possible. + // + // On the other hand if there is no Tx for some time we want the + // completions to be eventually handled. Thus we choose the smallest + // possible packets count number here. + // + static constexpr int gc_count = 1; + public: + tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid); + ~tx_buf_factory() { + // put all mbuf back into mempool in order to make the next factory work + while (gc()); + rte_mempool_put_bulk(_pool, (void**)_ring.data(), + _ring.size()); + } + + + /** + * @note Should not be called if there are no free tx_buf's + * + * @return a free tx_buf object + */ + tx_buf* get() { + // Take completed from the HW first + tx_buf *pkt = get_one_completed(); + if (pkt) { + pkt->reset_zc(); + return pkt; + } + + // + // If there are no completed at the moment - take from the + // factory's cache. + // + if (_ring.empty()) { + return nullptr; + } + + pkt = _ring.back(); + _ring.pop_back(); + + return pkt; + } + + void put(tx_buf* buf) { + buf->reset_zc(); + _ring.push_back(buf); + } + + bool gc() { + for (int cnt = 0; cnt < gc_count; ++cnt) { + auto tx_buf_p = get_one_completed(); + if (!tx_buf_p) { + return false; + } + + put(tx_buf_p); + } + + return true; + } + private: + /** + * Fill the mbufs circular buffer: after this the _pool will become + * empty. We will use it to catch the completed buffers: + * + * - Underlying PMD drivers will "free" the mbufs once they are + * completed. + * - We will poll the _pktmbuf_pool_tx till it's empty and release + * all the buffers from the freed mbufs. + */ + void init_factory() { + while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) { + _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this}); + } + } + + /** + * PMD puts the completed buffers back into the mempool they have + * originally come from. + * + * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call + * rte_pktmbuf_reset() here again. + * + * @return a single tx_buf that has been completed by HW. + */ + tx_buf* get_one_completed() { + return tx_buf::me(rte_pktmbuf_alloc(_pool)); + } + + private: + CephContext *cct; + std::vector<tx_buf*> _ring; + rte_mempool* _pool = nullptr; + }; + + public: + explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid); + ~DPDKQueuePair() { + if (device_stat_time_fd) { + center->delete_time_event(device_stat_time_fd); + } + rx_gc(true); + } + + void rx_start() { + _rx_poller.construct(this); + } + + uint32_t send(circular_buffer<Packet>& pb) { + // Zero-copy send + return _send(pb, [&] (Packet&& p) { + return tx_buf::from_packet_zc(cct, std::move(p), *this); + }); + } + + DPDKDevice& port() const { return *_dev; } + tx_buf* get_tx_buf() { return _tx_buf_factory.get(); } + + void handle_stats(); + + private: + template <class Func> + uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) { + if (_tx_burst.size() == 0) { + for (auto&& p : pb) { + // TODO: ceph_assert() in a fast path! Remove me ASAP! + ceph_assert(p.len()); + + tx_buf* buf = packet_to_tx_buf_p(std::move(p)); + if (!buf) { + break; + } + + _tx_burst.push_back(buf->rte_mbuf_p()); + } + } + + uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid, + _tx_burst.data() + _tx_burst_idx, + _tx_burst.size() - _tx_burst_idx); + + uint64_t nr_frags = 0, bytes = 0; + + for (int i = 0; i < sent; i++) { + rte_mbuf* m = _tx_burst[_tx_burst_idx + i]; + bytes += m->pkt_len; + nr_frags += m->nb_segs; + pb.pop_front(); + } + + perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags); + perf_logger->inc(l_dpdk_qp_tx_bytes, bytes); + + _tx_burst_idx += sent; + + if (_tx_burst_idx == _tx_burst.size()) { + _tx_burst_idx = 0; + _tx_burst.clear(); + } + + return sent; + } + + /** + * Allocate a new data buffer and set the mbuf to point to it. + * + * Do some DPDK hacks to work on PMD: it assumes that the buf_addr + * points to the private data of RTE_PKTMBUF_HEADROOM before the actual + * data buffer. + * + * @param m mbuf to update + */ + static bool refill_rx_mbuf(rte_mbuf* m, size_t size, + std::vector<void*> &datas) { + if (datas.empty()) + return false; + void *data = datas.back(); + datas.pop_back(); + + // + // Set the mbuf to point to our data. + // + // Do some DPDK hacks to work on PMD: it assumes that the buf_addr + // points to the private data of RTE_PKTMBUF_HEADROOM before the + // actual data buffer. + // + m->buf_addr = (char*)data - RTE_PKTMBUF_HEADROOM; + m->buf_physaddr = rte_mem_virt2phy(data) - RTE_PKTMBUF_HEADROOM; + return true; + } + + bool init_rx_mbuf_pool(); + bool rx_gc(bool force=false); + bool refill_one_cluster(rte_mbuf* head); + + /** + * Polls for a burst of incoming packets. This function will not block and + * will immediately return after processing all available packets. + * + */ + bool poll_rx_once(); + + /** + * Translates an rte_mbuf's into packet and feeds them to _rx_stream. + * + * @param bufs An array of received rte_mbuf's + * @param count Number of buffers in the bufs[] + */ + void process_packets(struct rte_mbuf **bufs, uint16_t count); + + /** + * Translate rte_mbuf into the "packet". + * @param m mbuf to translate + * + * @return a "optional" object representing the newly received data if in an + * "engaged" state or an error if in a "disengaged" state. + */ + Tub<Packet> from_mbuf(rte_mbuf* m); + + /** + * Transform an LRO rte_mbuf cluster into the "packet" object. + * @param m HEAD of the mbufs' cluster to transform + * + * @return a "optional" object representing the newly received LRO packet if + * in an "engaged" state or an error if in a "disengaged" state. + */ + Tub<Packet> from_mbuf_lro(rte_mbuf* m); + + private: + CephContext *cct; + std::vector<packet_provider_type> _pkt_providers; + Tub<std::array<uint8_t, 128>> _sw_reta; + circular_buffer<Packet> _proxy_packetq; + stream<Packet> _rx_stream; + circular_buffer<Packet> _tx_packetq; + std::vector<void*> _alloc_bufs; + + PerfCounters *perf_logger; + DPDKDevice* _dev; + uint8_t _dev_port_idx; + EventCenter *center; + uint8_t _qid; + rte_mempool *_pktmbuf_pool_rx; + std::vector<rte_mbuf*> _rx_free_pkts; + std::vector<rte_mbuf*> _rx_free_bufs; + std::vector<fragment> _frags; + std::vector<char*> _bufs; + size_t _num_rx_free_segs = 0; + uint64_t device_stat_time_fd = 0; + +#ifdef CEPH_PERF_DEV + uint64_t rx_cycles = 0; + uint64_t rx_count = 0; + uint64_t tx_cycles = 0; + uint64_t tx_count = 0; +#endif + + class DPDKTXPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKTXPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {} + + virtual int poll() { + return qp->poll_tx(); + } + } _tx_poller; + + class DPDKRXGCPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKRXGCPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {} + + virtual int poll() { + return qp->rx_gc(); + } + } _rx_gc_poller; + tx_buf_factory _tx_buf_factory; + class DPDKRXPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKRXPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {} + + virtual int poll() { + return qp->poll_rx_once(); + } + }; + Tub<DPDKRXPoller> _rx_poller; + class DPDKTXGCPoller : public EventCenter::Poller { + DPDKQueuePair *qp; + + public: + explicit DPDKTXGCPoller(DPDKQueuePair *qp) + : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {} + + virtual int poll() { + return qp->_tx_buf_factory.gc(); + } + } _tx_gc_poller; + std::vector<rte_mbuf*> _tx_burst; + uint16_t _tx_burst_idx = 0; +}; + +class DPDKDevice { + public: + CephContext *cct; + PerfCounters *perf_logger; + std::vector<std::unique_ptr<DPDKQueuePair>> _queues; + std::vector<DPDKWorker*> workers; + size_t _rss_table_bits = 0; + uint8_t _port_idx; + uint16_t _num_queues; + unsigned cores; + hw_features _hw_features; + uint8_t _queues_ready = 0; + unsigned _home_cpu; + bool _use_lro; + bool _enable_fc; + std::vector<uint8_t> _redir_table; + rss_key_type _rss_key; + bool _is_i40e_device = false; + bool _is_vmxnet3_device = false; + + public: + rte_eth_dev_info _dev_info = {}; + + /** + * The final stage of a port initialization. + * @note Must be called *after* all queues from stage (2) have been + * initialized. + */ + int init_port_fini(); + + private: + /** + * Port initialization consists of 3 main stages: + * 1) General port initialization which ends with a call to + * rte_eth_dev_configure() where we request the needed number of Rx and + * Tx queues. + * 2) Individual queues initialization. This is done in the constructor of + * DPDKQueuePair class. In particular the memory pools for queues are allocated + * in this stage. + * 3) The final stage of the initialization which starts with the call of + * rte_eth_dev_start() after which the port becomes fully functional. We + * will also wait for a link to get up in this stage. + */ + + + /** + * First stage of the port initialization. + * + * @return 0 in case of success and an appropriate error code in case of an + * error. + */ + int init_port_start(); + + /** + * Check the link status of out port in up to 9s, and print them finally. + */ + int check_port_link_status(); + + /** + * Configures the HW Flow Control + */ + void set_hw_flow_control(); + + public: + DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc): + cct(c), _port_idx(port_idx), _num_queues(num_queues), + _home_cpu(0), _use_lro(use_lro), + _enable_fc(enable_fc) { + _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues); + /* now initialise the port we will use */ + int ret = init_port_start(); + if (ret != 0) { + rte_exit(EXIT_FAILURE, "Cannot initialise port %u\n", _port_idx); + } + string name(std::string("port") + std::to_string(port_idx)); + PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last); + + plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets"); + plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors"); + + plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors"); + plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors"); + plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors"); + plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors"); + + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + } + + ~DPDKDevice() { + rte_eth_dev_stop(_port_idx); + } + + DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; } + void l2receive(int qid, Packet p) { + _queues[qid]->_rx_stream.produce(std::move(p)); + } + subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) { + auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet)); + _queues[cpuid]->rx_start(); + return std::move(sub); + } + ethernet_address hw_address() { + struct ether_addr mac; + rte_eth_macaddr_get(_port_idx, &mac); + + return mac.addr_bytes; + } + hw_features get_hw_features() { + return _hw_features; + } + const rss_key_type& rss_key() const { return _rss_key; } + uint16_t hw_queues_count() { return _num_queues; } + std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c, EventCenter *center, string hugepages, uint16_t qid) { + std::unique_ptr<DPDKQueuePair> qp; + qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid)); + return std::move(qp); + } + unsigned hash2qid(uint32_t hash) { + // return hash % hw_queues_count(); + return _redir_table[hash & (_redir_table.size() - 1)]; + } + void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) { + ceph_assert(!_queues[i]); + _queues[i] = std::move(qp); + } + void unset_local_queue(unsigned i) { + ceph_assert(_queues[i]); + _queues[i].reset(); + } + template <typename Func> + unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) { + auto& qp = queue_for_cpu(src_cpuid); + if (!qp._sw_reta) + return src_cpuid; + + ceph_assert(!qp._sw_reta); + auto hash = hashfn() >> _rss_table_bits; + auto& reta = *qp._sw_reta; + return reta[hash % reta.size()]; + } + unsigned hash2cpu(uint32_t hash) { + // there is an assumption here that qid == get_id() which will + // not necessary be true in the future + return forward_dst(hash2qid(hash), [hash] { return hash; }); + } + + hw_features& hw_features_ref() { return _hw_features; } + + const rte_eth_rxconf* def_rx_conf() const { + return &_dev_info.default_rxconf; + } + + const rte_eth_txconf* def_tx_conf() const { + return &_dev_info.default_txconf; + } + + /** + * Set the RSS table in the device and store it in the internal vector. + */ + void set_rss_table(); + + uint8_t port_idx() { return _port_idx; } + bool is_i40e_device() const { + return _is_i40e_device; + } + bool is_vmxnet3_device() const { + return _is_vmxnet3_device; + } +}; + + +std::unique_ptr<DPDKDevice> create_dpdk_net_device( + CephContext *c, unsigned cores, uint8_t port_idx = 0, + bool use_lro = true, bool enable_fc = true); + + +/** + * @return Number of bytes needed for mempool objects of each QP. + */ +uint32_t qp_mempool_obj_size(); + +#endif // CEPH_DPDK_DEV_H diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc new file mode 100644 index 00000000..3101ae57 --- /dev/null +++ b/src/msg/async/dpdk/DPDKStack.cc @@ -0,0 +1,281 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <memory> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include <tuple> + +#include "common/ceph_argparse.h" +#include "dpdk_rte.h" +#include "DPDKStack.h" +#include "DPDK.h" +#include "IP.h" +#include "TCP-Stack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" +#include "common/Cond.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdkstack " + +static int dpdk_thread_adaptor(void* f) +{ + (*static_cast<std::function<void ()>*>(f))(); + return 0; +} + +void DPDKWorker::initialize() +{ + static enum { + WAIT_DEVICE_STAGE, + WAIT_PORT_FIN_STAGE, + DONE + } create_stage = WAIT_DEVICE_STAGE; + static Mutex lock("DPDKStack::lock"); + static Cond cond; + static unsigned queue_init_done = 0; + static unsigned cores = 0; + static std::shared_ptr<DPDKDevice> sdev; + + unsigned i = center.get_id(); + if (i == 0) { + // Hardcoded port index 0. + // TODO: Inherit it from the opts + cores = cct->_conf->ms_async_op_threads; + std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device( + cct, cores, cct->_conf->ms_dpdk_port_id, + cct->_conf->ms_dpdk_lro, + cct->_conf->ms_dpdk_hw_flow_control); + sdev = std::shared_ptr<DPDKDevice>(dev.release()); + sdev->workers.resize(cores); + ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl; + + Mutex::Locker l(lock); + create_stage = WAIT_PORT_FIN_STAGE; + cond.Signal(); + } else { + Mutex::Locker l(lock); + while (create_stage <= WAIT_DEVICE_STAGE) + cond.Wait(lock); + } + ceph_assert(sdev); + if (i < sdev->hw_queues_count()) { + auto qp = sdev->init_local_queue(cct, ¢er, cct->_conf->ms_dpdk_hugepages, i); + std::map<unsigned, float> cpu_weights; + for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count(); + j < cores; j+= sdev->hw_queues_count()) + cpu_weights[i] = 1; + cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight; + qp->configure_proxies(cpu_weights); + sdev->set_local_queue(i, std::move(qp)); + Mutex::Locker l(lock); + ++queue_init_done; + cond.Signal(); + } else { + // auto master = qid % sdev->hw_queues_count(); + // sdev->set_local_queue(create_proxy_net_device(master, sdev.get())); + ceph_abort(); + } + if (i == 0) { + { + Mutex::Locker l(lock); + while (queue_init_done < cores) + cond.Wait(lock); + } + + if (sdev->init_port_fini() < 0) { + lderr(cct) << __func__ << " init_port_fini failed " << dendl; + ceph_abort(); + } + Mutex::Locker l(lock); + create_stage = DONE; + cond.Signal(); + } else { + Mutex::Locker l(lock); + while (create_stage <= WAIT_PORT_FIN_STAGE) + cond.Wait(lock); + } + + sdev->workers[i] = this; + _impl = std::unique_ptr<DPDKWorker::Impl>( + new DPDKWorker::Impl(cct, i, ¢er, sdev)); + { + Mutex::Locker l(lock); + if (!--queue_init_done) { + create_stage = WAIT_DEVICE_STAGE; + sdev.reset(); + } + } +} + +using AvailableIPAddress = std::tuple<string, string, string>; +static bool parse_available_address( + const string &ips, const string &gates, const string &masks, vector<AvailableIPAddress> &res) +{ + vector<string> ip_vec, gate_vec, mask_vec; + string_to_vec(ip_vec, ips); + string_to_vec(gate_vec, gates); + string_to_vec(mask_vec, masks); + if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size()) + return false; + + for (size_t i = 0; i < ip_vec.size(); ++i) { + res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]}); + } + return true; +} + +static bool match_available_address(const vector<AvailableIPAddress> &avails, + const entity_addr_t &ip, int &res) +{ + for (size_t i = 0; i < avails.size(); ++i) { + entity_addr_t addr; + auto a = std::get<0>(avails[i]).c_str(); + if (!addr.parse(a)) + continue; + if (addr.is_same_host(ip)) { + res = i; + return true; + } + } + return false; +} + +DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev) + : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif) +{ + vector<AvailableIPAddress> tuples; + bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"), + cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"), + cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples); + if (!parsed) { + lderr(cct) << __func__ << " no available address " + << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", " + << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", " + << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", " + << dendl; + ceph_abort(); + } + _inet.set_host_address(ipv4_address(std::get<0>(tuples[0]))); + _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0]))); + _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0]))); +} + +DPDKWorker::Impl::~Impl() +{ + _dev->unset_local_queue(id); +} + +int DPDKWorker::listen(entity_addr_t &sa, const SocketOptions &opt, + ServerSocket *sock) +{ + ceph_assert(sa.get_family() == AF_INET); + ceph_assert(sock); + + ldout(cct, 10) << __func__ << " addr " << sa << dendl; + // vector<AvailableIPAddress> tuples; + // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr, + // cct->_conf->ms_dpdk_gateway_ipv4_addr, + // cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples); + // if (!parsed) { + // lderr(cct) << __func__ << " no available address " + // << cct->_conf->ms_dpdk_host_ipv4_addr << ", " + // << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", " + // << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", " + // << dendl; + // return -EINVAL; + // } + // int idx; + // parsed = match_available_address(tuples, sa, idx); + // if (!parsed) { + // lderr(cct) << __func__ << " no matched address for " << sa << dendl; + // return -EINVAL; + // } + // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx]))); + // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx]))); + // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx]))); + return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(), + sock); +} + +int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) +{ + // ceph_assert(addr.get_family() == AF_INET); + int r = tcpv4_connect(_impl->_inet.get_tcp(), addr, socket); + ldout(cct, 10) << __func__ << " addr " << addr << dendl; + return r; +} + +void DPDKStack::spawn_worker(unsigned i, std::function<void ()> &&func) +{ + // create a extra master thread + // + funcs[i] = std::move(func); + int r = 0; + r = dpdk::eal::init(cct); + if (r < 0) { + lderr(cct) << __func__ << " init dpdk rte failed, r=" << r << dendl; + ceph_abort(); + } + // if dpdk::eal::init already called by NVMEDevice, we will select 1..n + // cores + ceph_assert(rte_lcore_count() >= i + 1); + unsigned core_id; + int j = i; + RTE_LCORE_FOREACH_SLAVE(core_id) { + if (i-- == 0) { + break; + } + } + dpdk::eal::execute_on_master([&]() { + r = rte_eal_remote_launch(dpdk_thread_adaptor, static_cast<void*>(&funcs[j]), core_id); + if (r < 0) { + lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl; + ceph_abort(); + } + }); +} + +void DPDKStack::join_worker(unsigned i) +{ + dpdk::eal::execute_on_master([&]() { + rte_eal_wait_lcore(i+1); + }); +} diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h new file mode 100644 index 00000000..a44ae383 --- /dev/null +++ b/src/msg/async/dpdk/DPDKStack.h @@ -0,0 +1,257 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#ifndef CEPH_MSG_DPDKSTACK_H +#define CEPH_MSG_DPDKSTACK_H + +#include <functional> + +#include "common/ceph_context.h" +#include "common/Tub.h" + +#include "msg/async/Stack.h" +#include "net.h" +#include "const.h" +#include "IP.h" +#include "Packet.h" + +class interface; + +template <typename Protocol> +class NativeConnectedSocketImpl; + +// DPDKServerSocketImpl +template <typename Protocol> +class DPDKServerSocketImpl : public ServerSocketImpl { + typename Protocol::listener _listener; + public: + DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt, + int type); + int listen() { + return _listener.listen(); + } + virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override; + virtual void abort_accept() override; + virtual int fd() const override { + return _listener.fd(); + } +}; + +// NativeConnectedSocketImpl +template <typename Protocol> +class NativeConnectedSocketImpl : public ConnectedSocketImpl { + typename Protocol::connection _conn; + uint32_t _cur_frag = 0; + uint32_t _cur_off = 0; + Tub<Packet> _buf; + Tub<bufferptr> _cache_ptr; + + public: + explicit NativeConnectedSocketImpl(typename Protocol::connection conn) + : _conn(std::move(conn)) {} + NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs) + : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf)) {} + virtual int is_connected() override { + return _conn.is_connected(); + } + + virtual ssize_t read(char *buf, size_t len) override { + size_t left = len; + ssize_t r = 0; + size_t off = 0; + while (left > 0) { + if (!_cache_ptr) { + _cache_ptr.construct(); + r = zero_copy_read(*_cache_ptr); + if (r <= 0) { + _cache_ptr.destroy(); + if (r == -EAGAIN) + break; + return r; + } + } + if (_cache_ptr->length() <= left) { + _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off); + left -= _cache_ptr->length(); + off += _cache_ptr->length(); + _cache_ptr.destroy(); + } else { + _cache_ptr->copy_out(0, left, buf+off); + _cache_ptr->set_offset(_cache_ptr->offset() + left); + _cache_ptr->set_length(_cache_ptr->length() - left); + left = 0; + break; + } + } + return len - left ? len - left : -EAGAIN; + } + + virtual ssize_t zero_copy_read(bufferptr &data) override { + auto err = _conn.get_errno(); + if (err <= 0) + return err; + + if (!_buf) { + _buf = std::move(_conn.read()); + if (!_buf) + return -EAGAIN; + } + + fragment &f = _buf->frag(_cur_frag); + Packet p = _buf->share(_cur_off, f.size); + auto del = std::bind( + [](Packet &p) {}, std::move(p)); + data = buffer::claim_buffer( + f.size, f.base, make_deleter(std::move(del))); + if (++_cur_frag == _buf->nr_frags()) { + _cur_frag = 0; + _cur_off = 0; + _buf.destroy(); + } else { + _cur_off += f.size; + } + ceph_assert(data.length()); + return data.length(); + } + virtual ssize_t send(bufferlist &bl, bool more) override { + auto err = _conn.get_errno(); + if (err < 0) + return (ssize_t)err; + + size_t available = _conn.peek_sent_available(); + if (available == 0) { + return 0; + } + + std::vector<fragment> frags; + std::list<bufferptr>::const_iterator pb = bl.buffers().begin(); + uint64_t left_pbrs = bl.buffers().size(); + uint64_t len = 0; + uint64_t seglen = 0; + while (len < available && left_pbrs--) { + seglen = pb->length(); + if (len + seglen > available) { + // don't continue if we enough at least 1 fragment since no available + // space for next ptr. + if (len > 0) + break; + seglen = std::min(seglen, available); + } + len += seglen; + frags.push_back(fragment{(char*)pb->c_str(), seglen}); + ++pb; + } + + if (len != bl.length()) { + bufferlist swapped; + bl.splice(0, len, &swapped); + auto del = std::bind( + [](bufferlist &bl) {}, std::move(swapped)); + return _conn.send(Packet(std::move(frags), make_deleter(std::move(del)))); + } else { + auto del = std::bind( + [](bufferlist &bl) {}, std::move(bl)); + + return _conn.send(Packet(std::move(frags), make_deleter(std::move(del)))); + } + } + virtual void shutdown() override { + _conn.close_write(); + } + // FIXME need to impl close + virtual void close() override { + _conn.close_write(); + } + virtual int fd() const override { + return _conn.fd(); + } + virtual int socket_fd() const override { + return _conn.fd(); + } + +}; + +template <typename Protocol> +DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl( + Protocol& proto, uint16_t port, const SocketOptions &opt, int type) + : ServerSocketImpl(type), _listener(proto.listen(port)) {} + +template <typename Protocol> +int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) { + if (_listener.get_errno() < 0) + return _listener.get_errno(); + auto c = _listener.accept(); + if (!c) + return -EAGAIN; + + if (out) { + *out = c->remote_addr(); + out->set_type(addr_type); + } + std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi( + new NativeConnectedSocketImpl<Protocol>(std::move(*c))); + *s = ConnectedSocket(std::move(csi)); + return 0; +} + +template <typename Protocol> +void DPDKServerSocketImpl<Protocol>::abort_accept() { + _listener.abort_accept(); +} + +class DPDKWorker : public Worker { + struct Impl { + unsigned id; + interface _netif; + std::shared_ptr<DPDKDevice> _dev; + ipv4 _inet; + Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev); + ~Impl(); + }; + std::unique_ptr<Impl> _impl; + + virtual void initialize() override; + void set_ipv4_packet_filter(ip_packet_filter* filter) { + _impl->_inet.set_packet_filter(filter); + } + using tcp4 = tcp<ipv4_traits>; + + public: + explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {} + virtual int listen(entity_addr_t &addr, const SocketOptions &opts, ServerSocket *) override; + virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override; + void arp_learn(ethernet_address l2, ipv4_address l3) { + _impl->_inet.learn(l2, l3); + } + virtual void destroy() override { + _impl.reset(); + } + + friend class DPDKServerSocketImpl<tcp4>; +}; + +class DPDKStack : public NetworkStack { + vector<std::function<void()> > funcs; + public: + explicit DPDKStack(CephContext *cct, const string &t): NetworkStack(cct, t) { + funcs.resize(cct->_conf->ms_async_max_op_threads); + } + virtual bool support_zero_copy_read() const override { return true; } + virtual bool support_local_listen_table() const override { return true; } + + virtual void spawn_worker(unsigned i, std::function<void ()> &&func) override; + virtual void join_worker(unsigned i) override; +}; + +#endif diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc new file mode 100644 index 00000000..5d291716 --- /dev/null +++ b/src/msg/async/dpdk/EventDPDK.cc @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/errno.h" +#include "DPDKStack.h" +#include "EventDPDK.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_ms + +#undef dout_prefix +#define dout_prefix *_dout << "DPDKDriver." + +int DPDKDriver::init(EventCenter *c, int nevent) +{ + return 0; +} + +int DPDKDriver::add_event(int fd, int cur_mask, int add_mask) +{ + ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask + << " add_mask=" << add_mask << dendl; + + int r = manager.listen(fd, add_mask); + if (r < 0) { + lderr(cct) << __func__ << " add fd=" << fd << " failed. " + << cpp_strerror(-r) << dendl; + return -errno; + } + + return 0; +} + +int DPDKDriver::del_event(int fd, int cur_mask, int delmask) +{ + ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask + << " delmask=" << delmask << dendl; + int r = 0; + + if (delmask != EVENT_NONE) { + if ((r = manager.unlisten(fd, delmask)) < 0) { + lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask + << " failed." << cpp_strerror(-r) << dendl; + return r; + } + } + return 0; +} + +int DPDKDriver::resize_events(int newsize) +{ + return 0; +} + +int DPDKDriver::event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tvp) +{ + int num_events = 512; + int events[num_events]; + int masks[num_events]; + + int retval = manager.poll(events, masks, num_events, tvp); + if (retval > 0) { + fired_events.resize(retval); + for (int i = 0; i < retval; i++) { + fired_events[i].fd = events[i]; + fired_events[i].mask = masks[i]; + } + } + return retval; +} diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h new file mode 100644 index 00000000..541c2210 --- /dev/null +++ b/src/msg/async/dpdk/EventDPDK.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_EVENTDPDK_H +#define CEPH_EVENTDPDK_H + +#include "msg/async/Event.h" +#include "msg/async/Stack.h" +#include "UserspaceEvent.h" + +class DPDKDriver : public EventDriver { + CephContext *cct; + + public: + UserspaceEventManager manager; + + explicit DPDKDriver(CephContext *c): cct(c), manager(c) {} + virtual ~DPDKDriver() { } + + int init(EventCenter *c, int nevent) override; + int add_event(int fd, int cur_mask, int add_mask) override; + int del_event(int fd, int cur_mask, int del_mask) override; + int resize_events(int newsize) override; + int event_wait(vector<FiredFileEvent> &fired_events, struct timeval *tp) override; + bool need_wakeup() override { return false; } +}; + +#endif //CEPH_EVENTDPDK_H diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc new file mode 100644 index 00000000..f730cded --- /dev/null +++ b/src/msg/async/dpdk/IP.cc @@ -0,0 +1,470 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "common/perf_counters.h" + +#include "capture.h" +#include "IP.h" +#include "toeplitz.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + +std::ostream& operator<<(std::ostream& os, const ipv4_address& a) { + auto ip = a.ip; + return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff) + << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff); +} + +utime_t ipv4::_frag_timeout = utime_t(30, 0); +constexpr uint32_t ipv4::_frag_low_thresh; +constexpr uint32_t ipv4::_frag_high_thresh; + +class C_handle_frag_timeout : public EventCallback { + ipv4 *_ipv4; + + public: + C_handle_frag_timeout(ipv4 *i): _ipv4(i) {} + void do_request(uint64_t fd_or_id) { + _ipv4->frag_timeout(); + } +}; + +enum { + l_dpdk_qp_first = 99000, + l_dpdk_total_linearize_operations, + l_dpdk_qp_last +}; + +ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif) + : cct(c), center(cen), _netif(netif), _global_arp(netif), + _arp(c, _global_arp, cen), + _host_address(0), _gw_address(0), _netmask(0), + _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }), + _rx_packets( + _l3.receive( + [this] (Packet p, ethernet_address ea) { + return handle_received_packet(std::move(p), ea); + }, + [this] (forward_hash& out_hash_data, Packet& p, size_t off) { + return forward(out_hash_data, p, off); + } + ) + ), + _tcp(*this, cen), _icmp(c, *this), + _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp }, + { uint8_t(ip_protocol_num::icmp), &_icmp }}), + _packet_filter(nullptr) +{ + PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last); + plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations"); + perf_logger = plb.create_perf_counters(); + cct->get_perfcounters_collection()->add(perf_logger); + frag_handler = new C_handle_frag_timeout(this); +} + +bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + auto iph = p.get_header<ip_hdr>(off); + + out_hash_data.push_back(iph->src_ip.ip); + out_hash_data.push_back(iph->dst_ip.ip); + + auto h = iph->ntoh(); + auto l4 = _l4[h.ip_proto]; + if (l4) { + if (h.mf() == false && h.offset() == 0) { + // This IP datagram is atomic, forward according to tcp connection hash + l4->forward(out_hash_data, p, off + sizeof(ip_hdr)); + } + // else forward according to ip fields only + } + return true; +} + +int ipv4::handle_received_packet(Packet p, ethernet_address from) +{ + auto iph = p.get_header<ip_hdr>(0); + if (!iph) { + return 0; + } + + // Skip checking csum of reassembled IP datagram + if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) { + checksummer csum; + csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); + if (csum.get() != 0) { + return 0; + } + } + + auto h = iph->ntoh(); + unsigned ip_len = h.len; + unsigned ip_hdr_len = h.ihl * 4; + unsigned pkt_len = p.len(); + auto offset = h.offset(); + + ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto) + << std::dec << " packet from " + << h.src_ip << " -> " << h.dst_ip << " id=" << h.id + << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len + << " pkt_len=" << pkt_len << " offset=" << offset << dendl; + + if (pkt_len > ip_len) { + // Trim extra data in the packet beyond IP total length + p.trim_back(pkt_len - ip_len); + } else if (pkt_len < ip_len) { + // Drop if it contains less than IP total length + return 0; + } + // Drop if the reassembled datagram will be larger than maximum IP size + if (offset + p.len() > ip_packet_len_max) { + return 0; + } + + // FIXME: process options + if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) { + ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl; + _arp.learn(from, h.src_ip); + } + + if (_packet_filter) { + bool handled = false; + _packet_filter->handle(p, &h, from, handled); + if (handled) { + return 0; + } + } + + if (h.dst_ip != _host_address) { + // FIXME: forward + return 0; + } + + // Does this IP datagram need reassembly + auto mf = h.mf(); + if (mf == true || offset != 0) { + frag_limit_mem(); + auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto}; + auto& frag = _frags[frag_id]; + if (mf == false) { + frag.last_frag_received = true; + } + // This is a newly created frag_id + if (frag.mem_size == 0) { + _frags_age.push_back(frag_id); + frag.rx_time = ceph_clock_now(); + } + auto added_size = frag.merge(h, offset, std::move(p)); + _frag_mem += added_size; + if (frag.is_complete()) { + // All the fragments are received + auto dropped_size = frag.mem_size; + auto& ip_data = frag.data.map.begin()->second; + // Choose a cpu to forward this packet + auto cpu_id = center->get_id(); + auto l4 = _l4[h.ip_proto]; + if (l4) { + size_t l4_offset = 0; + forward_hash hash_data; + hash_data.push_back(hton(h.src_ip.ip)); + hash_data.push_back(hton(h.dst_ip.ip)); + l4->forward(hash_data, ip_data, l4_offset); + cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data)); + } + + // No need to forward if the dst cpu is the current cpu + if (cpu_id == center->get_id()) { + l4->received(std::move(ip_data), h.src_ip, h.dst_ip); + } else { + auto to = _netif->hw_address(); + auto pkt = frag.get_assembled_packet(from, to); + _netif->forward(center, cpu_id, std::move(pkt)); + } + + // Delete this frag from _frags and _frags_age + frag_drop(frag_id, dropped_size); + _frags_age.remove(frag_id); + perf_logger->set(l_dpdk_total_linearize_operations, + ipv4_packet_merger::linearizations()); + } else { + // Some of the fragments are missing + if (frag_timefd) { + frag_arm(); + } + } + return 0; + } + + auto l4 = _l4[h.ip_proto]; + if (l4) { + // Trim IP header and pass to upper layer + p.trim_front(ip_hdr_len); + l4->received(std::move(p), h.src_ip, h.dst_ip); + } + return 0; +} + +void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) { + // Figure out where to send the packet to. If it is a directly connected + // host, send to it directly, otherwise send to the default gateway. + ipv4_address dst; + if (in_my_netmask(to)) { + dst = to; + } else { + dst = _gw_address; + } + + _arp.wait(std::move(dst), std::move(p), std::move(cb)); +} + +const hw_features& ipv4::get_hw_features() const +{ + return _netif->get_hw_features(); +} + +void ipv4::send(ipv4_address to, ip_protocol_num proto_num, + Packet p, ethernet_address e_dst) { + auto needs_frag = this->needs_frag(p, proto_num, get_hw_features()); + + auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable { + static uint16_t id = 0; + auto iph = pkt.prepend_header<ip_hdr>(); + iph->ihl = sizeof(*iph) / 4; + iph->ver = 4; + iph->dscp = 0; + iph->ecn = 0; + iph->len = pkt.len(); + // FIXME: a proper id + iph->id = id++; + if (needs_frag) { + uint16_t mf = remaining > 0; + // The fragment offset is measured in units of 8 octets (64 bits) + auto off = offset / 8; + iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off; + } else { + iph->frag = 0; + } + iph->ttl = 64; + iph->ip_proto = (uint8_t)proto_num; + iph->csum = 0; + iph->src_ip = _host_address; + iph->dst_ip = to; + ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to + << " len " << pkt.len() << dendl; + *iph = iph->hton(); + + if (get_hw_features().tx_csum_ip_offload) { + iph->csum = 0; + pkt.offload_info_ref().needs_ip_csum = true; + } else { + checksummer csum; + csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); + iph->csum = csum.get(); + } + + _packetq.push_back( + l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)}); + }; + + if (needs_frag) { + uint16_t offset = 0; + uint16_t remaining = p.len(); + auto mtu = get_hw_features().mtu; + + while (remaining) { + auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining); + remaining -= can_send; + auto pkt = p.share(offset, can_send); + send_pkt(pkt, remaining, offset); + offset += can_send; + } + } else { + // The whole packet can be send in one shot + send_pkt(p, 0, 0); + } +} + +Tub<l3_protocol::l3packet> ipv4::get_packet() { + // _packetq will be mostly empty here unless it hold remnants of previously + // fragmented packet + if (_packetq.empty()) { + for (size_t i = 0; i < _pkt_providers.size(); i++) { + auto l4p = _pkt_providers[_pkt_provider_idx++](); + if (_pkt_provider_idx == _pkt_providers.size()) { + _pkt_provider_idx = 0; + } + if (l4p) { + ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl; + send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst); + break; + } + } + } + + Tub<l3_protocol::l3packet> p; + if (!_packetq.empty()) { + p = std::move(_packetq.front()); + _packetq.pop_front(); + } + return p; +} + +void ipv4::frag_limit_mem() { + if (_frag_mem <= _frag_high_thresh) { + return; + } + auto drop = _frag_mem - _frag_low_thresh; + while (drop) { + if (_frags_age.empty()) { + return; + } + // Drop the oldest frag (first element) from _frags_age + auto frag_id = _frags_age.front(); + _frags_age.pop_front(); + + // Drop from _frags as well + auto& frag = _frags[frag_id]; + auto dropped_size = frag.mem_size; + frag_drop(frag_id, dropped_size); + + drop -= std::min(drop, dropped_size); + } +} + +void ipv4::frag_timeout() { + if (_frags.empty()) { + return; + } + auto now = ceph_clock_now(); + for (auto it = _frags_age.begin(); it != _frags_age.end();) { + auto frag_id = *it; + auto& frag = _frags[frag_id]; + if (now > frag.rx_time + _frag_timeout) { + auto dropped_size = frag.mem_size; + // Drop from _frags + frag_drop(frag_id, dropped_size); + // Drop from _frags_age + it = _frags_age.erase(it); + } else { + // The further items can only be younger + break; + } + } + if (_frags.size() != 0) { + frag_arm(now); + } else { + _frag_mem = 0; + } +} + +int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) { + uint32_t old = mem_size; + unsigned ip_hdr_len = h.ihl * 4; + // Store IP header + if (offset == 0) { + header = p.share(0, ip_hdr_len); + } + // Sotre IP payload + p.trim_front(ip_hdr_len); + data.merge(offset, std::move(p)); + // Update mem size + mem_size = header.memory(); + for (const auto& x : data.map) { + mem_size += x.second.memory(); + } + auto added_size = mem_size - old; + return added_size; +} + +bool ipv4::frag::is_complete() { + // If all the fragments are received, ipv4::frag::merge() should merge all + // the fragments into a single packet + auto offset = data.map.begin()->first; + auto nr_packet = data.map.size(); + return last_frag_received && nr_packet == 1 && offset == 0; +} + +Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) { + auto& ip_header = header; + auto& ip_data = data.map.begin()->second; + // Append a ethernet header, needed for forwarding + auto eh = ip_header.prepend_header<eth_hdr>(); + eh->src_mac = from; + eh->dst_mac = to; + eh->eth_proto = uint16_t(eth_protocol_num::ipv4); + *eh = eh->hton(); + // Prepare a packet contains both ethernet header, ip header and ip data + ip_header.append(std::move(ip_data)); + auto pkt = std::move(ip_header); + auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr)); + // len is the sum of each fragment + iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr))); + // No fragmentation for the assembled datagram + iph->frag = 0; + // Since each fragment's csum is checked, no need to csum + // again for the assembled datagram + offload_info oi; + oi.reassembled = true; + pkt.set_offload_info(oi); + return pkt; +} + +void icmp::received(Packet p, ipaddr from, ipaddr to) { + auto hdr = p.get_header<icmp_hdr>(0); + if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) { + return; + } + hdr->type = icmp_hdr::msg_type::echo_reply; + hdr->code = 0; + hdr->csum = 0; + checksummer csum; + csum.sum(reinterpret_cast<char*>(hdr), p.len()); + hdr->csum = csum.get(); + + if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue + auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable { + if (r == 0) { + _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp}); + } + }; + _inet.wait_l2_dst_address(from, std::move(p), cb); + } +} diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h new file mode 100644 index 00000000..480b4b95 --- /dev/null +++ b/src/msg/async/dpdk/IP.h @@ -0,0 +1,414 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ + +#ifndef CEPH_MSG_IP_H_ +#define CEPH_MSG_IP_H_ + +#include <arpa/inet.h> +#include <unordered_map> +#include <cstdint> +#include <array> +#include <map> +#include <list> +#include <chrono> + +#include "msg/async/Event.h" +#include "common/Throttle.h" + +#include "array_map.h" +#include "ARP.h" +#include "IPChecksum.h" +#include "ip_types.h" +#include "const.h" +#include "net.h" +#include "PacketUtil.h" +#include "toeplitz.h" + +class ipv4; +template <ip_protocol_num ProtoNum> +class ipv4_l4; + +template <typename InetTraits> +class tcp; + +struct ipv4_traits { + using address_type = ipv4_address; + using inet_type = ipv4_l4<ip_protocol_num::tcp>; + struct l4packet { + ipv4_address to; + Packet p; + ethernet_address e_dst; + ip_protocol_num proto_num; + }; + using packet_provider_type = std::function<Tub<l4packet> ()>; + static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) { + csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len); + } + static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min; +}; + +template <ip_protocol_num ProtoNum> +class ipv4_l4 { + public: + ipv4& _inet; + public: + ipv4_l4(ipv4& inet) : _inet(inet) {} + void register_packet_provider(ipv4_traits::packet_provider_type func); + void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb); +}; + +class ip_protocol { + public: + virtual ~ip_protocol() {} + virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; } +}; + +template <typename InetTraits> +struct l4connid { + using ipaddr = typename InetTraits::address_type; + using inet_type = typename InetTraits::inet_type; + struct connid_hash; + + ipaddr local_ip; + ipaddr foreign_ip; + uint16_t local_port; + uint16_t foreign_port; + + bool operator==(const l4connid& x) const { + return local_ip == x.local_ip + && foreign_ip == x.foreign_ip + && local_port == x.local_port + && foreign_port == x.foreign_port; + } + + uint32_t hash(const rss_key_type& rss_key) { + forward_hash hash_data; + hash_data.push_back(hton(foreign_ip.ip)); + hash_data.push_back(hton(local_ip.ip)); + hash_data.push_back(hton(foreign_port)); + hash_data.push_back(hton(local_port)); + return toeplitz_hash(rss_key, hash_data); + } +}; + +class ipv4_tcp final : public ip_protocol { + ipv4_l4<ip_protocol_num::tcp> _inet_l4; + std::unique_ptr<tcp<ipv4_traits>> _tcp; + public: + ipv4_tcp(ipv4& inet, EventCenter *c); + ~ipv4_tcp(); + virtual void received(Packet p, ipv4_address from, ipv4_address to) override; + virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override; + friend class ipv4; +}; + +struct icmp_hdr { + enum class msg_type : uint8_t { + echo_reply = 0, + echo_request = 8, + }; + msg_type type; + uint8_t code; + uint16_t csum; + uint32_t rest; +} __attribute__((packed)); + + +class icmp { + public: + using ipaddr = ipv4_address; + using inet_type = ipv4_l4<ip_protocol_num::icmp>; + explicit icmp(CephContext *c, inet_type& inet) + : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) { + _inet.register_packet_provider([this] { + Tub<ipv4_traits::l4packet> l4p; + if (!_packetq.empty()) { + l4p = std::move(_packetq.front()); + _packetq.pop_front(); + _queue_space.put(l4p->p.len()); + } + return l4p; + }); + } + void received(Packet p, ipaddr from, ipaddr to); + + private: + CephContext *cct; + // ipv4_l4<ip_protocol_num::icmp> + inet_type& _inet; + circular_buffer<ipv4_traits::l4packet> _packetq; + Throttle _queue_space; +}; + +class ipv4_icmp final : public ip_protocol { + CephContext *cct; + ipv4_l4<ip_protocol_num::icmp> _inet_l4; + icmp _icmp; + public: + ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {} + virtual void received(Packet p, ipv4_address from, ipv4_address to) override { + _icmp.received(std::move(p), from, to); + } + friend class ipv4; +}; + +struct ip_hdr; + +struct ip_packet_filter { + virtual ~ip_packet_filter() {}; + virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0; +}; + +struct ipv4_frag_id { + struct hash; + ipv4_address src_ip; + ipv4_address dst_ip; + uint16_t identification; + uint8_t protocol; + bool operator==(const ipv4_frag_id& x) const { + return src_ip == x.src_ip && + dst_ip == x.dst_ip && + identification == x.identification && + protocol == x.protocol; + } +}; + +struct ipv4_frag_id::hash : private std::hash<ipv4_address>, + private std::hash<uint16_t>, private std::hash<uint8_t> { + size_t operator()(const ipv4_frag_id& id) const noexcept { + using h1 = std::hash<ipv4_address>; + using h2 = std::hash<uint16_t>; + using h3 = std::hash<uint8_t>; + return h1::operator()(id.src_ip) ^ + h1::operator()(id.dst_ip) ^ + h2::operator()(id.identification) ^ + h3::operator()(id.protocol); + } +}; + +struct ipv4_tag {}; +using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>; + +class interface; + +class ipv4 { + public: + using address_type = ipv4_address; + using proto_type = uint16_t; + static address_type broadcast_address() { return ipv4_address(0xffffffff); } + static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); } + CephContext *cct; + EventCenter *center; + + private: + interface* _netif; + std::vector<ipv4_traits::packet_provider_type> _pkt_providers; + Tub<uint64_t> frag_timefd; + EventCallbackRef frag_handler; + arp _global_arp; + arp_for<ipv4> _arp; + ipv4_address _host_address; + ipv4_address _gw_address; + ipv4_address _netmask; + l3_protocol _l3; + subscription<Packet, ethernet_address> _rx_packets; + ipv4_tcp _tcp; + ipv4_icmp _icmp; + array_map<ip_protocol*, 256> _l4; + ip_packet_filter *_packet_filter; + struct frag { + Packet header; + ipv4_packet_merger data; + utime_t rx_time; + uint32_t mem_size = 0; + // fragment with MF == 0 inidates it is the last fragment + bool last_frag_received = false; + + Packet get_assembled_packet(ethernet_address from, ethernet_address to); + int32_t merge(ip_hdr &h, uint16_t offset, Packet p); + bool is_complete(); + }; + std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags; + std::list<ipv4_frag_id> _frags_age; + static utime_t _frag_timeout; + static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024}; + static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024}; + uint32_t _frag_mem = 0; + circular_buffer<l3_protocol::l3packet> _packetq; + unsigned _pkt_provider_idx = 0; + PerfCounters *perf_logger; + + private: + int handle_received_packet(Packet p, ethernet_address from); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + Tub<l3_protocol::l3packet> get_packet(); + bool in_my_netmask(ipv4_address a) const { + return !((a.ip ^ _host_address.ip) & _netmask.ip); + } + void frag_limit_mem(); + void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) { + _frags.erase(frag_id); + _frag_mem -= dropped_size; + } + void frag_arm(utime_t now) { + auto tp = now + _frag_timeout; + frag_timefd.construct(center->create_time_event(tp.to_nsec() / 1000, frag_handler)); + } + void frag_arm() { + auto now = ceph_clock_now(); + frag_timefd.construct(center->create_time_event(now.to_nsec() / 1000, frag_handler)); + } + + public: + void frag_timeout(); + + public: + explicit ipv4(CephContext *c, EventCenter *cen, interface* netif); + ~ipv4() { + delete frag_handler; + } + void set_host_address(ipv4_address ip) { + _host_address = ip; + _arp.set_self_addr(ip); + } + ipv4_address host_address() { + return _host_address; + } + void set_gw_address(ipv4_address ip) { + _gw_address = ip; + } + ipv4_address gw_address() const { + return _gw_address; + } + void set_netmask_address(ipv4_address ip) { + _netmask = ip; + } + ipv4_address netmask_address() const { + return _netmask; + } + interface *netif() const { + return _netif; + } + // TODO or something. Should perhaps truly be a list + // of filters. With ordering. And blackjack. Etc. + // But for now, a simple single raw pointer suffices + void set_packet_filter(ip_packet_filter *f) { + _packet_filter = f; + } + ip_packet_filter * packet_filter() const { + return _packet_filter; + } + void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst); + tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; } + void register_l4(proto_type id, ip_protocol* handler); + const hw_features& get_hw_features() const; + static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) { + if (p.len() + ipv4_hdr_len_min <= hw_features.mtu) + return false; + + if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso)) + return false; + + return true; + } + void learn(ethernet_address l2, ipv4_address l3) { + _arp.learn(l2, l3); + } + void register_packet_provider(ipv4_traits::packet_provider_type&& func) { + _pkt_providers.push_back(std::move(func)); + } + void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb); +}; + +template <ip_protocol_num ProtoNum> +inline void ipv4_l4<ProtoNum>::register_packet_provider( + ipv4_traits::packet_provider_type func) { + _inet.register_packet_provider([func] { + auto l4p = func(); + if (l4p) { + (*l4p).proto_num = ProtoNum; + } + return l4p; + }); +} + +template <ip_protocol_num ProtoNum> +inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) { + _inet.wait_l2_dst_address(to, std::move(p), std::move(cb)); +} + +struct ip_hdr { + uint8_t ihl : 4; + uint8_t ver : 4; + uint8_t dscp : 6; + uint8_t ecn : 2; + uint16_t len; + uint16_t id; + uint16_t frag; + enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 }; + uint8_t ttl; + uint8_t ip_proto; + uint16_t csum; + ipv4_address src_ip; + ipv4_address dst_ip; + uint8_t options[0]; + ip_hdr hton() { + ip_hdr hdr = *this; + hdr.len = ::hton(len); + hdr.id = ::hton(id); + hdr.frag = ::hton(frag); + hdr.csum = ::hton(csum); + hdr.src_ip.ip = ::hton(src_ip.ip); + hdr.dst_ip.ip = ::hton(dst_ip.ip); + return hdr; + } + ip_hdr ntoh() { + ip_hdr hdr = *this; + hdr.len = ::ntoh(len); + hdr.id = ::ntoh(id); + hdr.frag = ::ntoh(frag); + hdr.csum = ::ntoh(csum); + hdr.src_ip = src_ip.ntoh(); + hdr.dst_ip = dst_ip.ntoh(); + return hdr; + } + + bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); } + bool df() { return frag & (1 << uint8_t(frag_bits::df)); } + uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); } +} __attribute__((packed)); + +template <typename InetTraits> +struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> { + size_t operator()(const l4connid<InetTraits>& id) const noexcept { + using h1 = std::hash<ipaddr>; + using h2 = std::hash<uint16_t>; + return h1::operator()(id.local_ip) + ^ h1::operator()(id.foreign_ip) + ^ h2::operator()(id.local_port) + ^ h2::operator()(id.foreign_port); + } +}; + +#endif /* CEPH_MSG_IP_H */ diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc new file mode 100644 index 00000000..7a3253c1 --- /dev/null +++ b/src/msg/async/dpdk/IPChecksum.cc @@ -0,0 +1,70 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include <arpa/inet.h> +#include "net.h" +#include "IPChecksum.h" + +void checksummer::sum(const char* data, size_t len) { + auto orig_len = len; + if (odd) { + csum += uint8_t(*data++); + --len; + } + auto p64 = reinterpret_cast<const uint64_t*>(data); + while (len >= 8) { + csum += ntohq(*p64++); + len -= 8; + } + auto p16 = reinterpret_cast<const uint16_t*>(p64); + while (len >= 2) { + csum += ntohs(*p16++); + len -= 2; + } + auto p8 = reinterpret_cast<const uint8_t*>(p16); + if (len) { + csum += *p8++ << 8; + len -= 1; + } + odd ^= orig_len & 1; +} + +uint16_t checksummer::get() const { + __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64); + uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64); + csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48); + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); + return htons(~csum); +} + +void checksummer::sum(const Packet& p) { + for (auto&& f : p.fragments()) { + sum(f.base, f.size); + } +} + +uint16_t ip_checksum(const void* data, size_t len) { + checksummer cksum; + cksum.sum(reinterpret_cast<const char*>(data), len); + return cksum.get(); +} diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h new file mode 100644 index 00000000..9af4a86b --- /dev/null +++ b/src/msg/async/dpdk/IPChecksum.h @@ -0,0 +1,72 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_CHECKSUM_H_ +#define CEPH_MSG_CHECKSUM_H_ + +#include <cstdint> +#include <cstddef> +#include <arpa/inet.h> + +#include "Packet.h" + +uint16_t ip_checksum(const void* data, size_t len); + +struct checksummer { + __int128 csum = 0; + bool odd = false; + void sum(const char* data, size_t len); + void sum(const Packet& p); + void sum(uint8_t data) { + if (!odd) { + csum += data << 8; + } else { + csum += data; + } + odd = !odd; + } + void sum(uint16_t data) { + if (odd) { + sum(uint8_t(data >> 8)); + sum(uint8_t(data)); + } else { + csum += data; + } + } + void sum(uint32_t data) { + if (odd) { + sum(uint16_t(data)); + sum(uint16_t(data >> 16)); + } else { + csum += data; + } + } + void sum_many() {} + template <typename T0, typename... T> + void sum_many(T0 data, T... rest) { + sum(data); + sum_many(rest...); + } + uint16_t get() const; +}; + +#endif /* CEPH_MSG_CHECKSUM_H_ */ diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc new file mode 100644 index 00000000..6c2320a0 --- /dev/null +++ b/src/msg/async/dpdk/Packet.cc @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include <iostream> +#include <algorithm> +#include <cctype> + +#include "capture.h" +#include "Packet.h" + +constexpr size_t Packet::internal_data_size; +constexpr size_t Packet::default_nr_frags; + +void Packet::linearize(size_t at_frag, size_t desired_size) { + _impl->unuse_internal_data(); + size_t nr_frags = 0; + size_t accum_size = 0; + while (accum_size < desired_size) { + accum_size += _impl->frags[at_frag + nr_frags].size; + ++nr_frags; + } + char *new_frag = new char[accum_size]; + auto p = new_frag; + for (size_t i = 0; i < nr_frags; ++i) { + auto& f = _impl->frags[at_frag + i]; + p = std::copy(f.base, f.base + f.size, p); + } + // collapse nr_frags into one fragment + std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags, + _impl->frags + at_frag + 1); + _impl->_nr_frags -= nr_frags - 1; + _impl->frags[at_frag] = fragment{new_frag, accum_size}; + if (at_frag == 0 && desired_size == len()) { + // We can drop the old buffer safely + auto x = std::move(_impl->_deleter); + _impl->_deleter = make_deleter([new_frag] { delete []new_frag; }); + } else { + auto del = std::bind( + [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter)); + _impl->_deleter = make_deleter(std::move(del)); + } +} + +class C_free_on_cpu : public EventCallback { + deleter del; + std::function<void()> cb; + public: + C_free_on_cpu(deleter &&d, std::function<void()> &&c): + del(std::move(d)), cb(std::move(c)) {} + void do_request(uint64_t fd) { + // deleter needs to be moved from lambda capture to be destroyed here + // otherwise deleter destructor will be called on a cpu that called + // create_external_event when work_item is destroyed. + deleter xxx(std::move(del)); + cb(); + delete this; + } +}; + +Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb) +{ + auto del = std::bind( + [center, cb] (deleter &del) mutable { + center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb))); + }, std::move(_impl->_deleter)); + // make new deleter that runs old deleter on an origin cpu + _impl->_deleter = make_deleter(deleter(), std::move(del)); + + return Packet(impl::copy(_impl.get())); +} + +std::ostream& operator<<(std::ostream& os, const Packet& p) { + os << "Packet{"; + bool first = true; + for (auto&& frag : p.fragments()) { + if (!first) { + os << ", "; + } + first = false; + if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) { + os << '"'; + for (auto p = frag.base; p != frag.base + frag.size; ++p) { + auto c = *p; + if (isprint(c)) { + os << c; + } else if (c == '\r') { + os << "\\r"; + } else if (c == '\n') { + os << "\\n"; + } else if (c == '\t') { + os << "\\t"; + } else { + uint8_t b = c; + os << "\\x" << (b / 16) << (b % 16); + } + } + os << '"'; + } else { + os << "{"; + bool nfirst = true; + for (auto p = frag.base; p != frag.base + frag.size; ++p) { + if (!nfirst) { + os << " "; + } + nfirst = false; + uint8_t b = *p; + os << b; + } + os << "}"; + } + } + os << "}"; + return os; +} diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h new file mode 100644 index 00000000..db9cd2a7 --- /dev/null +++ b/src/msg/async/dpdk/Packet.h @@ -0,0 +1,550 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_PACKET_H_ +#define CEPH_MSG_PACKET_H_ + +#include <vector> +#include <algorithm> +#include <iosfwd> + +#include "include/types.h" +#include "common/Tub.h" +#include "common/deleter.h" +#include "msg/async/Event.h" + +#include "const.h" + +struct fragment { + char* base; + size_t size; +}; + +struct offload_info { + ip_protocol_num protocol = ip_protocol_num::unused; + bool needs_csum = false; + uint8_t ip_hdr_len = 20; + uint8_t tcp_hdr_len = 20; + uint8_t udp_hdr_len = 8; + bool needs_ip_csum = false; + bool reassembled = false; + uint16_t tso_seg_size = 0; + // HW stripped VLAN header (CPU order) + Tub<uint16_t> vlan_tci; +}; + +// Zero-copy friendly packet class +// +// For implementing zero-copy, we need a flexible destructor that can +// destroy packet data in different ways: decrementing a reference count, +// or calling a free()-like function. +// +// Moreover, we need different destructors for each set of fragments within +// a single fragment. For example, a header and trailer might need delete[] +// to be called, while the internal data needs a reference count to be +// released. Matters are complicated in that fragments can be split +// (due to virtual/physical translation). +// +// To implement this, we associate each packet with a single destructor, +// but allow composing a packet from another packet plus a fragment to +// be added, with its own destructor, causing the destructors to be chained. +// +// The downside is that the data needed for the destructor is duplicated, +// if it is already available in the fragment itself. +// +// As an optimization, when we allocate small fragments, we allocate some +// extra space, so prepending to the packet does not require extra +// allocations. This is useful when adding headers. +// +class Packet { + // enough for lots of headers, not quite two cache lines: + static constexpr size_t internal_data_size = 128 - 16; + static constexpr size_t default_nr_frags = 4; + + struct pseudo_vector { + fragment* _start; + fragment* _finish; + pseudo_vector(fragment* start, size_t nr) + : _start(start), _finish(_start + nr) {} + fragment* begin() { return _start; } + fragment* end() { return _finish; } + fragment& operator[](size_t idx) { return _start[idx]; } + }; + + struct impl { + // when destroyed, virtual destructor will reclaim resources + deleter _deleter; + unsigned _len = 0; + uint16_t _nr_frags = 0; + uint16_t _allocated_frags; + offload_info _offload_info; + Tub<uint32_t> rss_hash; + char data[internal_data_size]; // only frags[0] may use + unsigned headroom = internal_data_size; // in data + // FIXME: share data/frags space + + fragment frags[]; + + explicit impl(size_t nr_frags = default_nr_frags); + impl(const impl&) = delete; + impl(fragment frag, size_t nr_frags = default_nr_frags); + + pseudo_vector fragments() { return { frags, _nr_frags }; } + + static std::unique_ptr<impl> allocate(size_t nr_frags) { + nr_frags = std::max(nr_frags, default_nr_frags); + return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags)); + } + + static std::unique_ptr<impl> copy(impl* old, size_t nr) { + auto n = allocate(nr); + n->_deleter = std::move(old->_deleter); + n->_len = old->_len; + n->_nr_frags = old->_nr_frags; + n->headroom = old->headroom; + n->_offload_info = old->_offload_info; + n->rss_hash.construct(old->rss_hash); + std::copy(old->frags, old->frags + old->_nr_frags, n->frags); + old->copy_internal_fragment_to(n.get()); + return std::move(n); + } + + static std::unique_ptr<impl> copy(impl* old) { + return copy(old, old->_nr_frags); + } + + static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) { + if (old->_allocated_frags >= old->_nr_frags + extra_frags) { + return std::move(old); + } + return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags)); + } + void* operator new(size_t size, size_t nr_frags = default_nr_frags) { + ceph_assert(nr_frags == uint16_t(nr_frags)); + return ::operator new(size + nr_frags * sizeof(fragment)); + } + // Matching the operator new above + void operator delete(void* ptr, size_t nr_frags) { + return ::operator delete(ptr); + } + // Since the above "placement delete" hides the global one, expose it + void operator delete(void* ptr) { + return ::operator delete(ptr); + } + + bool using_internal_data() const { + return _nr_frags + && frags[0].base >= data + && frags[0].base < data + internal_data_size; + } + + void unuse_internal_data() { + if (!using_internal_data()) { + return; + } + auto buf = static_cast<char*>(::malloc(frags[0].size)); + if (!buf) { + throw std::bad_alloc(); + } + deleter d = make_free_deleter(buf); + std::copy(frags[0].base, frags[0].base + frags[0].size, buf); + frags[0].base = buf; + _deleter.append(std::move(d)); + headroom = internal_data_size; + } + void copy_internal_fragment_to(impl* to) { + if (!using_internal_data()) { + return; + } + to->frags[0].base = to->data + headroom; + std::copy(frags[0].base, frags[0].base + frags[0].size, + to->frags[0].base); + } + }; + explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {} + std::unique_ptr<impl> _impl; +public: + static Packet from_static_data(const char* data, size_t len) { + return {fragment{const_cast<char*>(data), len}, deleter()}; + } + + // build empty Packet + Packet(); + // build empty Packet with nr_frags allocated + explicit Packet(size_t nr_frags); + // move existing Packet + Packet(Packet&& x) noexcept; + // copy data into Packet + Packet(const char* data, size_t len); + // copy data into Packet + explicit Packet(fragment frag); + // zero-copy single fragment + Packet(fragment frag, deleter del); + // zero-copy multiple fragments + Packet(std::vector<fragment> frag, deleter del); + // build Packet with iterator + template <typename Iterator> + Packet(Iterator begin, Iterator end, deleter del); + // append fragment (copying new fragment) + Packet(Packet&& x, fragment frag); + // prepend fragment (copying new fragment, with header optimization) + Packet(fragment frag, Packet&& x); + // prepend fragment (zero-copy) + Packet(fragment frag, deleter del, Packet&& x); + // append fragment (zero-copy) + Packet(Packet&& x, fragment frag, deleter d); + // append deleter + Packet(Packet&& x, deleter d); + + Packet& operator=(Packet&& x) { + if (this != &x) { + this->~Packet(); + new (this) Packet(std::move(x)); + } + return *this; + } + + unsigned len() const { return _impl->_len; } + unsigned memory() const { return len() + sizeof(Packet::impl); } + + fragment frag(unsigned idx) const { return _impl->frags[idx]; } + fragment& frag(unsigned idx) { return _impl->frags[idx]; } + + unsigned nr_frags() const { return _impl->_nr_frags; } + pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; } + fragment* fragment_array() const { return _impl->frags; } + + // share Packet data (reference counted, non COW) + Packet share(); + Packet share(size_t offset, size_t len); + + void append(Packet&& p); + + void trim_front(size_t how_much); + void trim_back(size_t how_much); + + // get a header pointer, linearizing if necessary + template <typename Header> + Header* get_header(size_t offset = 0); + + // get a header pointer, linearizing if necessary + char* get_header(size_t offset, size_t size); + + // prepend a header (default-initializing it) + template <typename Header> + Header* prepend_header(size_t extra_size = 0); + + // prepend a header (uninitialized!) + char* prepend_uninitialized_header(size_t size); + + Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{}); + + void linearize() { return linearize(0, len()); } + + void reset() { _impl.reset(); } + + void reserve(int n_frags) { + if (n_frags > _impl->_nr_frags) { + auto extra = n_frags - _impl->_nr_frags; + _impl = impl::allocate_if_needed(std::move(_impl), extra); + } + } + Tub<uint32_t> rss_hash() { + return _impl->rss_hash; + } + void set_rss_hash(uint32_t hash) { + _impl->rss_hash.construct(hash); + } +private: + void linearize(size_t at_frag, size_t desired_size); + bool allocate_headroom(size_t size); +public: + class offload_info offload_info() const { return _impl->_offload_info; } + class offload_info& offload_info_ref() { return _impl->_offload_info; } + void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; } +}; + +std::ostream& operator<<(std::ostream& os, const Packet& p); + +inline Packet::Packet(Packet&& x) noexcept + : _impl(std::move(x._impl)) { +} + +inline Packet::impl::impl(size_t nr_frags) + : _len(0), _allocated_frags(nr_frags) { +} + +inline Packet::impl::impl(fragment frag, size_t nr_frags) + : _len(frag.size), _allocated_frags(nr_frags) { + ceph_assert(_allocated_frags > _nr_frags); + if (frag.size <= internal_data_size) { + headroom -= frag.size; + frags[0] = { data + headroom, frag.size }; + } else { + auto buf = static_cast<char*>(::malloc(frag.size)); + if (!buf) { + throw std::bad_alloc(); + } + deleter d = make_free_deleter(buf); + frags[0] = { buf, frag.size }; + _deleter.append(std::move(d)); + } + std::copy(frag.base, frag.base + frag.size, frags[0].base); + ++_nr_frags; +} + +inline Packet::Packet(): _impl(impl::allocate(1)) { +} + +inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) { +} + +inline Packet::Packet(fragment frag): _impl(new impl(frag)) { +} + +inline Packet::Packet(const char* data, size_t size): + Packet(fragment{const_cast<char*>(data), size}) { +} + +inline Packet::Packet(fragment frag, deleter d) + : _impl(impl::allocate(1)) { + _impl->_deleter = std::move(d); + _impl->frags[_impl->_nr_frags++] = frag; + _impl->_len = frag.size; +} + +inline Packet::Packet(std::vector<fragment> frag, deleter d) + : _impl(impl::allocate(frag.size())) { + _impl->_deleter = std::move(d); + std::copy(frag.begin(), frag.end(), _impl->frags); + _impl->_nr_frags = frag.size(); + _impl->_len = 0; + for (auto&& f : _impl->fragments()) { + _impl->_len += f.size; + } +} + +template <typename Iterator> +inline Packet::Packet(Iterator begin, Iterator end, deleter del) { + unsigned nr_frags = 0, len = 0; + nr_frags = std::distance(begin, end); + std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; }); + _impl = impl::allocate(nr_frags); + _impl->_deleter = std::move(del); + _impl->_len = len; + _impl->_nr_frags = nr_frags; + std::copy(begin, end, _impl->frags); +} + +inline Packet::Packet(Packet&& x, fragment frag) + : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) { + _impl->_len += frag.size; + char* buf = new char[frag.size]; + std::copy(frag.base, frag.base + frag.size, buf); + _impl->frags[_impl->_nr_frags++] = {buf, frag.size}; + _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] { + delete[] buf; + }); +} + +inline bool Packet::allocate_headroom(size_t size) { + if (_impl->headroom >= size) { + _impl->_len += size; + if (!_impl->using_internal_data()) { + _impl = impl::allocate_if_needed(std::move(_impl), 1); + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + _impl->frags[0] = { _impl->data + internal_data_size, 0 }; + ++_impl->_nr_frags; + } + _impl->headroom -= size; + _impl->frags[0].base -= size; + _impl->frags[0].size += size; + return true; + } else { + return false; + } +} + + +inline Packet::Packet(fragment frag, Packet&& x) + : _impl(std::move(x._impl)) { + // try to prepend into existing internal fragment + if (allocate_headroom(frag.size)) { + std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base); + return; + } else { + // didn't work out, allocate and copy + _impl->unuse_internal_data(); + _impl = impl::allocate_if_needed(std::move(_impl), 1); + _impl->_len += frag.size; + char *buf = new char[frag.size]; + std::copy(frag.base, frag.base + frag.size, buf); + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + ++_impl->_nr_frags; + _impl->frags[0] = {buf, frag.size}; + _impl->_deleter = make_deleter( + std::move(_impl->_deleter), [buf] { delete []buf; }); + } +} + +inline Packet::Packet(Packet&& x, fragment frag, deleter d) + : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) { + _impl->_len += frag.size; + _impl->frags[_impl->_nr_frags++] = frag; + d.append(std::move(_impl->_deleter)); + _impl->_deleter = std::move(d); +} + +inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) { + _impl->_deleter.append(std::move(d)); +} + +inline void Packet::append(Packet&& p) { + if (!_impl->_len) { + *this = std::move(p); + return; + } + _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags); + _impl->_len += p._impl->_len; + p._impl->unuse_internal_data(); + std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags, + _impl->frags + _impl->_nr_frags); + _impl->_nr_frags += p._impl->_nr_frags; + p._impl->_deleter.append(std::move(_impl->_deleter)); + _impl->_deleter = std::move(p._impl->_deleter); +} + +inline char* Packet::get_header(size_t offset, size_t size) { + if (offset + size > _impl->_len) { + return nullptr; + } + size_t i = 0; + while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) { + offset -= _impl->frags[i++].size; + } + if (i == _impl->_nr_frags) { + return nullptr; + } + if (offset + size > _impl->frags[i].size) { + linearize(i, offset + size); + } + return _impl->frags[i].base + offset; +} + +template <typename Header> +inline Header* Packet::get_header(size_t offset) { + return reinterpret_cast<Header*>(get_header(offset, sizeof(Header))); +} + +inline void Packet::trim_front(size_t how_much) { + ceph_assert(how_much <= _impl->_len); + _impl->_len -= how_much; + size_t i = 0; + while (how_much && how_much >= _impl->frags[i].size) { + how_much -= _impl->frags[i++].size; + } + std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags); + _impl->_nr_frags -= i; + if (!_impl->using_internal_data()) { + _impl->headroom = internal_data_size; + } + if (how_much) { + if (_impl->using_internal_data()) { + _impl->headroom += how_much; + } + _impl->frags[0].base += how_much; + _impl->frags[0].size -= how_much; + } +} + +inline void Packet::trim_back(size_t how_much) { + ceph_assert(how_much <= _impl->_len); + _impl->_len -= how_much; + size_t i = _impl->_nr_frags - 1; + while (how_much && how_much >= _impl->frags[i].size) { + how_much -= _impl->frags[i--].size; + } + _impl->_nr_frags = i + 1; + if (how_much) { + _impl->frags[i].size -= how_much; + if (i == 0 && _impl->using_internal_data()) { + _impl->headroom += how_much; + } + } +} + +template <typename Header> +Header* Packet::prepend_header(size_t extra_size) { + auto h = prepend_uninitialized_header(sizeof(Header) + extra_size); + return new (h) Header{}; +} + +// prepend a header (uninitialized!) +inline char* Packet::prepend_uninitialized_header(size_t size) { + if (!allocate_headroom(size)) { + // didn't work out, allocate and copy + _impl->unuse_internal_data(); + // try again, after unuse_internal_data we may have space after all + if (!allocate_headroom(size)) { + // failed + _impl->_len += size; + _impl = impl::allocate_if_needed(std::move(_impl), 1); + char *buf = new char[size]; + std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags, + _impl->frags + _impl->_nr_frags + 1); + ++_impl->_nr_frags; + _impl->frags[0] = {buf, size}; + _impl->_deleter = make_deleter(std::move(_impl->_deleter), + [buf] { delete []buf; }); + } + } + return _impl->frags[0].base; +} + +inline Packet Packet::share() { + return share(0, _impl->_len); +} + +inline Packet Packet::share(size_t offset, size_t len) { + _impl->unuse_internal_data(); // FIXME: eliminate? + Packet n; + n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags); + size_t idx = 0; + while (offset > 0 && offset >= _impl->frags[idx].size) { + offset -= _impl->frags[idx++].size; + } + while (n._impl->_len < len) { + auto& f = _impl->frags[idx++]; + auto fsize = std::min(len - n._impl->_len, f.size - offset); + n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize }; + n._impl->_len += fsize; + offset = 0; + } + n._impl->_offload_info = _impl->_offload_info; + ceph_assert(!n._impl->_deleter); + n._impl->_deleter = _impl->_deleter.share(); + return n; +} + +#endif /* CEPH_MSG_PACKET_H_ */ diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h new file mode 100644 index 00000000..118218e6 --- /dev/null +++ b/src/msg/async/dpdk/PacketUtil.h @@ -0,0 +1,154 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_PACKET_UTIL_H_ +#define CEPH_MSG_PACKET_UTIL_H_ + +#include <map> +#include <iostream> + +#include "Packet.h" + +template <typename Offset, typename Tag> +class packet_merger { + private: + static uint64_t& linearizations_ref() { + static thread_local uint64_t linearization_count; + return linearization_count; + } + public: + std::map<Offset, Packet> map; + + static uint64_t linearizations() { + return linearizations_ref(); + } + + void merge(Offset offset, Packet p) { + bool insert = true; + auto beg = offset; + auto end = beg + p.len(); + // First, try to merge the packet with existing segment + for (auto it = map.begin(); it != map.end();) { + auto& seg_pkt = it->second; + auto seg_beg = it->first; + auto seg_end = seg_beg + seg_pkt.len(); + // There are 6 cases: + if (seg_beg <= beg && end <= seg_end) { + // 1) seg_beg beg end seg_end + // We already have data in this packet + return; + } else if (beg <= seg_beg && seg_end <= end) { + // 2) beg seg_beg seg_end end + // The new segment contains more data than this old segment + // Delete the old one, insert the new one + it = map.erase(it); + insert = true; + break; + } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) { + // 3) beg seg_beg end seg_end + // Merge two segments, trim front of old segment + auto trim = end - seg_beg; + seg_pkt.trim_front(trim); + p.append(std::move(seg_pkt)); + // Delete the old one, insert the new one + it = map.erase(it); + insert = true; + break; + } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) { + // 4) seg_beg beg seg_end end + // Merge two segments, trim front of new segment + auto trim = seg_end - beg; + p.trim_front(trim); + // Append new data to the old segment, keep the old segment + seg_pkt.append(std::move(p)); + seg_pkt.linearize(); + ++linearizations_ref(); + insert = false; + break; + } else { + // 5) beg end < seg_beg seg_end + // or + // 6) seg_beg seg_end < beg end + // Can not merge with this segment, keep looking + it++; + insert = true; + } + } + + if (insert) { + p.linearize(); + ++linearizations_ref(); + map.emplace(beg, std::move(p)); + } + + // Second, merge adjacent segments after this packet has been merged, + // because this packet might fill a "whole" and make two adjacent + // segments mergable + for (auto it = map.begin(); it != map.end();) { + // The first segment + auto& seg_pkt = it->second; + auto seg_beg = it->first; + auto seg_end = seg_beg + seg_pkt.len(); + + // The second segment + auto it_next = it; + it_next++; + if (it_next == map.end()) { + break; + } + auto& p = it_next->second; + auto beg = it_next->first; + auto end = beg + p.len(); + + // Merge the the second segment into first segment if possible + if (seg_beg <= beg && beg <= seg_end && seg_end < end) { + // Merge two segments, trim front of second segment + auto trim = seg_end - beg; + p.trim_front(trim); + // Append new data to the first segment, keep the first segment + seg_pkt.append(std::move(p)); + + // Delete the second segment + map.erase(it_next); + + // Keep merging this first segment with its new next packet + // So we do not update the iterator: it + continue; + } else if (end <= seg_end) { + // The first segment has all the data in the second segment + // Delete the second segment + map.erase(it_next); + continue; + } else if (seg_end < beg) { + // Can not merge first segment with second segment + it = it_next; + continue; + } else { + // If we reach here, we have a bug with merge. + std::cout << "packet_merger: merge error\n"; + abort(); + } + } + } +}; + +#endif diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h new file mode 100644 index 00000000..996ae93c --- /dev/null +++ b/src/msg/async/dpdk/TCP-Stack.h @@ -0,0 +1,40 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +// tcp/network-stack integration + +#ifndef CEPH_MSG_DPDK_TCP_STACK_H +#define CEPH_MSG_DPDK_TCP_STACK_H + +class ServerSocket; +class ConnectedSocket; + +class ipv4_traits; +template <typename InetTraits> +class tcp; + +int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, + int type, ServerSocket *sa); + +int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, + ConnectedSocket *sa); + +#endif diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc new file mode 100644 index 00000000..c6397709 --- /dev/null +++ b/src/msg/async/dpdk/TCP.cc @@ -0,0 +1,840 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#include "align.h" +#include "TCP.h" +#include "IP.h" +#include "DPDKStack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "tcp " + +void tcp_option::parse(uint8_t* beg, uint8_t* end) +{ + while (beg < end) { + auto kind = option_kind(*beg); + if (kind != option_kind::nop && kind != option_kind::eol) { + // Make sure there is enough room for this option + auto len = *(beg + 1); + if (beg + len > end) { + return; + } + } + switch (kind) { + case option_kind::mss: + _mss_received = true; + _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss); + beg += option_len::mss; + break; + case option_kind::win_scale: + _win_scale_received = true; + _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift; + // We can turn on win_scale option, 7 is Linux's default win scale size + _local_win_scale = 7; + beg += option_len::win_scale; + break; + case option_kind::sack: + _sack_received = true; + beg += option_len::sack; + break; + case option_kind::nop: + beg += option_len::nop; + break; + case option_kind::eol: + return; + default: + // Ignore options we do not understand + auto len = *(beg + 1); + beg += len; + // Prevent infinite loop + if (len == 0) { + return; + } + break; + } + } +} + +uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size) +{ + auto hdr = reinterpret_cast<uint8_t*>(th); + auto off = hdr + sizeof(tcp_hdr); + uint8_t size = 0; + bool syn_on = th->f_syn; + bool ack_on = th->f_ack; + + if (syn_on) { + if (_mss_received || !ack_on) { + auto mss = new (off) tcp_option::mss; + mss->mss = _local_mss; + off += mss->len; + size += mss->len; + *mss = mss->hton(); + } + if (_win_scale_received || !ack_on) { + auto win_scale = new (off) tcp_option::win_scale; + win_scale->shift = _local_win_scale; + off += win_scale->len; + size += win_scale->len; + } + } + if (size > 0) { + // Insert NOP option + auto size_max = align_up(uint8_t(size + 1), tcp_option::align); + while (size < size_max - uint8_t(option_len::eol)) { + new (off) tcp_option::nop; + off += option_len::nop; + size += option_len::nop; + } + new (off) tcp_option::eol; + size += option_len::eol; + } + ceph_assert(size == options_size); + + return size; +} + +uint8_t tcp_option::get_size(bool syn_on, bool ack_on) +{ + uint8_t size = 0; + if (syn_on) { + if (_mss_received || !ack_on) { + size += option_len::mss; + } + if (_win_scale_received || !ack_on) { + size += option_len::win_scale; + } + } + if (size > 0) { + size += option_len::eol; + // Insert NOP option to align on 32-bit + size = align_up(size, tcp_option::align); + } + return size; +} + +ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c) + : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c))) +{ } + +ipv4_tcp::~ipv4_tcp() { } + +void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to) +{ + _tcp->received(std::move(p), from, to); +} + +bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off) +{ + return _tcp->forward(out_hash_data, p, off); +} + +int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts, + int type, ServerSocket *sock) +{ + auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts, type); + int r = p->listen(); + if (r < 0) { + delete p; + return r; + } + *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p)); + return 0; +} + +int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr, + ConnectedSocket *sock) +{ + auto conn = tcpv4.connect(addr); + *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>( + new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn)))); + return 0; +} + +template <typename InetTraits> +void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip) +{ + ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin) + << " syn=" << bool(rth->f_syn) << dendl; + if (rth->f_rst) { + return; + } + Packet p; + auto th = p.prepend_header<tcp_hdr>(); + th->src_port = rth->dst_port; + th->dst_port = rth->src_port; + if (rth->f_ack) { + th->seq = rth->ack; + } + // If this RST packet is in response to a SYN packet. We ACK the ISN. + if (rth->f_syn) { + th->ack = rth->seq + 1; + th->f_ack = true; + } + th->f_rst = true; + th->data_offset = sizeof(*th) / 4; + th->checksum = 0; + *th = th->hton(); + + checksummer csum; + offload_info oi; + InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th)); + if (get_hw_features().tx_csum_l4_offload) { + th->checksum = ~csum.get(); + oi.needs_csum = true; + } else { + csum.sum(p); + th->checksum = csum.get(); + oi.needs_csum = false; + } + + oi.protocol = ip_protocol_num::tcp; + oi.tcp_hdr_len = sizeof(tcp_hdr); + p.set_offload_info(oi); + + send_packet_without_tcb(local_ip, foreign_ip, std::move(p)); +} + +#undef dout_prefix +#define dout_prefix _prefix(_dout) +template<typename InetTraits> +ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) { + return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port + << " tcb(" << this << " fd=" << fd << " s=" << _state << ")."; +} + +template<typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p) +{ + auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); + auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); + auto opt_end = opt_start + opt_len; + p.trim_front(th->data_offset * 4); + tcp_sequence seg_seq = th->seq; + + // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ + _rcv.next = seg_seq + 1; + _rcv.initial = seg_seq; + + // ISS should be selected and a SYN segment sent of the form: + // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> + // SND.NXT is set to ISS+1 and SND.UNA to ISS + // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is + // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we + // have + // th->seq = syn_on ? _snd.initial : _snd.next + // to make sure retransmitted SYN has correct SEQ number. + do_setup_isn(); + + _rcv.urgent = _rcv.next; + + ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl; + init_from_options(th, opt_start, opt_end); + do_syn_received(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p) +{ + auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr); + auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr); + auto opt_end = opt_start + opt_len; + p.trim_front(th->data_offset * 4); + tcp_sequence seg_seq = th->seq; + auto seg_ack = th->ack; + + ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw + << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; + + bool acceptable = false; + // 3.1 first check the ACK bit + if (th->f_ack) { + // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the + // RST bit is set, if so drop the segment and return) + if (seg_ack <= _snd.initial || seg_ack > _snd.next) { + return respond_with_reset(th); + } + + // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable. + acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next; + } + + // 3.2 second check the RST bit + if (th->f_rst) { + // If the ACK was acceptable then signal the user "error: connection + // reset", drop the segment, enter CLOSED state, delete TCB, and + // return. Otherwise (no ACK) drop the segment and return. + if (acceptable) { + return do_reset(); + } else { + return; + } + } + + // 3.3 third check the security and precedence + // NOTE: Ignored for now + + // 3.4 fourth check the SYN bit + if (th->f_syn) { + // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should + // be advanced to equal SEG.ACK (if there is an ACK), and any segments + // on the retransmission queue which are thereby acknowledged should be + // removed. + _rcv.next = seg_seq + 1; + _rcv.initial = seg_seq; + if (th->f_ack) { + // TODO: clean retransmission queue + _snd.unacknowledged = seg_ack; + } + if (_snd.unacknowledged > _snd.initial) { + // If SND.UNA > ISS (our SYN has been ACKed), change the connection + // state to ESTABLISHED, form an ACK segment + // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl; + init_from_options(th, opt_start, opt_end); + do_established(); + output(); + } else { + // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment + // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK> + ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl; + do_syn_received(); + } + } + + // 3.5 fifth, if neither of the SYN or RST bits is set then drop the + // segment and return. + return; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p) +{ + p.trim_front(th->data_offset * 4); + bool do_output = false; + bool do_output_data = false; + tcp_sequence seg_seq = th->seq; + auto seg_ack = th->ack; + auto seg_len = p.len(); + ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw + << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw + << " rcv next " << _rcv.next.raw << " len " << seg_len + << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl; + + // 4.1 first check sequence number + if (!segment_acceptable(seg_seq, seg_len)) { + //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + return output(); + } + + // In the following it is assumed that the segment is the idealized + // segment that begins at RCV.NXT and does not exceed the window. + if (seg_seq < _rcv.next) { + // ignore already acknowledged data + auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len); + ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl; + p.trim_front(dup); + seg_len -= dup; + seg_seq += dup; + } + // FIXME: We should trim data outside the right edge of the receive window as well + + if (seg_seq != _rcv.next) { + ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw + << " actual " << seg_seq.raw + << " out of order size " << _rcv.out_of_order.map.size() + << dendl; + insert_out_of_order(seg_seq, std::move(p)); + // A TCP receiver SHOULD send an immediate duplicate ACK + // when an out-of-order segment arrives. + return output(); + } + + // 4.2 second check the RST bit + if (th->f_rst) { + if (in_state(SYN_RECEIVED)) { + // If this connection was initiated with a passive OPEN (i.e., + // came from the LISTEN state), then return this connection to + // LISTEN state and return. The user need not be informed. If + // this connection was initiated with an active OPEN (i.e., came + // from SYN_SENT state) then the connection was refused, signal + // the user "connection refused". In either case, all segments + // on the retransmission queue should be removed. And in the + // active OPEN case, enter the CLOSED state and delete the TCB, + // and return. + errno = -ECONNREFUSED; + return do_reset(); + } + if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) { + // If the RST bit is set then, any outstanding RECEIVEs and SEND + // should receive "reset" responses. All segment queues should be + // flushed. Users should also receive an unsolicited general + // "connection reset" signal. Enter the CLOSED state, delete the + // TCB, and return. + return do_reset(); + } + if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) { + // If the RST bit is set then, enter the CLOSED state, delete the + // TCB, and return. + return do_closed(); + } + } + + // 4.3 third check security and precedence + // NOTE: Ignored for now + + // 4.4 fourth, check the SYN bit + if (th->f_syn) { + // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 + // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT + + // If the SYN is in the window it is an error, send a reset, any + // outstanding RECEIVEs and SEND should receive "reset" responses, + // all segment queues should be flushed, the user should also + // receive an unsolicited general "connection reset" signal, enter + // the CLOSED state, delete the TCB, and return. + respond_with_reset(th); + return do_reset(); + + // If the SYN is not in the window this step would not be reached + // and an ack would have been sent in the first step (sequence + // number check). + } + + // 4.5 fifth check the ACK field + if (!th->f_ack) { + // if the ACK bit is off drop the segment and return + return; + } else { + // SYN_RECEIVED STATE + if (in_state(SYN_RECEIVED)) { + // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state + // and continue processing. + if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) { + ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl; + do_established(); + if (_tcp.push_listen_queue(_local_port, this)) { + ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl; + } else { + ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl; + return respond_with_reset(th); + } + } else { + // <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(th); + } + } + auto update_window = [this, th, seg_seq, seg_ack] { + ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq + << " seg_ack=" << seg_ack << " old window=" << th->window + << " new window=" << int(_snd.window_scale) << dendl; + _snd.window = th->window << _snd.window_scale; + _snd.wl1 = seg_seq; + _snd.wl2 = seg_ack; + if (_snd.window == 0) { + _persist_time_out = _rto; + start_persist_timer(); + } else { + stop_persist_timer(); + } + }; + // ESTABLISHED STATE or + // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state. + if (in_state(ESTABLISHED | CLOSE_WAIT)) { + // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK. + if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) { + // Remote ACKed data we sent + auto acked_bytes = data_segment_acked(seg_ack); + + // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated. + if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) { + update_window(); + } + + // some data is acked, try send more data + do_output_data = true; + + auto set_retransmit_timer = [this] { + if (_snd.data.empty()) { + // All outstanding segments are acked, turn off the timer. + stop_retransmit_timer(); + // Signal the waiter of this event + signal_all_data_acked(); + } else { + // Restart the timer becasue new data is acked. + start_retransmit_timer(); + } + }; + + if (_snd.dupacks >= 3) { + // We are in fast retransmit / fast recovery phase + uint32_t smss = _snd.mss; + if (seg_ack > _snd.recover) { + ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl; + // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS) + _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss); + // Exit the fast recovery procedure + exit_fast_recovery(); + set_retransmit_timer(); + } else { + ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl; + // Retransmit the first unacknowledged segment + fast_retransmit(); + // Deflate the congestion window by the amount of new data + // acknowledged by the Cumulative Acknowledgment field + _snd.cwnd -= acked_bytes; + // If the partial ACK acknowledges at least one SMSS of new + // data, then add back SMSS bytes to the congestion window + if (acked_bytes >= smss) { + _snd.cwnd += smss; + } + // Send a new segment if permitted by the new value of + // cwnd. Do not exit the fast recovery procedure For + // the first partial ACK that arrives during fast + // recovery, also reset the retransmit timer. + if (++_snd.partial_ack == 1) { + start_retransmit_timer(); + } + } + } else { + // RFC5681: The fast retransmit algorithm uses the arrival + // of 3 duplicate ACKs (as defined in section 2, without + // any intervening ACKs which move SND.UNA) as an + // indication that a segment has been lost. + // + // So, here we reset dupacks to zero becasue this ACK moves + // SND.UNA. + exit_fast_recovery(); + set_retransmit_timer(); + } + } else if (!_snd.data.empty() && seg_len == 0 && + th->f_fin == 0 && th->f_syn == 0 && + th->ack == _snd.unacknowledged && + uint32_t(th->window << _snd.window_scale) == _snd.window) { + // Note: + // RFC793 states: + // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored + // RFC5681 states: + // The TCP sender SHOULD use the "fast retransmit" algorithm to detect + // and repair loss, based on incoming duplicate ACKs. + // Here, We follow RFC5681. + _snd.dupacks++; + uint32_t smss = _snd.mss; + // 3 duplicated ACKs trigger a fast retransmit + if (_snd.dupacks == 1 || _snd.dupacks == 2) { + // RFC5681 Step 3.1 + // Send cwnd + 2 * smss per RFC3042 + do_output_data = true; + } else if (_snd.dupacks == 3) { + // RFC6582 Step 3.2 + if (seg_ack - 1 > _snd.recover) { + _snd.recover = _snd.next - 1; + // RFC5681 Step 3.2 + _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss); + fast_retransmit(); + } else { + // Do not enter fast retransmit and do not reset ssthresh + } + // RFC5681 Step 3.3 + _snd.cwnd = _snd.ssthresh + 3 * smss; + } else if (_snd.dupacks > 3) { + // RFC5681 Step 3.4 + _snd.cwnd += smss; + // RFC5681 Step 3.5 + do_output_data = true; + } + } else if (seg_ack > _snd.next) { + // If the ACK acks something not yet sent (SEG.ACK > SND.NXT) + // then send an ACK, drop the segment, and return + return output(); + } else if (_snd.window == 0 && th->window > 0) { + update_window(); + do_output_data = true; + } + } + // FIN_WAIT_1 STATE + if (in_state(FIN_WAIT_1)) { + // In addition to the processing for the ESTABLISHED state, if + // our FIN is now acknowledged then enter FIN-WAIT-2 and continue + // processing in that state. + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl; + _state = FIN_WAIT_2; + do_local_fin_acked(); + } + } + // FIN_WAIT_2 STATE + if (in_state(FIN_WAIT_2)) { + // In addition to the processing for the ESTABLISHED state, if + // the retransmission queue is empty, the user’s CLOSE can be + // acknowledged ("ok") but do not delete the TCB. + // TODO + } + // CLOSING STATE + if (in_state(CLOSING)) { + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl; + do_local_fin_acked(); + return do_time_wait(); + } else { + return; + } + } + // LAST_ACK STATE + if (in_state(LAST_ACK)) { + if (seg_ack == _snd.next + 1) { + ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl; + do_local_fin_acked(); + return do_closed(); + } + } + // TIME_WAIT STATE + if (in_state(TIME_WAIT)) { + // The only thing that can arrive in this state is a + // retransmission of the remote FIN. Acknowledge it, and restart + // the 2 MSL timeout. + // TODO + } + } + + // 4.6 sixth, check the URG bit + if (th->f_urg) { + // TODO + } + + // 4.7 seventh, process the segment text + if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) { + if (p.len()) { + // Once the TCP takes responsibility for the data it advances + // RCV.NXT over the data accepted, and adjusts RCV.WND as + // apporopriate to the current buffer availability. The total of + // RCV.NXT and RCV.WND should not be reduced. + _rcv.data.push_back(std::move(p)); + _rcv.next += seg_len; + auto merged = merge_out_of_order(); + signal_data_received(); + // Send an acknowledgment of the form: + // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK> + // This acknowledgment should be piggybacked on a segment being + // transmitted if possible without incurring undue delay. + if (merged) { + // TCP receiver SHOULD send an immediate ACK when the + // incoming segment fills in all or part of a gap in the + // sequence space. + do_output = true; + } else { + do_output = should_send_ack(seg_len); + } + ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl; + } + } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) { + // This should not occur, since a FIN has been received from the + // remote side. Ignore the segment text. + return; + } + + // 4.8 eighth, check the FIN bit + if (th->f_fin) { + if (in_state(CLOSED | LISTEN | SYN_SENT)) { + // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT + // since the SEG.SEQ cannot be validated; drop the segment and return. + return; + } + auto fin_seq = seg_seq + seg_len; + if (fin_seq == _rcv.next) { + _rcv.next = fin_seq + 1; + + // If this <FIN> packet contains data as well, we can ACK both data + // and <FIN> in a single packet, so canncel the previous ACK. + clear_delayed_ack(); + do_output = false; + // Send ACK for the FIN! + output(); + signal_data_received(); + _errno = 0; + + if (in_state(SYN_RECEIVED | ESTABLISHED)) { + ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl; + _state = CLOSE_WAIT; + // EOF + } + if (in_state(FIN_WAIT_1)) { + // If our FIN has been ACKed (perhaps in this segment), then + // enter TIME-WAIT, start the time-wait timer, turn off the other + // timers; otherwise enter the CLOSING state. + // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2 + // not FIN_WAIT_1 if we reach here. + ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl; + _state = CLOSING; + } + if (in_state(FIN_WAIT_2)) { + ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl; + return do_time_wait(); + } + } + } + if (do_output || (do_output_data && can_send())) { + // Since we will do output, we can canncel scheduled delayed ACK. + clear_delayed_ack(); + output(); + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::connect() +{ + ldout(_tcp.cct, 20) << __func__ << dendl; + // An initial send sequence number (ISS) is selected. A SYN segment of the + // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1, + // enter SYN-SENT state, and return. + do_setup_isn(); + + // Local receive window scale factor + _rcv.window_scale = _option._local_win_scale = 7; + // Maximum segment size local can receive + _rcv.mss = _option._local_mss = local_mss(); + // Linux's default window size + _rcv.window = 29200 << _rcv.window_scale; + + do_syn_sent(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::close_final_cleanup() +{ + if (_snd._all_data_acked_fd >= 0) { + center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE); + _tcp.manager.close(_snd._all_data_acked_fd); + _snd._all_data_acked_fd = -1; + } + + _snd.closed = true; + signal_data_received(); + ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl; + if (in_state(CLOSE_WAIT)) { + ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl; + _state = LAST_ACK; + } else if (in_state(ESTABLISHED)) { + ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl; + _state = FIN_WAIT_1; + } + // Send <FIN> to remote + // Note: we call output_one to make sure a packet with FIN actually + // sent out. If we only call output() and _packetq is not empty, + // tcp::tcb::get_packet(), packet with FIN will not be generated. + output_one(); + output(); + center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::retransmit() +{ + auto output_update_rto = [this] { + output(); + // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off + this->_rto = std::min(this->_rto * 2, this->_rto_max); + start_retransmit_timer(); + }; + + // Retransmit SYN + if (syn_needs_on()) { + if (_snd.syn_retransmit++ < _max_nr_retransmit) { + output_update_rto(); + } else { + _errno = -ECONNABORTED; + ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + } + + // Retransmit FIN + if (fin_needs_on()) { + if (_snd.fin_retransmit++ < _max_nr_retransmit) { + output_update_rto(); + } else { + ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + } + + // Retransmit Data + if (_snd.data.empty()) { + return; + } + + // If there are unacked data, retransmit the earliest segment + auto& unacked_seg = _snd.data.front(); + + // According to RFC5681 + // Update ssthresh only for the first retransmit + uint32_t smss = _snd.mss; + if (unacked_seg.nr_transmits == 0) { + _snd.ssthresh = std::max(flight_size() / 2, 2 * smss); + } + // RFC6582 Step 4 + _snd.recover = _snd.next - 1; + // Start the slow start process + _snd.cwnd = smss; + // End fast recovery + exit_fast_recovery(); + + ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size() + << " nr=" << unacked_seg.nr_transmits << dendl; + if (unacked_seg.nr_transmits < _max_nr_retransmit) { + unacked_seg.nr_transmits++; + } else { + // Delete connection when max num of retransmission is reached + ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max " + << _max_nr_retransmit << dendl; + _errno = -ETIMEDOUT; + cleanup(); + return; + } + retransmit_one(); + + output_update_rto(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::persist() { + ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl; + // Send 1 byte packet to probe peer's window size + _snd.window_probe = true; + output_one(); + _snd.window_probe = false; + + output(); + // Perform binary exponential back-off per RFC1122 + _persist_time_out = std::min(_persist_time_out * 2, _rto_max); + start_persist_timer(); +} diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h new file mode 100644 index 00000000..b7bd7132 --- /dev/null +++ b/src/msg/async/dpdk/TCP.h @@ -0,0 +1,1503 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_DPDK_TCP_H_ +#define CEPH_DPDK_TCP_H_ + +#include <unordered_map> +#include <map> +#include <queue> +#include <functional> +#include <deque> +#include <chrono> +#include <stdexcept> +#include <system_error> + +#include "msg/async/dpdk/EventDPDK.h" + +#include "include/utime.h" +#include "common/Throttle.h" +#include "common/ceph_time.h" +#include "common/ceph_crypto.h" +#include "msg/async/Event.h" +#include "IPChecksum.h" +#include "IP.h" +#include "const.h" +#include "byteorder.h" +#include "shared_ptr.h" +#include "PacketUtil.h" + +#include "include/random.h" + +struct tcp_hdr; + +enum class tcp_state : uint16_t { + CLOSED = (1 << 0), + LISTEN = (1 << 1), + SYN_SENT = (1 << 2), + SYN_RECEIVED = (1 << 3), + ESTABLISHED = (1 << 4), + FIN_WAIT_1 = (1 << 5), + FIN_WAIT_2 = (1 << 6), + CLOSE_WAIT = (1 << 7), + CLOSING = (1 << 8), + LAST_ACK = (1 << 9), + TIME_WAIT = (1 << 10) +}; + +inline tcp_state operator|(tcp_state s1, tcp_state s2) { + return tcp_state(uint16_t(s1) | uint16_t(s2)); +} + +inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) { + switch (s) { + case tcp_state::CLOSED: return str << "CLOSED"; + case tcp_state::LISTEN: return str << "LISTEN"; + case tcp_state::SYN_SENT: return str << "SYN_SENT"; + case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED"; + case tcp_state::ESTABLISHED: return str << "ESTABLISHED"; + case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1"; + case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2"; + case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT"; + case tcp_state::CLOSING: return str << "CLOSING"; + case tcp_state::LAST_ACK: return str << "LAST_ACK"; + case tcp_state::TIME_WAIT: return str << "TIME_WAIT"; + default: return str << "UNKNOWN"; + } +} + +struct tcp_option { + // The kind and len field are fixed and defined in TCP protocol + enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 }; + enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 }; + struct mss { + option_kind kind = option_kind::mss; + option_len len = option_len::mss; + uint16_t mss; + struct mss hton() { + struct mss m = *this; + m.mss = ::hton(m.mss); + return m; + } + } __attribute__((packed)); + struct win_scale { + option_kind kind = option_kind::win_scale; + option_len len = option_len::win_scale; + uint8_t shift; + } __attribute__((packed)); + struct sack { + option_kind kind = option_kind::sack; + option_len len = option_len::sack; + } __attribute__((packed)); + struct timestamps { + option_kind kind = option_kind::timestamps; + option_len len = option_len::timestamps; + uint32_t t1; + uint32_t t2; + } __attribute__((packed)); + struct nop { + option_kind kind = option_kind::nop; + } __attribute__((packed)); + struct eol { + option_kind kind = option_kind::eol; + } __attribute__((packed)); + static const uint8_t align = 4; + + void parse(uint8_t* beg, uint8_t* end); + uint8_t fill(tcp_hdr* th, uint8_t option_size); + uint8_t get_size(bool syn_on, bool ack_on); + + // For option negotiattion + bool _mss_received = false; + bool _win_scale_received = false; + bool _timestamps_received = false; + bool _sack_received = false; + + // Option data + uint16_t _remote_mss = 536; + uint16_t _local_mss; + uint8_t _remote_win_scale = 0; + uint8_t _local_win_scale = 0; +}; +inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; } +inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; } + +struct tcp_sequence { + uint32_t raw; +}; + +tcp_sequence ntoh(tcp_sequence ts) { + return tcp_sequence { ::ntoh(ts.raw) }; +} + +tcp_sequence hton(tcp_sequence ts) { + return tcp_sequence { ::hton(ts.raw) }; +} + +inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) { + return os << s.raw; +} + +inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; } +inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; } +inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; } +inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; } +inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; } +inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; } +inline bool operator==(tcp_sequence s, tcp_sequence q) { return s.raw == q.raw; } +inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); } +inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; } +inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; } +inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); } +inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); } + +struct tcp_hdr { + uint16_t src_port; + uint16_t dst_port; + tcp_sequence seq; + tcp_sequence ack; + uint8_t rsvd1 : 4; + uint8_t data_offset : 4; + uint8_t f_fin : 1; + uint8_t f_syn : 1; + uint8_t f_rst : 1; + uint8_t f_psh : 1; + uint8_t f_ack : 1; + uint8_t f_urg : 1; + uint8_t rsvd2 : 2; + uint16_t window; + uint16_t checksum; + uint16_t urgent; + + tcp_hdr hton() { + tcp_hdr hdr = *this; + hdr.src_port = ::hton(src_port); + hdr.dst_port = ::hton(dst_port); + hdr.seq = ::hton(seq); + hdr.ack = ::hton(ack); + hdr.window = ::hton(window); + hdr.checksum = ::hton(checksum); + hdr.urgent = ::hton(urgent); + return hdr; + } + + tcp_hdr ntoh() { + tcp_hdr hdr = *this; + hdr.src_port = ::ntoh(src_port); + hdr.dst_port = ::ntoh(dst_port); + hdr.seq = ::ntoh(seq); + hdr.ack = ::ntoh(ack); + hdr.window = ::ntoh(window); + hdr.checksum = ::ntoh(checksum); + hdr.urgent = ::ntoh(urgent); + return hdr; + } +} __attribute__((packed)); + +struct tcp_tag {}; +using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>; + +template <typename InetTraits> +class tcp { + public: + using ipaddr = typename InetTraits::address_type; + using inet_type = typename InetTraits::inet_type; + using connid = l4connid<InetTraits>; + using connid_hash = typename connid::connid_hash; + class connection; + class listener; + private: + class tcb; + + class C_handle_delayed_ack : public EventCallback { + tcb *tc; + + public: + C_handle_delayed_ack(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->_nr_full_seg_received = 0; + tc->output(); + } + }; + + class C_handle_retransmit : public EventCallback { + tcb *tc; + + public: + C_handle_retransmit(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->retransmit(); + } + }; + + class C_handle_persist : public EventCallback { + tcb *tc; + + public: + C_handle_persist(tcb *t): tc(t) { } + void do_request(uint64_t r) { + tc->persist(); + } + }; + + class C_all_data_acked : public EventCallback { + tcb *tc; + + public: + C_all_data_acked(tcb *t): tc(t) {} + void do_request(uint64_t fd_or_id) { + tc->close_final_cleanup(); + } + }; + + class C_actual_remove_tcb : public EventCallback { + lw_shared_ptr<tcb> tc; + public: + C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {} + void do_request(uint64_t r) { + delete this; + } + }; + + class tcb : public enable_lw_shared_from_this<tcb> { + using clock_type = ceph::coarse_real_clock; + static constexpr tcp_state CLOSED = tcp_state::CLOSED; + static constexpr tcp_state LISTEN = tcp_state::LISTEN; + static constexpr tcp_state SYN_SENT = tcp_state::SYN_SENT; + static constexpr tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED; + static constexpr tcp_state ESTABLISHED = tcp_state::ESTABLISHED; + static constexpr tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1; + static constexpr tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2; + static constexpr tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT; + static constexpr tcp_state CLOSING = tcp_state::CLOSING; + static constexpr tcp_state LAST_ACK = tcp_state::LAST_ACK; + static constexpr tcp_state TIME_WAIT = tcp_state::TIME_WAIT; + tcp_state _state = CLOSED; + tcp& _tcp; + UserspaceEventManager &manager; + connection* _conn = nullptr; + bool _connect_done = false; + ipaddr _local_ip; + ipaddr _foreign_ip; + uint16_t _local_port; + uint16_t _foreign_port; + struct unacked_segment { + Packet p; + uint16_t data_len; + unsigned nr_transmits; + clock_type::time_point tx_time; + }; + struct send { + tcp_sequence unacknowledged; + tcp_sequence next; + uint32_t window; + uint8_t window_scale; + uint16_t mss; + tcp_sequence urgent; + tcp_sequence wl1; + tcp_sequence wl2; + tcp_sequence initial; + std::deque<unacked_segment> data; + std::deque<Packet> unsent; + uint32_t unsent_len = 0; + uint32_t queued_len = 0; + bool closed = false; + // Wait for all data are acked + int _all_data_acked_fd = -1; + // Limit number of data queued into send queue + Throttle user_queue_space; + // Round-trip time variation + std::chrono::microseconds rttvar; + // Smoothed round-trip time + std::chrono::microseconds srtt; + bool first_rto_sample = true; + clock_type::time_point syn_tx_time; + // Congestion window + uint32_t cwnd; + // Slow start threshold + uint32_t ssthresh; + // Duplicated ACKs + uint16_t dupacks = 0; + unsigned syn_retransmit = 0; + unsigned fin_retransmit = 0; + uint32_t limited_transfer = 0; + uint32_t partial_ack = 0; + tcp_sequence recover; + bool window_probe = false; + send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {} + } _snd; + struct receive { + tcp_sequence next; + uint32_t window; + uint8_t window_scale; + uint16_t mss; + tcp_sequence urgent; + tcp_sequence initial; + std::deque<Packet> data; + tcp_packet_merger out_of_order; + } _rcv; + EventCenter *center; + int fd; + // positive means no errno, 0 means eof, nagetive means error + int16_t _errno = 1; + tcp_option _option; + EventCallbackRef delayed_ack_event; + Tub<uint64_t> _delayed_ack_fd; + // Retransmission timeout + std::chrono::microseconds _rto{1000*1000}; + std::chrono::microseconds _persist_time_out{1000*1000}; + static constexpr std::chrono::microseconds _rto_min{1000*1000}; + static constexpr std::chrono::microseconds _rto_max{60000*1000}; + // Clock granularity + static constexpr std::chrono::microseconds _rto_clk_granularity{1000}; + static constexpr uint16_t _max_nr_retransmit{5}; + EventCallbackRef retransmit_event; + Tub<uint64_t> retransmit_fd; + EventCallbackRef persist_event; + EventCallbackRef all_data_ack_event; + Tub<uint64_t> persist_fd; + uint16_t _nr_full_seg_received = 0; + struct isn_secret { + // 512 bits secretkey for ISN generating + uint32_t key[16]; + isn_secret () { + for (auto& k : key) { + k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max()); + } + } + }; + static isn_secret _isn_secret; + tcp_sequence get_isn(); + circular_buffer<typename InetTraits::l4packet> _packetq; + bool _poll_active = false; + public: + // callback + void close_final_cleanup(); + ostream& _prefix(std::ostream *_dout); + + public: + tcb(tcp& t, connid id); + ~tcb(); + void input_handle_listen_state(tcp_hdr* th, Packet p); + void input_handle_syn_sent_state(tcp_hdr* th, Packet p); + void input_handle_other_state(tcp_hdr* th, Packet p); + void output_one(bool data_retransmit = false); + bool is_all_data_acked(); + int send(Packet p); + void connect(); + Tub<Packet> read(); + void close(); + void remove_from_tcbs() { + auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port}; + _tcp._tcbs.erase(id); + } + Tub<typename InetTraits::l4packet> get_packet(); + void output() { + if (!_poll_active) { + _poll_active = true; + + auto tcb = this->shared_from_this(); + _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) { + if (r == 0) { + tcb->_tcp.poll_tcb(dst, std::move(tcb)); + } else if (r == -ETIMEDOUT) { + // in other states connection should time out + if (tcb->in_state(SYN_SENT)) { + tcb->_errno = -ETIMEDOUT; + tcb->cleanup(); + } + } else if (r == -EBUSY) { + // retry later + tcb->_poll_active = false; + tcb->start_retransmit_timer(); + } + }); + } + } + + int16_t get_errno() const { + return _errno; + } + + tcp_state& state() { + return _state; + } + + uint64_t peek_sent_available() { + if (!in_state(ESTABLISHED)) + return 0; + uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current(); + return left; + } + + int is_connected() const { + if (_errno <= 0) + return _errno; + return _connect_done; + } + + private: + void respond_with_reset(tcp_hdr* th); + bool merge_out_of_order(); + void insert_out_of_order(tcp_sequence seq, Packet p); + void trim_receive_data_after_window(); + bool should_send_ack(uint16_t seg_len); + void clear_delayed_ack(); + Packet get_transmit_packet(); + void retransmit_one() { + bool data_retransmit = true; + output_one(data_retransmit); + } + void start_retransmit_timer() { + if (retransmit_fd) + center->delete_time_event(*retransmit_fd); + retransmit_fd.construct(center->create_time_event(_rto.count(), retransmit_event)); + }; + void stop_retransmit_timer() { + if (retransmit_fd) { + center->delete_time_event(*retransmit_fd); + retransmit_fd.destroy(); + } + }; + void start_persist_timer() { + if (persist_fd) + center->delete_time_event(*persist_fd); + persist_fd.construct(center->create_time_event(_persist_time_out.count(), persist_event)); + }; + void stop_persist_timer() { + if (persist_fd) { + center->delete_time_event(*persist_fd); + persist_fd.destroy(); + } + }; + void persist(); + void retransmit(); + void fast_retransmit(); + void update_rto(clock_type::time_point tx_time); + void update_cwnd(uint32_t acked_bytes); + void cleanup(); + uint32_t can_send() { + if (_snd.window_probe) { + return 1; + } + // Can not send more than advertised window allows + auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len); + // Can not send more than congestion window allows + x = std::min(_snd.cwnd, x); + if (_snd.dupacks == 1 || _snd.dupacks == 2) { + // RFC5681 Step 3.1 + // Send cwnd + 2 * smss per RFC3042 + auto flight = flight_size(); + auto max = _snd.cwnd + 2 * _snd.mss; + x = flight <= max ? std::min(x, max - flight) : 0; + _snd.limited_transfer += x; + } else if (_snd.dupacks >= 3) { + // RFC5681 Step 3.5 + // Sent 1 full-sized segment at most + x = std::min(uint32_t(_snd.mss), x); + } + return x; + } + uint32_t flight_size() { + uint32_t size = 0; + std::for_each(_snd.data.begin(), _snd.data.end(), + [&] (unacked_segment& seg) { size += seg.p.len(); }); + return size; + } + uint16_t local_mss() { + return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min; + } + void queue_packet(Packet p) { + _packetq.emplace_back( + typename InetTraits::l4packet{_foreign_ip, std::move(p)}); + } + void signal_data_received() { + manager.notify(fd, EVENT_READABLE); + } + void signal_all_data_acked() { + if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0) + manager.notify(_snd._all_data_acked_fd, EVENT_READABLE); + } + void do_syn_sent() { + _state = SYN_SENT; + _snd.syn_tx_time = clock_type::now(); + // Send <SYN> to remote + output(); + } + void do_syn_received() { + _state = SYN_RECEIVED; + _snd.syn_tx_time = clock_type::now(); + // Send <SYN,ACK> to remote + output(); + } + void do_established() { + _state = ESTABLISHED; + update_rto(_snd.syn_tx_time); + _connect_done = true; + manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE); + } + void do_reset() { + _state = CLOSED; + // Free packets to be sent which are waiting for user_queue_space + _snd.user_queue_space.reset(); + cleanup(); + _errno = -ECONNRESET; + manager.notify(fd, EVENT_READABLE); + + if (_snd._all_data_acked_fd >= 0) + manager.notify(_snd._all_data_acked_fd, EVENT_READABLE); + } + void do_time_wait() { + // FIXME: Implement TIME_WAIT state timer + _state = TIME_WAIT; + cleanup(); + } + void do_closed() { + _state = CLOSED; + cleanup(); + } + void do_setup_isn() { + _snd.initial = get_isn(); + _snd.unacknowledged = _snd.initial; + _snd.next = _snd.initial + 1; + _snd.recover = _snd.initial; + } + void do_local_fin_acked() { + _snd.unacknowledged += 1; + _snd.next += 1; + } + bool syn_needs_on() { + return in_state(SYN_SENT | SYN_RECEIVED); + } + bool fin_needs_on() { + return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed && + _snd.unsent_len == 0 && _snd.queued_len == 0; + } + bool ack_needs_on() { + return !in_state(CLOSED | LISTEN | SYN_SENT); + } + bool foreign_will_not_send() { + return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED); + } + bool in_state(tcp_state state) { + return uint16_t(_state) & uint16_t(state); + } + void exit_fast_recovery() { + _snd.dupacks = 0; + _snd.limited_transfer = 0; + _snd.partial_ack = 0; + } + uint32_t data_segment_acked(tcp_sequence seg_ack); + bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len); + void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end); + friend class connection; + + friend class C_handle_delayed_ack; + friend class C_handle_retransmit; + friend class C_handle_persist; + friend class C_all_data_acked; + }; + + CephContext *cct; + // ipv4_l4<ip_protocol_num::tcp> + inet_type& _inet; + EventCenter *center; + UserspaceEventManager &manager; + std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs; + std::unordered_map<uint16_t, listener*> _listening; + std::random_device _rd; + std::default_random_engine _e; + std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535}; + circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs; + // queue for packets that do not belong to any tcb + circular_buffer<ipv4_traits::l4packet> _packetq; + Throttle _queue_space; + // Limit number of data queued into send queue + public: + class connection { + lw_shared_ptr<tcb> _tcb; + public: + explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; } + connection(const connection&) = delete; + connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) { + _tcb->_conn = this; + } + ~connection(); + void operator=(const connection&) = delete; + connection& operator=(connection&& x) { + if (this != &x) { + this->~connection(); + new (this) connection(std::move(x)); + } + return *this; + } + int fd() const { + return _tcb->fd; + } + int send(Packet p) { + return _tcb->send(std::move(p)); + } + Tub<Packet> read() { + return _tcb->read(); + } + int16_t get_errno() const { + return _tcb->get_errno(); + } + void close_read(); + void close_write(); + entity_addr_t remote_addr() const { + entity_addr_t addr; + auto net_ip = _tcb->_foreign_ip.hton(); + memcpy((void*)&addr.in4_addr().sin_addr.s_addr, + &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr)); + addr.set_family(AF_INET); + return addr; + } + uint64_t peek_sent_available() { + return _tcb->peek_sent_available(); + } + int is_connected() const { return _tcb->is_connected(); } + }; + class listener { + tcp& _tcp; + uint16_t _port; + int _fd = -1; + int16_t _errno; + queue<connection> _q; + size_t _q_max_length; + + private: + listener(tcp& t, uint16_t port, size_t queue_length) + : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) { + } + public: + listener(const listener&) = delete; + void operator=(const listener&) = delete; + listener(listener&& x) + : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno), + _q(std::move(x._q)) { + if (_fd >= 0) + _tcp._listening[_port] = this; + } + ~listener() { + abort_accept(); + } + int listen() { + if (_tcp._listening.find(_port) != _tcp._listening.end()) + return -EADDRINUSE; + _tcp._listening.emplace(_port, this); + _fd = _tcp.manager.get_eventfd(); + return 0; + } + Tub<connection> accept() { + Tub<connection> c; + if (!_q.empty()) { + c = std::move(_q.front()); + _q.pop(); + } + return c; + } + void abort_accept() { + while (!_q.empty()) + _q.pop(); + if (_fd >= 0) { + _tcp._listening.erase(_port); + _tcp.manager.close(_fd); + _fd = -1; + } + } + int16_t get_errno() const { + return _errno; + } + bool full() const { + return _q.size() == _q_max_length; + } + int fd() const { + return _fd; + } + friend class tcp; + }; + public: + explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen); + void received(Packet p, ipaddr from, ipaddr to); + bool forward(forward_hash& out_hash_data, Packet& p, size_t off); + listener listen(uint16_t port, size_t queue_length = 100); + connection connect(const entity_addr_t &addr); + const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); } + void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) { + _poll_tcbs.emplace_back(std::move(tcb), dst); + } + bool push_listen_queue(uint16_t port, tcb *t) { + auto listener = _listening.find(port); + if (listener == _listening.end() || listener->second->full()) { + return false; + } + listener->second->_q.push(connection(t->shared_from_this())); + manager.notify(listener->second->_fd, EVENT_READABLE); + return true; + } + + private: + void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p); + void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip); + friend class listener; +}; + +template <typename InetTraits> +tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen) + : cct(c), _inet(inet), center(cen), + manager(static_cast<DPDKDriver*>(cen->get_driver())->manager), + _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) { + int tcb_polled = 0u; + _inet.register_packet_provider([this, tcb_polled] () mutable { + Tub<typename InetTraits::l4packet> l4p; + auto c = _poll_tcbs.size(); + if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) { + l4p = std::move(_packetq.front()); + _packetq.pop_front(); + _queue_space.put(l4p->p.len()); + } else { + while (c--) { + tcb_polled++; + lw_shared_ptr<tcb> tcb; + ethernet_address dst; + std::tie(tcb, dst) = std::move(_poll_tcbs.front()); + _poll_tcbs.pop_front(); + l4p = std::move(tcb->get_packet()); + if (l4p) { + l4p->e_dst = dst; + break; + } + } + } + return l4p; + }); +} + +template <typename InetTraits> +auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener { + return listener(*this, port, queue_length); +} + +template <typename InetTraits> +typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) { + uint16_t src_port; + connid id; + auto src_ip = _inet._inet.host_address(); + auto dst_ip = ipv4_address(addr); + auto dst_port = addr.get_port(); + + do { + src_port = _port_dist(_e); + id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port}; + if (_tcbs.find(id) == _tcbs.end()) { + if (_inet._inet.netif()->hw_queues_count() == 1 || + _inet._inet.netif()->hash2cpu( + id.hash(_inet._inet.netif()->rss_key())) == center->get_id()) + break; + } + } while (true); + + auto tcbp = make_lw_shared<tcb>(*this, id); + _tcbs.insert({id, tcbp}); + tcbp->connect(); + return connection(tcbp); +} + +template <typename InetTraits> +bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) { + auto th = p.get_header<tcp_hdr>(off); + if (th) { + out_hash_data.push_back(th->src_port); + out_hash_data.push_back(th->dst_port); + } + return true; +} + +template <typename InetTraits> +void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) { + auto th = p.get_header<tcp_hdr>(0); + if (!th) { + return; + } + // th->data_offset is correct even before ntoh() + if (unsigned(th->data_offset * 4) < sizeof(*th)) { + return; + } + + if (!get_hw_features().rx_csum_offload) { + checksummer csum; + InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len()); + csum.sum(p); + if (csum.get() != 0) { + return; + } + } + auto h = th->ntoh(); + auto id = connid{to, from, h.dst_port, h.src_port}; + auto tcbi = _tcbs.find(id); + lw_shared_ptr<tcb> tcbp; + if (tcbi == _tcbs.end()) { + auto listener = _listening.find(id.local_port); + if (listener == _listening.end() || listener->second->full()) { + // 1) In CLOSE state + // 1.1 all data in the incoming segment is discarded. An incoming + // segment containing a RST is discarded. An incoming segment not + // containing a RST causes a RST to be sent in response. + // FIXME: + // if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> + // if ACK on: <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(&h, id.local_ip, id.foreign_ip); + } else { + // 2) In LISTEN state + // 2.1 first check for an RST + if (h.f_rst) { + // An incoming RST should be ignored + return; + } + // 2.2 second check for an ACK + if (h.f_ack) { + // Any acknowledgment is bad if it arrives on a connection + // still in the LISTEN state. + // <SEQ=SEG.ACK><CTL=RST> + return respond_with_reset(&h, id.local_ip, id.foreign_ip); + } + // 2.3 third check for a SYN + if (h.f_syn) { + // check the security + // NOTE: Ignored for now + tcbp = make_lw_shared<tcb>(*this, id); + _tcbs.insert({id, tcbp}); + return tcbp->input_handle_listen_state(&h, std::move(p)); + } + // 2.4 fourth other text or control + // So you are unlikely to get here, but if you do, drop the + // segment, and return. + return; + } + } else { + tcbp = tcbi->second; + if (tcbp->state() == tcp_state::SYN_SENT) { + // 3) In SYN_SENT State + return tcbp->input_handle_syn_sent_state(&h, std::move(p)); + } else { + // 4) In other state, can be one of the following: + // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2 + // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT + return tcbp->input_handle_other_state(&h, std::move(p)); + } + } +} + +// Send packet does not belong to any tcb +template <typename InetTraits> +void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) { + if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue + _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable { + if (r == 0) + _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp}); + }); + } +} + +template <typename InetTraits> +tcp<InetTraits>::connection::~connection() { + if (_tcb) { + _tcb->_conn = nullptr; + close_read(); + close_write(); + } +} + +template <typename InetTraits> +tcp<InetTraits>::tcb::tcb(tcp& t, connid id) + : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip), + _local_port(id.local_port), _foreign_port(id.foreign_port), + _snd(_tcp.cct), + center(t.center), + fd(t.manager.get_eventfd()), + delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)), + retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)), + persist_event(new tcp<InetTraits>::C_handle_persist(this)), + all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {} + +template <typename InetTraits> +tcp<InetTraits>::tcb::~tcb() +{ + if (_delayed_ack_fd) + center->delete_time_event(*_delayed_ack_fd); + if (retransmit_fd) + center->delete_time_event(*retransmit_fd); + if (persist_fd) + center->delete_time_event(*persist_fd); + delete delayed_ack_event; + delete retransmit_event; + delete persist_event; + delete all_data_ack_event; + manager.close(fd); + fd = -1; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth) +{ + _tcp.respond_with_reset(rth, _local_ip, _foreign_ip); +} + +template <typename InetTraits> +uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) { + uint32_t total_acked_bytes = 0; + // Full ACK of segment + while (!_snd.data.empty() + && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) { + auto acked_bytes = _snd.data.front().p.len(); + _snd.unacknowledged += acked_bytes; + // Ignore retransmitted segments when setting the RTO + if (_snd.data.front().nr_transmits == 0) { + update_rto(_snd.data.front().tx_time); + } + update_cwnd(acked_bytes); + total_acked_bytes += acked_bytes; + _snd.user_queue_space.put(_snd.data.front().data_len); + manager.notify(fd, EVENT_WRITABLE); + _snd.data.pop_front(); + } + // Partial ACK of segment + if (_snd.unacknowledged < seg_ack) { + auto acked_bytes = seg_ack - _snd.unacknowledged; + if (!_snd.data.empty()) { + auto& unacked_seg = _snd.data.front(); + unacked_seg.p.trim_front(acked_bytes); + } + _snd.unacknowledged = seg_ack; + update_cwnd(acked_bytes); + total_acked_bytes += acked_bytes; + } + return total_acked_bytes; +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) { + if (seg_len == 0 && _rcv.window == 0) { + // SEG.SEQ = RCV.NXT + return seg_seq == _rcv.next; + } else if (seg_len == 0 && _rcv.window > 0) { + // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window); + } else if (seg_len > 0 && _rcv.window > 0) { + // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND + // or + // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND + bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window); + bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window); + return x || y; + } else { + // SEG.LEN > 0 RCV.WND = 0, not acceptable + return false; + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) { + // Handle tcp options + _option.parse(opt_start, opt_end); + + // Remote receive window scale factor + _snd.window_scale = _option._remote_win_scale; + // Local receive window scale factor + _rcv.window_scale = _option._local_win_scale; + + // Maximum segment size remote can receive + _snd.mss = _option._remote_mss; + // Maximum segment size local can receive + _rcv.mss = _option._local_mss = local_mss(); + + // Linux's default window size + _rcv.window = 29200 << _rcv.window_scale; + _snd.window = th->window << _snd.window_scale; + + // Segment sequence number used for last window update + _snd.wl1 = th->seq; + // Segment acknowledgment number used for last window update + _snd.wl2 = th->ack; + + // Setup initial congestion window + if (2190 < _snd.mss) { + _snd.cwnd = 2 * _snd.mss; + } else if (1095 < _snd.mss && _snd.mss <= 2190) { + _snd.cwnd = 3 * _snd.mss; + } else { + _snd.cwnd = 4 * _snd.mss; + } + + // Setup initial slow start threshold + _snd.ssthresh = th->window << _snd.window_scale; +} + +template <typename InetTraits> +Packet tcp<InetTraits>::tcb::get_transmit_packet() { + // easy case: empty queue + if (_snd.unsent.empty()) { + return Packet(); + } + auto can_send = this->can_send(); + // Max number of TCP payloads we can pass to NIC + uint32_t len; + if (_tcp.get_hw_features().tx_tso) { + // FIXME: Info tap device the size of the split packet + len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min; + } else { + len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss); + } + can_send = std::min(can_send, len); + // easy case: one small packet + if (_snd.unsent.front().len() <= can_send) { + auto p = std::move(_snd.unsent.front()); + _snd.unsent.pop_front(); + _snd.unsent_len -= p.len(); + return p; + } + // moderate case: need to split one packet + if (_snd.unsent.front().len() > can_send) { + auto p = _snd.unsent.front().share(0, can_send); + _snd.unsent.front().trim_front(can_send); + _snd.unsent_len -= p.len(); + return p; + } + // hard case: merge some packets, possibly split last + auto p = std::move(_snd.unsent.front()); + _snd.unsent.pop_front(); + can_send -= p.len(); + while (!_snd.unsent.empty() + && _snd.unsent.front().len() <= can_send) { + can_send -= _snd.unsent.front().len(); + p.append(std::move(_snd.unsent.front())); + _snd.unsent.pop_front(); + } + // FIXME: this will result in calling "deleter" of packet which free managed objects + // will used later + // if (!_snd.unsent.empty() && can_send) { + // auto& q = _snd.unsent.front(); + // p.append(q.share(0, can_send)); + // q.trim_front(can_send); + // } + _snd.unsent_len -= p.len(); + return p; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::output_one(bool data_retransmit) { + if (in_state(CLOSED)) { + return; + } + + Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet(); + Packet clone = p.share(); // early clone to prevent share() from calling packet::unuse_internal_data() on header. + uint16_t len = p.len(); + bool syn_on = syn_needs_on(); + bool ack_on = ack_needs_on(); + + auto options_size = _option.get_size(syn_on, ack_on); + auto th = p.prepend_header<tcp_hdr>(options_size); + + th->src_port = _local_port; + th->dst_port = _foreign_port; + + th->f_syn = syn_on; + th->f_ack = ack_on; + if (ack_on) { + clear_delayed_ack(); + } + th->f_urg = false; + th->f_psh = false; + + tcp_sequence seq; + if (data_retransmit) { + seq = _snd.unacknowledged; + } else { + seq = syn_on ? _snd.initial : _snd.next; + _snd.next += len; + } + th->seq = seq; + th->ack = _rcv.next; + th->data_offset = (sizeof(*th) + options_size) / 4; + th->window = _rcv.window >> _rcv.window_scale; + th->checksum = 0; + + // FIXME: does the FIN have to fit in the window? + bool fin_on = fin_needs_on(); + th->f_fin = fin_on; + + // Add tcp options + _option.fill(th, options_size); + *th = th->hton(); + + offload_info oi; + checksummer csum; + uint16_t pseudo_hdr_seg_len = 0; + + oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size; + + if (_tcp.get_hw_features().tx_csum_l4_offload) { + oi.needs_csum = true; + + // + // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's + // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones' + // complement sum of the pseudo header. + // + // For TSO the csum should be calculated for a pseudo header with + // segment length set to 0. All the rest is the same as for a TCP Tx + // CSUM offload case. + // + if (_tcp.get_hw_features().tx_tso && len > _snd.mss) { + oi.tso_seg_size = _snd.mss; + } else { + pseudo_hdr_seg_len = sizeof(*th) + options_size + len; + } + } else { + pseudo_hdr_seg_len = sizeof(*th) + options_size + len; + oi.needs_csum = false; + } + + InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip, + pseudo_hdr_seg_len); + + if (_tcp.get_hw_features().tx_csum_l4_offload) { + th->checksum = ~csum.get(); + } else { + csum.sum(p); + th->checksum = csum.get(); + } + + oi.protocol = ip_protocol_num::tcp; + + p.set_offload_info(oi); + + if (!data_retransmit && (len || syn_on || fin_on)) { + auto now = clock_type::now(); + if (len) { + unsigned nr_transmits = 0; + _snd.data.emplace_back(unacked_segment{std::move(clone), + len, nr_transmits, now}); + } + if (!retransmit_fd) { + start_retransmit_timer(); + } + } + + queue_packet(std::move(p)); +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::is_all_data_acked() { + if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) { + return true; + } + return false; +} + +template <typename InetTraits> +Tub<Packet> tcp<InetTraits>::tcb::read() { + Tub<Packet> p; + if (_rcv.data.empty()) + return p; + + p.construct(); + for (auto&& q : _rcv.data) { + p->append(std::move(q)); + } + _rcv.data.clear(); + return p; +} + +template <typename InetTraits> +int tcp<InetTraits>::tcb::send(Packet p) { + // We can not send after the connection is closed + ceph_assert(!_snd.closed); + + if (in_state(CLOSED)) + return -ECONNRESET; + + auto len = p.len(); + if (!_snd.user_queue_space.get_or_fail(len)) { + // note: caller must ensure enough queue space to send + ceph_abort(); + } + // TODO: Handle p.len() > max user_queue_space case + _snd.queued_len += len; + _snd.unsent_len += len; + _snd.queued_len -= len; + _snd.unsent.push_back(std::move(p)); + if (can_send() > 0) { + output(); + } + return len; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::close() { + if (in_state(CLOSED) || _snd.closed) { + return ; + } + // TODO: We should make this asynchronous + + _errno = -EPIPE; + center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE); + bool acked = is_all_data_acked(); + if (!acked) { + _snd._all_data_acked_fd = manager.get_eventfd(); + center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event); + } else { + close_final_cleanup(); + } +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) { + // We've received a TSO packet, do ack immediately + if (seg_len > _rcv.mss) { + _nr_full_seg_received = 0; + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } + return true; + } + + // We've received a full sized segment, ack for every second full sized segment + if (seg_len == _rcv.mss) { + if (_nr_full_seg_received++ >= 1) { + _nr_full_seg_received = 0; + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } + return true; + } + } + + // If the timer is armed and its callback hasn't been run. + if (_delayed_ack_fd) { + return false; + } + + // If the timer is not armed, schedule a delayed ACK. + // The maximum delayed ack timer allowed by RFC1122 is 500ms, most + // implementations use 200ms. + _delayed_ack_fd.construct(center->create_time_event(200*1000, delayed_ack_event)); + return false; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::clear_delayed_ack() { + if (_delayed_ack_fd) { + center->delete_time_event(*_delayed_ack_fd); + _delayed_ack_fd.destroy(); + } +} + +template <typename InetTraits> +bool tcp<InetTraits>::tcb::merge_out_of_order() { + bool merged = false; + if (_rcv.out_of_order.map.empty()) { + return merged; + } + for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) { + auto& p = it->second; + auto seg_beg = it->first; + auto seg_len = p.len(); + auto seg_end = seg_beg + seg_len; + if (seg_beg <= _rcv.next && seg_end > _rcv.next) { + // This segment has been received out of order and its previous + // segment has been received now + auto trim = _rcv.next - seg_beg; + if (trim) { + p.trim_front(trim); + seg_len -= trim; + } + _rcv.next += seg_len; + _rcv.data.push_back(std::move(p)); + // Since c++11, erase() always returns the value of the following element + it = _rcv.out_of_order.map.erase(it); + merged = true; + } else if (_rcv.next >= seg_end) { + // This segment has been receive already, drop it + it = _rcv.out_of_order.map.erase(it); + } else { + // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only, + // so we can stop looking here. + it++; + break; + } + } + return merged; +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) { + _rcv.out_of_order.merge(seg, std::move(p)); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::trim_receive_data_after_window() { + abort(); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::fast_retransmit() { + if (!_snd.data.empty()) { + auto& unacked_seg = _snd.data.front(); + unacked_seg.nr_transmits++; + retransmit_one(); + output(); + } +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) { + // Update RTO according to RFC6298 + auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time); + if (_snd.first_rto_sample) { + _snd.first_rto_sample = false; + // RTTVAR <- R/2 + // SRTT <- R + _snd.rttvar = R / 2; + _snd.srtt = R; + } else { + // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'| + // SRTT <- (1 - alpha) * SRTT + alpha * R' + // where alpha = 1/8 and beta = 1/4 + auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt); + _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4; + _snd.srtt = _snd.srtt * 7 / 8 + R / 8; + } + // RTO <- SRTT + max(G, K * RTTVAR) + _rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar); + + // Make sure 1 sec << _rto << 60 sec + _rto = std::max(_rto, _rto_min); + _rto = std::min(_rto, _rto_max); +} + +template <typename InetTraits> +void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) { + uint32_t smss = _snd.mss; + if (_snd.cwnd < _snd.ssthresh) { + // In slow start phase + _snd.cwnd += std::min(acked_bytes, smss); + } else { + // In congestion avoidance phase + uint32_t round_up = 1; + _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd); + } +} + + +template <typename InetTraits> +void tcp<InetTraits>::tcb::cleanup() { + manager.notify(fd, EVENT_READABLE); + _snd.closed = true; + _snd.unsent.clear(); + _snd.data.clear(); + _rcv.out_of_order.map.clear(); + _rcv.data.clear(); + stop_retransmit_timer(); + clear_delayed_ack(); + center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this)); + remove_from_tcbs(); +} + +template <typename InetTraits> +tcp_sequence tcp<InetTraits>::tcb::get_isn() { + // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers + // with the expression: + // ISN = M + F(localip, localport, remoteip, remoteport, secretkey) + // M is the 4 microsecond timer + using namespace std::chrono; + uint32_t hash[4]; + hash[0] = _local_ip.ip; + hash[1] = _foreign_ip.ip; + hash[2] = (_local_port << 16) + _foreign_port; + hash[3] = _isn_secret.key[15]; + ceph::crypto::MD5 md5; + md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key)); + md5.Final((unsigned char*)hash); + auto seq = hash[0]; + auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch()); + seq += m.count() / 4; + return make_seq(seq); +} + +template <typename InetTraits> +Tub<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() { + _poll_active = false; + if (_packetq.empty()) { + output_one(); + } + + Tub<typename InetTraits::l4packet> p; + if (in_state(CLOSED)) { + return p; + } + + ceph_assert(!_packetq.empty()); + + p = std::move(_packetq.front()); + _packetq.pop_front(); + if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) { + // If there are packets to send in the queue or tcb is allowed to send + // more add tcp back to polling set to keep sending. In addition, dupacks >= 3 + // is an indication that an segment is lost, stop sending more in this case. + output(); + } + return p; +} + +template <typename InetTraits> +void tcp<InetTraits>::connection::close_read() { + // do nothing + // _tcb->manager.notify(_tcb->fd, EVENT_READABLE); +} + +template <typename InetTraits> +void tcp<InetTraits>::connection::close_write() { + _tcb->close(); +} + +template <typename InetTraits> +constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max; + +template <typename InetTraits> +constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity; + +template <typename InetTraits> +typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret; + + +#endif /* TCP_HH_ */ diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc new file mode 100644 index 00000000..282dcef1 --- /dev/null +++ b/src/msg/async/dpdk/UserspaceEvent.cc @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2016 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "UserspaceEvent.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "dpdk " + +int UserspaceEventManager::get_eventfd() +{ + int fd; + if (!unused_fds.empty()) { + fd = unused_fds.front(); + unused_fds.pop_front(); + } else { + fd = ++max_fd; + fds.resize(fd + 1); + } + + Tub<UserspaceFDImpl> &impl = fds[fd]; + ceph_assert(!impl); + impl.construct(); + ldout(cct, 20) << __func__ << " fd=" << fd << dendl; + return fd; +} + +int UserspaceEventManager::notify(int fd, int mask) +{ + ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl; + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask) + << " listening=" << int(impl->listening_mask) + << " waiting_idx=" << int(impl->waiting_idx) << dendl; + + impl->activating_mask |= mask; + if (impl->waiting_idx) + return 0; + + if (impl->listening_mask & mask) { + if (waiting_fds.size() <= max_wait_idx) + waiting_fds.resize(waiting_fds.size()*2); + impl->waiting_idx = ++max_wait_idx; + waiting_fds[max_wait_idx] = fd; + } + + ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask) + << " listening=" << int(impl->listening_mask) + << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl; + return 0; +} + +void UserspaceEventManager::close(int fd) +{ + ldout(cct, 20) << __func__ << " fd=" << fd << dendl; + if ((size_t)fd >= fds.size()) + return ; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return ; + + if (fd == max_fd) + --max_fd; + else + unused_fds.push_back(fd); + + if (impl->activating_mask) { + if (waiting_fds[max_wait_idx] == fd) { + ceph_assert(impl->waiting_idx == max_wait_idx); + --max_wait_idx; + } + waiting_fds[impl->waiting_idx] = -1; + } + impl.destroy(); +} + +int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp) +{ + int fd; + uint32_t i = 0; + int count = 0; + ceph_assert(num_events); + // leave zero slot for waiting_fds + while (i < max_wait_idx) { + fd = waiting_fds[++i]; + if (fd == -1) + continue; + + events[count] = fd; + Tub<UserspaceFDImpl> &impl = fds[fd]; + ceph_assert(impl); + masks[count] = impl->listening_mask & impl->activating_mask; + ceph_assert(masks[count]); + ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl; + impl->activating_mask &= (~masks[count]); + impl->waiting_idx = 0; + if (++count >= num_events) + break; + } + if (i < max_wait_idx) { + memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i)); + } + max_wait_idx -= i; + return count; +} diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h new file mode 100644 index 00000000..7e89517d --- /dev/null +++ b/src/msg/async/dpdk/UserspaceEvent.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_USERSPACEEVENT_H +#define CEPH_USERSPACEEVENT_H + +#include <cstddef> +#include <errno.h> +#include <string.h> + +#include <vector> +#include <list> + +#include "include/ceph_assert.h" +#include "include/int_types.h" +#include "common/Tub.h" + +class CephContext; + +class UserspaceEventManager { + struct UserspaceFDImpl { + uint32_t waiting_idx = 0; + int16_t read_errno = 0; + int16_t write_errno = 0; + int8_t listening_mask = 0; + int8_t activating_mask = 0; + uint32_t magic = 4921; + }; + CephContext *cct; + int max_fd = 0; + uint32_t max_wait_idx = 0; + std::vector<Tub<UserspaceFDImpl> > fds; + std::vector<int> waiting_fds; + std::list<uint32_t> unused_fds; + + public: + explicit UserspaceEventManager(CephContext *c): cct(c) { + waiting_fds.resize(1024); + } + + int get_eventfd(); + + int listen(int fd, int mask) { + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + impl->listening_mask |= mask; + if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) { + if (waiting_fds.size() <= max_wait_idx) + waiting_fds.resize(waiting_fds.size()*2); + impl->waiting_idx = ++max_wait_idx; + waiting_fds[max_wait_idx] = fd; + } + return 0; + } + + int unlisten(int fd, int mask) { + if ((size_t)fd >= fds.size()) + return -ENOENT; + + Tub<UserspaceFDImpl> &impl = fds[fd]; + if (!impl) + return -ENOENT; + + impl->listening_mask &= (~mask); + if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) { + if (waiting_fds[max_wait_idx] == fd) { + ceph_assert(impl->waiting_idx == max_wait_idx); + --max_wait_idx; + } + waiting_fds[impl->waiting_idx] = -1; + impl->waiting_idx = 0; + } + return 0; + } + + int notify(int fd, int mask); + void close(int fd); + int poll(int *events, int *masks, int num_events, struct timeval *tp); + + bool check() { + for (auto &&m : fds) { + if (m && m->magic != 4921) + return false; + } + return true; + } +}; + +#endif //CEPH_USERSPACEEVENT_H diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h new file mode 100644 index 00000000..3b48f789 --- /dev/null +++ b/src/msg/async/dpdk/align.h @@ -0,0 +1,50 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_ALIGN_HH_ +#define CEPH_MSG_DPDK_ALIGN_HH_ + +#include <cstdint> +#include <cstdlib> + +template <typename T> +inline constexpr T align_up(T v, T align) { + return (v + align - 1) & ~(align - 1); +} + +template <typename T> +inline constexpr T* align_up(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align)); +} + +template <typename T> +inline constexpr T align_down(T v, T align) { + return v & ~(align - 1); +} + +template <typename T> +inline constexpr T* align_down(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align)); +} + +#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */ diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h new file mode 100644 index 00000000..40f7728d --- /dev/null +++ b/src/msg/async/dpdk/array_map.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_ARRAY_MAP_HH_ +#define CEPH_ARRAY_MAP_HH_ + +#include <array> + +// unordered_map implemented as a simple array + +template <typename Value, size_t Max> +class array_map { + std::array<Value, Max> _a {}; + public: + array_map(std::initializer_list<std::pair<size_t, Value>> i) { + for (auto kv : i) { + _a[kv.first] = kv.second; + } + } + Value& operator[](size_t key) { return _a[key]; } + const Value& operator[](size_t key) const { return _a[key]; } + + Value& at(size_t key) { + if (key >= Max) { + throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max)); + } + return _a[key]; + } +}; + +#endif /* ARRAY_MAP_HH_ */ diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h new file mode 100644 index 00000000..a996ec07 --- /dev/null +++ b/src/msg/async/dpdk/byteorder.h @@ -0,0 +1,58 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_BYTEORDER_H_ +#define CEPH_MSG_BYTEORDER_H_ + +#include <arpa/inet.h> // for ntohs() and friends +#include <iosfwd> +#include <utility> + +inline uint64_t ntohq(uint64_t v) { + return __builtin_bswap64(v); +} +inline uint64_t htonq(uint64_t v) { + return __builtin_bswap64(v); +} + +inline void ntoh() {} +inline void hton() {} + +inline uint8_t ntoh(uint8_t x) { return x; } +inline uint8_t hton(uint8_t x) { return x; } +inline uint16_t ntoh(uint16_t x) { return ntohs(x); } +inline uint16_t hton(uint16_t x) { return htons(x); } +inline uint32_t ntoh(uint32_t x) { return ntohl(x); } +inline uint32_t hton(uint32_t x) { return htonl(x); } +inline uint64_t ntoh(uint64_t x) { return ntohq(x); } +inline uint64_t hton(uint64_t x) { return htonq(x); } + +inline int8_t ntoh(int8_t x) { return x; } +inline int8_t hton(int8_t x) { return x; } +inline int16_t ntoh(int16_t x) { return ntohs(x); } +inline int16_t hton(int16_t x) { return htons(x); } +inline int32_t ntoh(int32_t x) { return ntohl(x); } +inline int32_t hton(int32_t x) { return htonl(x); } +inline int64_t ntoh(int64_t x) { return ntohq(x); } +inline int64_t hton(int64_t x) { return htonq(x); } + +#endif /* CEPH_MSG_BYTEORDER_H_ */ diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h new file mode 100644 index 00000000..1ace8eeb --- /dev/null +++ b/src/msg/async/dpdk/capture.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_MSG_DPDK_CAPTURE_H +#define CEPH_MSG_DPDK_CAPTURE_H + +#include <utility> + +template <typename T, typename F> +class capture_impl { + T x; + F f; + public: + capture_impl(capture_impl &) = delete; + capture_impl( T && x, F && f ) + : x{std::forward<T>(x)}, f{std::forward<F>(f)} + {} + + template <typename ...Ts> auto operator()( Ts&&...args ) + -> decltype(f( x, std::forward<Ts>(args)... )) + { + return f( x, std::forward<Ts>(args)... ); + } + + template <typename ...Ts> auto operator()( Ts&&...args ) const + -> decltype(f( x, std::forward<Ts>(args)... )) + { + return f( x, std::forward<Ts>(args)... ); + } +}; + +template <typename T, typename F> +capture_impl<T,F> capture( T && x, F && f ) { + return capture_impl<T,F>( + std::forward<T>(x), std::forward<F>(f) ); +} + +#endif //CEPH_MSG_DPDK_CAPTURE_H diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h new file mode 100644 index 00000000..2c92c120 --- /dev/null +++ b/src/msg/async/dpdk/circular_buffer.h @@ -0,0 +1,347 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_CIRCULAR_BUFFER_HH_ +#define CEPH_CIRCULAR_BUFFER_HH_ + +// A growable double-ended queue container that can be efficiently +// extended (and shrunk) from both ends. Implementation is a single +// storage vector. +// +// Similar to libstdc++'s std::deque, except that it uses a single level +// store, and so is more efficient for simple stored items. +// Similar to boost::circular_buffer_space_optimized, except it uses +// uninitialized storage for unoccupied elements (and thus move/copy +// constructors instead of move/copy assignments, which are less efficient). + +#include <memory> +#include <algorithm> + +#include "transfer.h" + +template <typename T, typename Alloc = std::allocator<T>> +class circular_buffer { + struct impl : Alloc { + T* storage = nullptr; + // begin, end interpreted (mod capacity) + size_t begin = 0; + size_t end = 0; + size_t capacity = 0; + }; + impl _impl; + public: + using value_type = T; + using size_type = size_t; + using reference = T&; + using pointer = T*; + using const_reference = const T&; + using const_pointer = const T*; + public: + circular_buffer() = default; + circular_buffer(circular_buffer&& X); + circular_buffer(const circular_buffer& X) = delete; + ~circular_buffer(); + circular_buffer& operator=(const circular_buffer&) = delete; + circular_buffer& operator=(circular_buffer&&) = delete; + void push_front(const T& data); + void push_front(T&& data); + template <typename... A> + void emplace_front(A&&... args); + void push_back(const T& data); + void push_back(T&& data); + template <typename... A> + void emplace_back(A&&... args); + T& front(); + T& back(); + void pop_front(); + void pop_back(); + bool empty() const; + size_t size() const; + size_t capacity() const; + T& operator[](size_t idx); + template <typename Func> + void for_each(Func func); + // access an element, may return wrong or destroyed element + // only useful if you do not rely on data accuracy (e.g. prefetch) + T& access_element_unsafe(size_t idx); + private: + void expand(); + void maybe_expand(size_t nr = 1); + size_t mask(size_t idx) const; + + template<typename CB, typename ValueType> + struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> { + typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t; + + ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; } + ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; } + // prefix + cbiterator<CB, ValueType>& operator++() { + idx++; + return *this; + } + // postfix + cbiterator<CB, ValueType> operator++(int unused) { + auto v = *this; + idx++; + return v; + } + // prefix + cbiterator<CB, ValueType>& operator--() { + idx--; + return *this; + } + // postfix + cbiterator<CB, ValueType> operator--(int unused) { + auto v = *this; + idx--; + return v; + } + cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const { + return cbiterator<CB, ValueType>(cb, idx + n); + } + cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const { + return cbiterator<CB, ValueType>(cb, idx - n); + } + cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) { + idx += n; + return *this; + } + cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) { + idx -= n; + return *this; + } + bool operator==(const cbiterator<CB, ValueType>& rhs) const { + return idx == rhs.idx; + } + bool operator!=(const cbiterator<CB, ValueType>& rhs) const { + return idx != rhs.idx; + } + bool operator<(const cbiterator<CB, ValueType>& rhs) const { + return idx < rhs.idx; + } + bool operator>(const cbiterator<CB, ValueType>& rhs) const { + return idx > rhs.idx; + } + bool operator>=(const cbiterator<CB, ValueType>& rhs) const { + return idx >= rhs.idx; + } + bool operator<=(const cbiterator<CB, ValueType>& rhs) const { + return idx <= rhs.idx; + } + typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const { + return idx - rhs.idx; + } + private: + CB* cb; + size_t idx; + cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {} + friend class circular_buffer; + }; + friend class iterator; + + public: + typedef cbiterator<circular_buffer, T> iterator; + typedef cbiterator<const circular_buffer, const T> const_iterator; + + iterator begin() { + return iterator(this, _impl.begin); + } + const_iterator begin() const { + return const_iterator(this, _impl.begin); + } + iterator end() { + return iterator(this, _impl.end); + } + const_iterator end() const { + return const_iterator(this, _impl.end); + } + const_iterator cbegin() const { + return const_iterator(this, _impl.begin); + } + const_iterator cend() const { + return const_iterator(this, _impl.end); + } +}; + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const { + return idx & (_impl.capacity - 1); +} + +template <typename T, typename Alloc> +inline bool circular_buffer<T, Alloc>::empty() const { + return _impl.begin == _impl.end; +} + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::size() const { + return _impl.end - _impl.begin; +} + +template <typename T, typename Alloc> +inline size_t circular_buffer<T, Alloc>::capacity() const { + return _impl.capacity; +} + +template <typename T, typename Alloc> +inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x) + : _impl(std::move(x._impl)) { + x._impl = {}; +} + +template <typename T, typename Alloc> +template <typename Func> +inline void circular_buffer<T, Alloc>::for_each(Func func) { + auto s = _impl.storage; + auto m = _impl.capacity - 1; + for (auto i = _impl.begin; i != _impl.end; ++i) { + func(s[i & m]); + } +} + +template <typename T, typename Alloc> +inline circular_buffer<T, Alloc>::~circular_buffer() { + for_each([this] (T& obj) { + _impl.destroy(&obj); + }); + _impl.deallocate(_impl.storage, _impl.capacity); +} + +template <typename T, typename Alloc> +void circular_buffer<T, Alloc>::expand() { + auto new_cap = std::max<size_t>(_impl.capacity * 2, 1); + auto new_storage = _impl.allocate(new_cap); + auto p = new_storage; + try { + for_each([this, &p] (T& obj) { + transfer_pass1(_impl, &obj, p); + p++; + }); + } catch (...) { + while (p != new_storage) { + _impl.destroy(--p); + } + _impl.deallocate(new_storage, new_cap); + throw; + } + p = new_storage; + for_each([this, &p] (T& obj) { + transfer_pass2(_impl, &obj, p++); + }); + std::swap(_impl.storage, new_storage); + std::swap(_impl.capacity, new_cap); + _impl.begin = 0; + _impl.end = p - _impl.storage; + _impl.deallocate(new_storage, new_cap); +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) { + if (_impl.end - _impl.begin + nr > _impl.capacity) { + expand(); + } +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_front(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, data); + --_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_front(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, std::move(data)); + --_impl.begin; +} + +template <typename T, typename Alloc> +template <typename... Args> +inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + _impl.construct(p, std::forward<Args>(args)...); + --_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_back(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, data); + ++_impl.end; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::push_back(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, std::move(data)); + ++_impl.end; +} + +template <typename T, typename Alloc> +template <typename... Args> +inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + _impl.construct(p, std::forward<Args>(args)...); + ++_impl.end; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::front() { + return _impl.storage[mask(_impl.begin)]; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::back() { + return _impl.storage[mask(_impl.end - 1)]; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::pop_front() { + _impl.destroy(&front()); + ++_impl.begin; +} + +template <typename T, typename Alloc> +inline void circular_buffer<T, Alloc>::pop_back() { + _impl.destroy(&back()); + --_impl.end; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::operator[](size_t idx) { + return _impl.storage[mask(_impl.begin + idx)]; +} + +template <typename T, typename Alloc> +inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) { + return _impl.storage[mask(_impl.begin + idx)]; +} + +#endif /* CEPH_CIRCULAR_BUFFER_HH_ */ diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h new file mode 100644 index 00000000..ea5dc49e --- /dev/null +++ b/src/msg/async/dpdk/const.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_CONST_H_ +#define CEPH_MSG_CONST_H_ + +#include <stdint.h> + +enum class ip_protocol_num : uint8_t { + icmp = 1, tcp = 6, unused = 255 +}; + +enum class eth_protocol_num : uint16_t { + ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd +}; + +const uint8_t eth_hdr_len = 14; +const uint8_t tcp_hdr_len_min = 20; +const uint8_t ipv4_hdr_len_min = 20; +const uint8_t ipv6_hdr_len_min = 40; +const uint16_t ip_packet_len_max = 65535; + +#endif diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc new file mode 100644 index 00000000..9f9d343b --- /dev/null +++ b/src/msg/async/dpdk/dpdk_rte.cc @@ -0,0 +1,154 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <bitset> + +#include <rte_config.h> +#include <rte_common.h> +#include <rte_ethdev.h> +#include <rte_version.h> + +#include "DPDK.h" +#include "dpdk_rte.h" + +namespace dpdk { + + static inline std::vector<char> string2vector(std::string str) { + auto v = std::vector<char>(str.begin(), str.end()); + v.push_back('\0'); + return v; + } + + bool eal::initialized = false; + std::thread eal::t; + std::mutex eal::lock; + std::condition_variable eal::cond; + std::list<std::function<void()>> eal::funcs; + + static int bitcount(unsigned long long n) + { + return std::bitset<CHAR_BIT * sizeof(n)>{n}.count(); + } + + int eal::init(CephContext *c) + { + if (initialized) { + return 1; + } + + bool done = false; + auto num = std::stoull(c->_conf.get_val<std::string>("ms_dpdk_coremask"), + nullptr, 16); + unsigned int coremaskbit = bitcount(num); + + ceph_assert(coremaskbit > c->_conf->ms_async_op_threads); + + t = std::thread([&]() { + // TODO: Inherit these from the app parameters - "opts" + std::vector<std::vector<char>> args { + string2vector(string("ceph")), + string2vector("-c"), string2vector(c->_conf.get_val<std::string>("ms_dpdk_coremask")), + string2vector("-n"), string2vector(c->_conf->ms_dpdk_memory_channel), + }; + + Tub<std::string> hugepages_path; + if (!c->_conf->ms_dpdk_hugepages.empty()) { + hugepages_path.construct(c->_conf->ms_dpdk_hugepages); + } + + // If "hugepages" is not provided and DPDK PMD drivers mode is requested - + // use the default DPDK huge tables configuration. + if (hugepages_path) { + args.push_back(string2vector("--huge-dir")); + args.push_back(string2vector(*hugepages_path)); + + // + // We don't know what is going to be our networking configuration so we + // assume there is going to be a queue per-CPU. Plus we'll give a DPDK + // 64MB for "other stuff". + // + unsigned int x; + std::stringstream ss; + ss << std::hex << "fffefffe"; + ss >> x; + size_t size_MB = mem_size(bitcount(x)) >> 20; + std::stringstream size_MB_str; + size_MB_str << size_MB; + + args.push_back(string2vector("-m")); + args.push_back(string2vector(size_MB_str.str())); + } else if (!c->_conf->ms_dpdk_pmd.empty()) { + args.push_back(string2vector("--no-huge")); + } + + std::string rte_file_prefix; + rte_file_prefix = "rte_"; + rte_file_prefix += c->_conf->name.to_str(); + args.push_back(string2vector("--file-prefix")); + args.push_back(string2vector(rte_file_prefix)); + + std::vector<char*> cargs; + + for (auto&& a: args) { + cargs.push_back(a.data()); + } + /* initialise the EAL for all */ + int ret = rte_eal_init(cargs.size(), cargs.data()); + if (ret < 0) + return ret; + + std::unique_lock<std::mutex> l(lock); + initialized = true; + done = true; + cond.notify_all(); + while (true) { + if (!funcs.empty()) { + auto f = std::move(funcs.front()); + funcs.pop_front(); + f(); + cond.notify_all(); + } else { + cond.wait(l); + } + } + }); + t.detach(); + std::unique_lock<std::mutex> l(lock); + while (!done) + cond.wait(l); + return 0; + } + + size_t eal::mem_size(int num_cpus) + { + size_t memsize = 0; + // + // PMD mempool memory: + // + // We don't know what is going to be our networking configuration so we + // assume there is going to be a queue per-CPU. + // + memsize += num_cpus * qp_mempool_obj_size(); + + // Plus we'll give a DPDK 64MB for "other stuff". + memsize += (64UL << 20); + + return memsize; + } + +} // namespace dpdk diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h new file mode 100644 index 00000000..4aa83899 --- /dev/null +++ b/src/msg/async/dpdk/dpdk_rte.h @@ -0,0 +1,74 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef CEPH_DPDK_RTE_H_ +#define CEPH_DPDK_RTE_H_ + + +#include <condition_variable> +#include <mutex> +#include <thread> + +#include <bitset> +#include <rte_config.h> +#include <rte_version.h> +#include <boost/program_options.hpp> + +/*********************** Compat section ***************************************/ +// We currently support only versions 2.0 and above. +#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0)) +#error "DPDK version above 2.0.0 is required" +#endif + +#if defined(RTE_MBUF_REFCNT_ATOMIC) +#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \ + "config/common_linuxapp" +#endif +/******************************************************************************/ + +namespace dpdk { + +// DPDK Environment Abstraction Layer +class eal { + public: + using cpuset = std::bitset<RTE_MAX_LCORE>; + + static std::mutex lock; + static std::condition_variable cond; + static std::list<std::function<void()>> funcs; + static int init(CephContext *c); + static void execute_on_master(std::function<void()> &&f) { + bool done = false; + std::unique_lock<std::mutex> l(lock); + funcs.emplace_back([&]() { f(); done = true; }); + cond.notify_all(); + while (!done) + cond.wait(l); + } + /** + * Returns the amount of memory needed for DPDK + * @param num_cpus Number of CPUs the application is going to use + * + * @return + */ + static size_t mem_size(int num_cpus); + static bool initialized; + static std::thread t; +}; + +} // namespace dpdk +#endif // CEPH_DPDK_RTE_H_ diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc new file mode 100644 index 00000000..9aca5078 --- /dev/null +++ b/src/msg/async/dpdk/ethernet.cc @@ -0,0 +1,16 @@ +#include <iomanip> + +#include "ethernet.h" + +std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) { + auto& m = ea.mac; + using u = uint32_t; + os << std::hex << std::setw(2) + << u(m[0]) << ":" + << u(m[1]) << ":" + << u(m[2]) << ":" + << u(m[3]) << ":" + << u(m[4]) << ":" + << u(m[5]); + return os; +} diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h new file mode 100644 index 00000000..b007425f --- /dev/null +++ b/src/msg/async/dpdk/ethernet.h @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_ETHERNET_H_ +#define CEPH_MSG_ETHERNET_H_ + +#include <array> +#include <sstream> + +#include "include/ceph_assert.h" +#include "byteorder.h" + +struct ethernet_address { + ethernet_address() {} + + ethernet_address(const uint8_t *eaddr) { + std::copy(eaddr, eaddr + 6, mac.begin()); + } + + ethernet_address(std::initializer_list<uint8_t> eaddr) { + ceph_assert(eaddr.size() == mac.size()); + std::copy(eaddr.begin(), eaddr.end(), mac.begin()); + } + + ethernet_address ntoh() { + return *this; + } + ethernet_address hton() { + return *this; + } + std::array<uint8_t, 6> mac; +} __attribute__((packed)); + +inline bool operator==(const ethernet_address& a, const ethernet_address& b) { + return a.mac == b.mac; +} +std::ostream& operator<<(std::ostream& os, const ethernet_address& ea); + +struct ethernet { + using address = ethernet_address; + static address broadcast_address() { + return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + } + static constexpr uint16_t arp_hardware_type() { return 1; } +}; + +struct eth_hdr { + ethernet_address dst_mac; + ethernet_address src_mac; + uint16_t eth_proto; + eth_hdr hton() { + eth_hdr hdr = *this; + hdr.eth_proto = ::hton(eth_proto); + return hdr; + } + eth_hdr ntoh() { + eth_hdr hdr = *this; + hdr.eth_proto = ::ntoh(eth_proto); + return hdr; + } +} __attribute__((packed)); + +ethernet_address parse_ethernet_address(std::string addr); + +#endif /* CEPH_MSG_ETHERNET_H_ */ diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h new file mode 100644 index 00000000..356d8fd6 --- /dev/null +++ b/src/msg/async/dpdk/ip_types.h @@ -0,0 +1,109 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_IP_TYPES_H_H +#define CEPH_IP_TYPES_H_H + +#include <boost/asio/ip/address_v4.hpp> +#include <string> + +class Packet; +class ethernet_address; +using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>; + +struct ipv4_addr { + uint32_t ip; + uint16_t port; + + ipv4_addr() : ip(0), port(0) {} + ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {} + ipv4_addr(uint16_t port) : ip(0), port(port) {} + ipv4_addr(const std::string &addr); + ipv4_addr(const std::string &addr, uint16_t port); + + ipv4_addr(const entity_addr_t &ad) { + ip = ntoh(ad.in4_addr().sin_addr.s_addr); + port = ad.get_port(); + } + + ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {} +}; + +struct ipv4_address { + ipv4_address() : ip(0) {} + explicit ipv4_address(uint32_t ip) : ip(ip) {} + explicit ipv4_address(const std::string& addr) { + ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong()); + } + ipv4_address(ipv4_addr addr) { + ip = addr.ip; + } + + uint32_t ip; + + ipv4_address hton() { + ipv4_address addr; + addr.ip = ::hton(ip); + return addr; + } + ipv4_address ntoh() { + ipv4_address addr; + addr.ip = ::ntoh(ip); + return addr; + } + + friend bool operator==(ipv4_address x, ipv4_address y) { + return x.ip == y.ip; + } + friend bool operator!=(ipv4_address x, ipv4_address y) { + return x.ip != y.ip; + } +} __attribute__((packed)); + +static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; } + +std::ostream& operator<<(std::ostream& os, const ipv4_address& a); + +namespace std { + + template <> + struct hash<ipv4_address> { + size_t operator()(ipv4_address a) const { return a.ip; } + }; + +} + +#endif //CEPH_IP_TYPES_H_H diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc new file mode 100644 index 00000000..6e361f18 --- /dev/null +++ b/src/msg/async/dpdk/net.cc @@ -0,0 +1,205 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 XSky <haomai@xsky.com> + * + * Author: Haomai Wang <haomaiwang@gmail.com> + * + */ + +#include "net.h" +#include "DPDK.h" +#include "DPDKStack.h" + +#include "common/dout.h" +#include "include/ceph_assert.h" + +#define dout_subsys ceph_subsys_dpdk +#undef dout_prefix +#define dout_prefix *_dout << "net " + +interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center) + : cct(cct), _dev(dev), + _rx(_dev->receive( + center->get_id(), + [center, this] (Packet p) { + return dispatch_packet(center, std::move(p)); + } + )), + _hw_address(_dev->hw_address()), + _hw_features(_dev->get_hw_features()) { + auto idx = 0u; + unsigned qid = center->get_id(); + dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable { + Tub<Packet> p; + for (size_t i = 0; i < _pkt_providers.size(); i++) { + auto l3p = _pkt_providers[idx++](); + if (idx == _pkt_providers.size()) + idx = 0; + if (l3p) { + auto l3pv = std::move(*l3p); + auto eh = l3pv.p.prepend_header<eth_hdr>(); + eh->dst_mac = l3pv.to; + eh->src_mac = _hw_address; + eh->eth_proto = uint16_t(l3pv.proto_num); + *eh = eh->hton(); + ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num) + << " " << _hw_address << " -> " << l3pv.to + << " length " << std::dec << l3pv.p.len() << dendl; + p = std::move(l3pv.p); + return p; + } + } + return p; + }); +} + +subscription<Packet, ethernet_address> interface::register_l3( + eth_protocol_num proto_num, + std::function<int (Packet p, ethernet_address from)> next, + std::function<bool (forward_hash&, Packet& p, size_t)> forward) +{ + auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward))); + ceph_assert(i.second); + l3_rx_stream& l3_rx = i.first->second; + return l3_rx.packet_stream.listen(std::move(next)); +} + +unsigned interface::hash2cpu(uint32_t hash) { + return _dev->hash2cpu(hash); +} + +const rss_key_type& interface::rss_key() const { + return _dev->rss_key(); +} + +uint16_t interface::hw_queues_count() const { + return _dev->hw_queues_count(); +} + +class C_handle_l2forward : public EventCallback { + std::shared_ptr<DPDKDevice> sdev; + unsigned &queue_depth; + Packet p; + unsigned dst; + + public: + C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target) + : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {} + void do_request(uint64_t fd) { + sdev->l2receive(dst, std::move(p)); + queue_depth--; + delete this; + } +}; + +void interface::forward(EventCenter *source, unsigned target, Packet p) { + static __thread unsigned queue_depth; + + if (queue_depth < 1000) { + queue_depth++; + // FIXME: need ensure this event not be called after EventCenter destruct + _dev->workers[target]->center.dispatch_event_external( + new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target)); + } +} + +int interface::dispatch_packet(EventCenter *center, Packet p) { + auto eh = p.get_header<eth_hdr>(); + if (eh) { + auto i = _proto_map.find(ntoh(eh->eth_proto)); + auto hwrss = p.rss_hash(); + if (hwrss) { + ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto) + << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh() + << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl; + } else { + ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto) + << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh() + << " length " << std::dec << p.len() << dendl; + } + if (i != _proto_map.end()) { + l3_rx_stream& l3 = i->second; + auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () { + auto hwrss = p.rss_hash(); + if (hwrss) { + return *hwrss; + } else { + forward_hash data; + if (l3.forward(data, p, sizeof(eth_hdr))) { + return toeplitz_hash(rss_key(), data); + } + return 0u; + } + }); + if (fw != center->get_id()) { + ldout(cct, 1) << __func__ << " forward to " << fw << dendl; + forward(center, fw, std::move(p)); + } else { + auto h = eh->ntoh(); + auto from = h.src_mac; + p.trim_front(sizeof(*eh)); + // avoid chaining, since queue length is unlimited + // drop instead. + if (l3.ready()) { + return l3.packet_stream.produce(std::move(p), from); + } + } + } + } + return 0; +} + +class C_arp_learn : public EventCallback { + DPDKWorker *worker; + ethernet_address l2_addr; + ipv4_address l3_addr; + + public: + C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3) + : worker(w), l2_addr(l2), l3_addr(l3) {} + void do_request(uint64_t id) { + worker->arp_learn(l2_addr, l3_addr); + delete this; + } +}; + +void interface::arp_learn(ethernet_address l2, ipv4_address l3) +{ + for (auto &&w : _dev->workers) { + w->center.dispatch_event_external( + new C_arp_learn(w, l2, l3)); + } +} + +l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func) + : _netif(netif), _proto_num(proto_num) { + _netif->register_packet_provider(std::move(func)); +} + +subscription<Packet, ethernet_address> l3_protocol::receive( + std::function<int (Packet, ethernet_address)> rx_fn, + std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) { + return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward)); +}; diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h new file mode 100644 index 00000000..63f0422b --- /dev/null +++ b/src/msg/async/dpdk/net.h @@ -0,0 +1,138 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_NET_H +#define CEPH_MSG_DPDK_NET_H + +#include "const.h" +#include "ethernet.h" +#include "Packet.h" +#include "stream.h" +#include "toeplitz.h" + +struct hw_features { + // Enable tx ip header checksum offload + bool tx_csum_ip_offload = false; + // Enable tx l4 (TCP or UDP) checksum offload + bool tx_csum_l4_offload = false; + // Enable rx checksum offload + bool rx_csum_offload = false; + // LRO is enabled + bool rx_lro = false; + // Enable tx TCP segment offload + bool tx_tso = false; + // Enable tx UDP fragmentation offload + bool tx_ufo = false; + // Maximum Transmission Unit + uint16_t mtu = 1500; + // Maximun packet len when TCP/UDP offload is enabled + uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len; +}; + +class forward_hash { + uint8_t data[64]; + size_t end_idx = 0; + public: + size_t size() const { + return end_idx; + } + void push_back(uint8_t b) { + ceph_assert(end_idx < sizeof(data)); + data[end_idx++] = b; + } + void push_back(uint16_t b) { + push_back(uint8_t(b)); + push_back(uint8_t(b >> 8)); + } + void push_back(uint32_t b) { + push_back(uint16_t(b)); + push_back(uint16_t(b >> 16)); + } + const uint8_t& operator[](size_t idx) const { + return data[idx]; + } +}; + +class interface; + +class l3_protocol { + public: + struct l3packet { + eth_protocol_num proto_num; + ethernet_address to; + Packet p; + }; + using packet_provider_type = std::function<Tub<l3packet> ()>; + + private: + interface* _netif; + eth_protocol_num _proto_num; + + public: + explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func); + subscription<Packet, ethernet_address> receive( + std::function<int (Packet, ethernet_address)> rx_fn, + std::function<bool (forward_hash &h, Packet &p, size_t s)> forward); + + private: + friend class interface; +}; + +class DPDKDevice; +struct ipv4_address; + +class interface { + CephContext *cct; + struct l3_rx_stream { + stream<Packet, ethernet_address> packet_stream; + std::function<bool (forward_hash&, Packet&, size_t)> forward; + bool ready() { return packet_stream.started(); } + explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {} + }; + std::unordered_map<uint16_t, l3_rx_stream> _proto_map; + std::shared_ptr<DPDKDevice> _dev; + subscription<Packet> _rx; + ethernet_address _hw_address; + struct hw_features _hw_features; + std::vector<l3_protocol::packet_provider_type> _pkt_providers; + + private: + int dispatch_packet(EventCenter *c, Packet p); + public: + explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center); + ethernet_address hw_address() { return _hw_address; } + const struct hw_features& get_hw_features() const { return _hw_features; } + subscription<Packet, ethernet_address> register_l3( + eth_protocol_num proto_num, + std::function<int (Packet, ethernet_address)> next, + std::function<bool (forward_hash&, Packet&, size_t)> forward); + void forward(EventCenter *source, unsigned target, Packet p); + unsigned hash2cpu(uint32_t hash); + void register_packet_provider(l3_protocol::packet_provider_type func) { + _pkt_providers.push_back(std::move(func)); + } + const rss_key_type& rss_key() const; + uint16_t hw_queues_count() const; + void arp_learn(ethernet_address l2, ipv4_address l3); + friend class l3_protocol; +}; + +#endif //CEPH_MSG_DPDK_NET_H diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h new file mode 100644 index 00000000..984ddca1 --- /dev/null +++ b/src/msg/async/dpdk/queue.h @@ -0,0 +1,96 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_DPDK_QUEUE_H_ +#define CEPH_MSG_DPDK_QUEUE_H_ + +#include <queue> + +#include "circular_buffer.h" + +template <typename T> +class queue { + std::queue<T, circular_buffer<T>> _q; + size_t _max; + + public: + explicit queue(size_t size): _max(size) {} + + // Push an item. + // + // Returns false if the queue was full and the item was not pushed. + bool push(T&& a); + + // pops an item. + T pop(); + + // Consumes items from the queue, passing them to @func, until @func + // returns false or the queue it empty + // + // Returns false if func returned false. + template <typename Func> + bool consume(Func&& func); + + // Returns true when the queue is empty. + bool empty() const; + + // Returns true when the queue is full. + bool full() const; + + size_t size() const { return _q.size(); } + + // Destroy any items in the queue + void clear() { + while (!_q.empty()) { + _q.pop(); + } + } +}; + +template <typename T> +inline bool queue<T>::push(T&& data) { + if (_q.size() < _max) { + _q.push(std::move(data)); + notify_not_empty(); + return true; + } else { + return false; + } +} + +template <typename T> +inline T queue<T>::pop() { + T data = std::move(_q.front()); + _q.pop(); + return data; +} + +template <typename T> +inline bool queue<T>::empty() const { + return _q.empty(); +} + +template <typename T> +inline bool queue<T>::full() const { + return _q.size() == _max; +} + +#endif /* CEPH_MSG_DPDK_QUEUE_H_ */ diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h new file mode 100644 index 00000000..d078063b --- /dev/null +++ b/src/msg/async/dpdk/shared_ptr.h @@ -0,0 +1,391 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_LW_SHARED_PTR_H_ +#define CEPH_LW_SHARED_PTR_H_ + +#include <utility> +#include <type_traits> +#include <functional> +#include <iostream> + +// This header defines a shared pointer facility, lw_shared_ptr<>, +// modeled after std::shared_ptr<>. +// +// Unlike std::shared_ptr<>, this implementation is thread +// safe, and two pointers sharing the same object must not be used in +// different threads. +// +// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<> +// occupying just one machine word, and adding just one word to the shared +// object. However, it does not support polymorphism. +// +// It supports shared_from_this() via enable_shared_from_this<> +// and lw_enable_shared_from_this<>(). +// + +template <typename T> +class lw_shared_ptr; + +template <typename T> +class enable_lw_shared_from_this; + +template <typename T> +class enable_shared_from_this; + +template <typename T, typename... A> +lw_shared_ptr<T> make_lw_shared(A&&... a); + +template <typename T> +lw_shared_ptr<T> make_lw_shared(T&& a); + +template <typename T> +lw_shared_ptr<T> make_lw_shared(T& a); + +struct lw_shared_ptr_counter_base { + long _count = 0; +}; + + +namespace internal { + +template <class T, class U> +struct lw_shared_ptr_accessors; + +template <class T> +struct lw_shared_ptr_accessors_esft; + +template <class T> +struct lw_shared_ptr_accessors_no_esft; + +} + + +// We want to support two use cases for shared_ptr<T>: +// +// 1. T is any type (primitive or class type) +// +// 2. T is a class type that inherits from enable_shared_from_this<T>. +// +// In the first case, we must wrap T in an object containing the counter, +// since T may be a primitive type and cannot be a base class. +// +// In the second case, we want T to reach the counter through its +// enable_shared_from_this<> base class, so that we can implement +// shared_from_this(). +// +// To implement those two conflicting requirements (T alongside its counter; +// T inherits from an object containing the counter) we use std::conditional<> +// and some accessor functions to select between two implementations. + + +// CRTP from this to enable shared_from_this: +template <typename T> +class enable_lw_shared_from_this : private lw_shared_ptr_counter_base { + using ctor = T; +protected: + enable_lw_shared_from_this() noexcept {} + enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {} + enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {} + enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; } + enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; } +public: + lw_shared_ptr<T> shared_from_this(); + lw_shared_ptr<const T> shared_from_this() const; + + template <typename X> + friend class lw_shared_ptr; + template <typename X> + friend class ::internal::lw_shared_ptr_accessors_esft; + template <typename X, class Y> + friend class ::internal::lw_shared_ptr_accessors; +}; + +template <typename T> +struct shared_ptr_no_esft : private lw_shared_ptr_counter_base { + T _value; + + shared_ptr_no_esft() = default; + shared_ptr_no_esft(const T& x) : _value(x) {} + shared_ptr_no_esft(T&& x) : _value(std::move(x)) {} + template <typename... A> + shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {} + + template <typename X> + friend class lw_shared_ptr; + template <typename X> + friend class ::internal::lw_shared_ptr_accessors_no_esft; + template <typename X, class Y> + friend class ::internal::lw_shared_ptr_accessors; +}; + + +/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed, +/// primarily so that incomplete classes can be used. +/// +/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>. +/// The specialization must be visible for all uses of \c lw_shared_ptr<T>. +/// +/// To customize, the template must have a `static void dispose(T*)` operator that disposes of +/// the object. +template <typename T> +struct lw_shared_ptr_deleter; // No generic implementation + +namespace internal { + +template <typename T> +struct lw_shared_ptr_accessors_esft { + using concrete_type = std::remove_const_t<T>; + static T* to_value(lw_shared_ptr_counter_base* counter) { + return static_cast<T*>(counter); + } + static void dispose(lw_shared_ptr_counter_base* counter) { + delete static_cast<T*>(counter); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // since to_value() is defined above, we don't need to do anything special + // to force-instantiate it + } +}; + +template <typename T> +struct lw_shared_ptr_accessors_no_esft { + using concrete_type = shared_ptr_no_esft<T>; + static T* to_value(lw_shared_ptr_counter_base* counter) { + return &static_cast<concrete_type*>(counter)->_value; + } + static void dispose(lw_shared_ptr_counter_base* counter) { + delete static_cast<concrete_type*>(counter); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // since to_value() is defined above, we don't need to do anything special + // to force-instantiate it + } +}; + +// Generic case: lw_shared_ptr_deleter<T> is not specialized, select +// implementation based on whether T inherits from enable_lw_shared_from_this<T>. +template <typename T, typename U = void> +struct lw_shared_ptr_accessors : std::conditional_t< + std::is_base_of<enable_lw_shared_from_this<T>, T>::value, + lw_shared_ptr_accessors_esft<T>, + lw_shared_ptr_accessors_no_esft<T>> { +}; + +// Overload when lw_shared_ptr_deleter<T> specialized +template <typename T> +struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> { + using concrete_type = T; + static T* to_value(lw_shared_ptr_counter_base* counter); + static void dispose(lw_shared_ptr_counter_base* counter) { + lw_shared_ptr_deleter<T>::dispose(to_value(counter)); + } + static void instantiate_to_value(lw_shared_ptr_counter_base* p) { + // instantiate to_value(); must be defined by shared_ptr_incomplete.hh + to_value(p); + } +}; + +} + +template <typename T> +class lw_shared_ptr { + using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>; + using concrete_type = typename accessors::concrete_type; + mutable lw_shared_ptr_counter_base* _p = nullptr; +private: + lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) { + if (_p) { + ++_p->_count; + } + } + template <typename... A> + static lw_shared_ptr make(A&&... a) { + auto p = new concrete_type(std::forward<A>(a)...); + accessors::instantiate_to_value(p); + return lw_shared_ptr(p); + } +public: + using element_type = T; + + lw_shared_ptr() noexcept = default; + lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {} + lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) { + if (_p) { + ++_p->_count; + } + } + lw_shared_ptr(lw_shared_ptr&& x) noexcept : _p(x._p) { + x._p = nullptr; + } + [[gnu::always_inline]] + ~lw_shared_ptr() { + if (_p && !--_p->_count) { + accessors::dispose(_p); + } + } + lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept { + if (_p != x._p) { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(x); + } + return *this; + } + lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept { + if (_p != x._p) { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(std::move(x)); + } + return *this; + } + lw_shared_ptr& operator=(std::nullptr_t) noexcept { + return *this = lw_shared_ptr(); + } + lw_shared_ptr& operator=(T&& x) noexcept { + this->~lw_shared_ptr(); + new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x))); + return *this; + } + + T& operator*() const noexcept { return *accessors::to_value(_p); } + T* operator->() const noexcept { return accessors::to_value(_p); } + T* get() const noexcept { + if (_p) { + return accessors::to_value(_p); + } else { + return nullptr; + } + } + + long int use_count() const noexcept { + if (_p) { + return _p->_count; + } else { + return 0; + } + } + + operator lw_shared_ptr<const T>() const noexcept { + return lw_shared_ptr<const T>(_p); + } + + explicit operator bool() const noexcept { + return _p; + } + + bool owned() const noexcept { + return _p->_count == 1; + } + + bool operator==(const lw_shared_ptr<const T>& x) const { + return _p == x._p; + } + + bool operator!=(const lw_shared_ptr<const T>& x) const { + return !operator==(x); + } + + bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return _p == x._p; + } + + bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return !operator==(x); + } + + bool operator<(const lw_shared_ptr<const T>& x) const { + return _p < x._p; + } + + bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const { + return _p < x._p; + } + + template <typename U> + friend class lw_shared_ptr; + + template <typename X, typename... A> + friend lw_shared_ptr<X> make_lw_shared(A&&...); + + template <typename U> + friend lw_shared_ptr<U> make_lw_shared(U&&); + + template <typename U> + friend lw_shared_ptr<U> make_lw_shared(U&); + + template <typename U> + friend class enable_lw_shared_from_this; +}; + +template <typename T, typename... A> +inline +lw_shared_ptr<T> make_lw_shared(A&&... a) { + return lw_shared_ptr<T>::make(std::forward<A>(a)...); +} + +template <typename T> +inline +lw_shared_ptr<T> make_lw_shared(T&& a) { + return lw_shared_ptr<T>::make(std::move(a)); +} + +template <typename T> +inline +lw_shared_ptr<T> make_lw_shared(T& a) { + return lw_shared_ptr<T>::make(a); +} + +template <typename T> +inline +lw_shared_ptr<T> +enable_lw_shared_from_this<T>::shared_from_this() { + return lw_shared_ptr<T>(this); +} + +template <typename T> +inline +lw_shared_ptr<const T> +enable_lw_shared_from_this<T>::shared_from_this() const { + return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this)); +} + +template <typename T> +static inline +std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) { + if (!p) { + return out << "null"; + } + return out << *p; +} + +namespace std { + + template <typename T> + struct hash<lw_shared_ptr<T>> : private hash<T*> { + size_t operator()(const lw_shared_ptr<T>& p) const { + return hash<T*>::operator()(p.get()); + } + }; + +} + +#endif /* CEPH_LW_SHARED_PTR_H_ */ diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h new file mode 100644 index 00000000..1898e8f8 --- /dev/null +++ b/src/msg/async/dpdk/stream.h @@ -0,0 +1,155 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_MSG_STREAM_H_ +#define CEPH_MSG_STREAM_H_ + +#include <exception> +#include <cassert> + +// A stream<> is the producer side. It may call produce() as long +// as the returned from the previous invocation is ready. +// To signify no more data is available, call close(). +// +// A subscription<> is the consumer side. It is created by a call +// to stream::listen(). Calling subscription::start(), +// which registers the data processing callback, starts processing +// events. It may register for end-of-stream notifications by +// return the when_done() future, which also delivers error +// events (as exceptions). +// +// The consumer can pause generation of new data by returning +// positive integer; when it becomes ready, the producer +// will resume processing. + +template <typename... T> +class subscription; + +template <typename... T> +class stream { + subscription<T...>* _sub = nullptr; + int done; + bool ready; + public: + using next_fn = std::function<int (T...)>; + stream() = default; + stream(const stream&) = delete; + stream(stream&&) = delete; + ~stream() { + if (_sub) { + _sub->_stream = nullptr; + } + } + + void operator=(const stream&) = delete; + void operator=(stream&&) = delete; + + // Returns a subscription that reads value from this + // stream. + subscription<T...> listen() { + return subscription<T...>(this); + } + + // Returns a subscription that reads value from this + // stream, and also sets up the listen function. + subscription<T...> listen(next_fn next) { + auto sub = subscription<T...>(this); + sub.start(std::move(next)); + return sub; + } + + // Becomes ready when the listener is ready to accept + // values. Call only once, when beginning to produce + // values. + bool started() { + return ready; + } + + // Produce a value. Call only after started(), and after + // a previous produce() is ready. + int produce(T... data) { + return _sub->_next(std::move(data)...); + } + + // End the stream. Call only after started(), and after + // a previous produce() is ready. No functions may be called + // after this. + void close() { + done = 1; + } + + // Signal an error. Call only after started(), and after + // a previous produce() is ready. No functions may be called + // after this. + void set_exception(int error) { + done = error; + } + private: + void start(); + friend class subscription<T...>; +}; + +template <typename... T> +class subscription { + public: + using next_fn = typename stream<T...>::next_fn; + private: + stream<T...>* _stream; + next_fn _next; + private: + explicit subscription(stream<T...>* s): _stream(s) { + ceph_assert(!_stream->_sub); + _stream->_sub = this; + } + + public: + subscription(subscription&& x) + : _stream(x._stream), _next(std::move(x._next)) { + x._stream = nullptr; + if (_stream) { + _stream->_sub = this; + } + } + ~subscription() { + if (_stream) { + _stream->_sub = nullptr; + } + } + + /// \brief Start receiving events from the stream. + /// + /// \param next Callback to call for each event + void start(std::function<int (T...)> next) { + _next = std::move(next); + _stream->ready = true; + } + + // Becomes ready when the stream is empty, or when an error + // happens (in that case, an exception is held). + int done() { + return _stream->done; + } + + friend class stream<T...>; +}; + +#endif /* CEPH_MSG_STREAM_H_ */ diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h new file mode 100644 index 00000000..3ca38808 --- /dev/null +++ b/src/msg/async/dpdk/toeplitz.h @@ -0,0 +1,92 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/*- + * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef CEPH_MSG_TOEPLITZ_H_ +#define CEPH_MSG_TOEPLITZ_H_ + +#include <vector> + +using rss_key_type = std::vector<uint8_t>; + +// Mellanox Linux's driver key +static const rss_key_type default_rsskey_40bytes = { + 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b, + 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb, + 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c, + 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9, + 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc +}; + +// Intel's i40e PMD default RSS key +static const rss_key_type default_rsskey_52bytes = { + 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23, + 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30, + 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02, + 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c, + 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55, + 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e, + 0x81, 0x15, 0x03, 0x66 +}; + +template<typename T> +static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data) +{ + uint32_t hash = 0, v; + u_int i, b; + + /* XXXRW: Perhaps an assertion about key length vs. data length? */ + + v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3]; + for (i = 0; i < data.size(); i++) { + for (b = 0; b < 8; b++) { + if (data[i] & (1<<(7-b))) + hash ^= v; + v <<= 1; + if ((i + 4) < key.size() && + (key[i+4] & (1<<(7-b)))) + v |= 1; + } + } + return (hash); +} +#endif diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h new file mode 100644 index 00000000..599db5bd --- /dev/null +++ b/src/msg/async/dpdk/transfer.h @@ -0,0 +1,64 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#ifndef CEPH_TRANSFER_H_ +#define CEPH_TRANSFER_H_ + +// Helper functions for copying or moving multiple objects in an exception +// safe manner, then destroying the sources. +// +// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs, +// (this copies the object from @from to @to). If no exceptions are encountered, +// call transfer_pass2(allocator, &from, &to). This destroys the object at the +// origin. If exceptions were encountered, simply destroy all copied objects. +// +// As an optimization, if the objects are moveable without throwing (noexcept) +// transfer_pass1() simply moves the objects and destroys the source, and +// transfer_pass2() does nothing. + +#include <type_traits> +#include <utility> + +template <typename T, typename Alloc> +inline void transfer_pass1(Alloc& a, T* from, T* to, + typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.construct(to, std::move(*from)); + a.destroy(from); +} + +template <typename T, typename Alloc> +inline void transfer_pass2(Alloc& a, T* from, T* to, + typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { +} + +template <typename T, typename Alloc> +inline void transfer_pass1(Alloc& a, T* from, T* to, + typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.construct(to, *from); +} + +template <typename T, typename Alloc> +inline void transfer_pass2(Alloc& a, T* from, T* to, + typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) { + a.destroy(from); +} + +#endif /* CEPH_TRANSFER_H_ */ |