summaryrefslogtreecommitdiffstats
path: root/src/msg/async/dpdk
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/msg/async/dpdk
parentInitial commit. (diff)
downloadceph-upstream/18.2.2.tar.xz
ceph-upstream/18.2.2.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/msg/async/dpdk')
-rw-r--r--src/msg/async/dpdk/ARP.cc89
-rw-r--r--src/msg/async/dpdk/ARP.h301
-rw-r--r--src/msg/async/dpdk/DPDK.cc1376
-rw-r--r--src/msg/async/dpdk/DPDK.h937
-rw-r--r--src/msg/async/dpdk/DPDKStack.cc284
-rw-r--r--src/msg/async/dpdk/DPDKStack.h272
-rw-r--r--src/msg/async/dpdk/EventDPDK.cc85
-rw-r--r--src/msg/async/dpdk/EventDPDK.h40
-rw-r--r--src/msg/async/dpdk/IP.cc481
-rw-r--r--src/msg/async/dpdk/IP.h403
-rw-r--r--src/msg/async/dpdk/IPChecksum.cc70
-rw-r--r--src/msg/async/dpdk/IPChecksum.h72
-rw-r--r--src/msg/async/dpdk/Packet.cc146
-rw-r--r--src/msg/async/dpdk/Packet.h549
-rw-r--r--src/msg/async/dpdk/PacketUtil.h154
-rw-r--r--src/msg/async/dpdk/TCP-Stack.h40
-rw-r--r--src/msg/async/dpdk/TCP.cc841
-rw-r--r--src/msg/async/dpdk/TCP.h1506
-rw-r--r--src/msg/async/dpdk/UserspaceEvent.cc127
-rw-r--r--src/msg/async/dpdk/UserspaceEvent.h106
-rw-r--r--src/msg/async/dpdk/align.h50
-rw-r--r--src/msg/async/dpdk/array_map.h50
-rw-r--r--src/msg/async/dpdk/byteorder.h58
-rw-r--r--src/msg/async/dpdk/capture.h50
-rw-r--r--src/msg/async/dpdk/circular_buffer.h347
-rw-r--r--src/msg/async/dpdk/const.h42
-rw-r--r--src/msg/async/dpdk/dpdk_rte.cc204
-rw-r--r--src/msg/async/dpdk/dpdk_rte.h79
-rw-r--r--src/msg/async/dpdk/ethernet.cc16
-rw-r--r--src/msg/async/dpdk/ethernet.h84
-rw-r--r--src/msg/async/dpdk/ip_types.h109
-rw-r--r--src/msg/async/dpdk/net.cc205
-rw-r--r--src/msg/async/dpdk/net.h138
-rw-r--r--src/msg/async/dpdk/queue.h96
-rw-r--r--src/msg/async/dpdk/shared_ptr.h391
-rw-r--r--src/msg/async/dpdk/stream.h155
-rw-r--r--src/msg/async/dpdk/toeplitz.h92
-rw-r--r--src/msg/async/dpdk/transfer.h64
38 files changed, 10109 insertions, 0 deletions
diff --git a/src/msg/async/dpdk/ARP.cc b/src/msg/async/dpdk/ARP.cc
new file mode 100644
index 000000000..f73eed40c
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.cc
@@ -0,0 +1,89 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "ARP.h"
+
+arp_for_protocol::arp_for_protocol(arp& a, uint16_t proto_num)
+ : _arp(a), _proto_num(proto_num)
+{
+ _arp.add(proto_num, this);
+}
+
+arp_for_protocol::~arp_for_protocol()
+{
+ _arp.del(_proto_num);
+}
+
+arp::arp(interface* netif):
+ _netif(netif),
+ _proto(netif, eth_protocol_num::arp, [this] { return get_packet(); }),
+ _rx_packets(
+ _proto.receive(
+ [this] (Packet p, ethernet_address ea) {
+ return process_packet(std::move(p), ea);
+ },
+ [this](forward_hash& out_hash_data, Packet& p, size_t off) {
+ return forward(out_hash_data, p, off);
+ }
+ )
+ )
+{}
+
+std::optional<l3_protocol::l3packet> arp::get_packet()
+{
+ std::optional<l3_protocol::l3packet> p;
+ if (!_packetq.empty()) {
+ p = std::move(_packetq.front());
+ _packetq.pop_front();
+ }
+ return p;
+}
+
+bool arp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+ auto ah = p.get_header<arp_hdr>(off);
+ auto i = _arp_for_protocol.find(ntoh(ah->ptype));
+ if (i != _arp_for_protocol.end()) {
+ return i->second->forward(out_hash_data, p, off);
+ }
+ return false;
+}
+
+void arp::add(uint16_t proto_num, arp_for_protocol* afp)
+{
+ _arp_for_protocol[proto_num] = afp;
+}
+
+void arp::del(uint16_t proto_num)
+{
+ _arp_for_protocol.erase(proto_num);
+}
+
+int arp::process_packet(Packet p, ethernet_address from)
+{
+ auto ah = p.get_header<arp_hdr>()->ntoh();
+ auto i = _arp_for_protocol.find(ah.ptype);
+ if (i != _arp_for_protocol.end()) {
+ i->second->received(std::move(p));
+ }
+ return 0;
+}
diff --git a/src/msg/async/dpdk/ARP.h b/src/msg/async/dpdk/ARP.h
new file mode 100644
index 000000000..e2f6dfe98
--- /dev/null
+++ b/src/msg/async/dpdk/ARP.h
@@ -0,0 +1,301 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_ARP_H_
+#define CEPH_MSG_ARP_H_
+
+#include <errno.h>
+
+#include <unordered_map>
+#include <functional>
+
+#include "msg/async/Event.h"
+
+#include "ethernet.h"
+#include "circular_buffer.h"
+#include "ip_types.h"
+#include "net.h"
+#include "Packet.h"
+
+class arp;
+template <typename L3>
+class arp_for;
+
+class arp_for_protocol {
+ protected:
+ arp& _arp;
+ uint16_t _proto_num;
+ public:
+ arp_for_protocol(arp& a, uint16_t proto_num);
+ virtual ~arp_for_protocol();
+ virtual int received(Packet p) = 0;
+ virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return false; }
+};
+
+class interface;
+
+class arp {
+ interface* _netif;
+ l3_protocol _proto;
+ subscription<Packet, ethernet_address> _rx_packets;
+ std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol;
+ circular_buffer<l3_protocol::l3packet> _packetq;
+ private:
+ struct arp_hdr {
+ uint16_t htype;
+ uint16_t ptype;
+ arp_hdr ntoh() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::ntoh(htype);
+ hdr.ptype = ::ntoh(ptype);
+ return hdr;
+ }
+ arp_hdr hton() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::hton(htype);
+ hdr.ptype = ::hton(ptype);
+ return hdr;
+ }
+ };
+ public:
+ explicit arp(interface* netif);
+ void add(uint16_t proto_num, arp_for_protocol* afp);
+ void del(uint16_t proto_num);
+ private:
+ ethernet_address l2self() { return _netif->hw_address(); }
+ int process_packet(Packet p, ethernet_address from);
+ bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+ std::optional<l3_protocol::l3packet> get_packet();
+ template <class l3_proto>
+ friend class arp_for;
+};
+
+template <typename L3>
+class arp_for : public arp_for_protocol {
+ public:
+ using l2addr = ethernet_address;
+ using l3addr = typename L3::address_type;
+ private:
+ static constexpr auto max_waiters = 512;
+ enum oper {
+ op_request = 1,
+ op_reply = 2,
+ };
+ struct arp_hdr {
+ uint16_t htype;
+ uint16_t ptype;
+ uint8_t hlen;
+ uint8_t plen;
+ uint16_t oper;
+ l2addr sender_hwaddr;
+ l3addr sender_paddr;
+ l2addr target_hwaddr;
+ l3addr target_paddr;
+
+ arp_hdr ntoh() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::ntoh(htype);
+ hdr.ptype = ::ntoh(ptype);
+ hdr.oper = ::ntoh(oper);
+ hdr.sender_hwaddr = sender_hwaddr.ntoh();
+ hdr.sender_paddr = sender_paddr.ntoh();
+ hdr.target_hwaddr = target_hwaddr.ntoh();
+ hdr.target_paddr = target_paddr.ntoh();
+ return hdr;
+ }
+
+ arp_hdr hton() {
+ arp_hdr hdr = *this;
+ hdr.htype = ::hton(htype);
+ hdr.ptype = ::hton(ptype);
+ hdr.oper = ::hton(oper);
+ hdr.sender_hwaddr = sender_hwaddr.hton();
+ hdr.sender_paddr = sender_paddr.hton();
+ hdr.target_hwaddr = target_hwaddr.hton();
+ hdr.target_paddr = target_paddr.hton();
+ return hdr;
+ }
+ };
+ struct resolution {
+ std::vector<std::pair<resolution_cb, Packet>> _waiters;
+ uint64_t timeout_fd;
+ };
+ class C_handle_arp_timeout : public EventCallback {
+ arp_for *arp;
+ l3addr paddr;
+ bool first_request;
+
+ public:
+ C_handle_arp_timeout(arp_for *a, l3addr addr, bool first):
+ arp(a), paddr(addr), first_request(first) {}
+ void do_request(uint64_t r) {
+ arp->send_query(paddr);
+ auto &res = arp->_in_progress[paddr];
+
+ for (auto& p : res._waiters) {
+ p.first(ethernet_address(), std::move(p.second), -ETIMEDOUT);
+ }
+ res._waiters.clear();
+ res.timeout_fd = arp->center->create_time_event(
+ 1*1000*1000, this);
+ }
+ };
+ friend class C_handle_arp_timeout;
+
+ private:
+ CephContext *cct;
+ EventCenter *center;
+ l3addr _l3self = L3::broadcast_address();
+ std::unordered_map<l3addr, l2addr> _table;
+ std::unordered_map<l3addr, resolution> _in_progress;
+ private:
+ Packet make_query_packet(l3addr paddr);
+ virtual int received(Packet p) override;
+ int handle_request(arp_hdr* ah);
+ l2addr l2self() { return _arp.l2self(); }
+ void send(l2addr to, Packet &&p);
+ public:
+ void send_query(const l3addr& paddr);
+ explicit arp_for(CephContext *c, arp& a, EventCenter *cen)
+ : arp_for_protocol(a, L3::arp_protocol_type()), cct(c), center(cen) {
+ _table[L3::broadcast_address()] = ethernet::broadcast_address();
+ }
+ ~arp_for() {
+ for (auto && p : _in_progress)
+ center->delete_time_event(p.second.timeout_fd);
+ }
+ void wait(const l3addr& addr, Packet p, resolution_cb cb);
+ void learn(l2addr l2, l3addr l3);
+ void run();
+ void set_self_addr(l3addr addr) {
+ _table.erase(_l3self);
+ _table[addr] = l2self();
+ _l3self = addr;
+ }
+ friend class arp;
+};
+
+template <typename L3>
+void arp_for<L3>::send(l2addr to, Packet &&p) {
+ _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)});
+}
+
+template <typename L3>
+Packet arp_for<L3>::make_query_packet(l3addr paddr) {
+ arp_hdr hdr;
+ hdr.htype = ethernet::arp_hardware_type();
+ hdr.ptype = L3::arp_protocol_type();
+ hdr.hlen = sizeof(l2addr);
+ hdr.plen = sizeof(l3addr);
+ hdr.oper = op_request;
+ hdr.sender_hwaddr = l2self();
+ hdr.sender_paddr = _l3self;
+ hdr.target_hwaddr = ethernet::broadcast_address();
+ hdr.target_paddr = paddr;
+ hdr = hdr.hton();
+ return Packet(reinterpret_cast<char*>(&hdr), sizeof(hdr));
+}
+
+template <typename L3>
+void arp_for<L3>::send_query(const l3addr& paddr) {
+ send(ethernet::broadcast_address(), make_query_packet(paddr));
+}
+
+template <typename L3>
+void arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) {
+ _table[paddr] = hwaddr;
+ auto i = _in_progress.find(paddr);
+ if (i != _in_progress.end()) {
+ auto& res = i->second;
+ center->delete_time_event(res.timeout_fd);
+ for (auto &&p : res._waiters) {
+ p.first(hwaddr, std::move(p.second), 0);
+ }
+ _in_progress.erase(i);
+ }
+}
+
+template <typename L3>
+void arp_for<L3>::wait(const l3addr& paddr, Packet p, resolution_cb cb) {
+ auto i = _table.find(paddr);
+ if (i != _table.end()) {
+ cb(i->second, std::move(p), 0);
+ return ;
+ }
+
+ auto j = _in_progress.find(paddr);
+ auto first_request = j == _in_progress.end();
+ auto& res = first_request ? _in_progress[paddr] : j->second;
+
+ if (first_request) {
+ res.timeout_fd = center->create_time_event(
+ 1*1000*1000, new C_handle_arp_timeout(this, paddr, first_request));
+ send_query(paddr);
+ }
+
+ if (res._waiters.size() >= max_waiters) {
+ cb(ethernet_address(), std::move(p), -EBUSY);
+ return ;
+ }
+
+ res._waiters.emplace_back(cb, std::move(p));
+ return ;
+}
+
+template <typename L3>
+int arp_for<L3>::received(Packet p) {
+ auto ah = p.get_header<arp_hdr>();
+ if (!ah) {
+ return 0;
+ }
+ auto h = ah->ntoh();
+ if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) {
+ return 0;
+ }
+ switch (h.oper) {
+ case op_request:
+ return handle_request(&h);
+ case op_reply:
+ _arp._netif->arp_learn(h.sender_hwaddr, h.sender_paddr);
+ return 0;
+ default:
+ return 0;
+ }
+}
+
+template <typename L3>
+int arp_for<L3>::handle_request(arp_hdr* ah) {
+ if (ah->target_paddr == _l3self
+ && _l3self != L3::broadcast_address()) {
+ ah->oper = op_reply;
+ ah->target_hwaddr = ah->sender_hwaddr;
+ ah->target_paddr = ah->sender_paddr;
+ ah->sender_hwaddr = l2self();
+ ah->sender_paddr = _l3self;
+ *ah = ah->hton();
+ send(ah->target_hwaddr, Packet(reinterpret_cast<char*>(ah), sizeof(*ah)));
+ }
+ return 0;
+}
+
+#endif /* CEPH_MSG_ARP_H_ */
diff --git a/src/msg/async/dpdk/DPDK.cc b/src/msg/async/dpdk/DPDK.cc
new file mode 100644
index 000000000..a10c6ec96
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.cc
@@ -0,0 +1,1376 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <atomic>
+#include <vector>
+#include <queue>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+#include <rte_pci.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_cycles.h>
+#include <rte_memzone.h>
+
+#include "include/page.h"
+#include "align.h"
+#include "IP.h"
+#include "const.h"
+#include "dpdk_rte.h"
+#include "DPDK.h"
+#include "toeplitz.h"
+
+#include "common/Cycles.h"
+#include "common/dout.h"
+#include "common/errno.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+
+void* as_cookie(struct rte_pktmbuf_pool_private& p) {
+ return &p;
+};
+
+/******************* Net device related constatns *****************************/
+static constexpr uint16_t default_ring_size = 512;
+
+//
+// We need 2 times the ring size of buffers because of the way PMDs
+// refill the ring.
+//
+static constexpr uint16_t mbufs_per_queue_rx = 2 * default_ring_size;
+static constexpr uint16_t rx_gc_thresh = 64;
+
+//
+// No need to keep more descriptors in the air than can be sent in a single
+// rte_eth_tx_burst() call.
+//
+static constexpr uint16_t mbufs_per_queue_tx = 2 * default_ring_size;
+
+static constexpr uint16_t mbuf_cache_size = 512;
+//
+// Size of the data buffer in the non-inline case.
+//
+// We may want to change (increase) this value in future, while the
+// inline_mbuf_data_size value will unlikely change due to reasons described
+// above.
+//
+static constexpr size_t mbuf_data_size = 4096;
+
+static constexpr uint16_t mbuf_overhead =
+ sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+//
+// We'll allocate 2K data buffers for an inline case because this would require
+// a single page per mbuf. If we used 4K data buffers here it would require 2
+// pages for a single buffer (due to "mbuf_overhead") and this is a much more
+// demanding memory constraint.
+//
+static constexpr size_t inline_mbuf_data_size = 2048;
+
+
+// (INLINE_MBUF_DATA_SIZE(2K)*32 = 64K = Max TSO/LRO size) + 1 mbuf for headers
+static constexpr uint8_t max_frags = 32 + 1;
+
+//
+// Intel's 40G NIC HW limit for a number of fragments in an xmit segment.
+//
+// See Chapter 8.4.1 "Transmit Packet in System Memory" of the xl710 devices
+// spec. for more details.
+//
+static constexpr uint8_t i40e_max_xmit_segment_frags = 8;
+
+//
+// VMWare's virtual NIC limit for a number of fragments in an xmit segment.
+//
+// see drivers/net/vmxnet3/base/vmxnet3_defs.h VMXNET3_MAX_TXD_PER_PKT
+//
+static constexpr uint8_t vmxnet3_max_xmit_segment_frags = 16;
+
+static constexpr uint16_t inline_mbuf_size = inline_mbuf_data_size + mbuf_overhead;
+
+static size_t huge_page_size = 512 * CEPH_PAGE_SIZE;
+
+uint32_t qp_mempool_obj_size()
+{
+ uint32_t mp_size = 0;
+ struct rte_mempool_objsz mp_obj_sz = {};
+
+ //
+ // We will align each size to huge page size because DPDK allocates
+ // physically contiguous memory region for each pool object.
+ //
+
+ // Rx
+ mp_size += align_up(rte_mempool_calc_obj_size(mbuf_overhead, 0, &mp_obj_sz)+
+ sizeof(struct rte_pktmbuf_pool_private),
+ huge_page_size);
+
+ //Tx
+ std::memset(&mp_obj_sz, 0, sizeof(mp_obj_sz));
+ mp_size += align_up(rte_mempool_calc_obj_size(inline_mbuf_size, 0,
+ &mp_obj_sz)+
+ sizeof(struct rte_pktmbuf_pool_private),
+ huge_page_size);
+ return mp_size;
+}
+
+static constexpr const char* pktmbuf_pool_name = "dpdk_net_pktmbuf_pool";
+
+/*
+ * When doing reads from the NIC queues, use this batch size
+ */
+static constexpr uint8_t packet_read_size = 32;
+/******************************************************************************/
+
+int DPDKDevice::init_port_start()
+{
+ ceph_assert(_port_idx < rte_eth_dev_count_avail());
+
+ rte_eth_dev_info_get(_port_idx, &_dev_info);
+
+ //
+ // This is a workaround for a missing handling of a HW limitation in the
+ // DPDK i40e driver. This and all related to _is_i40e_device code should be
+ // removed once this handling is added.
+ //
+ if (std::string("rte_i40evf_pmd") == _dev_info.driver_name ||
+ std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+ ldout(cct, 1) << __func__ << " Device is an Intel's 40G NIC. Enabling 8 fragments hack!" << dendl;
+ _is_i40e_device = true;
+ }
+
+ if (std::string("rte_vmxnet3_pmd") == _dev_info.driver_name) {
+ ldout(cct, 1) << __func__ << " Device is a VMWare Virtual NIC. Enabling 16 fragments hack!" << dendl;
+ _is_vmxnet3_device = true;
+ }
+
+ //
+ // Another workaround: this time for a lack of number of RSS bits.
+ // ixgbe PF NICs support up to 16 RSS queues.
+ // ixgbe VF NICs support up to 4 RSS queues.
+ // i40e PF NICs support up to 64 RSS queues.
+ // i40e VF NICs support up to 16 RSS queues.
+ //
+ if (std::string("rte_ixgbe_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+ } else if (std::string("rte_ixgbevf_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)4);
+ } else if (std::string("rte_i40e_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)64);
+ } else if (std::string("rte_i40evf_pmd") == _dev_info.driver_name) {
+ _dev_info.max_rx_queues = std::min(_dev_info.max_rx_queues, (uint16_t)16);
+ }
+
+ // Hardware offload capabilities
+ // https://github.com/DPDK/dpdk/blob/v19.05/lib/librte_ethdev/rte_ethdev.h#L993-L1074
+ // We want to support all available offload features
+ // TODO: below features are implemented in 17.05, should support new ones
+ const uint64_t tx_offloads_wanted =
+ DEV_TX_OFFLOAD_VLAN_INSERT |
+ DEV_TX_OFFLOAD_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_UDP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_CKSUM |
+ DEV_TX_OFFLOAD_SCTP_CKSUM |
+ DEV_TX_OFFLOAD_TCP_TSO |
+ DEV_TX_OFFLOAD_UDP_TSO |
+ DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM |
+ DEV_TX_OFFLOAD_QINQ_INSERT |
+ DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+ DEV_TX_OFFLOAD_GRE_TNL_TSO |
+ DEV_TX_OFFLOAD_IPIP_TNL_TSO |
+ DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
+ DEV_TX_OFFLOAD_MACSEC_INSERT;
+
+ _dev_info.default_txconf.offloads =
+ _dev_info.tx_offload_capa & tx_offloads_wanted;
+
+ /* for port configuration all features are off by default */
+ rte_eth_conf port_conf = { 0 };
+
+ /* setting tx offloads for port */
+ port_conf.txmode.offloads = _dev_info.default_txconf.offloads;
+
+ ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": max_rx_queues "
+ << _dev_info.max_rx_queues << " max_tx_queues "
+ << _dev_info.max_tx_queues << dendl;
+
+ _num_queues = std::min({_num_queues, _dev_info.max_rx_queues, _dev_info.max_tx_queues});
+
+ ldout(cct, 5) << __func__ << " Port " << int(_port_idx) << ": using "
+ << _num_queues << " queues" << dendl;
+
+ // Set RSS mode: enable RSS if seastar is configured with more than 1 CPU.
+ // Even if port has a single queue we still want the RSS feature to be
+ // available in order to make HW calculate RSS hash for us.
+ if (_num_queues > 1) {
+ if (_dev_info.hash_key_size == 40) {
+ _rss_key = default_rsskey_40bytes;
+ } else if (_dev_info.hash_key_size == 52) {
+ _rss_key = default_rsskey_52bytes;
+ } else if (_dev_info.hash_key_size != 0) {
+ lderr(cct) << "Port " << int(_port_idx)
+ << ": We support only 40 or 52 bytes RSS hash keys, "
+ << int(_dev_info.hash_key_size) << " bytes key requested"
+ << dendl;
+ return -EINVAL;
+ } else {
+ _rss_key = default_rsskey_40bytes;
+ _dev_info.hash_key_size = 40;
+ }
+
+ port_conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+ /* enable all supported rss offloads */
+ port_conf.rx_adv_conf.rss_conf.rss_hf = _dev_info.flow_type_rss_offloads;
+ if (_dev_info.hash_key_size) {
+ port_conf.rx_adv_conf.rss_conf.rss_key = const_cast<uint8_t *>(_rss_key.data());
+ port_conf.rx_adv_conf.rss_conf.rss_key_len = _dev_info.hash_key_size;
+ }
+ } else {
+ port_conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+ }
+
+ if (_num_queues > 1) {
+ if (_dev_info.reta_size) {
+ // RETA size should be a power of 2
+ ceph_assert((_dev_info.reta_size & (_dev_info.reta_size - 1)) == 0);
+
+ // Set the RSS table to the correct size
+ _redir_table.resize(_dev_info.reta_size);
+ _rss_table_bits = std::lround(std::log2(_dev_info.reta_size));
+ ldout(cct, 5) << __func__ << " Port " << int(_port_idx)
+ << ": RSS table size is " << _dev_info.reta_size << dendl;
+ } else {
+ // FIXME: same with sw_reta
+ _redir_table.resize(128);
+ _rss_table_bits = std::lround(std::log2(128));
+ }
+ } else {
+ _redir_table.push_back(0);
+ }
+
+ // Set Rx VLAN stripping
+ if (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) {
+ port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
+ }
+
+#ifdef RTE_ETHDEV_HAS_LRO_SUPPORT
+ // Enable LRO
+ if (_use_lro && (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)) {
+ ldout(cct, 1) << __func__ << " LRO is on" << dendl;
+ port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_TCP_LRO;
+ _hw_features.rx_lro = true;
+ } else
+#endif
+ ldout(cct, 1) << __func__ << " LRO is off" << dendl;
+
+ // Check that all CSUM features are either all set all together or not set
+ // all together. If this assumption breaks we need to rework the below logic
+ // by splitting the csum offload feature bit into separate bits for IPv4,
+ // TCP.
+ ceph_assert(((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+ (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) ||
+ (!(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+ !(_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)));
+
+ // Set Rx checksum checking
+ if ((_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) &&
+ (_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_CKSUM)) {
+ ldout(cct, 1) << __func__ << " RX checksum offload supported" << dendl;
+ port_conf.rxmode.offloads |= DEV_RX_OFFLOAD_CHECKSUM;
+ _hw_features.rx_csum_offload = 1;
+ }
+
+ if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_IPV4_CKSUM)) {
+ ldout(cct, 1) << __func__ << " TX ip checksum offload supported" << dendl;
+ _hw_features.tx_csum_ip_offload = 1;
+ }
+
+ // TSO is supported starting from DPDK v1.8
+ // TSO is abnormal in some DPDK versions (eg.dpdk-20.11-3.e18.aarch64), try
+ // disable TSO by ms_dpdk_enable_tso=false
+ if ((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) &&
+ cct->_conf.get_val<bool>("ms_dpdk_enable_tso")) {
+ ldout(cct, 1) << __func__ << " TSO is supported" << dendl;
+ _hw_features.tx_tso = 1;
+ }
+
+ // Check that Tx TCP CSUM features are either all set all together
+ // or not set all together. If this assumption breaks we need to rework the
+ // below logic by splitting the csum offload feature bit into separate bits
+ // for TCP.
+ ceph_assert((_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) ||
+ !(_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM));
+
+ if (_dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_CKSUM) {
+ ldout(cct, 1) << __func__ << " TX TCP checksum offload supported" << dendl;
+ _hw_features.tx_csum_l4_offload = 1;
+ }
+
+ int retval;
+
+ ldout(cct, 1) << __func__ << " Port " << int(_port_idx) << " init ... " << dendl;
+
+ /*
+ * Standard DPDK port initialisation - config port, then set up
+ * rx and tx rings.
+ */
+ if ((retval = rte_eth_dev_configure(_port_idx, _num_queues, _num_queues,
+ &port_conf)) != 0) {
+ lderr(cct) << __func__ << " failed to configure port " << (int)_port_idx
+ << " rx/tx queues " << _num_queues << " error " << cpp_strerror(retval) << dendl;
+ return retval;
+ }
+
+ //rte_eth_promiscuous_enable(port_num);
+ ldout(cct, 1) << __func__ << " done." << dendl;
+
+ return 0;
+}
+
+void DPDKDevice::set_hw_flow_control()
+{
+ // Read the port's current/default flow control settings
+ struct rte_eth_fc_conf fc_conf;
+ auto ret = rte_eth_dev_flow_ctrl_get(_port_idx, &fc_conf);
+
+ if (ret == -ENOTSUP) {
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+ << ": not support to get hardware flow control settings: " << ret << dendl;
+ goto not_supported;
+ }
+
+ if (ret < 0) {
+ lderr(cct) << __func__ << " port " << int(_port_idx)
+ << ": failed to get hardware flow control settings: " << ret << dendl;
+ ceph_abort();
+ }
+
+ if (_enable_fc) {
+ fc_conf.mode = RTE_FC_FULL;
+ } else {
+ fc_conf.mode = RTE_FC_NONE;
+ }
+
+ ret = rte_eth_dev_flow_ctrl_set(_port_idx, &fc_conf);
+ if (ret == -ENOTSUP) {
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx)
+ << ": not support to set hardware flow control settings: " << ret << dendl;
+ goto not_supported;
+ }
+
+ if (ret < 0) {
+ lderr(cct) << __func__ << " port " << int(_port_idx)
+ << ": failed to set hardware flow control settings: " << ret << dendl;
+ ceph_abort();
+ }
+
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": HW FC " << _enable_fc << dendl;
+ return;
+
+not_supported:
+ ldout(cct, 1) << __func__ << " port " << int(_port_idx) << ": changing HW FC settings is not supported" << dendl;
+}
+
+class XstatSocketHook : public AdminSocketHook {
+ DPDKDevice *dev;
+ public:
+ explicit XstatSocketHook(DPDKDevice *dev) : dev(dev) {}
+ int call(std::string_view prefix, const cmdmap_t& cmdmap,
+ Formatter *f,
+ std::ostream& ss,
+ bufferlist& out) override {
+ if (prefix == "show_pmd_stats") {
+ dev->nic_stats_dump(f);
+ } else if (prefix == "show_pmd_xstats") {
+ dev->nic_xstats_dump(f);
+ }
+ return 0;
+ }
+};
+
+int DPDKDevice::init_port_fini()
+{
+ // Changing FC requires HW reset, so set it before the port is initialized.
+ set_hw_flow_control();
+
+ if (rte_eth_dev_start(_port_idx) != 0) {
+ lderr(cct) << __func__ << " can't start port " << _port_idx << dendl;
+ return -1;
+ }
+
+ if (_num_queues > 1)
+ set_rss_table();
+
+ // Wait for a link
+ if (check_port_link_status() < 0) {
+ lderr(cct) << __func__ << " port link up failed " << _port_idx << dendl;
+ return -1;
+ }
+
+ ldout(cct, 5) << __func__ << " created DPDK device" << dendl;
+ AdminSocket *admin_socket = cct->get_admin_socket();
+ dfx_hook = std::make_unique<XstatSocketHook>(this);
+ int r = admin_socket->register_command("show_pmd_stats", dfx_hook.get(),
+ "show pmd stats statistics");
+ ceph_assert(r == 0);
+ r = admin_socket->register_command("show_pmd_xstats", dfx_hook.get(),
+ "show pmd xstats statistics");
+ ceph_assert(r == 0);
+ return 0;
+}
+
+void DPDKDevice::set_rss_table()
+{
+ struct rte_flow_attr attr;
+ struct rte_flow_item pattern[1];
+ struct rte_flow_action action[2];
+ struct rte_flow_action_rss rss_conf;
+
+ /*
+ * set the rule attribute.
+ * in this case only ingress packets will be checked.
+ */
+ memset(&attr, 0, sizeof(struct rte_flow_attr));
+ attr.ingress = 1;
+
+ /* the final level must be always type end */
+ pattern[0].type = RTE_FLOW_ITEM_TYPE_END;
+
+ /*
+ * create the action sequence.
+ * one action only, set rss hash func to toeplitz.
+ */
+ uint16_t i = 0;
+ for (auto& r : _redir_table) {
+ r = i++ % _num_queues;
+ }
+ rss_conf.func = RTE_ETH_HASH_FUNCTION_TOEPLITZ;
+ rss_conf.types = ETH_RSS_FRAG_IPV4 | ETH_RSS_NONFRAG_IPV4_TCP;
+ rss_conf.queue_num = _num_queues;
+ rss_conf.queue = const_cast<uint16_t *>(_redir_table.data());
+ rss_conf.key_len = _dev_info.hash_key_size;
+ rss_conf.key = const_cast<uint8_t *>(_rss_key.data());
+ rss_conf.level = 0;
+ action[0].type = RTE_FLOW_ACTION_TYPE_RSS;
+ action[0].conf = &rss_conf;
+ action[1].type = RTE_FLOW_ACTION_TYPE_END;
+
+ if (rte_flow_validate(_port_idx, &attr, pattern, action, nullptr) == 0)
+ _flow = rte_flow_create(_port_idx, &attr, pattern, action, nullptr);
+ else
+ ldout(cct, 0) << __func__ << " Port " << _port_idx
+ << ": flow rss func configuration is unsupported"
+ << dendl;
+}
+
+void DPDKQueuePair::configure_proxies(const std::map<unsigned, float>& cpu_weights) {
+ ceph_assert(!cpu_weights.empty());
+ if (cpu_weights.size() == 1 && cpu_weights.begin()->first == _qid) {
+ // special case queue sending to self only, to avoid requiring a hash value
+ return;
+ }
+ register_packet_provider([this] {
+ std::optional<Packet> p;
+ if (!_proxy_packetq.empty()) {
+ p = std::move(_proxy_packetq.front());
+ _proxy_packetq.pop_front();
+ }
+ return p;
+ });
+ build_sw_reta(cpu_weights);
+}
+
+void DPDKQueuePair::build_sw_reta(const std::map<unsigned, float>& cpu_weights) {
+ float total_weight = 0;
+ for (auto&& x : cpu_weights) {
+ total_weight += x.second;
+ }
+ float accum = 0;
+ unsigned idx = 0;
+ std::array<uint8_t, 128> reta;
+ for (auto&& entry : cpu_weights) {
+ auto cpu = entry.first;
+ auto weight = entry.second;
+ accum += weight;
+ while (idx < (accum / total_weight * reta.size() - 0.5)) {
+ reta[idx++] = cpu;
+ }
+ }
+ _sw_reta = reta;
+}
+
+
+bool DPDKQueuePair::init_rx_mbuf_pool()
+{
+ std::string name = std::string(pktmbuf_pool_name) + std::to_string(_qid) + "_rx";
+
+ // reserve the memory for Rx buffers containers
+ _rx_free_pkts.reserve(mbufs_per_queue_rx);
+ _rx_free_bufs.reserve(mbufs_per_queue_rx);
+
+ _pktmbuf_pool_rx = rte_mempool_lookup(name.c_str());
+ if (!_pktmbuf_pool_rx) {
+ ldout(cct, 1) << __func__ << " Creating Rx mbuf pool '" << name.c_str()
+ << "' [" << mbufs_per_queue_rx << " mbufs] ..."<< dendl;
+
+ //
+ // Don't pass single-producer/single-consumer flags to mbuf create as it
+ // seems faster to use a cache instead.
+ //
+ struct rte_pktmbuf_pool_private roomsz = {};
+ roomsz.mbuf_data_room_size = mbuf_data_size + RTE_PKTMBUF_HEADROOM;
+ _pktmbuf_pool_rx = rte_mempool_create(
+ name.c_str(),
+ mbufs_per_queue_rx, mbuf_overhead + mbuf_data_size,
+ mbuf_cache_size,
+ sizeof(struct rte_pktmbuf_pool_private),
+ rte_pktmbuf_pool_init, as_cookie(roomsz),
+ rte_pktmbuf_init, nullptr,
+ rte_socket_id(), 0);
+ if (!_pktmbuf_pool_rx) {
+ lderr(cct) << __func__ << " Failed to create mempool for rx" << dendl;
+ return false;
+ }
+
+ //
+ // allocate more data buffer
+ int bufs_count = cct->_conf->ms_dpdk_rx_buffer_count_per_core - mbufs_per_queue_rx;
+ int mz_flags = RTE_MEMZONE_1GB|RTE_MEMZONE_SIZE_HINT_ONLY;
+ std::string mz_name = "rx_buffer_data" + std::to_string(_qid);
+ const struct rte_memzone *mz = rte_memzone_reserve_aligned(mz_name.c_str(),
+ mbuf_data_size*bufs_count, _pktmbuf_pool_rx->socket_id, mz_flags, mbuf_data_size);
+ ceph_assert(mz);
+ void* m = mz->addr;
+ for (int i = 0; i < bufs_count; i++) {
+ ceph_assert(m);
+ _alloc_bufs.push_back(m);
+ m += mbuf_data_size;
+ }
+
+ if (rte_eth_rx_queue_setup(_dev_port_idx, _qid, default_ring_size,
+ rte_eth_dev_socket_id(_dev_port_idx),
+ _dev->def_rx_conf(), _pktmbuf_pool_rx) < 0) {
+ lderr(cct) << __func__ << " cannot initialize rx queue" << dendl;
+ return false;
+ }
+ }
+
+ return _pktmbuf_pool_rx != nullptr;
+}
+
+int DPDKDevice::check_port_link_status()
+{
+ int count = 0;
+
+ ldout(cct, 20) << __func__ << dendl;
+ const int sleep_time = 100 * 1000;
+ const int max_check_time = 90; /* 9s (90 * 100ms) in total */
+ while (true) {
+ struct rte_eth_link link;
+ memset(&link, 0, sizeof(link));
+ rte_eth_link_get_nowait(_port_idx, &link);
+
+ if (true) {
+ if (link.link_status) {
+ ldout(cct, 5) << __func__ << " done port "
+ << static_cast<unsigned>(_port_idx)
+ << " link Up - speed " << link.link_speed
+ << " Mbps - "
+ << ((link.link_duplex == ETH_LINK_FULL_DUPLEX) ? ("full-duplex") : ("half-duplex\n"))
+ << dendl;
+ break;
+ } else if (count++ < max_check_time) {
+ ldout(cct, 20) << __func__ << " not ready, continue to wait." << dendl;
+ usleep(sleep_time);
+ } else {
+ lderr(cct) << __func__ << " done port " << _port_idx << " link down" << dendl;
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+class C_handle_dev_stats : public EventCallback {
+ DPDKQueuePair *_qp;
+ public:
+ C_handle_dev_stats(DPDKQueuePair *qp): _qp(qp) { }
+ void do_request(uint64_t id) {
+ _qp->handle_stats();
+ }
+};
+
+DPDKQueuePair::DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid)
+ : cct(c), _dev(dev), _dev_port_idx(dev->port_idx()), center(cen), _qid(qid),
+ _tx_poller(this), _rx_gc_poller(this), _tx_buf_factory(c, dev, qid),
+ _tx_gc_poller(this)
+{
+ if (!init_rx_mbuf_pool()) {
+ lderr(cct) << __func__ << " cannot initialize mbuf pools" << dendl;
+ ceph_abort();
+ }
+
+ static_assert(offsetof(tx_buf, private_end) -
+ offsetof(tx_buf, private_start) <= RTE_PKTMBUF_HEADROOM,
+ "RTE_PKTMBUF_HEADROOM is less than DPDKQueuePair::tx_buf size! "
+ "Increase the headroom size in the DPDK configuration");
+ static_assert(offsetof(tx_buf, _mbuf) == 0,
+ "There is a pad at the beginning of the tx_buf before _mbuf "
+ "field!");
+ static_assert((inline_mbuf_data_size & (inline_mbuf_data_size - 1)) == 0,
+ "inline_mbuf_data_size has to be a power of two!");
+
+ std::string name(std::string("queue") + std::to_string(qid));
+ PerfCountersBuilder plb(cct, name, l_dpdk_qp_first, l_dpdk_qp_last);
+
+ plb.add_u64_counter(l_dpdk_qp_rx_packets, "dpdk_receive_packets", "DPDK received packets");
+ plb.add_u64_counter(l_dpdk_qp_tx_packets, "dpdk_send_packets", "DPDK sendd packets");
+ plb.add_u64_counter(l_dpdk_qp_rx_bad_checksum_errors, "dpdk_receive_bad_checksum_errors", "DPDK received bad checksum packets");
+ plb.add_u64_counter(l_dpdk_qp_rx_no_memory_errors, "dpdk_receive_no_memory_errors", "DPDK received no memory packets");
+ plb.add_u64_counter(l_dpdk_qp_rx_bytes, "dpdk_receive_bytes", "DPDK received bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_tx_bytes, "dpdk_send_bytes", "DPDK sendd bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_rx_last_bunch, "dpdk_receive_last_bunch", "DPDK last received bunch");
+ plb.add_u64_counter(l_dpdk_qp_tx_last_bunch, "dpdk_send_last_bunch", "DPDK last send bunch");
+ plb.add_u64_counter(l_dpdk_qp_rx_fragments, "dpdk_receive_fragments", "DPDK received total fragments");
+ plb.add_u64_counter(l_dpdk_qp_tx_fragments, "dpdk_send_fragments", "DPDK sendd total fragments");
+ plb.add_u64_counter(l_dpdk_qp_rx_copy_ops, "dpdk_receive_copy_ops", "DPDK received copy operations");
+ plb.add_u64_counter(l_dpdk_qp_tx_copy_ops, "dpdk_send_copy_ops", "DPDK sendd copy operations");
+ plb.add_u64_counter(l_dpdk_qp_rx_copy_bytes, "dpdk_receive_copy_bytes", "DPDK received copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_tx_copy_bytes, "dpdk_send_copy_bytes", "DPDK send copy bytes", NULL, 0, unit_t(UNIT_BYTES));
+ plb.add_u64_counter(l_dpdk_qp_rx_linearize_ops, "dpdk_receive_linearize_ops", "DPDK received linearize operations");
+ plb.add_u64_counter(l_dpdk_qp_tx_linearize_ops, "dpdk_send_linearize_ops", "DPDK send linearize operations");
+ plb.add_u64_counter(l_dpdk_qp_tx_queue_length, "dpdk_send_queue_length", "DPDK send queue length");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+
+ if (!_qid)
+ device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+void DPDKDevice::nic_stats_dump(Formatter *f)
+{
+ static uint64_t prev_pkts_rx[RTE_MAX_ETHPORTS];
+ static uint64_t prev_pkts_tx[RTE_MAX_ETHPORTS];
+ static uint64_t prev_cycles[RTE_MAX_ETHPORTS];
+ size_t tx_fragments = 0;
+ size_t rx_fragments = 0;
+ size_t tx_free_cnt = 0;
+ size_t rx_free_cnt = 0;
+
+ for (auto &qp: _queues) {
+ tx_fragments += qp->perf_logger->get(l_dpdk_qp_tx_fragments);
+ rx_fragments += qp->perf_logger->get(l_dpdk_qp_rx_fragments);
+ tx_free_cnt += qp->_tx_buf_factory.ring_size();
+ rx_free_cnt += rte_mempool_avail_count(qp->_pktmbuf_pool_rx);
+ }
+ struct rte_eth_stats stats;
+ rte_eth_stats_get(_port_idx, &stats);
+ f->open_object_section("RX");
+ f->dump_unsigned("in_packets", stats.ipackets);
+ f->dump_unsigned("recv_packets", rx_fragments);
+ f->dump_unsigned("in_bytes", stats.ibytes);
+ f->dump_unsigned("missed", stats.imissed);
+ f->dump_unsigned("errors", stats.ierrors);
+ f->close_section();
+
+ f->open_object_section("TX");
+ f->dump_unsigned("out_packets", stats.opackets);
+ f->dump_unsigned("send_packets", tx_fragments);
+ f->dump_unsigned("out_bytes", stats.obytes);
+ f->dump_unsigned("errors", stats.oerrors);
+ f->close_section();
+
+ f->open_object_section("stats");
+ f->dump_unsigned("RX_nombuf", stats.rx_nombuf);
+ f->dump_unsigned("RX_avail_mbufs", rx_free_cnt);
+ f->dump_unsigned("TX_avail_mbufs", tx_free_cnt);
+
+ uint64_t diff_cycles = prev_cycles[_port_idx];
+ prev_cycles[_port_idx] = rte_rdtsc();
+ if (diff_cycles > 0) {
+ diff_cycles = prev_cycles[_port_idx] - diff_cycles;
+ }
+
+ uint64_t diff_pkts_rx = (stats.ipackets > prev_pkts_rx[_port_idx]) ?
+ (stats.ipackets - prev_pkts_rx[_port_idx]) : 0;
+ uint64_t diff_pkts_tx = (stats.opackets > prev_pkts_tx[_port_idx]) ?
+ (stats.opackets - prev_pkts_tx[_port_idx]) : 0;
+ prev_pkts_rx[_port_idx] = stats.ipackets;
+ prev_pkts_tx[_port_idx] = stats.opackets;
+ uint64_t mpps_rx = diff_cycles > 0 ? diff_pkts_rx * rte_get_tsc_hz() / diff_cycles : 0;
+ uint64_t mpps_tx = diff_cycles > 0 ? diff_pkts_tx * rte_get_tsc_hz() / diff_cycles : 0;
+ f->dump_unsigned("Rx_pps", mpps_rx);
+ f->dump_unsigned("Tx_pps", mpps_tx);
+ f->close_section();
+}
+
+void DPDKDevice::nic_xstats_dump(Formatter *f)
+{
+ // Get count
+ int cnt_xstats = rte_eth_xstats_get_names(_port_idx, NULL, 0);
+ if (cnt_xstats < 0) {
+ ldout(cct, 1) << "Error: Cannot get count of xstats" << dendl;
+ return;
+ }
+
+ // Get id-name lookup table
+ std::vector<struct rte_eth_xstat_name> xstats_names(cnt_xstats);
+ if (cnt_xstats != rte_eth_xstats_get_names(_port_idx, xstats_names.data(), cnt_xstats)) {
+ ldout(cct, 1) << "Error: Cannot get xstats lookup" << dendl;
+ return;
+ }
+
+ // Get stats themselves
+ std::vector<struct rte_eth_xstat> xstats(cnt_xstats);
+ if (cnt_xstats != rte_eth_xstats_get(_port_idx, xstats.data(), cnt_xstats)) {
+ ldout(cct, 1) << "Error: Unable to get xstats" << dendl;
+ return;
+ }
+ f->open_object_section("xstats");
+ for (int i = 0; i < cnt_xstats; i++){
+ f->dump_unsigned(xstats_names[i].name, xstats[i].value);
+ }
+ f->close_section();
+}
+
+void DPDKQueuePair::handle_stats()
+{
+ ldout(cct, 20) << __func__ << " started." << dendl;
+ rte_eth_stats rte_stats = {};
+ int rc = rte_eth_stats_get(_dev_port_idx, &rte_stats);
+
+ if (rc) {
+ ldout(cct, 0) << __func__ << " failed to get port statistics: " << cpp_strerror(rc) << dendl;
+ return ;
+ }
+
+#if RTE_VERSION < RTE_VERSION_NUM(16,7,0,0)
+ _dev->perf_logger->set(l_dpdk_dev_rx_mcast, rte_stats.imcasts);
+ _dev->perf_logger->set(l_dpdk_dev_rx_badcrc_errors, rte_stats.ibadcrc);
+#endif
+ _dev->perf_logger->set(l_dpdk_dev_rx_dropped_errors, rte_stats.imissed);
+ _dev->perf_logger->set(l_dpdk_dev_rx_nombuf_errors, rte_stats.rx_nombuf);
+
+ _dev->perf_logger->set(l_dpdk_dev_rx_total_errors, rte_stats.ierrors);
+ _dev->perf_logger->set(l_dpdk_dev_tx_total_errors, rte_stats.oerrors);
+ device_stat_time_fd = center->create_time_event(1000*1000, new C_handle_dev_stats(this));
+}
+
+bool DPDKQueuePair::poll_tx() {
+ bool nonloopback = !cct->_conf->ms_dpdk_debug_allow_loopback;
+#ifdef CEPH_PERF_DEV
+ uint64_t start = Cycles::rdtsc();
+#endif
+ uint32_t total_work = 0;
+ if (_tx_packetq.size() < 16) {
+ // refill send queue from upper layers
+ uint32_t work;
+ do {
+ work = 0;
+ for (auto&& pr : _pkt_providers) {
+ auto p = pr();
+ if (p) {
+ work++;
+ if (likely(nonloopback)) {
+ // ldout(cct, 0) << __func__ << " len: " << p->len() << " frags: " << p->nr_frags() << dendl;
+ _tx_packetq.push_back(std::move(*p));
+ } else {
+ auto th = p->get_header<eth_hdr>(0);
+ if (th->dst_mac == th->src_mac) {
+ _dev->l2receive(_qid, std::move(*p));
+ } else {
+ _tx_packetq.push_back(std::move(*p));
+ }
+ }
+ if (_tx_packetq.size() == 128) {
+ break;
+ }
+ }
+ }
+ total_work += work;
+ } while (work && total_work < 256 && _tx_packetq.size() < 128);
+ }
+ if (!_tx_packetq.empty()) {
+ uint64_t c = send(_tx_packetq);
+ perf_logger->inc(l_dpdk_qp_tx_packets, c);
+ perf_logger->set(l_dpdk_qp_tx_last_bunch, c);
+#ifdef CEPH_PERF_DEV
+ tx_count += total_work;
+ tx_cycles += Cycles::rdtsc() - start;
+#endif
+ return true;
+ }
+
+ return false;
+}
+
+inline std::optional<Packet> DPDKQueuePair::from_mbuf_lro(rte_mbuf* m)
+{
+ _frags.clear();
+ _bufs.clear();
+
+ for (; m != nullptr; m = m->next) {
+ char* data = rte_pktmbuf_mtod(m, char*);
+
+ _frags.emplace_back(fragment{data, rte_pktmbuf_data_len(m)});
+ _bufs.push_back(data);
+ }
+
+ auto del = std::bind(
+ [this](std::vector<char*> &bufs) {
+ for (auto&& b : bufs) { _alloc_bufs.push_back(b); }
+ }, std::move(_bufs));
+ return Packet(
+ _frags.begin(), _frags.end(), make_deleter(std::move(del)));
+}
+
+inline std::optional<Packet> DPDKQueuePair::from_mbuf(rte_mbuf* m)
+{
+ _rx_free_pkts.push_back(m);
+ _num_rx_free_segs += m->nb_segs;
+
+ if (!_dev->hw_features_ref().rx_lro || rte_pktmbuf_is_contiguous(m)) {
+ char* data = rte_pktmbuf_mtod(m, char*);
+
+ return Packet(fragment{data, rte_pktmbuf_data_len(m)},
+ make_deleter([this, data] { _alloc_bufs.push_back(data); }));
+ } else {
+ return from_mbuf_lro(m);
+ }
+}
+
+inline bool DPDKQueuePair::refill_one_cluster(rte_mbuf* head)
+{
+ for (; head != nullptr; head = head->next) {
+ if (!refill_rx_mbuf(head, mbuf_data_size, _alloc_bufs)) {
+ //
+ // If we failed to allocate a new buffer - push the rest of the
+ // cluster back to the free_packets list for a later retry.
+ //
+ _rx_free_pkts.push_back(head);
+ return false;
+ }
+ _rx_free_bufs.push_back(head);
+ }
+
+ return true;
+}
+
+bool DPDKQueuePair::rx_gc(bool force)
+{
+ if (_num_rx_free_segs >= rx_gc_thresh || force) {
+ ldout(cct, 10) << __func__ << " free segs " << _num_rx_free_segs
+ << " thresh " << rx_gc_thresh
+ << " free pkts " << _rx_free_pkts.size()
+ << dendl;
+
+ while (!_rx_free_pkts.empty()) {
+ //
+ // Use back() + pop_back() semantics to avoid an extra
+ // _rx_free_pkts.clear() at the end of the function - clear() has a
+ // linear complexity.
+ //
+ auto m = _rx_free_pkts.back();
+ _rx_free_pkts.pop_back();
+
+ if (!refill_one_cluster(m)) {
+ ldout(cct, 1) << __func__ << " get new mbuf failed " << dendl;
+ break;
+ }
+ }
+ for (auto&& m : _rx_free_bufs) {
+ rte_pktmbuf_prefree_seg(m);
+ }
+
+ if (_rx_free_bufs.size()) {
+ rte_mempool_put_bulk(_pktmbuf_pool_rx,
+ (void **)_rx_free_bufs.data(),
+ _rx_free_bufs.size());
+
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert(_num_rx_free_segs >= _rx_free_bufs.size());
+
+ _num_rx_free_segs -= _rx_free_bufs.size();
+ _rx_free_bufs.clear();
+
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert((_rx_free_pkts.empty() && !_num_rx_free_segs) ||
+ (!_rx_free_pkts.empty() && _num_rx_free_segs));
+ }
+ }
+
+ return _num_rx_free_segs >= rx_gc_thresh;
+}
+
+
+void DPDKQueuePair::process_packets(
+ struct rte_mbuf **bufs, uint16_t count)
+{
+ uint64_t nr_frags = 0, bytes = 0;
+
+ for (uint16_t i = 0; i < count; i++) {
+ struct rte_mbuf *m = bufs[i];
+ offload_info oi;
+
+ std::optional<Packet> p = from_mbuf(m);
+
+ // Drop the packet if translation above has failed
+ if (!p) {
+ perf_logger->inc(l_dpdk_qp_rx_no_memory_errors);
+ continue;
+ }
+ // ldout(cct, 0) << __func__ << " len " << p->len() << " " << dendl;
+
+ nr_frags += m->nb_segs;
+ bytes += m->pkt_len;
+
+ // Set stipped VLAN value if available
+ if ((_dev->_dev_info.rx_offload_capa & DEV_RX_OFFLOAD_VLAN_STRIP) &&
+ (m->ol_flags & PKT_RX_VLAN_STRIPPED)) {
+ oi.vlan_tci = m->vlan_tci;
+ }
+
+ if (_dev->get_hw_features().rx_csum_offload) {
+ if (m->ol_flags & (PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD)) {
+ // Packet with bad checksum, just drop it.
+ perf_logger->inc(l_dpdk_qp_rx_bad_checksum_errors);
+ continue;
+ }
+ // Note that when _hw_features.rx_csum_offload is on, the receive
+ // code for ip, tcp and udp will assume they don't need to check
+ // the checksum again, because we did this here.
+ }
+
+ p->set_offload_info(oi);
+ if (m->ol_flags & PKT_RX_RSS_HASH) {
+ p->set_rss_hash(m->hash.rss);
+ }
+
+ _dev->l2receive(_qid, std::move(*p));
+ }
+
+ perf_logger->inc(l_dpdk_qp_rx_packets, count);
+ perf_logger->set(l_dpdk_qp_rx_last_bunch, count);
+ perf_logger->inc(l_dpdk_qp_rx_fragments, nr_frags);
+ perf_logger->inc(l_dpdk_qp_rx_bytes, bytes);
+}
+
+bool DPDKQueuePair::poll_rx_once()
+{
+ struct rte_mbuf *buf[packet_read_size];
+
+ /* read a port */
+#ifdef CEPH_PERF_DEV
+ uint64_t start = Cycles::rdtsc();
+#endif
+ uint16_t count = rte_eth_rx_burst(_dev_port_idx, _qid,
+ buf, packet_read_size);
+
+ /* Now process the NIC packets read */
+ if (likely(count > 0)) {
+ process_packets(buf, count);
+#ifdef CEPH_PERF_DEV
+ rx_cycles = Cycles::rdtsc() - start;
+ rx_count += count;
+#endif
+ }
+#ifdef CEPH_PERF_DEV
+ else {
+ if (rx_count > 10000 && tx_count) {
+ ldout(cct, 0) << __func__ << " rx count=" << rx_count << " avg rx=" << Cycles::to_nanoseconds(rx_cycles)/rx_count << "ns "
+ << " tx count=" << tx_count << " avg tx=" << Cycles::to_nanoseconds(tx_cycles)/tx_count << "ns"
+ << dendl;
+ rx_count = rx_cycles = tx_count = tx_cycles = 0;
+ }
+ }
+#endif
+
+ return count;
+}
+
+DPDKQueuePair::tx_buf_factory::tx_buf_factory(CephContext *c,
+ DPDKDevice *dev, uint8_t qid): cct(c)
+{
+ std::string name = std::string(pktmbuf_pool_name) + std::to_string(qid) + "_tx";
+
+ _pool = rte_mempool_lookup(name.c_str());
+ if (!_pool) {
+ ldout(cct, 0) << __func__ << " Creating Tx mbuf pool '" << name.c_str()
+ << "' [" << mbufs_per_queue_tx << " mbufs] ..." << dendl;
+ //
+ // We are going to push the buffers from the mempool into
+ // the circular_buffer and then poll them from there anyway, so
+ // we prefer to make a mempool non-atomic in this case.
+ //
+ _pool = rte_mempool_create(name.c_str(),
+ mbufs_per_queue_tx, inline_mbuf_size,
+ mbuf_cache_size,
+ sizeof(struct rte_pktmbuf_pool_private),
+ rte_pktmbuf_pool_init, nullptr,
+ rte_pktmbuf_init, nullptr,
+ rte_socket_id(), 0);
+
+ if (!_pool) {
+ lderr(cct) << __func__ << " Failed to create mempool for Tx" << dendl;
+ ceph_abort();
+ }
+ if (rte_eth_tx_queue_setup(dev->port_idx(), qid, default_ring_size,
+ rte_eth_dev_socket_id(dev->port_idx()),
+ dev->def_tx_conf()) < 0) {
+ lderr(cct) << __func__ << " cannot initialize tx queue" << dendl;
+ ceph_abort();
+ }
+ }
+
+ //
+ // Fill the factory with the buffers from the mempool allocated
+ // above.
+ //
+ init_factory();
+}
+
+bool DPDKQueuePair::tx_buf::i40e_should_linearize(rte_mbuf *head)
+{
+ bool is_tso = head->ol_flags & PKT_TX_TCP_SEG;
+
+ // For a non-TSO case: number of fragments should not exceed 8
+ if (!is_tso){
+ return head->nb_segs > i40e_max_xmit_segment_frags;
+ }
+
+ //
+ // For a TSO case each MSS window should not include more than 8
+ // fragments including headers.
+ //
+
+ // Calculate the number of frags containing headers.
+ //
+ // Note: we support neither VLAN nor tunneling thus headers size
+ // accounting is super simple.
+ //
+ size_t headers_size = head->l2_len + head->l3_len + head->l4_len;
+ unsigned hdr_frags = 0;
+ size_t cur_payload_len = 0;
+ rte_mbuf *cur_seg = head;
+
+ while (cur_seg && cur_payload_len < headers_size) {
+ cur_payload_len += cur_seg->data_len;
+ cur_seg = cur_seg->next;
+ hdr_frags++;
+ }
+
+ //
+ // Header fragments will be used for each TSO segment, thus the
+ // maximum number of data segments will be 8 minus the number of
+ // header fragments.
+ //
+ // It's unclear from the spec how the first TSO segment is treated
+ // if the last fragment with headers contains some data bytes:
+ // whether this fragment will be accounted as a single fragment or
+ // as two separate fragments. We prefer to play it safe and assume
+ // that this fragment will be accounted as two separate fragments.
+ //
+ size_t max_win_size = i40e_max_xmit_segment_frags - hdr_frags;
+
+ if (head->nb_segs <= max_win_size) {
+ return false;
+ }
+
+ // Get the data (without headers) part of the first data fragment
+ size_t prev_frag_data = cur_payload_len - headers_size;
+ auto mss = head->tso_segsz;
+
+ while (cur_seg) {
+ unsigned frags_in_seg = 0;
+ size_t cur_seg_size = 0;
+
+ if (prev_frag_data) {
+ cur_seg_size = prev_frag_data;
+ frags_in_seg++;
+ prev_frag_data = 0;
+ }
+
+ while (cur_seg_size < mss && cur_seg) {
+ cur_seg_size += cur_seg->data_len;
+ cur_seg = cur_seg->next;
+ frags_in_seg++;
+
+ if (frags_in_seg > max_win_size) {
+ return true;
+ }
+ }
+
+ if (cur_seg_size > mss) {
+ prev_frag_data = cur_seg_size - mss;
+ }
+ }
+
+ return false;
+}
+
+void DPDKQueuePair::tx_buf::set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head)
+{
+ // Handle TCP checksum offload
+ auto oi = p.offload_info();
+ if (oi.needs_ip_csum) {
+ head->ol_flags |= PKT_TX_IP_CKSUM;
+ // TODO: Take a VLAN header into an account here
+ head->l2_len = sizeof(struct rte_ether_hdr);
+ head->l3_len = oi.ip_hdr_len;
+ }
+ if (qp.port().get_hw_features().tx_csum_l4_offload) {
+ if (oi.protocol == ip_protocol_num::tcp) {
+ head->ol_flags |= PKT_TX_TCP_CKSUM;
+ // TODO: Take a VLAN header into an account here
+ head->l2_len = sizeof(struct rte_ether_hdr);
+ head->l3_len = oi.ip_hdr_len;
+
+ if (oi.tso_seg_size) {
+ ceph_assert(oi.needs_ip_csum);
+ head->ol_flags |= PKT_TX_TCP_SEG;
+ head->l4_len = oi.tcp_hdr_len;
+ head->tso_segsz = oi.tso_seg_size;
+ }
+ }
+ }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_zc(
+ CephContext *cct, Packet&& p, DPDKQueuePair& qp)
+{
+ // Too fragmented - linearize
+ if (p.nr_frags() > max_frags) {
+ p.linearize();
+ qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+ }
+
+ build_mbuf_cluster:
+ rte_mbuf *head = nullptr, *last_seg = nullptr;
+ unsigned nsegs = 0;
+
+ //
+ // Create a HEAD of the fragmented packet: check if frag0 has to be
+ // copied and if yes - send it in a copy way
+ //
+ if (!check_frag0(p)) {
+ if (!copy_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+ ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+ return nullptr;
+ }
+ } else if (!translate_one_frag(qp, p.frag(0), head, last_seg, nsegs)) {
+ ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(0).size << dendl;
+ return nullptr;
+ }
+
+ unsigned total_nsegs = nsegs;
+
+ for (unsigned i = 1; i < p.nr_frags(); i++) {
+ rte_mbuf *h = nullptr, *new_last_seg = nullptr;
+ if (!translate_one_frag(qp, p.frag(i), h, new_last_seg, nsegs)) {
+ ldout(cct, 1) << __func__ << " no available mbuf for " << p.frag(i).size << dendl;
+ me(head)->recycle();
+ return nullptr;
+ }
+
+ total_nsegs += nsegs;
+
+ // Attach a new buffers' chain to the packet chain
+ last_seg->next = h;
+ last_seg = new_last_seg;
+ }
+
+ // Update the HEAD buffer with the packet info
+ head->pkt_len = p.len();
+ head->nb_segs = total_nsegs;
+ // tx_pkt_burst loops until the next pointer is null, so last_seg->next must
+ // be null.
+ last_seg->next = nullptr;
+
+ set_cluster_offload_info(p, qp, head);
+
+ //
+ // If a packet hasn't been linearized already and the resulting
+ // cluster requires the linearisation due to HW limitation:
+ //
+ // - Recycle the cluster.
+ // - Linearize the packet.
+ // - Build the cluster once again
+ //
+ if (head->nb_segs > max_frags ||
+ (p.nr_frags() > 1 && qp.port().is_i40e_device() && i40e_should_linearize(head)) ||
+ (p.nr_frags() > vmxnet3_max_xmit_segment_frags && qp.port().is_vmxnet3_device())) {
+ me(head)->recycle();
+ p.linearize();
+ qp.perf_logger->inc(l_dpdk_qp_tx_linearize_ops);
+
+ goto build_mbuf_cluster;
+ }
+
+ me(last_seg)->set_packet(std::move(p));
+
+ return me(head);
+}
+
+void DPDKQueuePair::tx_buf::copy_packet_to_cluster(const Packet& p, rte_mbuf* head)
+{
+ rte_mbuf* cur_seg = head;
+ size_t cur_seg_offset = 0;
+ unsigned cur_frag_idx = 0;
+ size_t cur_frag_offset = 0;
+
+ while (true) {
+ size_t to_copy = std::min(p.frag(cur_frag_idx).size - cur_frag_offset,
+ inline_mbuf_data_size - cur_seg_offset);
+
+ memcpy(rte_pktmbuf_mtod_offset(cur_seg, void*, cur_seg_offset),
+ p.frag(cur_frag_idx).base + cur_frag_offset, to_copy);
+
+ cur_frag_offset += to_copy;
+ cur_seg_offset += to_copy;
+
+ if (cur_frag_offset >= p.frag(cur_frag_idx).size) {
+ ++cur_frag_idx;
+ if (cur_frag_idx >= p.nr_frags()) {
+ //
+ // We are done - set the data size of the last segment
+ // of the cluster.
+ //
+ cur_seg->data_len = cur_seg_offset;
+ break;
+ }
+
+ cur_frag_offset = 0;
+ }
+
+ if (cur_seg_offset >= inline_mbuf_data_size) {
+ cur_seg->data_len = inline_mbuf_data_size;
+ cur_seg = cur_seg->next;
+ cur_seg_offset = 0;
+
+ // FIXME: assert in a fast-path - remove!!!
+ ceph_assert(cur_seg);
+ }
+ }
+}
+
+DPDKQueuePair::tx_buf* DPDKQueuePair::tx_buf::from_packet_copy(Packet&& p, DPDKQueuePair& qp)
+{
+ // sanity
+ if (!p.len()) {
+ return nullptr;
+ }
+
+ /*
+ * Here we are going to use the fact that the inline data size is a
+ * power of two.
+ *
+ * We will first try to allocate the cluster and only if we are
+ * successful - we will go and copy the data.
+ */
+ auto aligned_len = align_up((size_t)p.len(), inline_mbuf_data_size);
+ unsigned nsegs = aligned_len / inline_mbuf_data_size;
+ rte_mbuf *head = nullptr, *last_seg = nullptr;
+
+ tx_buf* buf = qp.get_tx_buf();
+ if (!buf) {
+ return nullptr;
+ }
+
+ head = buf->rte_mbuf_p();
+ last_seg = head;
+ for (unsigned i = 1; i < nsegs; i++) {
+ buf = qp.get_tx_buf();
+ if (!buf) {
+ me(head)->recycle();
+ return nullptr;
+ }
+
+ last_seg->next = buf->rte_mbuf_p();
+ last_seg = last_seg->next;
+ }
+
+ //
+ // If we've got here means that we have succeeded already!
+ // We only need to copy the data and set the head buffer with the
+ // relevant info.
+ //
+ head->pkt_len = p.len();
+ head->nb_segs = nsegs;
+ // tx_pkt_burst loops until the next pointer is null, so last_seg->next must
+ // be null.
+ last_seg->next = nullptr;
+
+ copy_packet_to_cluster(p, head);
+ set_cluster_offload_info(p, qp, head);
+
+ return me(head);
+}
+
+size_t DPDKQueuePair::tx_buf::copy_one_data_buf(
+ DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len)
+{
+ tx_buf* buf = qp.get_tx_buf();
+ if (!buf) {
+ return 0;
+ }
+
+ size_t len = std::min(buf_len, inline_mbuf_data_size);
+
+ m = buf->rte_mbuf_p();
+
+ // mbuf_put()
+ m->data_len = len;
+ m->pkt_len = len;
+
+ qp.perf_logger->inc(l_dpdk_qp_tx_copy_ops);
+ qp.perf_logger->inc(l_dpdk_qp_tx_copy_bytes, len);
+
+ memcpy(rte_pktmbuf_mtod(m, void*), data, len);
+
+ return len;
+}
+
+/******************************** Interface functions *************************/
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+ CephContext *cct,
+ unsigned cores,
+ uint8_t port_idx,
+ bool use_lro,
+ bool enable_fc)
+{
+ // Check that we have at least one DPDK-able port
+ if (rte_eth_dev_count_avail() == 0) {
+ ceph_assert(false && "No Ethernet ports - bye\n");
+ } else {
+ ldout(cct, 10) << __func__ << " ports number: " << int(rte_eth_dev_count_avail()) << dendl;
+ }
+
+ return std::unique_ptr<DPDKDevice>(
+ new DPDKDevice(cct, port_idx, cores, use_lro, enable_fc));
+}
diff --git a/src/msg/async/dpdk/DPDK.h b/src/msg/async/dpdk/DPDK.h
new file mode 100644
index 000000000..ec1d707ff
--- /dev/null
+++ b/src/msg/async/dpdk/DPDK.h
@@ -0,0 +1,937 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_DEV_H
+#define CEPH_DPDK_DEV_H
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_ether.h>
+#include <rte_malloc.h>
+#include <rte_version.h>
+
+#include "include/page.h"
+#include "common/perf_counters.h"
+#include "common/admin_socket.h"
+#include "msg/async/Event.h"
+#include "const.h"
+#include "circular_buffer.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "net.h"
+#include "toeplitz.h"
+
+struct free_deleter {
+ void operator()(void* p) { ::free(p); }
+};
+
+
+enum {
+ l_dpdk_dev_first = 58800,
+ l_dpdk_dev_rx_mcast,
+ l_dpdk_dev_rx_total_errors,
+ l_dpdk_dev_tx_total_errors,
+ l_dpdk_dev_rx_badcrc_errors,
+ l_dpdk_dev_rx_dropped_errors,
+ l_dpdk_dev_rx_nombuf_errors,
+ l_dpdk_dev_last
+};
+
+enum {
+ l_dpdk_qp_first = 58900,
+ l_dpdk_qp_rx_packets,
+ l_dpdk_qp_tx_packets,
+ l_dpdk_qp_rx_bad_checksum_errors,
+ l_dpdk_qp_rx_no_memory_errors,
+ l_dpdk_qp_rx_bytes,
+ l_dpdk_qp_tx_bytes,
+ l_dpdk_qp_rx_last_bunch,
+ l_dpdk_qp_tx_last_bunch,
+ l_dpdk_qp_rx_fragments,
+ l_dpdk_qp_tx_fragments,
+ l_dpdk_qp_rx_copy_ops,
+ l_dpdk_qp_tx_copy_ops,
+ l_dpdk_qp_rx_copy_bytes,
+ l_dpdk_qp_tx_copy_bytes,
+ l_dpdk_qp_rx_linearize_ops,
+ l_dpdk_qp_tx_linearize_ops,
+ l_dpdk_qp_tx_queue_length,
+ l_dpdk_qp_last
+};
+
+class DPDKDevice;
+class DPDKWorker;
+
+
+#ifndef MARKER
+typedef void *MARKER[0]; /**< generic marker for a point in a structure */
+#endif
+
+class DPDKQueuePair {
+ using packet_provider_type = std::function<std::optional<Packet> ()>;
+ public:
+ void configure_proxies(const std::map<unsigned, float>& cpu_weights);
+ // build REdirection TAble for cpu_weights map: target cpu -> weight
+ void build_sw_reta(const std::map<unsigned, float>& cpu_weights);
+ void proxy_send(Packet p) {
+ _proxy_packetq.push_back(std::move(p));
+ }
+ void register_packet_provider(packet_provider_type func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ bool poll_tx();
+ friend class DPDKDevice;
+
+ class tx_buf_factory;
+
+ class tx_buf {
+ friend class DPDKQueuePair;
+ public:
+ static tx_buf* me(rte_mbuf* mbuf) {
+ return reinterpret_cast<tx_buf*>(mbuf);
+ }
+
+ private:
+ /**
+ * Checks if the original packet of a given cluster should be linearized
+ * due to HW limitations.
+ *
+ * @param head head of a cluster to check
+ *
+ * @return TRUE if a packet should be linearized.
+ */
+ static bool i40e_should_linearize(rte_mbuf *head);
+
+ /**
+ * Sets the offload info in the head buffer of an rte_mbufs cluster.
+ *
+ * @param p an original packet the cluster is built for
+ * @param qp QP handle
+ * @param head a head of an rte_mbufs cluster
+ */
+ static void set_cluster_offload_info(const Packet& p, const DPDKQueuePair& qp, rte_mbuf* head);
+
+ /**
+ * Creates a tx_buf cluster representing a given packet in a "zero-copy"
+ * way.
+ *
+ * @param p packet to translate
+ * @param qp DPDKQueuePair handle
+ *
+ * @return the HEAD tx_buf of the cluster or nullptr in case of a
+ * failure
+ */
+ static tx_buf* from_packet_zc(
+ CephContext *cct, Packet&& p, DPDKQueuePair& qp);
+
+ /**
+ * Copy the contents of the "packet" into the given cluster of
+ * rte_mbuf's.
+ *
+ * @note Size of the cluster has to be big enough to accommodate all the
+ * contents of the given packet.
+ *
+ * @param p packet to copy
+ * @param head head of the rte_mbuf's cluster
+ */
+ static void copy_packet_to_cluster(const Packet& p, rte_mbuf* head);
+
+ /**
+ * Creates a tx_buf cluster representing a given packet in a "copy" way.
+ *
+ * @param p packet to translate
+ * @param qp DPDKQueuePair handle
+ *
+ * @return the HEAD tx_buf of the cluster or nullptr in case of a
+ * failure
+ */
+ static tx_buf* from_packet_copy(Packet&& p, DPDKQueuePair& qp);
+
+ /**
+ * Zero-copy handling of a single fragment.
+ *
+ * @param do_one_buf Functor responsible for a single rte_mbuf
+ * handling
+ * @param qp DPDKQueuePair handle (in)
+ * @param frag Fragment to copy (in)
+ * @param head Head of the cluster (out)
+ * @param last_seg Last segment of the cluster (out)
+ * @param nsegs Number of segments in the cluster (out)
+ *
+ * @return TRUE in case of success
+ */
+ template <class DoOneBufFunc>
+ static bool do_one_frag(DoOneBufFunc do_one_buf, DPDKQueuePair& qp,
+ fragment& frag, rte_mbuf*& head,
+ rte_mbuf*& last_seg, unsigned& nsegs) {
+ size_t len, left_to_set = frag.size;
+ char* base = frag.base;
+
+ rte_mbuf* m;
+
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert(frag.size);
+
+ // Create a HEAD of mbufs' cluster and set the first bytes into it
+ len = do_one_buf(qp, head, base, left_to_set);
+ if (!len) {
+ return false;
+ }
+
+ left_to_set -= len;
+ base += len;
+ nsegs = 1;
+
+ //
+ // Set the rest of the data into the new mbufs and chain them to
+ // the cluster.
+ //
+ rte_mbuf* prev_seg = head;
+ while (left_to_set) {
+ len = do_one_buf(qp, m, base, left_to_set);
+ if (!len) {
+ me(head)->recycle();
+ return false;
+ }
+
+ left_to_set -= len;
+ base += len;
+ nsegs++;
+
+ prev_seg->next = m;
+ prev_seg = m;
+ }
+
+ // Return the last mbuf in the cluster
+ last_seg = prev_seg;
+
+ return true;
+ }
+
+ /**
+ * Zero-copy handling of a single fragment.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param frag Fragment to copy (in)
+ * @param head Head of the cluster (out)
+ * @param last_seg Last segment of the cluster (out)
+ * @param nsegs Number of segments in the cluster (out)
+ *
+ * @return TRUE in case of success
+ */
+ static bool translate_one_frag(DPDKQueuePair& qp, fragment& frag,
+ rte_mbuf*& head, rte_mbuf*& last_seg,
+ unsigned& nsegs) {
+ return do_one_frag(set_one_data_buf, qp, frag, head,
+ last_seg, nsegs);
+ }
+
+ /**
+ * Copies one fragment into the cluster of rte_mbuf's.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param frag Fragment to copy (in)
+ * @param head Head of the cluster (out)
+ * @param last_seg Last segment of the cluster (out)
+ * @param nsegs Number of segments in the cluster (out)
+ *
+ * We return the "last_seg" to avoid traversing the cluster in order to get
+ * it.
+ *
+ * @return TRUE in case of success
+ */
+ static bool copy_one_frag(DPDKQueuePair& qp, fragment& frag,
+ rte_mbuf*& head, rte_mbuf*& last_seg,
+ unsigned& nsegs) {
+ return do_one_frag(copy_one_data_buf, qp, frag, head,
+ last_seg, nsegs);
+ }
+
+ /**
+ * Allocates a single rte_mbuf and sets it to point to a given data
+ * buffer.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param m New allocated rte_mbuf (out)
+ * @param va virtual address of a data buffer (in)
+ * @param buf_len length of the data to copy (in)
+ *
+ * @return The actual number of bytes that has been set in the mbuf
+ */
+ static size_t set_one_data_buf(
+ DPDKQueuePair& qp, rte_mbuf*& m, char* va, size_t buf_len) {
+ static constexpr size_t max_frag_len = 15 * 1024; // 15K
+
+ // FIXME: current all tx buf is allocated without rte_malloc
+ return copy_one_data_buf(qp, m, va, buf_len);
+ //
+ // Currently we break a buffer on a 15K boundary because 82599
+ // devices have a 15.5K limitation on a maximum single fragment
+ // size.
+ //
+ rte_iova_t pa = rte_malloc_virt2iova(va);
+ if (!pa)
+ return copy_one_data_buf(qp, m, va, buf_len);
+
+ ceph_assert(buf_len);
+ tx_buf* buf = qp.get_tx_buf();
+ if (!buf) {
+ return 0;
+ }
+
+ size_t len = std::min(buf_len, max_frag_len);
+
+ buf->set_zc_info(va, pa, len);
+ m = buf->rte_mbuf_p();
+
+ return len;
+ }
+
+ /**
+ * Allocates a single rte_mbuf and copies a given data into it.
+ *
+ * @param qp DPDKQueuePair handle (in)
+ * @param m New allocated rte_mbuf (out)
+ * @param data Data to copy from (in)
+ * @param buf_len length of the data to copy (in)
+ *
+ * @return The actual number of bytes that has been copied
+ */
+ static size_t copy_one_data_buf(
+ DPDKQueuePair& qp, rte_mbuf*& m, char* data, size_t buf_len);
+
+ /**
+ * Checks if the first fragment of the given packet satisfies the
+ * zero-copy flow requirement: its first 128 bytes should not cross the
+ * 4K page boundary. This is required in order to avoid splitting packet
+ * headers.
+ *
+ * @param p packet to check
+ *
+ * @return TRUE if packet is ok and FALSE otherwise.
+ */
+ static bool check_frag0(Packet& p)
+ {
+ //
+ // First frag is special - it has headers that should not be split.
+ // If the addressing is such that the first fragment has to be
+ // split, then send this packet in a (non-zero) copy flow. We'll
+ // check if the first 128 bytes of the first fragment reside in the
+ // physically contiguous area. If that's the case - we are good to
+ // go.
+ //
+ if (p.frag(0).size < 128)
+ return false;
+
+ return true;
+ }
+
+ public:
+ tx_buf(tx_buf_factory& fc) : _fc(fc) {
+
+ _buf_physaddr = _mbuf.buf_iova;
+ _data_off = _mbuf.data_off;
+ }
+
+ rte_mbuf* rte_mbuf_p() { return &_mbuf; }
+
+ void set_zc_info(void* va, phys_addr_t pa, size_t len) {
+ // mbuf_put()
+ _mbuf.data_len = len;
+ _mbuf.pkt_len = len;
+
+ // Set the mbuf to point to our data
+ _mbuf.buf_addr = va;
+ _mbuf.buf_iova = pa;
+ _mbuf.data_off = 0;
+ _is_zc = true;
+ }
+
+ void reset_zc() {
+
+ //
+ // If this mbuf was the last in a cluster and contains an
+ // original packet object then call the destructor of the
+ // original packet object.
+ //
+ if (_p) {
+ //
+ // Reset the std::optional. This in particular is going
+ // to call the "packet"'s destructor and reset the
+ // "optional" state to "nonengaged".
+ //
+ _p.reset();
+
+ } else if (!_is_zc) {
+ return;
+ }
+
+ // Restore the rte_mbuf fields we trashed in set_zc_info()
+ _mbuf.buf_iova = _buf_physaddr;
+ _mbuf.buf_addr = rte_mbuf_to_baddr(&_mbuf);
+ _mbuf.data_off = _data_off;
+
+ _is_zc = false;
+ }
+
+ void recycle() {
+ struct rte_mbuf *m = &_mbuf, *m_next;
+
+ while (m != nullptr) {
+ m_next = m->next;
+ rte_pktmbuf_reset(m);
+ _fc.put(me(m));
+ m = m_next;
+ }
+ }
+
+ void set_packet(Packet&& p) {
+ _p = std::move(p);
+ }
+
+ private:
+ struct rte_mbuf _mbuf;
+ MARKER private_start;
+ std::optional<Packet> _p;
+ phys_addr_t _buf_physaddr;
+ uint16_t _data_off;
+ // TRUE if underlying mbuf has been used in the zero-copy flow
+ bool _is_zc = false;
+ // buffers' factory the buffer came from
+ tx_buf_factory& _fc;
+ MARKER private_end;
+ };
+
+ class tx_buf_factory {
+ //
+ // Number of buffers to free in each GC iteration:
+ // We want the buffers to be allocated from the mempool as many as
+ // possible.
+ //
+ // On the other hand if there is no Tx for some time we want the
+ // completions to be eventually handled. Thus we choose the smallest
+ // possible packets count number here.
+ //
+ static constexpr int gc_count = 1;
+ public:
+ tx_buf_factory(CephContext *c, DPDKDevice *dev, uint8_t qid);
+ ~tx_buf_factory() {
+ // put all mbuf back into mempool in order to make the next factory work
+ while (gc());
+ rte_mempool_put_bulk(_pool, (void**)_ring.data(),
+ _ring.size());
+ }
+
+
+ /**
+ * @note Should not be called if there are no free tx_buf's
+ *
+ * @return a free tx_buf object
+ */
+ tx_buf* get() {
+ // Take completed from the HW first
+ tx_buf *pkt = get_one_completed();
+ if (pkt) {
+ pkt->reset_zc();
+ return pkt;
+ }
+
+ //
+ // If there are no completed at the moment - take from the
+ // factory's cache.
+ //
+ if (_ring.empty()) {
+ return nullptr;
+ }
+
+ pkt = _ring.back();
+ _ring.pop_back();
+
+ return pkt;
+ }
+
+ void put(tx_buf* buf) {
+ buf->reset_zc();
+ _ring.push_back(buf);
+ }
+
+ unsigned ring_size() const {
+ return _ring.size();
+ }
+
+ bool gc() {
+ for (int cnt = 0; cnt < gc_count; ++cnt) {
+ auto tx_buf_p = get_one_completed();
+ if (!tx_buf_p) {
+ return false;
+ }
+
+ put(tx_buf_p);
+ }
+
+ return true;
+ }
+ private:
+ /**
+ * Fill the mbufs circular buffer: after this the _pool will become
+ * empty. We will use it to catch the completed buffers:
+ *
+ * - Underlying PMD drivers will "free" the mbufs once they are
+ * completed.
+ * - We will poll the _pktmbuf_pool_tx till it's empty and release
+ * all the buffers from the freed mbufs.
+ */
+ void init_factory() {
+ while (rte_mbuf* mbuf = rte_pktmbuf_alloc(_pool)) {
+ _ring.push_back(new(tx_buf::me(mbuf)) tx_buf{*this});
+ }
+ }
+
+ /**
+ * PMD puts the completed buffers back into the mempool they have
+ * originally come from.
+ *
+ * @note rte_pktmbuf_alloc() resets the mbuf so there is no need to call
+ * rte_pktmbuf_reset() here again.
+ *
+ * @return a single tx_buf that has been completed by HW.
+ */
+ tx_buf* get_one_completed() {
+ return tx_buf::me(rte_pktmbuf_alloc(_pool));
+ }
+
+ private:
+ CephContext *cct;
+ std::vector<tx_buf*> _ring;
+ rte_mempool* _pool = nullptr;
+ };
+
+ public:
+ explicit DPDKQueuePair(CephContext *c, EventCenter *cen, DPDKDevice* dev, uint8_t qid);
+ ~DPDKQueuePair() {
+ if (device_stat_time_fd) {
+ center->delete_time_event(device_stat_time_fd);
+ }
+ rx_gc(true);
+ }
+
+ void rx_start() {
+ _rx_poller.emplace(this);
+ }
+
+ uint32_t send(circular_buffer<Packet>& pb) {
+ // Zero-copy send
+ return _send(pb, [&] (Packet&& p) {
+ return tx_buf::from_packet_zc(cct, std::move(p), *this);
+ });
+ }
+
+ DPDKDevice& port() const { return *_dev; }
+ tx_buf* get_tx_buf() { return _tx_buf_factory.get(); }
+
+ void handle_stats();
+
+ private:
+ template <class Func>
+ uint32_t _send(circular_buffer<Packet>& pb, Func &&packet_to_tx_buf_p) {
+ if (_tx_burst.size() == 0) {
+ for (auto&& p : pb) {
+ // TODO: ceph_assert() in a fast path! Remove me ASAP!
+ ceph_assert(p.len());
+
+ tx_buf* buf = packet_to_tx_buf_p(std::move(p));
+ if (!buf) {
+ break;
+ }
+
+ _tx_burst.push_back(buf->rte_mbuf_p());
+ }
+ }
+
+ uint16_t sent = rte_eth_tx_burst(_dev_port_idx, _qid,
+ _tx_burst.data() + _tx_burst_idx,
+ _tx_burst.size() - _tx_burst_idx);
+
+ uint64_t nr_frags = 0, bytes = 0;
+
+ for (int i = 0; i < sent; i++) {
+ rte_mbuf* m = _tx_burst[_tx_burst_idx + i];
+ bytes += m->pkt_len;
+ nr_frags += m->nb_segs;
+ pb.pop_front();
+ }
+
+ perf_logger->inc(l_dpdk_qp_tx_fragments, nr_frags);
+ perf_logger->inc(l_dpdk_qp_tx_bytes, bytes);
+
+ _tx_burst_idx += sent;
+
+ if (_tx_burst_idx == _tx_burst.size()) {
+ _tx_burst_idx = 0;
+ _tx_burst.clear();
+ }
+
+ return sent;
+ }
+
+ /**
+ * Allocate a new data buffer and set the mbuf to point to it.
+ *
+ * Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+ * points to the private data of RTE_PKTMBUF_HEADROOM before the actual
+ * data buffer.
+ *
+ * @param m mbuf to update
+ */
+ static bool refill_rx_mbuf(rte_mbuf* m, size_t size,
+ std::vector<void*> &datas) {
+ if (datas.empty())
+ return false;
+ void *data = datas.back();
+ datas.pop_back();
+
+ //
+ // Set the mbuf to point to our data.
+ //
+ // Do some DPDK hacks to work on PMD: it assumes that the buf_addr
+ // points to the private data of RTE_PKTMBUF_HEADROOM before the
+ // actual data buffer.
+ //
+ m->buf_addr = (char*)data - RTE_PKTMBUF_HEADROOM;
+ m->buf_iova = rte_mem_virt2iova(data) - RTE_PKTMBUF_HEADROOM;
+ return true;
+ }
+
+ bool init_rx_mbuf_pool();
+ bool rx_gc(bool force=false);
+ bool refill_one_cluster(rte_mbuf* head);
+
+ /**
+ * Polls for a burst of incoming packets. This function will not block and
+ * will immediately return after processing all available packets.
+ *
+ */
+ bool poll_rx_once();
+
+ /**
+ * Translates an rte_mbuf's into packet and feeds them to _rx_stream.
+ *
+ * @param bufs An array of received rte_mbuf's
+ * @param count Number of buffers in the bufs[]
+ */
+ void process_packets(struct rte_mbuf **bufs, uint16_t count);
+
+ /**
+ * Translate rte_mbuf into the "packet".
+ * @param m mbuf to translate
+ *
+ * @return a "optional" object representing the newly received data if in an
+ * "engaged" state or an error if in a "disengaged" state.
+ */
+ std::optional<Packet> from_mbuf(rte_mbuf* m);
+
+ /**
+ * Transform an LRO rte_mbuf cluster into the "packet" object.
+ * @param m HEAD of the mbufs' cluster to transform
+ *
+ * @return a "optional" object representing the newly received LRO packet if
+ * in an "engaged" state or an error if in a "disengaged" state.
+ */
+ std::optional<Packet> from_mbuf_lro(rte_mbuf* m);
+
+ private:
+ CephContext *cct;
+ std::vector<packet_provider_type> _pkt_providers;
+ std::optional<std::array<uint8_t, 128>> _sw_reta;
+ circular_buffer<Packet> _proxy_packetq;
+ stream<Packet> _rx_stream;
+ circular_buffer<Packet> _tx_packetq;
+ std::vector<void*> _alloc_bufs;
+
+ PerfCounters *perf_logger;
+ DPDKDevice* _dev;
+ uint8_t _dev_port_idx;
+ EventCenter *center;
+ uint8_t _qid;
+ rte_mempool *_pktmbuf_pool_rx;
+ std::vector<rte_mbuf*> _rx_free_pkts;
+ std::vector<rte_mbuf*> _rx_free_bufs;
+ std::vector<fragment> _frags;
+ std::vector<char*> _bufs;
+ size_t _num_rx_free_segs = 0;
+ uint64_t device_stat_time_fd = 0;
+
+#ifdef CEPH_PERF_DEV
+ uint64_t rx_cycles = 0;
+ uint64_t rx_count = 0;
+ uint64_t tx_cycles = 0;
+ uint64_t tx_count = 0;
+#endif
+
+ class DPDKTXPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKTXPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKTXPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->poll_tx();
+ }
+ } _tx_poller;
+
+ class DPDKRXGCPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKRXGCPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKRXGCPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->rx_gc();
+ }
+ } _rx_gc_poller;
+ tx_buf_factory _tx_buf_factory;
+ class DPDKRXPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKRXPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKRXPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->poll_rx_once();
+ }
+ };
+ std::optional<DPDKRXPoller> _rx_poller;
+ class DPDKTXGCPoller : public EventCenter::Poller {
+ DPDKQueuePair *qp;
+
+ public:
+ explicit DPDKTXGCPoller(DPDKQueuePair *qp)
+ : EventCenter::Poller(qp->center, "DPDK::DPDKTXGCPoller"), qp(qp) {}
+
+ virtual int poll() {
+ return qp->_tx_buf_factory.gc();
+ }
+ } _tx_gc_poller;
+ std::vector<rte_mbuf*> _tx_burst;
+ uint16_t _tx_burst_idx = 0;
+};
+
+class DPDKDevice {
+ public:
+ CephContext *cct;
+ PerfCounters *perf_logger;
+ std::vector<std::unique_ptr<DPDKQueuePair>> _queues;
+ std::vector<DPDKWorker*> workers;
+ size_t _rss_table_bits = 0;
+ uint8_t _port_idx;
+ uint16_t _num_queues;
+ unsigned cores;
+ hw_features _hw_features;
+ uint8_t _queues_ready = 0;
+ unsigned _home_cpu;
+ bool _use_lro;
+ bool _enable_fc;
+ std::vector<uint16_t> _redir_table;
+ rss_key_type _rss_key;
+ struct rte_flow *_flow = nullptr;
+ bool _is_i40e_device = false;
+ bool _is_vmxnet3_device = false;
+ std::unique_ptr<AdminSocketHook> dfx_hook;
+
+ public:
+ rte_eth_dev_info _dev_info = {};
+
+ /**
+ * The final stage of a port initialization.
+ * @note Must be called *after* all queues from stage (2) have been
+ * initialized.
+ */
+ int init_port_fini();
+
+ void nic_stats_dump(Formatter *f);
+ void nic_xstats_dump(Formatter *f);
+ private:
+ /**
+ * Port initialization consists of 3 main stages:
+ * 1) General port initialization which ends with a call to
+ * rte_eth_dev_configure() where we request the needed number of Rx and
+ * Tx queues.
+ * 2) Individual queues initialization. This is done in the constructor of
+ * DPDKQueuePair class. In particular the memory pools for queues are allocated
+ * in this stage.
+ * 3) The final stage of the initialization which starts with the call of
+ * rte_eth_dev_start() after which the port becomes fully functional. We
+ * will also wait for a link to get up in this stage.
+ */
+
+
+ /**
+ * First stage of the port initialization.
+ *
+ * @return 0 in case of success and an appropriate error code in case of an
+ * error.
+ */
+ int init_port_start();
+
+ /**
+ * Check the link status of out port in up to 9s, and print them finally.
+ */
+ int check_port_link_status();
+
+ /**
+ * Configures the HW Flow Control
+ */
+ void set_hw_flow_control();
+
+ public:
+ DPDKDevice(CephContext *c, uint8_t port_idx, uint16_t num_queues, bool use_lro, bool enable_fc):
+ cct(c), _port_idx(port_idx), _num_queues(num_queues),
+ _home_cpu(0), _use_lro(use_lro),
+ _enable_fc(enable_fc) {
+ _queues = std::vector<std::unique_ptr<DPDKQueuePair>>(_num_queues);
+ /* now initialise the port we will use */
+ int ret = init_port_start();
+ if (ret != 0) {
+ ceph_assert(false && "Cannot initialise port\n");
+ }
+ std::string name(std::string("port") + std::to_string(port_idx));
+ PerfCountersBuilder plb(cct, name, l_dpdk_dev_first, l_dpdk_dev_last);
+
+ plb.add_u64_counter(l_dpdk_dev_rx_mcast, "dpdk_device_receive_multicast_packets", "DPDK received multicast packets");
+ plb.add_u64_counter(l_dpdk_dev_rx_badcrc_errors, "dpdk_device_receive_badcrc_errors", "DPDK received bad crc errors");
+
+ plb.add_u64_counter(l_dpdk_dev_rx_total_errors, "dpdk_device_receive_total_errors", "DPDK received total_errors");
+ plb.add_u64_counter(l_dpdk_dev_tx_total_errors, "dpdk_device_send_total_errors", "DPDK sendd total_errors");
+ plb.add_u64_counter(l_dpdk_dev_rx_dropped_errors, "dpdk_device_receive_dropped_errors", "DPDK received dropped errors");
+ plb.add_u64_counter(l_dpdk_dev_rx_nombuf_errors, "dpdk_device_receive_nombuf_errors", "DPDK received RX mbuf allocation errors");
+
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ }
+
+ ~DPDKDevice() {
+ cct->get_admin_socket()->unregister_commands(dfx_hook.get());
+ dfx_hook.reset();
+ if (_flow)
+ rte_flow_destroy(_port_idx, _flow, nullptr);
+ rte_eth_dev_stop(_port_idx);
+ }
+
+ DPDKQueuePair& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; }
+ void l2receive(int qid, Packet p) {
+ _queues[qid]->_rx_stream.produce(std::move(p));
+ }
+ subscription<Packet> receive(unsigned cpuid, std::function<int (Packet)> next_packet) {
+ auto sub = _queues[cpuid]->_rx_stream.listen(std::move(next_packet));
+ _queues[cpuid]->rx_start();
+ return sub;
+ }
+ ethernet_address hw_address() {
+ struct rte_ether_addr mac;
+ rte_eth_macaddr_get(_port_idx, &mac);
+
+ return mac.addr_bytes;
+ }
+ hw_features get_hw_features() {
+ return _hw_features;
+ }
+ const rss_key_type& rss_key() const { return _rss_key; }
+ uint16_t hw_queues_count() { return _num_queues; }
+ std::unique_ptr<DPDKQueuePair> init_local_queue(CephContext *c,
+ EventCenter *center, std::string hugepages, uint16_t qid) {
+ std::unique_ptr<DPDKQueuePair> qp;
+ qp = std::unique_ptr<DPDKQueuePair>(new DPDKQueuePair(c, center, this, qid));
+ return qp;
+ }
+ unsigned hash2qid(uint32_t hash) {
+ // return hash % hw_queues_count();
+ return _redir_table[hash & (_redir_table.size() - 1)];
+ }
+ void set_local_queue(unsigned i, std::unique_ptr<DPDKQueuePair> qp) {
+ ceph_assert(!_queues[i]);
+ _queues[i] = std::move(qp);
+ }
+ void unset_local_queue(unsigned i) {
+ ceph_assert(_queues[i]);
+ _queues[i].reset();
+ }
+ template <typename Func>
+ unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) {
+ auto& qp = queue_for_cpu(src_cpuid);
+ if (!qp._sw_reta)
+ return src_cpuid;
+
+ ceph_assert(!qp._sw_reta);
+ auto hash = hashfn() >> _rss_table_bits;
+ auto& reta = *qp._sw_reta;
+ return reta[hash % reta.size()];
+ }
+ unsigned hash2cpu(uint32_t hash) {
+ // there is an assumption here that qid == get_id() which will
+ // not necessary be true in the future
+ return forward_dst(hash2qid(hash), [hash] { return hash; });
+ }
+
+ hw_features& hw_features_ref() { return _hw_features; }
+
+ const rte_eth_rxconf* def_rx_conf() const {
+ return &_dev_info.default_rxconf;
+ }
+
+ const rte_eth_txconf* def_tx_conf() const {
+ return &_dev_info.default_txconf;
+ }
+
+ /**
+ * Set the RSS table in the device and store it in the internal vector.
+ */
+ void set_rss_table();
+
+ uint8_t port_idx() { return _port_idx; }
+ bool is_i40e_device() const {
+ return _is_i40e_device;
+ }
+ bool is_vmxnet3_device() const {
+ return _is_vmxnet3_device;
+ }
+};
+
+
+std::unique_ptr<DPDKDevice> create_dpdk_net_device(
+ CephContext *c, unsigned cores, uint8_t port_idx = 0,
+ bool use_lro = true, bool enable_fc = true);
+
+
+/**
+ * @return Number of bytes needed for mempool objects of each QP.
+ */
+uint32_t qp_mempool_obj_size();
+
+#endif // CEPH_DPDK_DEV_H
diff --git a/src/msg/async/dpdk/DPDKStack.cc b/src/msg/async/dpdk/DPDKStack.cc
new file mode 100644
index 000000000..1543a530d
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.cc
@@ -0,0 +1,284 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <memory>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <tuple>
+
+#include "common/ceph_argparse.h"
+#include "dpdk_rte.h"
+#include "DPDKStack.h"
+#include "DPDK.h"
+#include "IP.h"
+#include "TCP-Stack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+#include "common/Cond.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdkstack "
+
+static int dpdk_thread_adaptor(void* f)
+{
+ (*static_cast<std::function<void ()>*>(f))();
+ return 0;
+}
+
+void DPDKWorker::initialize()
+{
+ static enum {
+ WAIT_DEVICE_STAGE,
+ WAIT_PORT_FIN_STAGE,
+ DONE
+ } create_stage = WAIT_DEVICE_STAGE;
+ static ceph::mutex lock = ceph::make_mutex("DPDKStack::lock");
+ static ceph::condition_variable cond;
+ static unsigned queue_init_done = 0;
+ static unsigned cores = 0;
+ static std::shared_ptr<DPDKDevice> sdev;
+
+ unsigned i = center.get_id();
+ if (i == 0) {
+ // Hardcoded port index 0.
+ // TODO: Inherit it from the opts
+ cores = cct->_conf->ms_async_op_threads;
+ std::unique_ptr<DPDKDevice> dev = create_dpdk_net_device(
+ cct, cores, cct->_conf->ms_dpdk_port_id,
+ cct->_conf->ms_dpdk_lro,
+ cct->_conf->ms_dpdk_hw_flow_control);
+ sdev = std::shared_ptr<DPDKDevice>(dev.release());
+ sdev->workers.resize(cores);
+ ldout(cct, 1) << __func__ << " using " << cores << " cores " << dendl;
+
+ std::lock_guard l{lock};
+ create_stage = WAIT_PORT_FIN_STAGE;
+ cond.notify_all();
+ } else {
+ std::unique_lock l{lock};
+ cond.wait(l, [] { return create_stage > WAIT_DEVICE_STAGE; });
+ }
+ ceph_assert(sdev);
+ if (i < sdev->hw_queues_count()) {
+ auto qp = sdev->init_local_queue(cct, &center, cct->_conf->ms_dpdk_hugepages, i);
+ std::map<unsigned, float> cpu_weights;
+ for (unsigned j = sdev->hw_queues_count() + i % sdev->hw_queues_count();
+ j < cores; j+= sdev->hw_queues_count())
+ cpu_weights[i] = 1;
+ cpu_weights[i] = cct->_conf->ms_dpdk_hw_queue_weight;
+ qp->configure_proxies(cpu_weights);
+ sdev->set_local_queue(i, std::move(qp));
+ std::lock_guard l{lock};
+ ++queue_init_done;
+ cond.notify_all();
+ } else {
+ // auto master = qid % sdev->hw_queues_count();
+ // sdev->set_local_queue(create_proxy_net_device(master, sdev.get()));
+ ceph_abort();
+ }
+ if (i == 0) {
+ {
+ std::unique_lock l{lock};
+ cond.wait(l, [] { return queue_init_done >= cores; });
+ }
+
+ if (sdev->init_port_fini() < 0) {
+ lderr(cct) << __func__ << " init_port_fini failed " << dendl;
+ ceph_abort();
+ }
+ std::lock_guard l{lock};
+ create_stage = DONE;
+ cond.notify_all();
+ } else {
+ std::unique_lock l{lock};
+ cond.wait(l, [&] { return create_stage > WAIT_PORT_FIN_STAGE; });
+ }
+
+ sdev->workers[i] = this;
+ _impl = std::unique_ptr<DPDKWorker::Impl>(
+ new DPDKWorker::Impl(cct, i, &center, sdev));
+ {
+ std::lock_guard l{lock};
+ if (!--queue_init_done) {
+ create_stage = WAIT_DEVICE_STAGE;
+ sdev.reset();
+ }
+ }
+}
+
+using AvailableIPAddress = std::tuple<std::string, std::string, std::string>;
+static bool parse_available_address(
+ const std::string &ips, const std::string &gates,
+ const std::string &masks, std::vector<AvailableIPAddress> &res)
+{
+ std::vector<std::string> ip_vec, gate_vec, mask_vec;
+ string_to_vec(ip_vec, ips);
+ string_to_vec(gate_vec, gates);
+ string_to_vec(mask_vec, masks);
+ if (ip_vec.empty() || ip_vec.size() != gate_vec.size() || ip_vec.size() != mask_vec.size())
+ return false;
+
+ for (size_t i = 0; i < ip_vec.size(); ++i) {
+ res.push_back(AvailableIPAddress{ip_vec[i], gate_vec[i], mask_vec[i]});
+ }
+ return true;
+}
+
+static bool match_available_address(const std::vector<AvailableIPAddress> &avails,
+ const entity_addr_t &ip, int &res)
+{
+ for (size_t i = 0; i < avails.size(); ++i) {
+ entity_addr_t addr;
+ auto a = std::get<0>(avails[i]).c_str();
+ if (!addr.parse(a))
+ continue;
+ if (addr.is_same_host(ip)) {
+ res = i;
+ return true;
+ }
+ }
+ return false;
+}
+
+DPDKWorker::Impl::Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev)
+ : id(i), _netif(cct, dev, c), _dev(dev), _inet(cct, c, &_netif)
+{
+ std::vector<AvailableIPAddress> tuples;
+ bool parsed = parse_available_address(cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr"),
+ cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr"),
+ cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr"), tuples);
+ if (!parsed) {
+ lderr(cct) << __func__ << " no available address "
+ << cct->_conf.get_val<std::string>("ms_dpdk_host_ipv4_addr") << ", "
+ << cct->_conf.get_val<std::string>("ms_dpdk_gateway_ipv4_addr") << ", "
+ << cct->_conf.get_val<std::string>("ms_dpdk_netmask_ipv4_addr") << ", "
+ << dendl;
+ ceph_abort();
+ }
+ _inet.set_host_address(ipv4_address(std::get<0>(tuples[0])));
+ _inet.set_gw_address(ipv4_address(std::get<1>(tuples[0])));
+ _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[0])));
+}
+
+DPDKWorker::Impl::~Impl()
+{
+ _dev->unset_local_queue(id);
+}
+
+int DPDKWorker::listen(entity_addr_t &sa,
+ unsigned addr_slot,
+ const SocketOptions &opt,
+ ServerSocket *sock)
+{
+ ceph_assert(sa.get_family() == AF_INET);
+ ceph_assert(sock);
+
+ ldout(cct, 10) << __func__ << " addr " << sa << dendl;
+ // vector<AvailableIPAddress> tuples;
+ // bool parsed = parse_available_address(cct->_conf->ms_dpdk_host_ipv4_addr,
+ // cct->_conf->ms_dpdk_gateway_ipv4_addr,
+ // cct->_conf->ms_dpdk_netmask_ipv4_addr, tuples);
+ // if (!parsed) {
+ // lderr(cct) << __func__ << " no available address "
+ // << cct->_conf->ms_dpdk_host_ipv4_addr << ", "
+ // << cct->_conf->ms_dpdk_gateway_ipv4_addr << ", "
+ // << cct->_conf->ms_dpdk_netmask_ipv4_addr << ", "
+ // << dendl;
+ // return -EINVAL;
+ // }
+ // int idx;
+ // parsed = match_available_address(tuples, sa, idx);
+ // if (!parsed) {
+ // lderr(cct) << __func__ << " no matched address for " << sa << dendl;
+ // return -EINVAL;
+ // }
+ // _inet.set_host_address(ipv4_address(std::get<0>(tuples[idx])));
+ // _inet.set_gw_address(ipv4_address(std::get<1>(tuples[idx])));
+ // _inet.set_netmask_address(ipv4_address(std::get<2>(tuples[idx])));
+ return tcpv4_listen(_impl->_inet.get_tcp(), sa.get_port(), opt, sa.get_type(),
+ addr_slot, sock);
+}
+
+int DPDKWorker::connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket)
+{
+ // ceph_assert(addr.get_family() == AF_INET);
+ int r = tcpv4_connect(_impl->_inet.get_tcp(), addr, socket);
+ ldout(cct, 10) << __func__ << " addr " << addr << dendl;
+ return r;
+}
+
+void DPDKStack::spawn_worker(std::function<void ()> &&func)
+{
+ // create a extra master thread
+ //
+ funcs.push_back(std::move(func));
+ int r = 0;
+ r = eal.start();
+ if (r < 0) {
+ lderr(cct) << __func__ << " start dpdk rte failed, r=" << r << dendl;
+ ceph_abort();
+ }
+ // if eal.start already called by NVMEDevice, we will select 1..n
+ // cores
+ unsigned nr_worker = funcs.size();
+ ceph_assert(rte_lcore_count() >= nr_worker);
+ unsigned core_id;
+ RTE_LCORE_FOREACH_SLAVE(core_id) {
+ if (--nr_worker == 0) {
+ break;
+ }
+ }
+ void *adapted_func = static_cast<void*>(&funcs.back());
+ eal.execute_on_master([adapted_func, core_id, this]() {
+ int r = rte_eal_remote_launch(dpdk_thread_adaptor, adapted_func, core_id);
+ if (r < 0) {
+ lderr(cct) << __func__ << " remote launch failed, r=" << r << dendl;
+ ceph_abort();
+ }
+ });
+}
+
+void DPDKStack::join_worker(unsigned i)
+{
+ eal.execute_on_master([&]() {
+ rte_eal_wait_lcore(i+1);
+ });
+ if (i+1 == get_num_worker())
+ eal.stop();
+}
diff --git a/src/msg/async/dpdk/DPDKStack.h b/src/msg/async/dpdk/DPDKStack.h
new file mode 100644
index 000000000..3f64f5669
--- /dev/null
+++ b/src/msg/async/dpdk/DPDKStack.h
@@ -0,0 +1,272 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+#ifndef CEPH_MSG_DPDKSTACK_H
+#define CEPH_MSG_DPDKSTACK_H
+
+#include <functional>
+#include <optional>
+
+#include "common/ceph_context.h"
+
+#include "msg/async/Stack.h"
+#include "net.h"
+#include "const.h"
+#include "IP.h"
+#include "Packet.h"
+#include "dpdk_rte.h"
+
+class interface;
+
+template <typename Protocol>
+class NativeConnectedSocketImpl;
+
+// DPDKServerSocketImpl
+template <typename Protocol>
+class DPDKServerSocketImpl : public ServerSocketImpl {
+ typename Protocol::listener _listener;
+ public:
+ DPDKServerSocketImpl(Protocol& proto, uint16_t port, const SocketOptions &opt,
+ int type, unsigned addr_slot);
+ int listen() {
+ return _listener.listen();
+ }
+ virtual int accept(ConnectedSocket *s, const SocketOptions &opts, entity_addr_t *out, Worker *w) override;
+ virtual void abort_accept() override;
+ virtual int fd() const override {
+ return _listener.fd();
+ }
+ virtual void set_priority(int sd, int prio, int domain) override {}
+};
+
+// NativeConnectedSocketImpl
+template <typename Protocol>
+class NativeConnectedSocketImpl : public ConnectedSocketImpl {
+ typename Protocol::connection _conn;
+ uint32_t _cur_frag = 0;
+ uint32_t _cur_off = 0;
+ std::optional<Packet> _buf;
+ std::optional<bufferptr> _cache_ptr;
+
+ public:
+ explicit NativeConnectedSocketImpl(typename Protocol::connection conn)
+ : _conn(std::move(conn)) {}
+ NativeConnectedSocketImpl(NativeConnectedSocketImpl &&rhs)
+ : _conn(std::move(rhs._conn)), _buf(std::move(rhs.buf)) {}
+ virtual int is_connected() override {
+ return _conn.is_connected();
+ }
+
+ virtual ssize_t read(char *buf, size_t len) override {
+ size_t left = len;
+ ssize_t r = 0;
+ size_t off = 0;
+ while (left > 0) {
+ if (!_cache_ptr) {
+ _cache_ptr.emplace();
+ r = zero_copy_read(*_cache_ptr);
+ if (r <= 0) {
+ _cache_ptr.reset();
+ if (r == -EAGAIN)
+ break;
+ return r;
+ }
+ }
+ if (_cache_ptr->length() <= left) {
+ _cache_ptr->copy_out(0, _cache_ptr->length(), buf+off);
+ left -= _cache_ptr->length();
+ off += _cache_ptr->length();
+ _cache_ptr.reset();
+ } else {
+ _cache_ptr->copy_out(0, left, buf+off);
+ _cache_ptr->set_offset(_cache_ptr->offset() + left);
+ _cache_ptr->set_length(_cache_ptr->length() - left);
+ left = 0;
+ break;
+ }
+ }
+ return len - left ? len - left : -EAGAIN;
+ }
+
+private:
+ ssize_t zero_copy_read(bufferptr &data) {
+ auto err = _conn.get_errno();
+ if (err <= 0)
+ return err;
+
+ if (!_buf) {
+ _buf = std::move(_conn.read());
+ if (!_buf)
+ return -EAGAIN;
+ }
+
+ fragment &f = _buf->frag(_cur_frag);
+ Packet p = _buf->share(_cur_off, f.size);
+ auto del = std::bind(
+ [](Packet &p) {}, std::move(p));
+ data = buffer::claim_buffer(
+ f.size, f.base, make_deleter(std::move(del)));
+ if (++_cur_frag == _buf->nr_frags()) {
+ _cur_frag = 0;
+ _cur_off = 0;
+ _buf.reset();
+ } else {
+ _cur_off += f.size;
+ }
+ ceph_assert(data.length());
+ return data.length();
+ }
+ virtual ssize_t send(bufferlist &bl, bool more) override {
+ auto err = _conn.get_errno();
+ if (err < 0)
+ return (ssize_t)err;
+
+ size_t available = _conn.peek_sent_available();
+ if (available == 0) {
+ return 0;
+ }
+
+ std::vector<fragment> frags;
+ auto pb = bl.buffers().begin();
+ uint64_t len = 0;
+ uint64_t seglen = 0;
+ while (len < available && pb != bl.buffers().end()) {
+ seglen = pb->length();
+ // Buffer length is zero, no need to send, so skip it
+ if (seglen == 0) {
+ ++pb;
+ continue;
+ }
+ if (len + seglen > available) {
+ // don't continue if we enough at least 1 fragment since no available
+ // space for next ptr.
+ if (len > 0)
+ break;
+ seglen = std::min(seglen, available);
+ }
+ len += seglen;
+ frags.push_back(fragment{(char*)pb->c_str(), seglen});
+ ++pb;
+ }
+
+ if (len != bl.length()) {
+ bufferlist swapped;
+ bl.splice(0, len, &swapped);
+ auto del = std::bind(
+ [](bufferlist &bl) {}, std::move(swapped));
+ return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+ } else {
+ auto del = std::bind(
+ [](bufferlist &bl) {}, std::move(bl));
+
+ return _conn.send(Packet(std::move(frags), make_deleter(std::move(del))));
+ }
+ }
+
+public:
+ virtual void shutdown() override {
+ _conn.close_write();
+ }
+ // FIXME need to impl close
+ virtual void close() override {
+ _conn.close_write();
+ }
+ virtual int fd() const override {
+ return _conn.fd();
+ }
+};
+
+template <typename Protocol>
+DPDKServerSocketImpl<Protocol>::DPDKServerSocketImpl(
+ Protocol& proto, uint16_t port, const SocketOptions &opt,
+ int type, unsigned addr_slot)
+ : ServerSocketImpl(type, addr_slot), _listener(proto.listen(port)) {}
+
+template <typename Protocol>
+int DPDKServerSocketImpl<Protocol>::accept(ConnectedSocket *s, const SocketOptions &options, entity_addr_t *out, Worker *w) {
+ if (_listener.get_errno() < 0)
+ return _listener.get_errno();
+ auto c = _listener.accept();
+ if (!c)
+ return -EAGAIN;
+
+ if (out) {
+ *out = c->remote_addr();
+ out->set_type(addr_type);
+ }
+ std::unique_ptr<NativeConnectedSocketImpl<Protocol>> csi(
+ new NativeConnectedSocketImpl<Protocol>(std::move(*c)));
+ *s = ConnectedSocket(std::move(csi));
+ return 0;
+}
+
+template <typename Protocol>
+void DPDKServerSocketImpl<Protocol>::abort_accept() {
+ _listener.abort_accept();
+}
+
+class DPDKWorker : public Worker {
+ struct Impl {
+ unsigned id;
+ interface _netif;
+ std::shared_ptr<DPDKDevice> _dev;
+ ipv4 _inet;
+ Impl(CephContext *cct, unsigned i, EventCenter *c, std::shared_ptr<DPDKDevice> dev);
+ ~Impl();
+ };
+ std::unique_ptr<Impl> _impl;
+
+ virtual void initialize() override;
+ void set_ipv4_packet_filter(ip_packet_filter* filter) {
+ _impl->_inet.set_packet_filter(filter);
+ }
+ using tcp4 = tcp<ipv4_traits>;
+
+ public:
+ explicit DPDKWorker(CephContext *c, unsigned i): Worker(c, i) {}
+ virtual int listen(entity_addr_t &addr, unsigned addr_slot,
+ const SocketOptions &opts, ServerSocket *) override;
+ virtual int connect(const entity_addr_t &addr, const SocketOptions &opts, ConnectedSocket *socket) override;
+ void arp_learn(ethernet_address l2, ipv4_address l3) {
+ _impl->_inet.learn(l2, l3);
+ }
+ virtual void destroy() override {
+ _impl.reset();
+ }
+
+ friend class DPDKServerSocketImpl<tcp4>;
+};
+
+using namespace dpdk;
+class DPDKStack : public NetworkStack {
+ std::vector<std::function<void()> > funcs;
+
+ virtual Worker* create_worker(CephContext *c, unsigned worker_id) override {
+ return new DPDKWorker(c, worker_id);
+ }
+ virtual void rename_thread(unsigned id) override {}
+
+ public:
+ explicit DPDKStack(CephContext *cct): NetworkStack(cct), eal(cct) {
+ funcs.reserve(cct->_conf->ms_async_op_threads);
+ }
+ virtual bool support_local_listen_table() const override { return true; }
+
+ virtual void spawn_worker(std::function<void ()> &&func) override;
+ virtual void join_worker(unsigned i) override;
+ private:
+ dpdk::eal eal;
+};
+
+#endif
diff --git a/src/msg/async/dpdk/EventDPDK.cc b/src/msg/async/dpdk/EventDPDK.cc
new file mode 100644
index 000000000..a38ddcc99
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.cc
@@ -0,0 +1,85 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/errno.h"
+#include "DPDKStack.h"
+#include "EventDPDK.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_ms
+
+#undef dout_prefix
+#define dout_prefix *_dout << "DPDKDriver."
+
+int DPDKDriver::init(EventCenter *c, int nevent)
+{
+ return 0;
+}
+
+int DPDKDriver::add_event(int fd, int cur_mask, int add_mask)
+{
+ ldout(cct, 20) << __func__ << " add event fd=" << fd << " cur_mask=" << cur_mask
+ << " add_mask=" << add_mask << dendl;
+
+ int r = manager.listen(fd, add_mask);
+ if (r < 0) {
+ lderr(cct) << __func__ << " add fd=" << fd << " failed. "
+ << cpp_strerror(-r) << dendl;
+ return -errno;
+ }
+
+ return 0;
+}
+
+int DPDKDriver::del_event(int fd, int cur_mask, int delmask)
+{
+ ldout(cct, 20) << __func__ << " del event fd=" << fd << " cur_mask=" << cur_mask
+ << " delmask=" << delmask << dendl;
+ int r = 0;
+
+ if (delmask != EVENT_NONE) {
+ if ((r = manager.unlisten(fd, delmask)) < 0) {
+ lderr(cct) << __func__ << " delete fd=" << fd << " delmask=" << delmask
+ << " failed." << cpp_strerror(-r) << dendl;
+ return r;
+ }
+ }
+ return 0;
+}
+
+int DPDKDriver::resize_events(int newsize)
+{
+ return 0;
+}
+
+int DPDKDriver::event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tvp)
+{
+ int num_events = 512;
+ int events[num_events];
+ int masks[num_events];
+
+ int retval = manager.poll(events, masks, num_events, tvp);
+ if (retval > 0) {
+ fired_events.resize(retval);
+ for (int i = 0; i < retval; i++) {
+ fired_events[i].fd = events[i];
+ fired_events[i].mask = masks[i];
+ }
+ }
+ return retval;
+}
diff --git a/src/msg/async/dpdk/EventDPDK.h b/src/msg/async/dpdk/EventDPDK.h
new file mode 100644
index 000000000..ccf2cd28d
--- /dev/null
+++ b/src/msg/async/dpdk/EventDPDK.h
@@ -0,0 +1,40 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_EVENTDPDK_H
+#define CEPH_EVENTDPDK_H
+
+#include "msg/async/Event.h"
+#include "msg/async/Stack.h"
+#include "UserspaceEvent.h"
+
+class DPDKDriver : public EventDriver {
+ CephContext *cct;
+
+ public:
+ UserspaceEventManager manager;
+
+ explicit DPDKDriver(CephContext *c): cct(c), manager(c) {}
+ virtual ~DPDKDriver() { }
+
+ int init(EventCenter *c, int nevent) override;
+ int add_event(int fd, int cur_mask, int add_mask) override;
+ int del_event(int fd, int cur_mask, int del_mask) override;
+ int resize_events(int newsize) override;
+ int event_wait(std::vector<FiredFileEvent> &fired_events, struct timeval *tp) override;
+ bool need_wakeup() override { return false; }
+};
+
+#endif //CEPH_EVENTDPDK_H
diff --git a/src/msg/async/dpdk/IP.cc b/src/msg/async/dpdk/IP.cc
new file mode 100644
index 000000000..0bfb21b16
--- /dev/null
+++ b/src/msg/async/dpdk/IP.cc
@@ -0,0 +1,481 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "common/perf_counters.h"
+
+#include "capture.h"
+#include "IP.h"
+#include "toeplitz.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a) {
+ auto ip = a.ip;
+ return os << ((ip >> 24) & 0xff) << "." << ((ip >> 16) & 0xff)
+ << "." << ((ip >> 8) & 0xff) << "." << ((ip >> 0) & 0xff);
+}
+
+utime_t ipv4::_frag_timeout = utime_t(30, 0);
+constexpr uint32_t ipv4::_frag_low_thresh;
+constexpr uint32_t ipv4::_frag_high_thresh;
+
+class C_handle_frag_timeout : public EventCallback {
+ ipv4 *_ipv4;
+
+ public:
+ C_handle_frag_timeout(ipv4 *i): _ipv4(i) {}
+ void do_request(uint64_t fd_or_id) {
+ _ipv4->frag_timeout();
+ }
+};
+
+enum {
+ l_dpdk_qp_first = 99000,
+ l_dpdk_total_linearize_operations,
+ l_dpdk_qp_last
+};
+
+struct icmp_hdr {
+ enum class msg_type : uint8_t {
+ echo_reply = 0,
+ echo_request = 8,
+ };
+ msg_type type;
+ uint8_t code;
+ uint16_t csum;
+ uint32_t rest;
+} __attribute__((packed));
+
+ipv4::ipv4(CephContext *c, EventCenter *cen, interface* netif)
+ : cct(c), center(cen), _netif(netif), _global_arp(netif),
+ _arp(c, _global_arp, cen),
+ _host_address(0), _gw_address(0), _netmask(0),
+ _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }),
+ _rx_packets(
+ _l3.receive(
+ [this] (Packet p, ethernet_address ea) {
+ return handle_received_packet(std::move(p), ea);
+ },
+ [this] (forward_hash& out_hash_data, Packet& p, size_t off) {
+ return forward(out_hash_data, p, off);
+ }
+ )
+ ),
+ _tcp(*this, cen), _icmp(c, *this),
+ _l4({{ uint8_t(ip_protocol_num::tcp), &_tcp },
+ { uint8_t(ip_protocol_num::icmp), &_icmp }}),
+ _packet_filter(nullptr)
+{
+ PerfCountersBuilder plb(cct, "ipv4", l_dpdk_qp_first, l_dpdk_qp_last);
+ plb.add_u64_counter(l_dpdk_total_linearize_operations, "dpdk_ip_linearize_operations", "DPDK IP Packet linearization operations");
+ perf_logger = plb.create_perf_counters();
+ cct->get_perfcounters_collection()->add(perf_logger);
+ frag_handler = new C_handle_frag_timeout(this);
+}
+
+bool ipv4::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+ auto iph = p.get_header<ip_hdr>(off);
+
+ out_hash_data.push_back(iph->src_ip.ip);
+ out_hash_data.push_back(iph->dst_ip.ip);
+
+ auto h = iph->ntoh();
+ auto l4 = _l4[h.ip_proto];
+ if (l4) {
+ if (h.mf() == false && h.offset() == 0) {
+ // This IP datagram is atomic, forward according to tcp connection hash
+ l4->forward(out_hash_data, p, off + sizeof(ip_hdr));
+ }
+ // else forward according to ip fields only
+ }
+ return true;
+}
+
+int ipv4::handle_received_packet(Packet p, ethernet_address from)
+{
+ auto iph = p.get_header<ip_hdr>(0);
+ if (!iph) {
+ return 0;
+ }
+
+ // Skip checking csum of reassembled IP datagram
+ if (!get_hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) {
+ checksummer csum;
+ csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+ if (csum.get() != 0) {
+ return 0;
+ }
+ }
+
+ auto h = iph->ntoh();
+ unsigned ip_len = h.len;
+ unsigned ip_hdr_len = h.ihl * 4;
+ unsigned pkt_len = p.len();
+ auto offset = h.offset();
+
+ ldout(cct, 10) << __func__ << " get " << std::hex << int(h.ip_proto)
+ << std::dec << " packet from "
+ << h.src_ip << " -> " << h.dst_ip << " id=" << h.id
+ << " ip_len=" << ip_len << " ip_hdr_len=" << ip_hdr_len
+ << " pkt_len=" << pkt_len << " offset=" << offset << dendl;
+
+ if (pkt_len > ip_len) {
+ // Trim extra data in the packet beyond IP total length
+ p.trim_back(pkt_len - ip_len);
+ } else if (pkt_len < ip_len) {
+ // Drop if it contains less than IP total length
+ return 0;
+ }
+ // Drop if the reassembled datagram will be larger than maximum IP size
+ if (offset + p.len() > ip_packet_len_max) {
+ return 0;
+ }
+
+ // FIXME: process options
+ if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) {
+ ldout(cct, 20) << __func__ << " learn mac " << from << " with " << h.src_ip << dendl;
+ _arp.learn(from, h.src_ip);
+ }
+
+ if (_packet_filter) {
+ bool handled = false;
+ _packet_filter->handle(p, &h, from, handled);
+ if (handled) {
+ return 0;
+ }
+ }
+
+ if (h.dst_ip != _host_address) {
+ // FIXME: forward
+ return 0;
+ }
+
+ // Does this IP datagram need reassembly
+ auto mf = h.mf();
+ if (mf == true || offset != 0) {
+ frag_limit_mem();
+ auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto};
+ auto& frag = _frags[frag_id];
+ if (mf == false) {
+ frag.last_frag_received = true;
+ }
+ // This is a newly created frag_id
+ if (frag.mem_size == 0) {
+ _frags_age.push_back(frag_id);
+ frag.rx_time = ceph_clock_now();
+ }
+ auto added_size = frag.merge(h, offset, std::move(p));
+ _frag_mem += added_size;
+ if (frag.is_complete()) {
+ // All the fragments are received
+ auto dropped_size = frag.mem_size;
+ auto& ip_data = frag.data.map.begin()->second;
+ // Choose a cpu to forward this packet
+ auto cpu_id = center->get_id();
+ auto l4 = _l4[h.ip_proto];
+ if (l4) {
+ size_t l4_offset = 0;
+ forward_hash hash_data;
+ hash_data.push_back(hton(h.src_ip.ip));
+ hash_data.push_back(hton(h.dst_ip.ip));
+ l4->forward(hash_data, ip_data, l4_offset);
+ cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data));
+ }
+
+ // No need to forward if the dst cpu is the current cpu
+ if (cpu_id == center->get_id()) {
+ l4->received(std::move(ip_data), h.src_ip, h.dst_ip);
+ } else {
+ auto to = _netif->hw_address();
+ auto pkt = frag.get_assembled_packet(from, to);
+ _netif->forward(center, cpu_id, std::move(pkt));
+ }
+
+ // Delete this frag from _frags and _frags_age
+ frag_drop(frag_id, dropped_size);
+ _frags_age.remove(frag_id);
+ perf_logger->set(l_dpdk_total_linearize_operations,
+ ipv4_packet_merger::linearizations());
+ } else {
+ // Some of the fragments are missing
+ if (frag_timefd) {
+ frag_arm();
+ }
+ }
+ return 0;
+ }
+
+ auto l4 = _l4[h.ip_proto];
+ if (l4) {
+ // Trim IP header and pass to upper layer
+ p.trim_front(ip_hdr_len);
+ l4->received(std::move(p), h.src_ip, h.dst_ip);
+ }
+ return 0;
+}
+
+void ipv4::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+ // Figure out where to send the packet to. If it is a directly connected
+ // host, send to it directly, otherwise send to the default gateway.
+ ipv4_address dst;
+ if (in_my_netmask(to)) {
+ dst = to;
+ } else {
+ dst = _gw_address;
+ }
+
+ _arp.wait(std::move(dst), std::move(p), std::move(cb));
+}
+
+const hw_features& ipv4::get_hw_features() const
+{
+ return _netif->get_hw_features();
+}
+
+void ipv4::send(ipv4_address to, ip_protocol_num proto_num,
+ Packet p, ethernet_address e_dst) {
+ auto needs_frag = this->needs_frag(p, proto_num, get_hw_features());
+
+ auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (Packet& pkt, uint16_t remaining, uint16_t offset) mutable {
+ static uint16_t id = 0;
+ auto iph = pkt.prepend_header<ip_hdr>();
+ iph->ihl = sizeof(*iph) / 4;
+ iph->ver = 4;
+ iph->dscp = 0;
+ iph->ecn = 0;
+ iph->len = pkt.len();
+ // FIXME: a proper id
+ iph->id = id++;
+ if (needs_frag) {
+ uint16_t mf = remaining > 0;
+ // The fragment offset is measured in units of 8 octets (64 bits)
+ auto off = offset / 8;
+ iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off;
+ } else {
+ iph->frag = 0;
+ }
+ iph->ttl = 64;
+ iph->ip_proto = (uint8_t)proto_num;
+ iph->csum = 0;
+ iph->src_ip = _host_address;
+ iph->dst_ip = to;
+ ldout(cct, 20) << " ipv4::send " << " id=" << iph->id << " " << _host_address << " -> " << to
+ << " len " << pkt.len() << dendl;
+ *iph = iph->hton();
+
+ if (get_hw_features().tx_csum_ip_offload) {
+ iph->csum = 0;
+ pkt.offload_info_ref().needs_ip_csum = true;
+ } else {
+ checksummer csum;
+ csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
+ iph->csum = csum.get();
+ }
+
+ _packetq.push_back(
+ l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)});
+ };
+
+ if (needs_frag) {
+ uint16_t offset = 0;
+ uint16_t remaining = p.len();
+ auto mtu = get_hw_features().mtu;
+
+ while (remaining) {
+ auto can_send = std::min(uint16_t(mtu - ipv4_hdr_len_min), remaining);
+ remaining -= can_send;
+ auto pkt = p.share(offset, can_send);
+ send_pkt(pkt, remaining, offset);
+ offset += can_send;
+ }
+ } else {
+ // The whole packet can be send in one shot
+ send_pkt(p, 0, 0);
+ }
+}
+
+std::optional<l3_protocol::l3packet> ipv4::get_packet() {
+ // _packetq will be mostly empty here unless it hold remnants of previously
+ // fragmented packet
+ if (_packetq.empty()) {
+ for (size_t i = 0; i < _pkt_providers.size(); i++) {
+ auto l4p = _pkt_providers[_pkt_provider_idx++]();
+ if (_pkt_provider_idx == _pkt_providers.size()) {
+ _pkt_provider_idx = 0;
+ }
+ if (l4p) {
+ ldout(cct, 20) << " ipv4::get_packet len " << l4p->p.len() << dendl;
+ send(l4p->to, l4p->proto_num, std::move(l4p->p), l4p->e_dst);
+ break;
+ }
+ }
+ }
+
+ std::optional<l3_protocol::l3packet> p;
+ if (!_packetq.empty()) {
+ p = std::move(_packetq.front());
+ _packetq.pop_front();
+ }
+ return p;
+}
+
+void ipv4::frag_limit_mem() {
+ if (_frag_mem <= _frag_high_thresh) {
+ return;
+ }
+ auto drop = _frag_mem - _frag_low_thresh;
+ while (drop) {
+ if (_frags_age.empty()) {
+ return;
+ }
+ // Drop the oldest frag (first element) from _frags_age
+ auto frag_id = _frags_age.front();
+ _frags_age.pop_front();
+
+ // Drop from _frags as well
+ auto& frag = _frags[frag_id];
+ auto dropped_size = frag.mem_size;
+ frag_drop(frag_id, dropped_size);
+
+ drop -= std::min(drop, dropped_size);
+ }
+}
+
+void ipv4::frag_timeout() {
+ if (_frags.empty()) {
+ return;
+ }
+ auto now = ceph_clock_now();
+ for (auto it = _frags_age.begin(); it != _frags_age.end();) {
+ auto frag_id = *it;
+ auto& frag = _frags[frag_id];
+ if (now > frag.rx_time + _frag_timeout) {
+ auto dropped_size = frag.mem_size;
+ // Drop from _frags
+ frag_drop(frag_id, dropped_size);
+ // Drop from _frags_age
+ it = _frags_age.erase(it);
+ } else {
+ // The further items can only be younger
+ break;
+ }
+ }
+ if (_frags.size() != 0) {
+ frag_arm(now);
+ } else {
+ _frag_mem = 0;
+ }
+}
+
+int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, Packet p) {
+ uint32_t old = mem_size;
+ unsigned ip_hdr_len = h.ihl * 4;
+ // Store IP header
+ if (offset == 0) {
+ header = p.share(0, ip_hdr_len);
+ }
+ // Sotre IP payload
+ p.trim_front(ip_hdr_len);
+ data.merge(offset, std::move(p));
+ // Update mem size
+ mem_size = header.memory();
+ for (const auto& x : data.map) {
+ mem_size += x.second.memory();
+ }
+ auto added_size = mem_size - old;
+ return added_size;
+}
+
+bool ipv4::frag::is_complete() {
+ // If all the fragments are received, ipv4::frag::merge() should merge all
+ // the fragments into a single packet
+ auto offset = data.map.begin()->first;
+ auto nr_packet = data.map.size();
+ return last_frag_received && nr_packet == 1 && offset == 0;
+}
+
+Packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) {
+ auto& ip_header = header;
+ auto& ip_data = data.map.begin()->second;
+ // Append a ethernet header, needed for forwarding
+ auto eh = ip_header.prepend_header<eth_hdr>();
+ eh->src_mac = from;
+ eh->dst_mac = to;
+ eh->eth_proto = uint16_t(eth_protocol_num::ipv4);
+ *eh = eh->hton();
+ // Prepare a packet contains both ethernet header, ip header and ip data
+ ip_header.append(std::move(ip_data));
+ auto pkt = std::move(ip_header);
+ auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr));
+ // len is the sum of each fragment
+ iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr)));
+ // No fragmentation for the assembled datagram
+ iph->frag = 0;
+ // Since each fragment's csum is checked, no need to csum
+ // again for the assembled datagram
+ offload_info oi;
+ oi.reassembled = true;
+ pkt.set_offload_info(oi);
+ return pkt;
+}
+
+void icmp::received(Packet p, ipaddr from, ipaddr to) {
+ auto hdr = p.get_header<icmp_hdr>(0);
+ if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) {
+ return;
+ }
+ hdr->type = icmp_hdr::msg_type::echo_reply;
+ hdr->code = 0;
+ hdr->csum = 0;
+ checksummer csum;
+ csum.sum(reinterpret_cast<char*>(hdr), p.len());
+ hdr->csum = csum.get();
+
+ if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+ auto cb = [this, from] (const ethernet_address e_dst, Packet p, int r) mutable {
+ if (r == 0) {
+ _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp});
+ }
+ };
+ _inet.wait_l2_dst_address(from, std::move(p), cb);
+ }
+}
diff --git a/src/msg/async/dpdk/IP.h b/src/msg/async/dpdk/IP.h
new file mode 100644
index 000000000..e0e62f122
--- /dev/null
+++ b/src/msg/async/dpdk/IP.h
@@ -0,0 +1,403 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#ifndef CEPH_MSG_IP_H_
+#define CEPH_MSG_IP_H_
+
+#include <arpa/inet.h>
+#include <unordered_map>
+#include <cstdint>
+#include <array>
+#include <map>
+#include <list>
+#include <chrono>
+
+#include "msg/async/Event.h"
+#include "common/Throttle.h"
+
+#include "array_map.h"
+#include "ARP.h"
+#include "IPChecksum.h"
+#include "ip_types.h"
+#include "const.h"
+#include "net.h"
+#include "PacketUtil.h"
+#include "toeplitz.h"
+
+class ipv4;
+template <ip_protocol_num ProtoNum>
+class ipv4_l4;
+
+template <typename InetTraits>
+class tcp;
+
+struct ipv4_traits {
+ using address_type = ipv4_address;
+ using inet_type = ipv4_l4<ip_protocol_num::tcp>;
+ struct l4packet {
+ ipv4_address to;
+ Packet p;
+ ethernet_address e_dst;
+ ip_protocol_num proto_num;
+ };
+ using packet_provider_type = std::function<std::optional<l4packet> ()>;
+ static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+ csum.sum_many(src.ip, dst.ip, uint8_t(0), uint8_t(ip_protocol_num::tcp), len);
+ }
+ static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min;
+};
+
+template <ip_protocol_num ProtoNum>
+class ipv4_l4 {
+ public:
+ ipv4& _inet;
+ public:
+ ipv4_l4(ipv4& inet) : _inet(inet) {}
+ void register_packet_provider(ipv4_traits::packet_provider_type func);
+ void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+class ip_protocol {
+ public:
+ virtual ~ip_protocol() {}
+ virtual void received(Packet p, ipv4_address from, ipv4_address to) = 0;
+ virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) { return true; }
+};
+
+template <typename InetTraits>
+struct l4connid {
+ using ipaddr = typename InetTraits::address_type;
+ using inet_type = typename InetTraits::inet_type;
+ struct connid_hash;
+
+ ipaddr local_ip;
+ ipaddr foreign_ip;
+ uint16_t local_port;
+ uint16_t foreign_port;
+
+ bool operator==(const l4connid& x) const {
+ return local_ip == x.local_ip
+ && foreign_ip == x.foreign_ip
+ && local_port == x.local_port
+ && foreign_port == x.foreign_port;
+ }
+
+ uint32_t hash(const rss_key_type& rss_key) {
+ forward_hash hash_data;
+ hash_data.push_back(hton(foreign_ip.ip));
+ hash_data.push_back(hton(local_ip.ip));
+ hash_data.push_back(hton(foreign_port));
+ hash_data.push_back(hton(local_port));
+ return toeplitz_hash(rss_key, hash_data);
+ }
+};
+
+class ipv4_tcp final : public ip_protocol {
+ ipv4_l4<ip_protocol_num::tcp> _inet_l4;
+ std::unique_ptr<tcp<ipv4_traits>> _tcp;
+ public:
+ ipv4_tcp(ipv4& inet, EventCenter *c);
+ ~ipv4_tcp();
+ virtual void received(Packet p, ipv4_address from, ipv4_address to) override;
+ virtual bool forward(forward_hash& out_hash_data, Packet& p, size_t off) override;
+ friend class ipv4;
+};
+
+
+class icmp {
+ public:
+ using ipaddr = ipv4_address;
+ using inet_type = ipv4_l4<ip_protocol_num::icmp>;
+ explicit icmp(CephContext *c, inet_type& inet)
+ : cct(c), _inet(inet), _queue_space(c, "DPDK::icmp::_queue_space", 212992) {
+ _inet.register_packet_provider([this] {
+ std::optional<ipv4_traits::l4packet> l4p;
+ if (!_packetq.empty()) {
+ l4p = std::move(_packetq.front());
+ _packetq.pop_front();
+ _queue_space.put(l4p->p.len());
+ }
+ return l4p;
+ });
+ }
+ void received(Packet p, ipaddr from, ipaddr to);
+
+ private:
+ CephContext *cct;
+ // ipv4_l4<ip_protocol_num::icmp>
+ inet_type& _inet;
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+ Throttle _queue_space;
+};
+
+class ipv4_icmp final : public ip_protocol {
+ CephContext *cct;
+ ipv4_l4<ip_protocol_num::icmp> _inet_l4;
+ icmp _icmp;
+ public:
+ ipv4_icmp(CephContext *c, ipv4& inet) : cct(c), _inet_l4(inet), _icmp(c, _inet_l4) {}
+ virtual void received(Packet p, ipv4_address from, ipv4_address to) override {
+ _icmp.received(std::move(p), from, to);
+ }
+ friend class ipv4;
+};
+
+struct ip_hdr;
+
+struct ip_packet_filter {
+ virtual ~ip_packet_filter() {};
+ virtual void handle(Packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0;
+};
+
+struct ipv4_frag_id {
+ struct hash;
+ ipv4_address src_ip;
+ ipv4_address dst_ip;
+ uint16_t identification;
+ uint8_t protocol;
+ bool operator==(const ipv4_frag_id& x) const {
+ return src_ip == x.src_ip &&
+ dst_ip == x.dst_ip &&
+ identification == x.identification &&
+ protocol == x.protocol;
+ }
+};
+
+struct ipv4_frag_id::hash : private std::hash<ipv4_address>,
+ private std::hash<uint16_t>, private std::hash<uint8_t> {
+ size_t operator()(const ipv4_frag_id& id) const noexcept {
+ using h1 = std::hash<ipv4_address>;
+ using h2 = std::hash<uint16_t>;
+ using h3 = std::hash<uint8_t>;
+ return h1::operator()(id.src_ip) ^
+ h1::operator()(id.dst_ip) ^
+ h2::operator()(id.identification) ^
+ h3::operator()(id.protocol);
+ }
+};
+
+struct ipv4_tag {};
+using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>;
+
+class interface;
+
+class ipv4 {
+ public:
+ using address_type = ipv4_address;
+ using proto_type = uint16_t;
+ static address_type broadcast_address() { return ipv4_address(0xffffffff); }
+ static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); }
+ CephContext *cct;
+ EventCenter *center;
+
+ private:
+ interface* _netif;
+ std::vector<ipv4_traits::packet_provider_type> _pkt_providers;
+ std::optional<uint64_t> frag_timefd;
+ EventCallbackRef frag_handler;
+ arp _global_arp;
+ arp_for<ipv4> _arp;
+ ipv4_address _host_address;
+ ipv4_address _gw_address;
+ ipv4_address _netmask;
+ l3_protocol _l3;
+ subscription<Packet, ethernet_address> _rx_packets;
+ ipv4_tcp _tcp;
+ ipv4_icmp _icmp;
+ array_map<ip_protocol*, 256> _l4;
+ ip_packet_filter *_packet_filter;
+ struct frag {
+ Packet header;
+ ipv4_packet_merger data;
+ utime_t rx_time;
+ uint32_t mem_size = 0;
+ // fragment with MF == 0 inidates it is the last fragment
+ bool last_frag_received = false;
+
+ Packet get_assembled_packet(ethernet_address from, ethernet_address to);
+ int32_t merge(ip_hdr &h, uint16_t offset, Packet p);
+ bool is_complete();
+ };
+ std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags;
+ std::list<ipv4_frag_id> _frags_age;
+ static utime_t _frag_timeout;
+ static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024};
+ static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024};
+ uint32_t _frag_mem = 0;
+ circular_buffer<l3_protocol::l3packet> _packetq;
+ unsigned _pkt_provider_idx = 0;
+ PerfCounters *perf_logger;
+
+ private:
+ int handle_received_packet(Packet p, ethernet_address from);
+ bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+ std::optional<l3_protocol::l3packet> get_packet();
+ bool in_my_netmask(ipv4_address a) const {
+ return !((a.ip ^ _host_address.ip) & _netmask.ip);
+ }
+ void frag_limit_mem();
+ void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) {
+ _frags.erase(frag_id);
+ _frag_mem -= dropped_size;
+ }
+ void frag_arm(utime_t now) {
+ auto tp = now + _frag_timeout;
+ frag_timefd = center->create_time_event(tp.to_nsec() / 1000, frag_handler);
+ }
+ void frag_arm() {
+ auto now = ceph_clock_now();
+ frag_timefd = center->create_time_event(now.to_nsec() / 1000, frag_handler);
+ }
+
+ public:
+ void frag_timeout();
+
+ public:
+ explicit ipv4(CephContext *c, EventCenter *cen, interface* netif);
+ ~ipv4() {
+ delete frag_handler;
+ }
+ void set_host_address(ipv4_address ip) {
+ _host_address = ip;
+ _arp.set_self_addr(ip);
+ }
+ ipv4_address host_address() {
+ return _host_address;
+ }
+ void set_gw_address(ipv4_address ip) {
+ _gw_address = ip;
+ }
+ ipv4_address gw_address() const {
+ return _gw_address;
+ }
+ void set_netmask_address(ipv4_address ip) {
+ _netmask = ip;
+ }
+ ipv4_address netmask_address() const {
+ return _netmask;
+ }
+ interface *netif() const {
+ return _netif;
+ }
+ // TODO or something. Should perhaps truly be a list
+ // of filters. With ordering. And blackjack. Etc.
+ // But for now, a simple single raw pointer suffices
+ void set_packet_filter(ip_packet_filter *f) {
+ _packet_filter = f;
+ }
+ ip_packet_filter * packet_filter() const {
+ return _packet_filter;
+ }
+ void send(ipv4_address to, ip_protocol_num proto_num, Packet p, ethernet_address e_dst);
+ tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; }
+ void register_l4(proto_type id, ip_protocol* handler);
+ const hw_features& get_hw_features() const;
+ static bool needs_frag(Packet& p, ip_protocol_num proto_num, hw_features hw_features) {
+ if (p.len() + ipv4_hdr_len_min <= hw_features.mtu)
+ return false;
+
+ if ((proto_num == ip_protocol_num::tcp && hw_features.tx_tso))
+ return false;
+
+ return true;
+ }
+ void learn(ethernet_address l2, ipv4_address l3) {
+ _arp.learn(l2, l3);
+ }
+ void register_packet_provider(ipv4_traits::packet_provider_type&& func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ void wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb);
+};
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::register_packet_provider(
+ ipv4_traits::packet_provider_type func) {
+ _inet.register_packet_provider([func] {
+ auto l4p = func();
+ if (l4p) {
+ (*l4p).proto_num = ProtoNum;
+ }
+ return l4p;
+ });
+}
+
+template <ip_protocol_num ProtoNum>
+inline void ipv4_l4<ProtoNum>::wait_l2_dst_address(ipv4_address to, Packet p, resolution_cb cb) {
+ _inet.wait_l2_dst_address(to, std::move(p), std::move(cb));
+}
+
+struct ip_hdr {
+ uint8_t ihl : 4;
+ uint8_t ver : 4;
+ uint8_t dscp : 6;
+ uint8_t ecn : 2;
+ uint16_t len;
+ uint16_t id;
+ uint16_t frag;
+ enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 };
+ uint8_t ttl;
+ uint8_t ip_proto;
+ uint16_t csum;
+ ipv4_address src_ip;
+ ipv4_address dst_ip;
+ uint8_t options[0];
+ ip_hdr hton() {
+ ip_hdr hdr = *this;
+ hdr.len = ::hton(len);
+ hdr.id = ::hton(id);
+ hdr.frag = ::hton(frag);
+ hdr.csum = ::hton(csum);
+ hdr.src_ip.ip = ::hton(src_ip.ip);
+ hdr.dst_ip.ip = ::hton(dst_ip.ip);
+ return hdr;
+ }
+ ip_hdr ntoh() {
+ ip_hdr hdr = *this;
+ hdr.len = ::ntoh(len);
+ hdr.id = ::ntoh(id);
+ hdr.frag = ::ntoh(frag);
+ hdr.csum = ::ntoh(csum);
+ hdr.src_ip = src_ip.ntoh();
+ hdr.dst_ip = dst_ip.ntoh();
+ return hdr;
+ }
+
+ bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); }
+ bool df() { return frag & (1 << uint8_t(frag_bits::df)); }
+ uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); }
+} __attribute__((packed));
+
+template <typename InetTraits>
+struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> {
+ size_t operator()(const l4connid<InetTraits>& id) const noexcept {
+ using h1 = std::hash<ipaddr>;
+ using h2 = std::hash<uint16_t>;
+ return h1::operator()(id.local_ip)
+ ^ h1::operator()(id.foreign_ip)
+ ^ h2::operator()(id.local_port)
+ ^ h2::operator()(id.foreign_port);
+ }
+};
+
+#endif /* CEPH_MSG_IP_H */
diff --git a/src/msg/async/dpdk/IPChecksum.cc b/src/msg/async/dpdk/IPChecksum.cc
new file mode 100644
index 000000000..7a3253c1e
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.cc
@@ -0,0 +1,70 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include <arpa/inet.h>
+#include "net.h"
+#include "IPChecksum.h"
+
+void checksummer::sum(const char* data, size_t len) {
+ auto orig_len = len;
+ if (odd) {
+ csum += uint8_t(*data++);
+ --len;
+ }
+ auto p64 = reinterpret_cast<const uint64_t*>(data);
+ while (len >= 8) {
+ csum += ntohq(*p64++);
+ len -= 8;
+ }
+ auto p16 = reinterpret_cast<const uint16_t*>(p64);
+ while (len >= 2) {
+ csum += ntohs(*p16++);
+ len -= 2;
+ }
+ auto p8 = reinterpret_cast<const uint8_t*>(p16);
+ if (len) {
+ csum += *p8++ << 8;
+ len -= 1;
+ }
+ odd ^= orig_len & 1;
+}
+
+uint16_t checksummer::get() const {
+ __int128 csum1 = (csum & 0xffffffffffffffff) + (csum >> 64);
+ uint64_t csum = (csum1 & 0xffffffffffffffff) + (csum1 >> 64);
+ csum = (csum & 0xffff) + ((csum >> 16) & 0xffff) + ((csum >> 32) & 0xffff) + (csum >> 48);
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ return htons(~csum);
+}
+
+void checksummer::sum(const Packet& p) {
+ for (auto&& f : p.fragments()) {
+ sum(f.base, f.size);
+ }
+}
+
+uint16_t ip_checksum(const void* data, size_t len) {
+ checksummer cksum;
+ cksum.sum(reinterpret_cast<const char*>(data), len);
+ return cksum.get();
+}
diff --git a/src/msg/async/dpdk/IPChecksum.h b/src/msg/async/dpdk/IPChecksum.h
new file mode 100644
index 000000000..9af4a86b9
--- /dev/null
+++ b/src/msg/async/dpdk/IPChecksum.h
@@ -0,0 +1,72 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CHECKSUM_H_
+#define CEPH_MSG_CHECKSUM_H_
+
+#include <cstdint>
+#include <cstddef>
+#include <arpa/inet.h>
+
+#include "Packet.h"
+
+uint16_t ip_checksum(const void* data, size_t len);
+
+struct checksummer {
+ __int128 csum = 0;
+ bool odd = false;
+ void sum(const char* data, size_t len);
+ void sum(const Packet& p);
+ void sum(uint8_t data) {
+ if (!odd) {
+ csum += data << 8;
+ } else {
+ csum += data;
+ }
+ odd = !odd;
+ }
+ void sum(uint16_t data) {
+ if (odd) {
+ sum(uint8_t(data >> 8));
+ sum(uint8_t(data));
+ } else {
+ csum += data;
+ }
+ }
+ void sum(uint32_t data) {
+ if (odd) {
+ sum(uint16_t(data));
+ sum(uint16_t(data >> 16));
+ } else {
+ csum += data;
+ }
+ }
+ void sum_many() {}
+ template <typename T0, typename... T>
+ void sum_many(T0 data, T... rest) {
+ sum(data);
+ sum_many(rest...);
+ }
+ uint16_t get() const;
+};
+
+#endif /* CEPH_MSG_CHECKSUM_H_ */
diff --git a/src/msg/async/dpdk/Packet.cc b/src/msg/async/dpdk/Packet.cc
new file mode 100644
index 000000000..6c2320a01
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.cc
@@ -0,0 +1,146 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+#include <algorithm>
+#include <cctype>
+
+#include "capture.h"
+#include "Packet.h"
+
+constexpr size_t Packet::internal_data_size;
+constexpr size_t Packet::default_nr_frags;
+
+void Packet::linearize(size_t at_frag, size_t desired_size) {
+ _impl->unuse_internal_data();
+ size_t nr_frags = 0;
+ size_t accum_size = 0;
+ while (accum_size < desired_size) {
+ accum_size += _impl->frags[at_frag + nr_frags].size;
+ ++nr_frags;
+ }
+ char *new_frag = new char[accum_size];
+ auto p = new_frag;
+ for (size_t i = 0; i < nr_frags; ++i) {
+ auto& f = _impl->frags[at_frag + i];
+ p = std::copy(f.base, f.base + f.size, p);
+ }
+ // collapse nr_frags into one fragment
+ std::copy(_impl->frags + at_frag + nr_frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + at_frag + 1);
+ _impl->_nr_frags -= nr_frags - 1;
+ _impl->frags[at_frag] = fragment{new_frag, accum_size};
+ if (at_frag == 0 && desired_size == len()) {
+ // We can drop the old buffer safely
+ auto x = std::move(_impl->_deleter);
+ _impl->_deleter = make_deleter([new_frag] { delete []new_frag; });
+ } else {
+ auto del = std::bind(
+ [new_frag](deleter &d) { delete []new_frag; }, std::move(_impl->_deleter));
+ _impl->_deleter = make_deleter(std::move(del));
+ }
+}
+
+class C_free_on_cpu : public EventCallback {
+ deleter del;
+ std::function<void()> cb;
+ public:
+ C_free_on_cpu(deleter &&d, std::function<void()> &&c):
+ del(std::move(d)), cb(std::move(c)) {}
+ void do_request(uint64_t fd) {
+ // deleter needs to be moved from lambda capture to be destroyed here
+ // otherwise deleter destructor will be called on a cpu that called
+ // create_external_event when work_item is destroyed.
+ deleter xxx(std::move(del));
+ cb();
+ delete this;
+ }
+};
+
+Packet Packet::free_on_cpu(EventCenter *center, std::function<void()> cb)
+{
+ auto del = std::bind(
+ [center, cb] (deleter &del) mutable {
+ center->dispatch_event_external(new C_free_on_cpu(std::move(del), std::move(cb)));
+ }, std::move(_impl->_deleter));
+ // make new deleter that runs old deleter on an origin cpu
+ _impl->_deleter = make_deleter(deleter(), std::move(del));
+
+ return Packet(impl::copy(_impl.get()));
+}
+
+std::ostream& operator<<(std::ostream& os, const Packet& p) {
+ os << "Packet{";
+ bool first = true;
+ for (auto&& frag : p.fragments()) {
+ if (!first) {
+ os << ", ";
+ }
+ first = false;
+ if (std::all_of(frag.base, frag.base + frag.size, [] (int c) { return c >= 9 && c <= 0x7f; })) {
+ os << '"';
+ for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+ auto c = *p;
+ if (isprint(c)) {
+ os << c;
+ } else if (c == '\r') {
+ os << "\\r";
+ } else if (c == '\n') {
+ os << "\\n";
+ } else if (c == '\t') {
+ os << "\\t";
+ } else {
+ uint8_t b = c;
+ os << "\\x" << (b / 16) << (b % 16);
+ }
+ }
+ os << '"';
+ } else {
+ os << "{";
+ bool nfirst = true;
+ for (auto p = frag.base; p != frag.base + frag.size; ++p) {
+ if (!nfirst) {
+ os << " ";
+ }
+ nfirst = false;
+ uint8_t b = *p;
+ os << b;
+ }
+ os << "}";
+ }
+ }
+ os << "}";
+ return os;
+}
diff --git a/src/msg/async/dpdk/Packet.h b/src/msg/async/dpdk/Packet.h
new file mode 100644
index 000000000..2aa65f6e1
--- /dev/null
+++ b/src/msg/async/dpdk/Packet.h
@@ -0,0 +1,549 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_H_
+#define CEPH_MSG_PACKET_H_
+
+#include <vector>
+#include <algorithm>
+#include <iosfwd>
+
+#include "include/types.h"
+#include "common/deleter.h"
+#include "msg/async/Event.h"
+
+#include "const.h"
+
+struct fragment {
+ char* base;
+ size_t size;
+};
+
+struct offload_info {
+ ip_protocol_num protocol = ip_protocol_num::unused;
+ bool needs_csum = false;
+ uint8_t ip_hdr_len = 20;
+ uint8_t tcp_hdr_len = 20;
+ uint8_t udp_hdr_len = 8;
+ bool needs_ip_csum = false;
+ bool reassembled = false;
+ uint16_t tso_seg_size = 0;
+ // HW stripped VLAN header (CPU order)
+ std::optional<uint16_t> vlan_tci;
+};
+
+// Zero-copy friendly packet class
+//
+// For implementing zero-copy, we need a flexible destructor that can
+// destroy packet data in different ways: decrementing a reference count,
+// or calling a free()-like function.
+//
+// Moreover, we need different destructors for each set of fragments within
+// a single fragment. For example, a header and trailer might need delete[]
+// to be called, while the internal data needs a reference count to be
+// released. Matters are complicated in that fragments can be split
+// (due to virtual/physical translation).
+//
+// To implement this, we associate each packet with a single destructor,
+// but allow composing a packet from another packet plus a fragment to
+// be added, with its own destructor, causing the destructors to be chained.
+//
+// The downside is that the data needed for the destructor is duplicated,
+// if it is already available in the fragment itself.
+//
+// As an optimization, when we allocate small fragments, we allocate some
+// extra space, so prepending to the packet does not require extra
+// allocations. This is useful when adding headers.
+//
+class Packet {
+ // enough for lots of headers, not quite two cache lines:
+ static constexpr size_t internal_data_size = 128 - 16;
+ static constexpr size_t default_nr_frags = 4;
+
+ struct pseudo_vector {
+ fragment* _start;
+ fragment* _finish;
+ pseudo_vector(fragment* start, size_t nr)
+ : _start(start), _finish(_start + nr) {}
+ fragment* begin() { return _start; }
+ fragment* end() { return _finish; }
+ fragment& operator[](size_t idx) { return _start[idx]; }
+ };
+
+ struct impl {
+ // when destroyed, virtual destructor will reclaim resources
+ deleter _deleter;
+ unsigned _len = 0;
+ uint16_t _nr_frags = 0;
+ uint16_t _allocated_frags;
+ offload_info _offload_info;
+ std::optional<uint32_t> rss_hash;
+ char data[internal_data_size]; // only frags[0] may use
+ unsigned headroom = internal_data_size; // in data
+ // FIXME: share data/frags space
+
+ fragment frags[];
+
+ explicit impl(size_t nr_frags = default_nr_frags);
+ impl(const impl&) = delete;
+ impl(fragment frag, size_t nr_frags = default_nr_frags);
+
+ pseudo_vector fragments() { return { frags, _nr_frags }; }
+
+ static std::unique_ptr<impl> allocate(size_t nr_frags) {
+ nr_frags = std::max(nr_frags, default_nr_frags);
+ return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags));
+ }
+
+ static std::unique_ptr<impl> copy(impl* old, size_t nr) {
+ auto n = allocate(nr);
+ n->_deleter = std::move(old->_deleter);
+ n->_len = old->_len;
+ n->_nr_frags = old->_nr_frags;
+ n->headroom = old->headroom;
+ n->_offload_info = old->_offload_info;
+ n->rss_hash = old->rss_hash;
+ std::copy(old->frags, old->frags + old->_nr_frags, n->frags);
+ old->copy_internal_fragment_to(n.get());
+ return n;
+ }
+
+ static std::unique_ptr<impl> copy(impl* old) {
+ return copy(old, old->_nr_frags);
+ }
+
+ static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) {
+ if (old->_allocated_frags >= old->_nr_frags + extra_frags) {
+ return old;
+ }
+ return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags));
+ }
+ void* operator new(size_t size, size_t nr_frags = default_nr_frags) {
+ ceph_assert(nr_frags == uint16_t(nr_frags));
+ return ::operator new(size + nr_frags * sizeof(fragment));
+ }
+ // Matching the operator new above
+ void operator delete(void* ptr, size_t nr_frags) {
+ return ::operator delete(ptr);
+ }
+ // Since the above "placement delete" hides the global one, expose it
+ void operator delete(void* ptr) {
+ return ::operator delete(ptr);
+ }
+
+ bool using_internal_data() const {
+ return _nr_frags
+ && frags[0].base >= data
+ && frags[0].base < data + internal_data_size;
+ }
+
+ void unuse_internal_data() {
+ if (!using_internal_data()) {
+ return;
+ }
+ auto buf = static_cast<char*>(::malloc(frags[0].size));
+ if (!buf) {
+ throw std::bad_alloc();
+ }
+ deleter d = make_free_deleter(buf);
+ std::copy(frags[0].base, frags[0].base + frags[0].size, buf);
+ frags[0].base = buf;
+ _deleter.append(std::move(d));
+ headroom = internal_data_size;
+ }
+ void copy_internal_fragment_to(impl* to) {
+ if (!using_internal_data()) {
+ return;
+ }
+ to->frags[0].base = to->data + headroom;
+ std::copy(frags[0].base, frags[0].base + frags[0].size,
+ to->frags[0].base);
+ }
+ };
+ explicit Packet(std::unique_ptr<impl>&& impl) : _impl(std::move(impl)) {}
+ std::unique_ptr<impl> _impl;
+public:
+ static Packet from_static_data(const char* data, size_t len) {
+ return {fragment{const_cast<char*>(data), len}, deleter()};
+ }
+
+ // build empty Packet
+ Packet();
+ // build empty Packet with nr_frags allocated
+ explicit Packet(size_t nr_frags);
+ // move existing Packet
+ Packet(Packet&& x) noexcept;
+ // copy data into Packet
+ Packet(const char* data, size_t len);
+ // copy data into Packet
+ explicit Packet(fragment frag);
+ // zero-copy single fragment
+ Packet(fragment frag, deleter del);
+ // zero-copy multiple fragments
+ Packet(std::vector<fragment> frag, deleter del);
+ // build Packet with iterator
+ template <typename Iterator>
+ Packet(Iterator begin, Iterator end, deleter del);
+ // append fragment (copying new fragment)
+ Packet(Packet&& x, fragment frag);
+ // prepend fragment (copying new fragment, with header optimization)
+ Packet(fragment frag, Packet&& x);
+ // prepend fragment (zero-copy)
+ Packet(fragment frag, deleter del, Packet&& x);
+ // append fragment (zero-copy)
+ Packet(Packet&& x, fragment frag, deleter d);
+ // append deleter
+ Packet(Packet&& x, deleter d);
+
+ Packet& operator=(Packet&& x) {
+ if (this != &x) {
+ this->~Packet();
+ new (this) Packet(std::move(x));
+ }
+ return *this;
+ }
+
+ unsigned len() const { return _impl->_len; }
+ unsigned memory() const { return len() + sizeof(Packet::impl); }
+
+ fragment frag(unsigned idx) const { return _impl->frags[idx]; }
+ fragment& frag(unsigned idx) { return _impl->frags[idx]; }
+
+ unsigned nr_frags() const { return _impl->_nr_frags; }
+ pseudo_vector fragments() const { return { _impl->frags, _impl->_nr_frags }; }
+ fragment* fragment_array() const { return _impl->frags; }
+
+ // share Packet data (reference counted, non COW)
+ Packet share();
+ Packet share(size_t offset, size_t len);
+
+ void append(Packet&& p);
+
+ void trim_front(size_t how_much);
+ void trim_back(size_t how_much);
+
+ // get a header pointer, linearizing if necessary
+ template <typename Header>
+ Header* get_header(size_t offset = 0);
+
+ // get a header pointer, linearizing if necessary
+ char* get_header(size_t offset, size_t size);
+
+ // prepend a header (default-initializing it)
+ template <typename Header>
+ Header* prepend_header(size_t extra_size = 0);
+
+ // prepend a header (uninitialized!)
+ char* prepend_uninitialized_header(size_t size);
+
+ Packet free_on_cpu(EventCenter *c, std::function<void()> cb = []{});
+
+ void linearize() { return linearize(0, len()); }
+
+ void reset() { _impl.reset(); }
+
+ void reserve(int n_frags) {
+ if (n_frags > _impl->_nr_frags) {
+ auto extra = n_frags - _impl->_nr_frags;
+ _impl = impl::allocate_if_needed(std::move(_impl), extra);
+ }
+ }
+ std::optional<uint32_t> rss_hash() {
+ return _impl->rss_hash;
+ }
+ void set_rss_hash(uint32_t hash) {
+ _impl->rss_hash = hash;
+ }
+private:
+ void linearize(size_t at_frag, size_t desired_size);
+ bool allocate_headroom(size_t size);
+public:
+ class offload_info offload_info() const { return _impl->_offload_info; }
+ class offload_info& offload_info_ref() { return _impl->_offload_info; }
+ void set_offload_info(class offload_info oi) { _impl->_offload_info = oi; }
+};
+
+std::ostream& operator<<(std::ostream& os, const Packet& p);
+
+inline Packet::Packet(Packet&& x) noexcept
+ : _impl(std::move(x._impl)) {
+}
+
+inline Packet::impl::impl(size_t nr_frags)
+ : _len(0), _allocated_frags(nr_frags) {
+}
+
+inline Packet::impl::impl(fragment frag, size_t nr_frags)
+ : _len(frag.size), _allocated_frags(nr_frags) {
+ ceph_assert(_allocated_frags > _nr_frags);
+ if (frag.size <= internal_data_size) {
+ headroom -= frag.size;
+ frags[0] = { data + headroom, frag.size };
+ } else {
+ auto buf = static_cast<char*>(::malloc(frag.size));
+ if (!buf) {
+ throw std::bad_alloc();
+ }
+ deleter d = make_free_deleter(buf);
+ frags[0] = { buf, frag.size };
+ _deleter.append(std::move(d));
+ }
+ std::copy(frag.base, frag.base + frag.size, frags[0].base);
+ ++_nr_frags;
+}
+
+inline Packet::Packet(): _impl(impl::allocate(1)) {
+}
+
+inline Packet::Packet(size_t nr_frags): _impl(impl::allocate(nr_frags)) {
+}
+
+inline Packet::Packet(fragment frag): _impl(new impl(frag)) {
+}
+
+inline Packet::Packet(const char* data, size_t size):
+ Packet(fragment{const_cast<char*>(data), size}) {
+}
+
+inline Packet::Packet(fragment frag, deleter d)
+ : _impl(impl::allocate(1)) {
+ _impl->_deleter = std::move(d);
+ _impl->frags[_impl->_nr_frags++] = frag;
+ _impl->_len = frag.size;
+}
+
+inline Packet::Packet(std::vector<fragment> frag, deleter d)
+ : _impl(impl::allocate(frag.size())) {
+ _impl->_deleter = std::move(d);
+ std::copy(frag.begin(), frag.end(), _impl->frags);
+ _impl->_nr_frags = frag.size();
+ _impl->_len = 0;
+ for (auto&& f : _impl->fragments()) {
+ _impl->_len += f.size;
+ }
+}
+
+template <typename Iterator>
+inline Packet::Packet(Iterator begin, Iterator end, deleter del) {
+ unsigned nr_frags = 0, len = 0;
+ nr_frags = std::distance(begin, end);
+ std::for_each(begin, end, [&] (fragment& frag) { len += frag.size; });
+ _impl = impl::allocate(nr_frags);
+ _impl->_deleter = std::move(del);
+ _impl->_len = len;
+ _impl->_nr_frags = nr_frags;
+ std::copy(begin, end, _impl->frags);
+}
+
+inline Packet::Packet(Packet&& x, fragment frag)
+ : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+ _impl->_len += frag.size;
+ char* buf = new char[frag.size];
+ std::copy(frag.base, frag.base + frag.size, buf);
+ _impl->frags[_impl->_nr_frags++] = {buf, frag.size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf] {
+ delete[] buf;
+ });
+}
+
+inline bool Packet::allocate_headroom(size_t size) {
+ if (_impl->headroom >= size) {
+ _impl->_len += size;
+ if (!_impl->using_internal_data()) {
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags + 1);
+ _impl->frags[0] = { _impl->data + internal_data_size, 0 };
+ ++_impl->_nr_frags;
+ }
+ _impl->headroom -= size;
+ _impl->frags[0].base -= size;
+ _impl->frags[0].size += size;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+inline Packet::Packet(fragment frag, Packet&& x)
+ : _impl(std::move(x._impl)) {
+ // try to prepend into existing internal fragment
+ if (allocate_headroom(frag.size)) {
+ std::copy(frag.base, frag.base + frag.size, _impl->frags[0].base);
+ return;
+ } else {
+ // didn't work out, allocate and copy
+ _impl->unuse_internal_data();
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ _impl->_len += frag.size;
+ char *buf = new char[frag.size];
+ std::copy(frag.base, frag.base + frag.size, buf);
+ std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags + 1);
+ ++_impl->_nr_frags;
+ _impl->frags[0] = {buf, frag.size};
+ _impl->_deleter = make_deleter(
+ std::move(_impl->_deleter), [buf] { delete []buf; });
+ }
+}
+
+inline Packet::Packet(Packet&& x, fragment frag, deleter d)
+ : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+ _impl->_len += frag.size;
+ _impl->frags[_impl->_nr_frags++] = frag;
+ d.append(std::move(_impl->_deleter));
+ _impl->_deleter = std::move(d);
+}
+
+inline Packet::Packet(Packet&& x, deleter d): _impl(std::move(x._impl)) {
+ _impl->_deleter.append(std::move(d));
+}
+
+inline void Packet::append(Packet&& p) {
+ if (!_impl->_len) {
+ *this = std::move(p);
+ return;
+ }
+ _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags);
+ _impl->_len += p._impl->_len;
+ p._impl->unuse_internal_data();
+ std::copy(p._impl->frags, p._impl->frags + p._impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags);
+ _impl->_nr_frags += p._impl->_nr_frags;
+ p._impl->_deleter.append(std::move(_impl->_deleter));
+ _impl->_deleter = std::move(p._impl->_deleter);
+}
+
+inline char* Packet::get_header(size_t offset, size_t size) {
+ if (offset + size > _impl->_len) {
+ return nullptr;
+ }
+ size_t i = 0;
+ while (i != _impl->_nr_frags && offset >= _impl->frags[i].size) {
+ offset -= _impl->frags[i++].size;
+ }
+ if (i == _impl->_nr_frags) {
+ return nullptr;
+ }
+ if (offset + size > _impl->frags[i].size) {
+ linearize(i, offset + size);
+ }
+ return _impl->frags[i].base + offset;
+}
+
+template <typename Header>
+inline Header* Packet::get_header(size_t offset) {
+ return reinterpret_cast<Header*>(get_header(offset, sizeof(Header)));
+}
+
+inline void Packet::trim_front(size_t how_much) {
+ ceph_assert(how_much <= _impl->_len);
+ _impl->_len -= how_much;
+ size_t i = 0;
+ while (how_much && how_much >= _impl->frags[i].size) {
+ how_much -= _impl->frags[i++].size;
+ }
+ std::copy(_impl->frags + i, _impl->frags + _impl->_nr_frags, _impl->frags);
+ _impl->_nr_frags -= i;
+ if (!_impl->using_internal_data()) {
+ _impl->headroom = internal_data_size;
+ }
+ if (how_much) {
+ if (_impl->using_internal_data()) {
+ _impl->headroom += how_much;
+ }
+ _impl->frags[0].base += how_much;
+ _impl->frags[0].size -= how_much;
+ }
+}
+
+inline void Packet::trim_back(size_t how_much) {
+ ceph_assert(how_much <= _impl->_len);
+ _impl->_len -= how_much;
+ size_t i = _impl->_nr_frags - 1;
+ while (how_much && how_much >= _impl->frags[i].size) {
+ how_much -= _impl->frags[i--].size;
+ }
+ _impl->_nr_frags = i + 1;
+ if (how_much) {
+ _impl->frags[i].size -= how_much;
+ if (i == 0 && _impl->using_internal_data()) {
+ _impl->headroom += how_much;
+ }
+ }
+}
+
+template <typename Header>
+Header* Packet::prepend_header(size_t extra_size) {
+ auto h = prepend_uninitialized_header(sizeof(Header) + extra_size);
+ return new (h) Header{};
+}
+
+// prepend a header (uninitialized!)
+inline char* Packet::prepend_uninitialized_header(size_t size) {
+ if (!allocate_headroom(size)) {
+ // didn't work out, allocate and copy
+ _impl->unuse_internal_data();
+ // try again, after unuse_internal_data we may have space after all
+ if (!allocate_headroom(size)) {
+ // failed
+ _impl->_len += size;
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ char *buf = new char[size];
+ std::copy_backward(_impl->frags, _impl->frags + _impl->_nr_frags,
+ _impl->frags + _impl->_nr_frags + 1);
+ ++_impl->_nr_frags;
+ _impl->frags[0] = {buf, size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+ [buf] { delete []buf; });
+ }
+ }
+ return _impl->frags[0].base;
+}
+
+inline Packet Packet::share() {
+ return share(0, _impl->_len);
+}
+
+inline Packet Packet::share(size_t offset, size_t len) {
+ _impl->unuse_internal_data(); // FIXME: eliminate?
+ Packet n;
+ n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags);
+ size_t idx = 0;
+ while (offset > 0 && offset >= _impl->frags[idx].size) {
+ offset -= _impl->frags[idx++].size;
+ }
+ while (n._impl->_len < len) {
+ auto& f = _impl->frags[idx++];
+ auto fsize = std::min(len - n._impl->_len, f.size - offset);
+ n._impl->frags[n._impl->_nr_frags++] = { f.base + offset, fsize };
+ n._impl->_len += fsize;
+ offset = 0;
+ }
+ n._impl->_offload_info = _impl->_offload_info;
+ ceph_assert(!n._impl->_deleter);
+ n._impl->_deleter = _impl->_deleter.share();
+ return n;
+}
+
+#endif /* CEPH_MSG_PACKET_H_ */
diff --git a/src/msg/async/dpdk/PacketUtil.h b/src/msg/async/dpdk/PacketUtil.h
new file mode 100644
index 000000000..118218e66
--- /dev/null
+++ b/src/msg/async/dpdk/PacketUtil.h
@@ -0,0 +1,154 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_PACKET_UTIL_H_
+#define CEPH_MSG_PACKET_UTIL_H_
+
+#include <map>
+#include <iostream>
+
+#include "Packet.h"
+
+template <typename Offset, typename Tag>
+class packet_merger {
+ private:
+ static uint64_t& linearizations_ref() {
+ static thread_local uint64_t linearization_count;
+ return linearization_count;
+ }
+ public:
+ std::map<Offset, Packet> map;
+
+ static uint64_t linearizations() {
+ return linearizations_ref();
+ }
+
+ void merge(Offset offset, Packet p) {
+ bool insert = true;
+ auto beg = offset;
+ auto end = beg + p.len();
+ // First, try to merge the packet with existing segment
+ for (auto it = map.begin(); it != map.end();) {
+ auto& seg_pkt = it->second;
+ auto seg_beg = it->first;
+ auto seg_end = seg_beg + seg_pkt.len();
+ // There are 6 cases:
+ if (seg_beg <= beg && end <= seg_end) {
+ // 1) seg_beg beg end seg_end
+ // We already have data in this packet
+ return;
+ } else if (beg <= seg_beg && seg_end <= end) {
+ // 2) beg seg_beg seg_end end
+ // The new segment contains more data than this old segment
+ // Delete the old one, insert the new one
+ it = map.erase(it);
+ insert = true;
+ break;
+ } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) {
+ // 3) beg seg_beg end seg_end
+ // Merge two segments, trim front of old segment
+ auto trim = end - seg_beg;
+ seg_pkt.trim_front(trim);
+ p.append(std::move(seg_pkt));
+ // Delete the old one, insert the new one
+ it = map.erase(it);
+ insert = true;
+ break;
+ } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+ // 4) seg_beg beg seg_end end
+ // Merge two segments, trim front of new segment
+ auto trim = seg_end - beg;
+ p.trim_front(trim);
+ // Append new data to the old segment, keep the old segment
+ seg_pkt.append(std::move(p));
+ seg_pkt.linearize();
+ ++linearizations_ref();
+ insert = false;
+ break;
+ } else {
+ // 5) beg end < seg_beg seg_end
+ // or
+ // 6) seg_beg seg_end < beg end
+ // Can not merge with this segment, keep looking
+ it++;
+ insert = true;
+ }
+ }
+
+ if (insert) {
+ p.linearize();
+ ++linearizations_ref();
+ map.emplace(beg, std::move(p));
+ }
+
+ // Second, merge adjacent segments after this packet has been merged,
+ // because this packet might fill a "whole" and make two adjacent
+ // segments mergable
+ for (auto it = map.begin(); it != map.end();) {
+ // The first segment
+ auto& seg_pkt = it->second;
+ auto seg_beg = it->first;
+ auto seg_end = seg_beg + seg_pkt.len();
+
+ // The second segment
+ auto it_next = it;
+ it_next++;
+ if (it_next == map.end()) {
+ break;
+ }
+ auto& p = it_next->second;
+ auto beg = it_next->first;
+ auto end = beg + p.len();
+
+ // Merge the the second segment into first segment if possible
+ if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+ // Merge two segments, trim front of second segment
+ auto trim = seg_end - beg;
+ p.trim_front(trim);
+ // Append new data to the first segment, keep the first segment
+ seg_pkt.append(std::move(p));
+
+ // Delete the second segment
+ map.erase(it_next);
+
+ // Keep merging this first segment with its new next packet
+ // So we do not update the iterator: it
+ continue;
+ } else if (end <= seg_end) {
+ // The first segment has all the data in the second segment
+ // Delete the second segment
+ map.erase(it_next);
+ continue;
+ } else if (seg_end < beg) {
+ // Can not merge first segment with second segment
+ it = it_next;
+ continue;
+ } else {
+ // If we reach here, we have a bug with merge.
+ std::cout << "packet_merger: merge error\n";
+ abort();
+ }
+ }
+ }
+};
+
+#endif
diff --git a/src/msg/async/dpdk/TCP-Stack.h b/src/msg/async/dpdk/TCP-Stack.h
new file mode 100644
index 000000000..edcf4d803
--- /dev/null
+++ b/src/msg/async/dpdk/TCP-Stack.h
@@ -0,0 +1,40 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+// tcp/network-stack integration
+
+#ifndef CEPH_MSG_DPDK_TCP_STACK_H
+#define CEPH_MSG_DPDK_TCP_STACK_H
+
+class ServerSocket;
+class ConnectedSocket;
+
+class ipv4_traits;
+template <typename InetTraits>
+class tcp;
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+ int type, unsigned addr_slot, ServerSocket *sa);
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+ ConnectedSocket *sa);
+
+#endif
diff --git a/src/msg/async/dpdk/TCP.cc b/src/msg/async/dpdk/TCP.cc
new file mode 100644
index 000000000..86c80487c
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.cc
@@ -0,0 +1,841 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#include "align.h"
+#include "TCP.h"
+#include "IP.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "tcp "
+
+void tcp_option::parse(uint8_t* beg, uint8_t* end)
+{
+ while (beg < end) {
+ auto kind = option_kind(*beg);
+ if (kind != option_kind::nop && kind != option_kind::eol) {
+ // Make sure there is enough room for this option
+ auto len = *(beg + 1);
+ if (beg + len > end) {
+ return;
+ }
+ }
+ switch (kind) {
+ case option_kind::mss:
+ _mss_received = true;
+ _remote_mss = ntoh(reinterpret_cast<mss*>(beg)->mss);
+ beg += option_len::mss;
+ break;
+ case option_kind::win_scale:
+ _win_scale_received = true;
+ _remote_win_scale = reinterpret_cast<win_scale*>(beg)->shift;
+ // We can turn on win_scale option, 7 is Linux's default win scale size
+ _local_win_scale = 7;
+ beg += option_len::win_scale;
+ break;
+ case option_kind::sack:
+ _sack_received = true;
+ beg += option_len::sack;
+ break;
+ case option_kind::nop:
+ beg += option_len::nop;
+ break;
+ case option_kind::eol:
+ return;
+ default:
+ // Ignore options we do not understand
+ auto len = *(beg + 1);
+ beg += len;
+ // Prevent infinite loop
+ if (len == 0) {
+ return;
+ }
+ break;
+ }
+ }
+}
+
+uint8_t tcp_option::fill(tcp_hdr* th, uint8_t options_size)
+{
+ auto hdr = reinterpret_cast<uint8_t*>(th);
+ auto off = hdr + sizeof(tcp_hdr);
+ uint8_t size = 0;
+ bool syn_on = th->f_syn;
+ bool ack_on = th->f_ack;
+
+ if (syn_on) {
+ if (_mss_received || !ack_on) {
+ auto mss = new (off) tcp_option::mss;
+ mss->mss = _local_mss;
+ off += mss->len;
+ size += mss->len;
+ *mss = mss->hton();
+ }
+ if (_win_scale_received || !ack_on) {
+ auto win_scale = new (off) tcp_option::win_scale;
+ win_scale->shift = _local_win_scale;
+ off += win_scale->len;
+ size += win_scale->len;
+ }
+ }
+ if (size > 0) {
+ // Insert NOP option
+ auto size_max = align_up(uint8_t(size + 1), tcp_option::align);
+ while (size < size_max - uint8_t(option_len::eol)) {
+ new (off) tcp_option::nop;
+ off += option_len::nop;
+ size += option_len::nop;
+ }
+ new (off) tcp_option::eol;
+ size += option_len::eol;
+ }
+ ceph_assert(size == options_size);
+
+ return size;
+}
+
+uint8_t tcp_option::get_size(bool syn_on, bool ack_on)
+{
+ uint8_t size = 0;
+ if (syn_on) {
+ if (_mss_received || !ack_on) {
+ size += option_len::mss;
+ }
+ if (_win_scale_received || !ack_on) {
+ size += option_len::win_scale;
+ }
+ }
+ if (size > 0) {
+ size += option_len::eol;
+ // Insert NOP option to align on 32-bit
+ size = align_up(size, tcp_option::align);
+ }
+ return size;
+}
+
+ipv4_tcp::ipv4_tcp(ipv4& inet, EventCenter *c)
+ : _inet_l4(inet), _tcp(std::unique_ptr<tcp<ipv4_traits>>(new tcp<ipv4_traits>(inet.cct, _inet_l4, c)))
+{ }
+
+ipv4_tcp::~ipv4_tcp() { }
+
+void ipv4_tcp::received(Packet p, ipv4_address from, ipv4_address to)
+{
+ _tcp->received(std::move(p), from, to);
+}
+
+bool ipv4_tcp::forward(forward_hash& out_hash_data, Packet& p, size_t off)
+{
+ return _tcp->forward(out_hash_data, p, off);
+}
+
+int tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, const SocketOptions &opts,
+ int type, unsigned addr_slot, ServerSocket *sock)
+{
+ auto p = new DPDKServerSocketImpl<tcp<ipv4_traits>>(tcpv4, port, opts,
+ type, addr_slot);
+ int r = p->listen();
+ if (r < 0) {
+ delete p;
+ return r;
+ }
+ *sock = ServerSocket(std::unique_ptr<ServerSocketImpl>(p));
+ return 0;
+}
+
+int tcpv4_connect(tcp<ipv4_traits>& tcpv4, const entity_addr_t &addr,
+ ConnectedSocket *sock)
+{
+ auto conn = tcpv4.connect(addr);
+ *sock = ConnectedSocket(std::unique_ptr<ConnectedSocketImpl>(
+ new NativeConnectedSocketImpl<tcp<ipv4_traits>>(std::move(conn))));
+ return 0;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip)
+{
+ ldout(cct, 20) << __func__ << " tcp header rst=" << bool(rth->f_rst) << " fin=" << bool(rth->f_fin)
+ << " syn=" << bool(rth->f_syn) << dendl;
+ if (rth->f_rst) {
+ return;
+ }
+ Packet p;
+ auto th = p.prepend_header<tcp_hdr>();
+ th->src_port = rth->dst_port;
+ th->dst_port = rth->src_port;
+ if (rth->f_ack) {
+ th->seq = rth->ack;
+ }
+ // If this RST packet is in response to a SYN packet. We ACK the ISN.
+ if (rth->f_syn) {
+ th->ack = rth->seq + 1;
+ th->f_ack = true;
+ }
+ th->f_rst = true;
+ th->data_offset = sizeof(*th) / 4;
+ th->checksum = 0;
+ *th = th->hton();
+
+ checksummer csum;
+ offload_info oi;
+ InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, sizeof(*th));
+ if (get_hw_features().tx_csum_l4_offload) {
+ th->checksum = ~csum.get();
+ oi.needs_csum = true;
+ } else {
+ csum.sum(p);
+ th->checksum = csum.get();
+ oi.needs_csum = false;
+ }
+
+ oi.protocol = ip_protocol_num::tcp;
+ oi.tcp_hdr_len = sizeof(tcp_hdr);
+ p.set_offload_info(oi);
+
+ send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
+}
+
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+template<typename InetTraits>
+std::ostream& tcp<InetTraits>::tcb::_prefix(std::ostream *_dout) {
+ return *_dout << "tcp " << _local_ip << ":" << _local_port << " -> " << _foreign_ip << ":" << _foreign_port
+ << " tcb(" << this << " fd=" << fd << " s=" << _state << ").";
+}
+
+template<typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, Packet p)
+{
+ auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+ auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+ auto opt_end = opt_start + opt_len;
+ p.trim_front(th->data_offset * 4);
+ tcp_sequence seg_seq = th->seq;
+
+ // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
+ _rcv.next = seg_seq + 1;
+ _rcv.initial = seg_seq;
+
+ // ISS should be selected and a SYN segment sent of the form:
+ // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+ // SND.NXT is set to ISS+1 and SND.UNA to ISS
+ // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
+ // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
+ // have
+ // th->seq = syn_on ? _snd.initial : _snd.next
+ // to make sure retransmitted SYN has correct SEQ number.
+ do_setup_isn();
+
+ _rcv.urgent = _rcv.next;
+
+ ldout(_tcp.cct, 10) << __func__ << " listen: LISTEN -> SYN_RECEIVED" << dendl;
+ init_from_options(th, opt_start, opt_end);
+ do_syn_received();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, Packet p)
+{
+ auto opt_len = th->data_offset * 4 - sizeof(tcp_hdr);
+ auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + sizeof(tcp_hdr);
+ auto opt_end = opt_start + opt_len;
+ p.trim_front(th->data_offset * 4);
+ tcp_sequence seg_seq = th->seq;
+ auto seg_ack = th->ack;
+
+ ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+ << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+ bool acceptable = false;
+ // 3.1 first check the ACK bit
+ if (th->f_ack) {
+ // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
+ // RST bit is set, if so drop the segment and return)
+ if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
+ return respond_with_reset(th);
+ }
+
+ // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+ acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
+ }
+
+ // 3.2 second check the RST bit
+ if (th->f_rst) {
+ // If the ACK was acceptable then signal the user "error: connection
+ // reset", drop the segment, enter CLOSED state, delete TCB, and
+ // return. Otherwise (no ACK) drop the segment and return.
+ if (acceptable) {
+ return do_reset();
+ } else {
+ return;
+ }
+ }
+
+ // 3.3 third check the security and precedence
+ // NOTE: Ignored for now
+
+ // 3.4 fourth check the SYN bit
+ if (th->f_syn) {
+ // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should
+ // be advanced to equal SEG.ACK (if there is an ACK), and any segments
+ // on the retransmission queue which are thereby acknowledged should be
+ // removed.
+ _rcv.next = seg_seq + 1;
+ _rcv.initial = seg_seq;
+ if (th->f_ack) {
+ // TODO: clean retransmission queue
+ _snd.unacknowledged = seg_ack;
+ }
+ if (_snd.unacknowledged > _snd.initial) {
+ // If SND.UNA > ISS (our SYN has been ACKed), change the connection
+ // state to ESTABLISHED, form an ACK segment
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> ESTABLISHED" << dendl;
+ init_from_options(th, opt_start, opt_end);
+ do_established();
+ output();
+ } else {
+ // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
+ // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+ ldout(_tcp.cct, 20) << __func__ << " syn: SYN_SENT -> SYN_RECEIVED" << dendl;
+ do_syn_received();
+ }
+ }
+
+ // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
+ // segment and return.
+ return;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, Packet p)
+{
+ p.trim_front(th->data_offset * 4);
+ bool do_output = false;
+ bool do_output_data = false;
+ tcp_sequence seg_seq = th->seq;
+ auto seg_ack = th->ack;
+ auto seg_len = p.len();
+ ldout(_tcp.cct, 20) << __func__ << " tcp header seq " << seg_seq.raw << " ack " << seg_ack.raw
+ << " snd next " << _snd.next.raw << " unack " << _snd.unacknowledged.raw
+ << " rcv next " << _rcv.next.raw << " len " << seg_len
+ << " fin=" << bool(th->f_fin) << " syn=" << bool(th->f_syn) << dendl;
+
+ // 4.1 first check sequence number
+ if (!segment_acceptable(seg_seq, seg_len)) {
+ //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ return output();
+ }
+
+ // In the following it is assumed that the segment is the idealized
+ // segment that begins at RCV.NXT and does not exceed the window.
+ if (seg_seq < _rcv.next) {
+ // ignore already acknowledged data
+ auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
+ ldout(_tcp.cct, 10) << __func__ << " dup segment len " << dup << dendl;
+ p.trim_front(dup);
+ seg_len -= dup;
+ seg_seq += dup;
+ }
+ // FIXME: We should trim data outside the right edge of the receive window as well
+
+ if (seg_seq != _rcv.next) {
+ ldout(_tcp.cct, 10) << __func__ << " out of order, expect " << _rcv.next.raw
+ << " actual " << seg_seq.raw
+ << " out of order size " << _rcv.out_of_order.map.size()
+ << dendl;
+ insert_out_of_order(seg_seq, std::move(p));
+ // A TCP receiver SHOULD send an immediate duplicate ACK
+ // when an out-of-order segment arrives.
+ return output();
+ }
+
+ // 4.2 second check the RST bit
+ if (th->f_rst) {
+ if (in_state(SYN_RECEIVED)) {
+ // If this connection was initiated with a passive OPEN (i.e.,
+ // came from the LISTEN state), then return this connection to
+ // LISTEN state and return. The user need not be informed. If
+ // this connection was initiated with an active OPEN (i.e., came
+ // from SYN_SENT state) then the connection was refused, signal
+ // the user "connection refused". In either case, all segments
+ // on the retransmission queue should be removed. And in the
+ // active OPEN case, enter the CLOSED state and delete the TCB,
+ // and return.
+ errno = -ECONNREFUSED;
+ return do_reset();
+ }
+ if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
+ // If the RST bit is set then, any outstanding RECEIVEs and SEND
+ // should receive "reset" responses. All segment queues should be
+ // flushed. Users should also receive an unsolicited general
+ // "connection reset" signal. Enter the CLOSED state, delete the
+ // TCB, and return.
+ return do_reset();
+ }
+ if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
+ // If the RST bit is set then, enter the CLOSED state, delete the
+ // TCB, and return.
+ return do_closed();
+ }
+ }
+
+ // 4.3 third check security and precedence
+ // NOTE: Ignored for now
+
+ // 4.4 fourth, check the SYN bit
+ if (th->f_syn) {
+ // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+ // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+
+ // If the SYN is in the window it is an error, send a reset, any
+ // outstanding RECEIVEs and SEND should receive "reset" responses,
+ // all segment queues should be flushed, the user should also
+ // receive an unsolicited general "connection reset" signal, enter
+ // the CLOSED state, delete the TCB, and return.
+ respond_with_reset(th);
+ return do_reset();
+
+ // If the SYN is not in the window this step would not be reached
+ // and an ack would have been sent in the first step (sequence
+ // number check).
+ }
+
+ // 4.5 fifth check the ACK field
+ if (!th->f_ack) {
+ // if the ACK bit is off drop the segment and return
+ return;
+ } else {
+ // SYN_RECEIVED STATE
+ if (in_state(SYN_RECEIVED)) {
+ // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
+ // and continue processing.
+ if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
+ ldout(_tcp.cct, 20) << __func__ << " SYN_RECEIVED -> ESTABLISHED" << dendl;
+ do_established();
+ if (_tcp.push_listen_queue(_local_port, this)) {
+ ldout(_tcp.cct, 20) << __func__ << " successfully accepting socket" << dendl;
+ } else {
+ ldout(_tcp.cct, 5) << __func__ << " not exist listener or full queue, reset" << dendl;
+ return respond_with_reset(th);
+ }
+ } else {
+ // <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(th);
+ }
+ }
+ auto update_window = [this, th, seg_seq, seg_ack] {
+ ldout(_tcp.cct, 20) << __func__ << " window update seg_seq=" << seg_seq
+ << " seg_ack=" << seg_ack << " old window=" << th->window
+ << " new window=" << int(_snd.window_scale) << dendl;
+ _snd.window = th->window << _snd.window_scale;
+ _snd.wl1 = seg_seq;
+ _snd.wl2 = seg_ack;
+ if (_snd.window == 0) {
+ _persist_time_out = _rto;
+ start_persist_timer();
+ } else {
+ stop_persist_timer();
+ }
+ };
+ // ESTABLISHED STATE or
+ // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
+ if (in_state(ESTABLISHED | CLOSE_WAIT)) {
+ // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
+ if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
+ // Remote ACKed data we sent
+ auto acked_bytes = data_segment_acked(seg_ack);
+
+ // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
+ if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
+ update_window();
+ }
+
+ // some data is acked, try send more data
+ do_output_data = true;
+
+ auto set_retransmit_timer = [this] {
+ if (_snd.data.empty()) {
+ // All outstanding segments are acked, turn off the timer.
+ stop_retransmit_timer();
+ // Signal the waiter of this event
+ signal_all_data_acked();
+ } else {
+ // Restart the timer becasue new data is acked.
+ start_retransmit_timer();
+ }
+ };
+
+ if (_snd.dupacks >= 3) {
+ // We are in fast retransmit / fast recovery phase
+ uint32_t smss = _snd.mss;
+ if (seg_ack > _snd.recover) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: full_ack" << dendl;
+ // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
+ _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
+ // Exit the fast recovery procedure
+ exit_fast_recovery();
+ set_retransmit_timer();
+ } else {
+ ldout(_tcp.cct, 20) << __func__ << " ack: partial_ack" << dendl;
+ // Retransmit the first unacknowledged segment
+ fast_retransmit();
+ // Deflate the congestion window by the amount of new data
+ // acknowledged by the Cumulative Acknowledgment field
+ _snd.cwnd -= acked_bytes;
+ // If the partial ACK acknowledges at least one SMSS of new
+ // data, then add back SMSS bytes to the congestion window
+ if (acked_bytes >= smss) {
+ _snd.cwnd += smss;
+ }
+ // Send a new segment if permitted by the new value of
+ // cwnd. Do not exit the fast recovery procedure For
+ // the first partial ACK that arrives during fast
+ // recovery, also reset the retransmit timer.
+ if (++_snd.partial_ack == 1) {
+ start_retransmit_timer();
+ }
+ }
+ } else {
+ // RFC5681: The fast retransmit algorithm uses the arrival
+ // of 3 duplicate ACKs (as defined in section 2, without
+ // any intervening ACKs which move SND.UNA) as an
+ // indication that a segment has been lost.
+ //
+ // So, here we reset dupacks to zero becasue this ACK moves
+ // SND.UNA.
+ exit_fast_recovery();
+ set_retransmit_timer();
+ }
+ } else if (!_snd.data.empty() && seg_len == 0 &&
+ th->f_fin == 0 && th->f_syn == 0 &&
+ th->ack == _snd.unacknowledged &&
+ uint32_t(th->window << _snd.window_scale) == _snd.window) {
+ // Note:
+ // RFC793 states:
+ // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
+ // RFC5681 states:
+ // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
+ // and repair loss, based on incoming duplicate ACKs.
+ // Here, We follow RFC5681.
+ _snd.dupacks++;
+ uint32_t smss = _snd.mss;
+ // 3 duplicated ACKs trigger a fast retransmit
+ if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+ // RFC5681 Step 3.1
+ // Send cwnd + 2 * smss per RFC3042
+ do_output_data = true;
+ } else if (_snd.dupacks == 3) {
+ // RFC6582 Step 3.2
+ if (seg_ack - 1 > _snd.recover) {
+ _snd.recover = _snd.next - 1;
+ // RFC5681 Step 3.2
+ _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
+ fast_retransmit();
+ } else {
+ // Do not enter fast retransmit and do not reset ssthresh
+ }
+ // RFC5681 Step 3.3
+ _snd.cwnd = _snd.ssthresh + 3 * smss;
+ } else if (_snd.dupacks > 3) {
+ // RFC5681 Step 3.4
+ _snd.cwnd += smss;
+ // RFC5681 Step 3.5
+ do_output_data = true;
+ }
+ } else if (seg_ack > _snd.next) {
+ // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
+ // then send an ACK, drop the segment, and return
+ return output();
+ } else if (_snd.window == 0 && th->window > 0) {
+ update_window();
+ do_output_data = true;
+ }
+ }
+ // FIN_WAIT_1 STATE
+ if (in_state(FIN_WAIT_1)) {
+ // In addition to the processing for the ESTABLISHED state, if
+ // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
+ // processing in that state.
+ if (seg_ack == _snd.next + 1) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: FIN_WAIT_1 -> FIN_WAIT_2" << dendl;
+ _state = FIN_WAIT_2;
+ do_local_fin_acked();
+ }
+ }
+ // FIN_WAIT_2 STATE
+ if (in_state(FIN_WAIT_2)) {
+ // In addition to the processing for the ESTABLISHED state, if
+ // the retransmission queue is empty, the user’s CLOSE can be
+ // acknowledged ("ok") but do not delete the TCB.
+ // TODO
+ }
+ // CLOSING STATE
+ if (in_state(CLOSING)) {
+ if (seg_ack == _snd.next + 1) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: CLOSING -> TIME_WAIT" << dendl;
+ do_local_fin_acked();
+ return do_time_wait();
+ } else {
+ return;
+ }
+ }
+ // LAST_ACK STATE
+ if (in_state(LAST_ACK)) {
+ if (seg_ack == _snd.next + 1) {
+ ldout(_tcp.cct, 20) << __func__ << " ack: LAST_ACK -> CLOSED" << dendl;
+ do_local_fin_acked();
+ return do_closed();
+ }
+ }
+ // TIME_WAIT STATE
+ if (in_state(TIME_WAIT)) {
+ // The only thing that can arrive in this state is a
+ // retransmission of the remote FIN. Acknowledge it, and restart
+ // the 2 MSL timeout.
+ // TODO
+ }
+ }
+
+ // 4.6 sixth, check the URG bit
+ if (th->f_urg) {
+ // TODO
+ }
+
+ // 4.7 seventh, process the segment text
+ if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
+ if (p.len()) {
+ // Once the TCP takes responsibility for the data it advances
+ // RCV.NXT over the data accepted, and adjusts RCV.WND as
+ // apporopriate to the current buffer availability. The total of
+ // RCV.NXT and RCV.WND should not be reduced.
+ _rcv.data.push_back(std::move(p));
+ _rcv.next += seg_len;
+ auto merged = merge_out_of_order();
+ signal_data_received();
+ // Send an acknowledgment of the form:
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ // This acknowledgment should be piggybacked on a segment being
+ // transmitted if possible without incurring undue delay.
+ if (merged) {
+ // TCP receiver SHOULD send an immediate ACK when the
+ // incoming segment fills in all or part of a gap in the
+ // sequence space.
+ do_output = true;
+ } else {
+ do_output = should_send_ack(seg_len);
+ }
+ ldout(_tcp.cct, 20) << __func__ << " merged=" << merged << " do_output=" << do_output << dendl;
+ }
+ } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
+ // This should not occur, since a FIN has been received from the
+ // remote side. Ignore the segment text.
+ return;
+ }
+
+ // 4.8 eighth, check the FIN bit
+ if (th->f_fin) {
+ if (in_state(CLOSED | LISTEN | SYN_SENT)) {
+ // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
+ // since the SEG.SEQ cannot be validated; drop the segment and return.
+ return;
+ }
+ auto fin_seq = seg_seq + seg_len;
+ if (fin_seq == _rcv.next) {
+ _rcv.next = fin_seq + 1;
+
+ // If this <FIN> packet contains data as well, we can ACK both data
+ // and <FIN> in a single packet, so canncel the previous ACK.
+ clear_delayed_ack();
+ do_output = false;
+ // Send ACK for the FIN!
+ output();
+ signal_data_received();
+ _errno = 0;
+
+ if (in_state(SYN_RECEIVED | ESTABLISHED)) {
+ ldout(_tcp.cct, 20) << __func__ << " fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT" << dendl;
+ _state = CLOSE_WAIT;
+ // EOF
+ }
+ if (in_state(FIN_WAIT_1)) {
+ // If our FIN has been ACKed (perhaps in this segment), then
+ // enter TIME-WAIT, start the time-wait timer, turn off the other
+ // timers; otherwise enter the CLOSING state.
+ // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
+ // not FIN_WAIT_1 if we reach here.
+ ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_1 -> CLOSING" << dendl;
+ _state = CLOSING;
+ }
+ if (in_state(FIN_WAIT_2)) {
+ ldout(_tcp.cct, 20) << __func__ << " fin: FIN_WAIT_2 -> TIME_WAIT" << dendl;
+ return do_time_wait();
+ }
+ }
+ }
+ if (do_output || (do_output_data && can_send())) {
+ // Since we will do output, we can canncel scheduled delayed ACK.
+ clear_delayed_ack();
+ output();
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::connect()
+{
+ ldout(_tcp.cct, 20) << __func__ << dendl;
+ // An initial send sequence number (ISS) is selected. A SYN segment of the
+ // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1,
+ // enter SYN-SENT state, and return.
+ do_setup_isn();
+
+ // Local receive window scale factor
+ _rcv.window_scale = _option._local_win_scale = 7;
+ // Maximum segment size local can receive
+ _rcv.mss = _option._local_mss = local_mss();
+ // Linux's default window size
+ _rcv.window = 29200 << _rcv.window_scale;
+
+ do_syn_sent();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close_final_cleanup()
+{
+ if (_snd._all_data_acked_fd >= 0) {
+ center->delete_file_event(_snd._all_data_acked_fd, EVENT_READABLE);
+ _tcp.manager.close(_snd._all_data_acked_fd);
+ _snd._all_data_acked_fd = -1;
+ }
+
+ _snd.closed = true;
+ signal_data_received();
+ ldout(_tcp.cct, 20) << __func__ << " unsent_len=" << _snd.unsent_len << dendl;
+ if (in_state(CLOSE_WAIT)) {
+ ldout(_tcp.cct, 20) << __func__ << " CLOSE_WAIT -> LAST_ACK" << dendl;
+ _state = LAST_ACK;
+ } else if (in_state(ESTABLISHED)) {
+ ldout(_tcp.cct, 20) << __func__ << " ESTABLISHED -> FIN_WAIT_1" << dendl;
+ _state = FIN_WAIT_1;
+ }
+ // Send <FIN> to remote
+ // Note: we call output_one to make sure a packet with FIN actually
+ // sent out. If we only call output() and _packetq is not empty,
+ // tcp::tcb::get_packet(), packet with FIN will not be generated.
+ output_one();
+ output();
+ center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::retransmit()
+{
+ auto output_update_rto = [this] {
+ output();
+ // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
+ this->_rto = std::min(this->_rto * 2, this->_rto_max);
+ start_retransmit_timer();
+ };
+
+ // Retransmit SYN
+ if (syn_needs_on()) {
+ if (_snd.syn_retransmit++ < _max_nr_retransmit) {
+ output_update_rto();
+ } else {
+ _errno = -ECONNABORTED;
+ ldout(_tcp.cct, 5) << __func__ << " syn retransmit exceed max "
+ << _max_nr_retransmit << dendl;
+ _errno = -ETIMEDOUT;
+ cleanup();
+ return;
+ }
+ }
+
+ // Retransmit FIN
+ if (fin_needs_on()) {
+ if (_snd.fin_retransmit++ < _max_nr_retransmit) {
+ output_update_rto();
+ } else {
+ ldout(_tcp.cct, 5) << __func__ << " fin retransmit exceed max "
+ << _max_nr_retransmit << dendl;
+ _errno = -ETIMEDOUT;
+ cleanup();
+ return;
+ }
+ }
+
+ // Retransmit Data
+ if (_snd.data.empty()) {
+ return;
+ }
+
+ // If there are unacked data, retransmit the earliest segment
+ auto& unacked_seg = _snd.data.front();
+
+ // According to RFC5681
+ // Update ssthresh only for the first retransmit
+ uint32_t smss = _snd.mss;
+ if (unacked_seg.nr_transmits == 0) {
+ _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
+ }
+ // RFC6582 Step 4
+ _snd.recover = _snd.next - 1;
+ // Start the slow start process
+ _snd.cwnd = smss;
+ // End fast recovery
+ exit_fast_recovery();
+
+ ldout(_tcp.cct, 20) << __func__ << " unack data size " << _snd.data.size()
+ << " nr=" << unacked_seg.nr_transmits << dendl;
+ if (unacked_seg.nr_transmits < _max_nr_retransmit) {
+ unacked_seg.nr_transmits++;
+ } else {
+ // Delete connection when max num of retransmission is reached
+ ldout(_tcp.cct, 5) << __func__ << " seg retransmit exceed max "
+ << _max_nr_retransmit << dendl;
+ _errno = -ETIMEDOUT;
+ cleanup();
+ return;
+ }
+ retransmit_one();
+
+ output_update_rto();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::persist() {
+ ldout(_tcp.cct, 20) << __func__ << " persist timer fired" << dendl;
+ // Send 1 byte packet to probe peer's window size
+ _snd.window_probe = true;
+ output_one();
+ _snd.window_probe = false;
+
+ output();
+ // Perform binary exponential back-off per RFC1122
+ _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
+ start_persist_timer();
+}
diff --git a/src/msg/async/dpdk/TCP.h b/src/msg/async/dpdk/TCP.h
new file mode 100644
index 000000000..cf76d3279
--- /dev/null
+++ b/src/msg/async/dpdk/TCP.h
@@ -0,0 +1,1506 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_DPDK_TCP_H_
+#define CEPH_DPDK_TCP_H_
+
+#include <unordered_map>
+#include <map>
+#include <queue>
+#include <functional>
+#include <deque>
+#include <chrono>
+#include <stdexcept>
+#include <system_error>
+
+#include "msg/async/dpdk/EventDPDK.h"
+
+#include "include/utime.h"
+#include "common/Throttle.h"
+#include "common/ceph_time.h"
+#include "common/ceph_crypto.h"
+#include "msg/async/Event.h"
+#include "IPChecksum.h"
+#include "IP.h"
+#include "const.h"
+#include "byteorder.h"
+#include "shared_ptr.h"
+#include "PacketUtil.h"
+
+#include "include/random.h"
+
+struct tcp_hdr;
+
+enum class tcp_state : uint16_t {
+ CLOSED = (1 << 0),
+ LISTEN = (1 << 1),
+ SYN_SENT = (1 << 2),
+ SYN_RECEIVED = (1 << 3),
+ ESTABLISHED = (1 << 4),
+ FIN_WAIT_1 = (1 << 5),
+ FIN_WAIT_2 = (1 << 6),
+ CLOSE_WAIT = (1 << 7),
+ CLOSING = (1 << 8),
+ LAST_ACK = (1 << 9),
+ TIME_WAIT = (1 << 10)
+};
+
+inline tcp_state operator|(tcp_state s1, tcp_state s2) {
+ return tcp_state(uint16_t(s1) | uint16_t(s2));
+}
+
+inline std::ostream & operator<<(std::ostream & str, const tcp_state& s) {
+ switch (s) {
+ case tcp_state::CLOSED: return str << "CLOSED";
+ case tcp_state::LISTEN: return str << "LISTEN";
+ case tcp_state::SYN_SENT: return str << "SYN_SENT";
+ case tcp_state::SYN_RECEIVED: return str << "SYN_RECEIVED";
+ case tcp_state::ESTABLISHED: return str << "ESTABLISHED";
+ case tcp_state::FIN_WAIT_1: return str << "FIN_WAIT_1";
+ case tcp_state::FIN_WAIT_2: return str << "FIN_WAIT_2";
+ case tcp_state::CLOSE_WAIT: return str << "CLOSE_WAIT";
+ case tcp_state::CLOSING: return str << "CLOSING";
+ case tcp_state::LAST_ACK: return str << "LAST_ACK";
+ case tcp_state::TIME_WAIT: return str << "TIME_WAIT";
+ default: return str << "UNKNOWN";
+ }
+}
+
+struct tcp_option {
+ // The kind and len field are fixed and defined in TCP protocol
+ enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 };
+ enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
+ struct mss {
+ option_kind kind = option_kind::mss;
+ option_len len = option_len::mss;
+ uint16_t mss;
+ struct mss hton() {
+ struct mss m = *this;
+ m.mss = ::hton(m.mss);
+ return m;
+ }
+ } __attribute__((packed));
+ struct win_scale {
+ option_kind kind = option_kind::win_scale;
+ option_len len = option_len::win_scale;
+ uint8_t shift;
+ } __attribute__((packed));
+ struct sack {
+ option_kind kind = option_kind::sack;
+ option_len len = option_len::sack;
+ } __attribute__((packed));
+ struct timestamps {
+ option_kind kind = option_kind::timestamps;
+ option_len len = option_len::timestamps;
+ uint32_t t1;
+ uint32_t t2;
+ } __attribute__((packed));
+ struct nop {
+ option_kind kind = option_kind::nop;
+ } __attribute__((packed));
+ struct eol {
+ option_kind kind = option_kind::eol;
+ } __attribute__((packed));
+ static const uint8_t align = 4;
+
+ void parse(uint8_t* beg, uint8_t* end);
+ uint8_t fill(tcp_hdr* th, uint8_t option_size);
+ uint8_t get_size(bool syn_on, bool ack_on);
+
+ // For option negotiattion
+ bool _mss_received = false;
+ bool _win_scale_received = false;
+ bool _timestamps_received = false;
+ bool _sack_received = false;
+
+ // Option data
+ uint16_t _remote_mss = 536;
+ uint16_t _local_mss;
+ uint8_t _remote_win_scale = 0;
+ uint8_t _local_win_scale = 0;
+};
+inline uint8_t*& operator+=(uint8_t*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+
+struct tcp_sequence {
+ uint32_t raw;
+};
+
+tcp_sequence ntoh(tcp_sequence ts) {
+ return tcp_sequence { ::ntoh(ts.raw) };
+}
+
+tcp_sequence hton(tcp_sequence ts) {
+ return tcp_sequence { ::hton(ts.raw) };
+}
+
+inline std::ostream& operator<<(std::ostream& os, const tcp_sequence& s) {
+ return os << s.raw;
+}
+
+inline tcp_sequence make_seq(uint32_t raw) { return tcp_sequence{raw}; }
+inline tcp_sequence& operator+=(tcp_sequence& s, int32_t n) { s.raw += n; return s; }
+inline tcp_sequence& operator-=(tcp_sequence& s, int32_t n) { s.raw -= n; return s; }
+inline tcp_sequence operator+(tcp_sequence s, int32_t n) { return s += n; }
+inline tcp_sequence operator-(tcp_sequence s, int32_t n) { return s -= n; }
+inline int32_t operator-(tcp_sequence s, tcp_sequence q) { return s.raw - q.raw; }
+inline bool operator==(tcp_sequence s, tcp_sequence q) { return s.raw == q.raw; }
+inline bool operator!=(tcp_sequence s, tcp_sequence q) { return !(s == q); }
+inline bool operator<(tcp_sequence s, tcp_sequence q) { return s - q < 0; }
+inline bool operator>(tcp_sequence s, tcp_sequence q) { return q < s; }
+inline bool operator<=(tcp_sequence s, tcp_sequence q) { return !(s > q); }
+inline bool operator>=(tcp_sequence s, tcp_sequence q) { return !(s < q); }
+
+struct tcp_hdr {
+ uint16_t src_port;
+ uint16_t dst_port;
+ tcp_sequence seq;
+ tcp_sequence ack;
+ uint8_t rsvd1 : 4;
+ uint8_t data_offset : 4;
+ uint8_t f_fin : 1;
+ uint8_t f_syn : 1;
+ uint8_t f_rst : 1;
+ uint8_t f_psh : 1;
+ uint8_t f_ack : 1;
+ uint8_t f_urg : 1;
+ uint8_t rsvd2 : 2;
+ uint16_t window;
+ uint16_t checksum;
+ uint16_t urgent;
+
+ tcp_hdr hton() {
+ tcp_hdr hdr = *this;
+ hdr.src_port = ::hton(src_port);
+ hdr.dst_port = ::hton(dst_port);
+ hdr.seq = ::hton(seq);
+ hdr.ack = ::hton(ack);
+ hdr.window = ::hton(window);
+ hdr.checksum = ::hton(checksum);
+ hdr.urgent = ::hton(urgent);
+ return hdr;
+ }
+
+ tcp_hdr ntoh() {
+ tcp_hdr hdr = *this;
+ hdr.src_port = ::ntoh(src_port);
+ hdr.dst_port = ::ntoh(dst_port);
+ hdr.seq = ::ntoh(seq);
+ hdr.ack = ::ntoh(ack);
+ hdr.window = ::ntoh(window);
+ hdr.checksum = ::ntoh(checksum);
+ hdr.urgent = ::ntoh(urgent);
+ return hdr;
+ }
+} __attribute__((packed));
+
+struct tcp_tag {};
+using tcp_packet_merger = packet_merger<tcp_sequence, tcp_tag>;
+
+template <typename InetTraits>
+class tcp {
+ public:
+ using ipaddr = typename InetTraits::address_type;
+ using inet_type = typename InetTraits::inet_type;
+ using connid = l4connid<InetTraits>;
+ using connid_hash = typename connid::connid_hash;
+ class connection;
+ class listener;
+ private:
+ class tcb;
+
+ class C_handle_delayed_ack : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_handle_delayed_ack(tcb *t): tc(t) { }
+ void do_request(uint64_t r) {
+ tc->_delayed_ack_fd.reset();
+ tc->_nr_full_seg_received = 0;
+ tc->output();
+ }
+ };
+
+ class C_handle_retransmit : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_handle_retransmit(tcb *t): tc(t) { }
+ void do_request(uint64_t r) {
+ tc->retransmit_fd.reset();
+ tc->retransmit();
+ }
+ };
+
+ class C_handle_persist : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_handle_persist(tcb *t): tc(t) { }
+ void do_request(uint64_t r) {
+ tc->persist_fd.reset();
+ tc->persist();
+ }
+ };
+
+ class C_all_data_acked : public EventCallback {
+ tcb *tc;
+
+ public:
+ C_all_data_acked(tcb *t): tc(t) {}
+ void do_request(uint64_t fd_or_id) {
+ tc->close_final_cleanup();
+ }
+ };
+
+ class C_actual_remove_tcb : public EventCallback {
+ lw_shared_ptr<tcb> tc;
+ public:
+ C_actual_remove_tcb(tcb *t): tc(t->shared_from_this()) {}
+ void do_request(uint64_t r) {
+ delete this;
+ }
+ };
+
+ class tcb : public enable_lw_shared_from_this<tcb> {
+ using clock_type = ceph::coarse_real_clock;
+ static constexpr tcp_state CLOSED = tcp_state::CLOSED;
+ static constexpr tcp_state LISTEN = tcp_state::LISTEN;
+ static constexpr tcp_state SYN_SENT = tcp_state::SYN_SENT;
+ static constexpr tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED;
+ static constexpr tcp_state ESTABLISHED = tcp_state::ESTABLISHED;
+ static constexpr tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1;
+ static constexpr tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2;
+ static constexpr tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT;
+ static constexpr tcp_state CLOSING = tcp_state::CLOSING;
+ static constexpr tcp_state LAST_ACK = tcp_state::LAST_ACK;
+ static constexpr tcp_state TIME_WAIT = tcp_state::TIME_WAIT;
+ tcp_state _state = CLOSED;
+ tcp& _tcp;
+ UserspaceEventManager &manager;
+ connection* _conn = nullptr;
+ bool _connect_done = false;
+ ipaddr _local_ip;
+ ipaddr _foreign_ip;
+ uint16_t _local_port;
+ uint16_t _foreign_port;
+ struct unacked_segment {
+ Packet p;
+ uint16_t data_len;
+ unsigned nr_transmits;
+ clock_type::time_point tx_time;
+ };
+ struct send {
+ tcp_sequence unacknowledged;
+ tcp_sequence next;
+ uint32_t window;
+ uint8_t window_scale;
+ uint16_t mss;
+ tcp_sequence urgent;
+ tcp_sequence wl1;
+ tcp_sequence wl2;
+ tcp_sequence initial;
+ std::deque<unacked_segment> data;
+ std::deque<Packet> unsent;
+ uint32_t unsent_len = 0;
+ uint32_t queued_len = 0;
+ bool closed = false;
+ // Wait for all data are acked
+ int _all_data_acked_fd = -1;
+ // Limit number of data queued into send queue
+ Throttle user_queue_space;
+ // Round-trip time variation
+ std::chrono::microseconds rttvar;
+ // Smoothed round-trip time
+ std::chrono::microseconds srtt;
+ bool first_rto_sample = true;
+ clock_type::time_point syn_tx_time;
+ // Congestion window
+ uint32_t cwnd;
+ // Slow start threshold
+ uint32_t ssthresh;
+ // Duplicated ACKs
+ uint16_t dupacks = 0;
+ unsigned syn_retransmit = 0;
+ unsigned fin_retransmit = 0;
+ uint32_t limited_transfer = 0;
+ uint32_t partial_ack = 0;
+ tcp_sequence recover;
+ bool window_probe = false;
+ send(CephContext *c): user_queue_space(c, "DPDK::tcp::tcb::user_queue_space", 81920) {}
+ } _snd;
+ struct receive {
+ tcp_sequence next;
+ uint32_t window;
+ uint8_t window_scale;
+ uint16_t mss;
+ tcp_sequence urgent;
+ tcp_sequence initial;
+ std::deque<Packet> data;
+ tcp_packet_merger out_of_order;
+ } _rcv;
+ EventCenter *center;
+ int fd;
+ // positive means no errno, 0 means eof, nagetive means error
+ int16_t _errno = 1;
+ tcp_option _option;
+ EventCallbackRef delayed_ack_event;
+ std::optional<uint64_t> _delayed_ack_fd;
+ // Retransmission timeout
+ std::chrono::microseconds _rto{1000*1000};
+ std::chrono::microseconds _persist_time_out{1000*1000};
+ static constexpr std::chrono::microseconds _rto_min{1000*1000};
+ static constexpr std::chrono::microseconds _rto_max{60000*1000};
+ // Clock granularity
+ static constexpr std::chrono::microseconds _rto_clk_granularity{1000};
+ static constexpr uint16_t _max_nr_retransmit{5};
+ EventCallbackRef retransmit_event;
+ std::optional<uint64_t> retransmit_fd;
+ EventCallbackRef persist_event;
+ EventCallbackRef all_data_ack_event;
+ std::optional<uint64_t> persist_fd;
+ uint16_t _nr_full_seg_received = 0;
+ struct isn_secret {
+ // 512 bits secretkey for ISN generating
+ uint32_t key[16];
+ isn_secret () {
+ for (auto& k : key) {
+ k = ceph::util::generate_random_number<uint32_t>(0, std::numeric_limits<uint32_t>::max());
+ }
+ }
+ };
+ static isn_secret _isn_secret;
+ tcp_sequence get_isn();
+ circular_buffer<typename InetTraits::l4packet> _packetq;
+ bool _poll_active = false;
+ public:
+ // callback
+ void close_final_cleanup();
+ std::ostream& _prefix(std::ostream *_dout);
+
+ public:
+ tcb(tcp& t, connid id);
+ ~tcb();
+ void input_handle_listen_state(tcp_hdr* th, Packet p);
+ void input_handle_syn_sent_state(tcp_hdr* th, Packet p);
+ void input_handle_other_state(tcp_hdr* th, Packet p);
+ void output_one(bool data_retransmit = false);
+ bool is_all_data_acked();
+ int send(Packet p);
+ void connect();
+ std::optional<Packet> read();
+ void close();
+ void remove_from_tcbs() {
+ auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
+ _tcp._tcbs.erase(id);
+ }
+ std::optional<typename InetTraits::l4packet> get_packet();
+ void output() {
+ if (!_poll_active) {
+ _poll_active = true;
+
+ auto tcb = this->shared_from_this();
+ _tcp._inet.wait_l2_dst_address(_foreign_ip, Packet(), [tcb] (const ethernet_address &dst, Packet p, int r) {
+ if (r == 0) {
+ tcb->_tcp.poll_tcb(dst, std::move(tcb));
+ } else if (r == -ETIMEDOUT) {
+ // in other states connection should time out
+ if (tcb->in_state(SYN_SENT)) {
+ tcb->_errno = -ETIMEDOUT;
+ tcb->cleanup();
+ }
+ } else if (r == -EBUSY) {
+ // retry later
+ tcb->_poll_active = false;
+ tcb->start_retransmit_timer();
+ }
+ });
+ }
+ }
+
+ int16_t get_errno() const {
+ return _errno;
+ }
+
+ tcp_state& state() {
+ return _state;
+ }
+
+ uint64_t peek_sent_available() {
+ if (!in_state(ESTABLISHED))
+ return 0;
+ uint64_t left = _snd.user_queue_space.get_max() - _snd.user_queue_space.get_current();
+ return left;
+ }
+
+ int is_connected() const {
+ if (_errno <= 0)
+ return _errno;
+ return _connect_done;
+ }
+
+ private:
+ void respond_with_reset(tcp_hdr* th);
+ bool merge_out_of_order();
+ void insert_out_of_order(tcp_sequence seq, Packet p);
+ void trim_receive_data_after_window();
+ bool should_send_ack(uint16_t seg_len);
+ void clear_delayed_ack();
+ Packet get_transmit_packet();
+ void retransmit_one() {
+ bool data_retransmit = true;
+ output_one(data_retransmit);
+ }
+ void start_retransmit_timer() {
+ if (retransmit_fd)
+ center->delete_time_event(*retransmit_fd);
+ retransmit_fd.emplace(center->create_time_event(_rto.count(), retransmit_event));
+ };
+ void stop_retransmit_timer() {
+ if (retransmit_fd) {
+ center->delete_time_event(*retransmit_fd);
+ retransmit_fd.reset();
+ }
+ };
+ void start_persist_timer() {
+ if (persist_fd)
+ center->delete_time_event(*persist_fd);
+ persist_fd.emplace(center->create_time_event(_persist_time_out.count(), persist_event));
+ };
+ void stop_persist_timer() {
+ if (persist_fd) {
+ center->delete_time_event(*persist_fd);
+ persist_fd.reset();
+ }
+ };
+ void persist();
+ void retransmit();
+ void fast_retransmit();
+ void update_rto(clock_type::time_point tx_time);
+ void update_cwnd(uint32_t acked_bytes);
+ void cleanup();
+ uint32_t can_send() {
+ if (_snd.window_probe) {
+ return 1;
+ }
+ // Can not send more than advertised window allows
+ auto x = std::min(uint32_t(_snd.unacknowledged + _snd.window - _snd.next), _snd.unsent_len);
+ // Can not send more than congestion window allows
+ x = std::min(_snd.cwnd, x);
+ if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+ // RFC5681 Step 3.1
+ // Send cwnd + 2 * smss per RFC3042
+ auto flight = flight_size();
+ auto max = _snd.cwnd + 2 * _snd.mss;
+ x = flight <= max ? std::min(x, max - flight) : 0;
+ _snd.limited_transfer += x;
+ } else if (_snd.dupacks >= 3) {
+ // RFC5681 Step 3.5
+ // Sent 1 full-sized segment at most
+ x = std::min(uint32_t(_snd.mss), x);
+ }
+ return x;
+ }
+ uint32_t flight_size() {
+ uint32_t size = 0;
+ std::for_each(_snd.data.begin(), _snd.data.end(),
+ [&] (unacked_segment& seg) { size += seg.p.len(); });
+ return size;
+ }
+ uint16_t local_mss() {
+ return _tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+ }
+ void queue_packet(Packet p) {
+ _packetq.emplace_back(
+ typename InetTraits::l4packet{_foreign_ip, std::move(p)});
+ }
+ void signal_data_received() {
+ manager.notify(fd, EVENT_READABLE);
+ }
+ void signal_all_data_acked() {
+ if (_snd._all_data_acked_fd >= 0 && _snd.unsent_len == 0 && _snd.queued_len == 0)
+ manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+ }
+ void do_syn_sent() {
+ _state = SYN_SENT;
+ _snd.syn_tx_time = clock_type::now();
+ // Send <SYN> to remote
+ output();
+ }
+ void do_syn_received() {
+ _state = SYN_RECEIVED;
+ _snd.syn_tx_time = clock_type::now();
+ // Send <SYN,ACK> to remote
+ output();
+ }
+ void do_established() {
+ _state = ESTABLISHED;
+ update_rto(_snd.syn_tx_time);
+ _connect_done = true;
+ manager.notify(fd, EVENT_READABLE|EVENT_WRITABLE);
+ }
+ void do_reset() {
+ _state = CLOSED;
+ // Free packets to be sent which are waiting for user_queue_space
+ _snd.user_queue_space.reset();
+ cleanup();
+ _errno = -ECONNRESET;
+ manager.notify(fd, EVENT_READABLE);
+
+ if (_snd._all_data_acked_fd >= 0)
+ manager.notify(_snd._all_data_acked_fd, EVENT_READABLE);
+ }
+ void do_time_wait() {
+ // FIXME: Implement TIME_WAIT state timer
+ _state = TIME_WAIT;
+ cleanup();
+ }
+ void do_closed() {
+ _state = CLOSED;
+ cleanup();
+ }
+ void do_setup_isn() {
+ _snd.initial = get_isn();
+ _snd.unacknowledged = _snd.initial;
+ _snd.next = _snd.initial + 1;
+ _snd.recover = _snd.initial;
+ }
+ void do_local_fin_acked() {
+ _snd.unacknowledged += 1;
+ _snd.next += 1;
+ }
+ bool syn_needs_on() {
+ return in_state(SYN_SENT | SYN_RECEIVED);
+ }
+ bool fin_needs_on() {
+ return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
+ _snd.unsent_len == 0 && _snd.queued_len == 0;
+ }
+ bool ack_needs_on() {
+ return !in_state(CLOSED | LISTEN | SYN_SENT);
+ }
+ bool foreign_will_not_send() {
+ return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
+ }
+ bool in_state(tcp_state state) {
+ return uint16_t(_state) & uint16_t(state);
+ }
+ void exit_fast_recovery() {
+ _snd.dupacks = 0;
+ _snd.limited_transfer = 0;
+ _snd.partial_ack = 0;
+ }
+ uint32_t data_segment_acked(tcp_sequence seg_ack);
+ bool segment_acceptable(tcp_sequence seg_seq, unsigned seg_len);
+ void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
+ friend class connection;
+
+ friend class C_handle_delayed_ack;
+ friend class C_handle_retransmit;
+ friend class C_handle_persist;
+ friend class C_all_data_acked;
+ };
+
+ CephContext *cct;
+ // ipv4_l4<ip_protocol_num::tcp>
+ inet_type& _inet;
+ EventCenter *center;
+ UserspaceEventManager &manager;
+ std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
+ std::unordered_map<uint16_t, listener*> _listening;
+ std::random_device _rd;
+ std::default_random_engine _e;
+ std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
+ circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
+ // queue for packets that do not belong to any tcb
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+ Throttle _queue_space;
+ // Limit number of data queued into send queue
+ public:
+ class connection {
+ lw_shared_ptr<tcb> _tcb;
+ public:
+ explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
+ connection(const connection&) = delete;
+ connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
+ _tcb->_conn = this;
+ }
+ ~connection();
+ void operator=(const connection&) = delete;
+ connection& operator=(connection&& x) {
+ if (this != &x) {
+ this->~connection();
+ new (this) connection(std::move(x));
+ }
+ return *this;
+ }
+ int fd() const {
+ return _tcb->fd;
+ }
+ int send(Packet p) {
+ return _tcb->send(std::move(p));
+ }
+ std::optional<Packet> read() {
+ return _tcb->read();
+ }
+ int16_t get_errno() const {
+ return _tcb->get_errno();
+ }
+ void close_read();
+ void close_write();
+ entity_addr_t remote_addr() const {
+ entity_addr_t addr;
+ auto net_ip = _tcb->_foreign_ip.hton();
+ memcpy((void*)&addr.in4_addr().sin_addr.s_addr,
+ &net_ip, sizeof(addr.in4_addr().sin_addr.s_addr));
+ addr.set_family(AF_INET);
+ return addr;
+ }
+ uint64_t peek_sent_available() {
+ return _tcb->peek_sent_available();
+ }
+ int is_connected() const { return _tcb->is_connected(); }
+ };
+ class listener {
+ tcp& _tcp;
+ uint16_t _port;
+ int _fd = -1;
+ int16_t _errno;
+ std::queue<connection> _q;
+ size_t _q_max_length;
+
+ private:
+ listener(tcp& t, uint16_t port, size_t queue_length)
+ : _tcp(t), _port(port), _errno(0), _q(), _q_max_length(queue_length) {
+ }
+ public:
+ listener(const listener&) = delete;
+ void operator=(const listener&) = delete;
+ listener(listener&& x)
+ : _tcp(x._tcp), _port(x._port), _fd(std::move(x._fd)), _errno(x._errno),
+ _q(std::move(x._q)) {
+ if (_fd >= 0)
+ _tcp._listening[_port] = this;
+ }
+ ~listener() {
+ abort_accept();
+ }
+ int listen() {
+ if (_tcp._listening.find(_port) != _tcp._listening.end())
+ return -EADDRINUSE;
+ _tcp._listening.emplace(_port, this);
+ _fd = _tcp.manager.get_eventfd();
+ return 0;
+ }
+ std::optional<connection> accept() {
+ std::optional<connection> c;
+ if (!_q.empty()) {
+ c = std::move(_q.front());
+ _q.pop();
+ }
+ return c;
+ }
+ void abort_accept() {
+ while (!_q.empty())
+ _q.pop();
+ if (_fd >= 0) {
+ _tcp._listening.erase(_port);
+ _tcp.manager.close(_fd);
+ _fd = -1;
+ }
+ }
+ int16_t get_errno() const {
+ return _errno;
+ }
+ bool full() const {
+ return _q.size() == _q_max_length;
+ }
+ int fd() const {
+ return _fd;
+ }
+ friend class tcp;
+ };
+ public:
+ explicit tcp(CephContext *c, inet_type& inet, EventCenter *cen);
+ void received(Packet p, ipaddr from, ipaddr to);
+ bool forward(forward_hash& out_hash_data, Packet& p, size_t off);
+ listener listen(uint16_t port, size_t queue_length = 100);
+ connection connect(const entity_addr_t &addr);
+ const hw_features& get_hw_features() const { return _inet._inet.get_hw_features(); }
+ void poll_tcb(const ethernet_address &dst, lw_shared_ptr<tcb> tcb) {
+ _poll_tcbs.emplace_back(std::move(tcb), dst);
+ }
+ bool push_listen_queue(uint16_t port, tcb *t) {
+ auto listener = _listening.find(port);
+ if (listener == _listening.end() || listener->second->full()) {
+ return false;
+ }
+ listener->second->_q.push(connection(t->shared_from_this()));
+ manager.notify(listener->second->_fd, EVENT_READABLE);
+ return true;
+ }
+
+ private:
+ void send_packet_without_tcb(ipaddr from, ipaddr to, Packet p);
+ void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
+ friend class listener;
+};
+
+template <typename InetTraits>
+tcp<InetTraits>::tcp(CephContext *c, inet_type& inet, EventCenter *cen)
+ : cct(c), _inet(inet), center(cen),
+ manager(static_cast<DPDKDriver*>(cen->get_driver())->manager),
+ _e(_rd()), _queue_space(cct, "DPDK::tcp::queue_space", 81920) {
+ int tcb_polled = 0u;
+ _inet.register_packet_provider([this, tcb_polled] () mutable {
+ std::optional<typename InetTraits::l4packet> l4p;
+ auto c = _poll_tcbs.size();
+ if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
+ l4p = std::move(_packetq.front());
+ _packetq.pop_front();
+ _queue_space.put(l4p->p.len());
+ } else {
+ while (c--) {
+ tcb_polled++;
+ lw_shared_ptr<tcb> tcb;
+ ethernet_address dst;
+ std::tie(tcb, dst) = std::move(_poll_tcbs.front());
+ _poll_tcbs.pop_front();
+ l4p = std::move(tcb->get_packet());
+ if (l4p) {
+ l4p->e_dst = dst;
+ break;
+ }
+ }
+ }
+ return l4p;
+ });
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
+ return listener(*this, port, queue_length);
+}
+
+template <typename InetTraits>
+typename tcp<InetTraits>::connection tcp<InetTraits>::connect(const entity_addr_t &addr) {
+ uint16_t src_port;
+ connid id;
+ auto src_ip = _inet._inet.host_address();
+ auto dst_ip = ipv4_address(addr);
+ auto dst_port = addr.get_port();
+
+ do {
+ src_port = _port_dist(_e);
+ id = connid{src_ip, dst_ip, src_port, (uint16_t)dst_port};
+ if (_tcbs.find(id) == _tcbs.end()) {
+ if (_inet._inet.netif()->hw_queues_count() == 1 ||
+ _inet._inet.netif()->hash2cpu(
+ id.hash(_inet._inet.netif()->rss_key())) == center->get_id())
+ break;
+ }
+ } while (true);
+
+ auto tcbp = make_lw_shared<tcb>(*this, id);
+ _tcbs.insert({id, tcbp});
+ tcbp->connect();
+ return connection(tcbp);
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::forward(forward_hash& out_hash_data, Packet& p, size_t off) {
+ auto th = p.get_header<tcp_hdr>(off);
+ if (th) {
+ out_hash_data.push_back(th->src_port);
+ out_hash_data.push_back(th->dst_port);
+ }
+ return true;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::received(Packet p, ipaddr from, ipaddr to) {
+ auto th = p.get_header<tcp_hdr>(0);
+ if (!th) {
+ return;
+ }
+ // th->data_offset is correct even before ntoh()
+ if (unsigned(th->data_offset * 4) < sizeof(*th)) {
+ return;
+ }
+
+ if (!get_hw_features().rx_csum_offload) {
+ checksummer csum;
+ InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
+ csum.sum(p);
+ if (csum.get() != 0) {
+ return;
+ }
+ }
+ auto h = th->ntoh();
+ auto id = connid{to, from, h.dst_port, h.src_port};
+ auto tcbi = _tcbs.find(id);
+ lw_shared_ptr<tcb> tcbp;
+ if (tcbi == _tcbs.end()) {
+ auto listener = _listening.find(id.local_port);
+ if (listener == _listening.end() || listener->second->full()) {
+ // 1) In CLOSE state
+ // 1.1 all data in the incoming segment is discarded. An incoming
+ // segment containing a RST is discarded. An incoming segment not
+ // containing a RST causes a RST to be sent in response.
+ // FIXME:
+ // if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
+ // if ACK on: <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+ } else {
+ // 2) In LISTEN state
+ // 2.1 first check for an RST
+ if (h.f_rst) {
+ // An incoming RST should be ignored
+ return;
+ }
+ // 2.2 second check for an ACK
+ if (h.f_ack) {
+ // Any acknowledgment is bad if it arrives on a connection
+ // still in the LISTEN state.
+ // <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+ }
+ // 2.3 third check for a SYN
+ if (h.f_syn) {
+ // check the security
+ // NOTE: Ignored for now
+ tcbp = make_lw_shared<tcb>(*this, id);
+ _tcbs.insert({id, tcbp});
+ return tcbp->input_handle_listen_state(&h, std::move(p));
+ }
+ // 2.4 fourth other text or control
+ // So you are unlikely to get here, but if you do, drop the
+ // segment, and return.
+ return;
+ }
+ } else {
+ tcbp = tcbi->second;
+ if (tcbp->state() == tcp_state::SYN_SENT) {
+ // 3) In SYN_SENT State
+ return tcbp->input_handle_syn_sent_state(&h, std::move(p));
+ } else {
+ // 4) In other state, can be one of the following:
+ // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+ // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+ return tcbp->input_handle_other_state(&h, std::move(p));
+ }
+ }
+}
+
+// Send packet does not belong to any tcb
+template <typename InetTraits>
+void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, Packet p) {
+ if (_queue_space.get_or_fail(p.len())) { // drop packets that do not fit the queue
+ _inet.wait_l2_dst_address(to, std::move(p), [this, to] (const ethernet_address &e_dst, Packet p, int r) mutable {
+ if (r == 0)
+ _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
+ });
+ }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::connection::~connection() {
+ if (_tcb) {
+ _tcb->_conn = nullptr;
+ close_read();
+ close_write();
+ }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
+ : _tcp(t), manager(t.manager), _local_ip(id.local_ip) , _foreign_ip(id.foreign_ip),
+ _local_port(id.local_port), _foreign_port(id.foreign_port),
+ _snd(_tcp.cct),
+ center(t.center),
+ fd(t.manager.get_eventfd()),
+ delayed_ack_event(new tcp<InetTraits>::C_handle_delayed_ack(this)),
+ retransmit_event(new tcp<InetTraits>::C_handle_retransmit(this)),
+ persist_event(new tcp<InetTraits>::C_handle_persist(this)),
+ all_data_ack_event(new tcp<InetTraits>::C_all_data_acked(this)) {}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::~tcb()
+{
+ if (_delayed_ack_fd)
+ center->delete_time_event(*_delayed_ack_fd);
+ if (retransmit_fd)
+ center->delete_time_event(*retransmit_fd);
+ if (persist_fd)
+ center->delete_time_event(*persist_fd);
+ delete delayed_ack_event;
+ delete retransmit_event;
+ delete persist_event;
+ delete all_data_ack_event;
+ manager.close(fd);
+ fd = -1;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth)
+{
+ _tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
+}
+
+template <typename InetTraits>
+uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_sequence seg_ack) {
+ uint32_t total_acked_bytes = 0;
+ // Full ACK of segment
+ while (!_snd.data.empty()
+ && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
+ auto acked_bytes = _snd.data.front().p.len();
+ _snd.unacknowledged += acked_bytes;
+ // Ignore retransmitted segments when setting the RTO
+ if (_snd.data.front().nr_transmits == 0) {
+ update_rto(_snd.data.front().tx_time);
+ }
+ update_cwnd(acked_bytes);
+ total_acked_bytes += acked_bytes;
+ _snd.user_queue_space.put(_snd.data.front().data_len);
+ manager.notify(fd, EVENT_WRITABLE);
+ _snd.data.pop_front();
+ }
+ // Partial ACK of segment
+ if (_snd.unacknowledged < seg_ack) {
+ auto acked_bytes = seg_ack - _snd.unacknowledged;
+ if (!_snd.data.empty()) {
+ auto& unacked_seg = _snd.data.front();
+ unacked_seg.p.trim_front(acked_bytes);
+ }
+ _snd.unacknowledged = seg_ack;
+ update_cwnd(acked_bytes);
+ total_acked_bytes += acked_bytes;
+ }
+ return total_acked_bytes;
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::segment_acceptable(tcp_sequence seg_seq, unsigned seg_len) {
+ if (seg_len == 0 && _rcv.window == 0) {
+ // SEG.SEQ = RCV.NXT
+ return seg_seq == _rcv.next;
+ } else if (seg_len == 0 && _rcv.window > 0) {
+ // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
+ } else if (seg_len > 0 && _rcv.window > 0) {
+ // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ // or
+ // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+ bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
+ bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
+ return x || y;
+ } else {
+ // SEG.LEN > 0 RCV.WND = 0, not acceptable
+ return false;
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
+ // Handle tcp options
+ _option.parse(opt_start, opt_end);
+
+ // Remote receive window scale factor
+ _snd.window_scale = _option._remote_win_scale;
+ // Local receive window scale factor
+ _rcv.window_scale = _option._local_win_scale;
+
+ // Maximum segment size remote can receive
+ _snd.mss = _option._remote_mss;
+ // Maximum segment size local can receive
+ _rcv.mss = _option._local_mss = local_mss();
+
+ // Linux's default window size
+ _rcv.window = 29200 << _rcv.window_scale;
+ _snd.window = th->window << _snd.window_scale;
+
+ // Segment sequence number used for last window update
+ _snd.wl1 = th->seq;
+ // Segment acknowledgment number used for last window update
+ _snd.wl2 = th->ack;
+
+ // Setup initial congestion window
+ if (2190 < _snd.mss) {
+ _snd.cwnd = 2 * _snd.mss;
+ } else if (1095 < _snd.mss && _snd.mss <= 2190) {
+ _snd.cwnd = 3 * _snd.mss;
+ } else {
+ _snd.cwnd = 4 * _snd.mss;
+ }
+
+ // Setup initial slow start threshold
+ _snd.ssthresh = th->window << _snd.window_scale;
+}
+
+template <typename InetTraits>
+Packet tcp<InetTraits>::tcb::get_transmit_packet() {
+ // easy case: empty queue
+ if (_snd.unsent.empty()) {
+ return Packet();
+ }
+ auto can_send = this->can_send();
+ // Max number of TCP payloads we can pass to NIC
+ uint32_t len;
+ if (_tcp.get_hw_features().tx_tso) {
+ // FIXME: Info tap device the size of the split packet
+ len = _tcp.get_hw_features().max_packet_len - tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+ } else {
+ len = std::min(uint16_t(_tcp.get_hw_features().mtu - tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
+ }
+ can_send = std::min(can_send, len);
+ // easy case: one small packet
+ if (_snd.unsent.front().len() <= can_send) {
+ auto p = std::move(_snd.unsent.front());
+ _snd.unsent.pop_front();
+ _snd.unsent_len -= p.len();
+ return p;
+ }
+ // moderate case: need to split one packet
+ if (_snd.unsent.front().len() > can_send) {
+ auto p = _snd.unsent.front().share(0, can_send);
+ _snd.unsent.front().trim_front(can_send);
+ _snd.unsent_len -= p.len();
+ return p;
+ }
+ // hard case: merge some packets, possibly split last
+ auto p = std::move(_snd.unsent.front());
+ _snd.unsent.pop_front();
+ can_send -= p.len();
+ while (!_snd.unsent.empty()
+ && _snd.unsent.front().len() <= can_send) {
+ can_send -= _snd.unsent.front().len();
+ p.append(std::move(_snd.unsent.front()));
+ _snd.unsent.pop_front();
+ }
+ // FIXME: this will result in calling "deleter" of packet which free managed objects
+ // will used later
+ // if (!_snd.unsent.empty() && can_send) {
+ // auto& q = _snd.unsent.front();
+ // p.append(q.share(0, can_send));
+ // q.trim_front(can_send);
+ // }
+ _snd.unsent_len -= p.len();
+ return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
+ if (in_state(CLOSED)) {
+ return;
+ }
+
+ Packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
+ Packet clone = p.share(); // early clone to prevent share() from calling packet::unuse_internal_data() on header.
+ uint16_t len = p.len();
+ bool syn_on = syn_needs_on();
+ bool ack_on = ack_needs_on();
+
+ auto options_size = _option.get_size(syn_on, ack_on);
+ auto th = p.prepend_header<tcp_hdr>(options_size);
+
+ th->src_port = _local_port;
+ th->dst_port = _foreign_port;
+
+ th->f_syn = syn_on;
+ th->f_ack = ack_on;
+ if (ack_on) {
+ clear_delayed_ack();
+ }
+ th->f_urg = false;
+ th->f_psh = false;
+
+ tcp_sequence seq;
+ if (data_retransmit) {
+ seq = _snd.unacknowledged;
+ } else {
+ seq = syn_on ? _snd.initial : _snd.next;
+ _snd.next += len;
+ }
+ th->seq = seq;
+ th->ack = _rcv.next;
+ th->data_offset = (sizeof(*th) + options_size) / 4;
+ th->window = _rcv.window >> _rcv.window_scale;
+ th->checksum = 0;
+
+ // FIXME: does the FIN have to fit in the window?
+ bool fin_on = fin_needs_on();
+ th->f_fin = fin_on;
+
+ // Add tcp options
+ _option.fill(th, options_size);
+ *th = th->hton();
+
+ offload_info oi;
+ checksummer csum;
+ uint16_t pseudo_hdr_seg_len = 0;
+
+ oi.tcp_hdr_len = sizeof(tcp_hdr) + options_size;
+
+ if (_tcp.get_hw_features().tx_csum_l4_offload) {
+ oi.needs_csum = true;
+
+ //
+ // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
+ // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
+ // complement sum of the pseudo header.
+ //
+ // For TSO the csum should be calculated for a pseudo header with
+ // segment length set to 0. All the rest is the same as for a TCP Tx
+ // CSUM offload case.
+ //
+ if (_tcp.get_hw_features().tx_tso && len > _snd.mss) {
+ oi.tso_seg_size = _snd.mss;
+ } else {
+ pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+ }
+ } else {
+ pseudo_hdr_seg_len = sizeof(*th) + options_size + len;
+ oi.needs_csum = false;
+ }
+
+ InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
+ pseudo_hdr_seg_len);
+
+ if (_tcp.get_hw_features().tx_csum_l4_offload) {
+ th->checksum = ~csum.get();
+ } else {
+ csum.sum(p);
+ th->checksum = csum.get();
+ }
+
+ oi.protocol = ip_protocol_num::tcp;
+
+ p.set_offload_info(oi);
+
+ if (!data_retransmit && (len || syn_on || fin_on)) {
+ auto now = clock_type::now();
+ if (len) {
+ unsigned nr_transmits = 0;
+ _snd.data.emplace_back(unacked_segment{std::move(clone),
+ len, nr_transmits, now});
+ }
+ if (!retransmit_fd) {
+ start_retransmit_timer();
+ }
+ }
+
+ queue_packet(std::move(p));
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::is_all_data_acked() {
+ if (_snd.data.empty() && _snd.unsent_len == 0 && _snd.queued_len == 0) {
+ return true;
+ }
+ return false;
+}
+
+template <typename InetTraits>
+std::optional<Packet> tcp<InetTraits>::tcb::read() {
+ std::optional<Packet> p;
+ if (_rcv.data.empty())
+ return p;
+
+ p.emplace();
+ for (auto&& q : _rcv.data) {
+ p->append(std::move(q));
+ }
+ _rcv.data.clear();
+ return p;
+}
+
+template <typename InetTraits>
+int tcp<InetTraits>::tcb::send(Packet p) {
+ // We can not send after the connection is closed
+ ceph_assert(!_snd.closed);
+
+ if (in_state(CLOSED))
+ return -ECONNRESET;
+
+ auto len = p.len();
+ if (!_snd.user_queue_space.get_or_fail(len)) {
+ // note: caller must ensure enough queue space to send
+ ceph_abort();
+ }
+ // TODO: Handle p.len() > max user_queue_space case
+ _snd.queued_len += len;
+ _snd.unsent_len += len;
+ _snd.queued_len -= len;
+ _snd.unsent.push_back(std::move(p));
+ if (can_send() > 0) {
+ output();
+ }
+ return len;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close() {
+ if (in_state(CLOSED) || _snd.closed) {
+ return ;
+ }
+ // TODO: We should make this asynchronous
+
+ _errno = -EPIPE;
+ center->delete_file_event(fd, EVENT_READABLE|EVENT_WRITABLE);
+ bool acked = is_all_data_acked();
+ if (!acked) {
+ _snd._all_data_acked_fd = manager.get_eventfd();
+ center->create_file_event(_snd._all_data_acked_fd, EVENT_READABLE, all_data_ack_event);
+ } else {
+ close_final_cleanup();
+ }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
+ // We've received a TSO packet, do ack immediately
+ if (seg_len > _rcv.mss) {
+ _nr_full_seg_received = 0;
+ if (_delayed_ack_fd) {
+ center->delete_time_event(*_delayed_ack_fd);
+ _delayed_ack_fd.reset();
+ }
+ return true;
+ }
+
+ // We've received a full sized segment, ack for every second full sized segment
+ if (seg_len == _rcv.mss) {
+ if (_nr_full_seg_received++ >= 1) {
+ _nr_full_seg_received = 0;
+ if (_delayed_ack_fd) {
+ center->delete_time_event(*_delayed_ack_fd);
+ _delayed_ack_fd.reset();
+ }
+ return true;
+ }
+ }
+
+ // If the timer is armed and its callback hasn't been run.
+ if (_delayed_ack_fd) {
+ return false;
+ }
+
+ // If the timer is not armed, schedule a delayed ACK.
+ // The maximum delayed ack timer allowed by RFC1122 is 500ms, most
+ // implementations use 200ms.
+ _delayed_ack_fd.emplace(center->create_time_event(200*1000, delayed_ack_event));
+ return false;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::clear_delayed_ack() {
+ if (_delayed_ack_fd) {
+ center->delete_time_event(*_delayed_ack_fd);
+ _delayed_ack_fd.reset();
+ }
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::merge_out_of_order() {
+ bool merged = false;
+ if (_rcv.out_of_order.map.empty()) {
+ return merged;
+ }
+ for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
+ auto& p = it->second;
+ auto seg_beg = it->first;
+ auto seg_len = p.len();
+ auto seg_end = seg_beg + seg_len;
+ if (seg_beg <= _rcv.next && seg_end > _rcv.next) {
+ // This segment has been received out of order and its previous
+ // segment has been received now
+ auto trim = _rcv.next - seg_beg;
+ if (trim) {
+ p.trim_front(trim);
+ seg_len -= trim;
+ }
+ _rcv.next += seg_len;
+ _rcv.data.push_back(std::move(p));
+ // Since c++11, erase() always returns the value of the following element
+ it = _rcv.out_of_order.map.erase(it);
+ merged = true;
+ } else if (_rcv.next >= seg_end) {
+ // This segment has been receive already, drop it
+ it = _rcv.out_of_order.map.erase(it);
+ } else {
+ // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
+ // so we can stop looking here.
+ it++;
+ break;
+ }
+ }
+ return merged;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::insert_out_of_order(tcp_sequence seg, Packet p) {
+ _rcv.out_of_order.merge(seg, std::move(p));
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
+ abort();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::fast_retransmit() {
+ if (!_snd.data.empty()) {
+ auto& unacked_seg = _snd.data.front();
+ unacked_seg.nr_transmits++;
+ retransmit_one();
+ output();
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
+ // Update RTO according to RFC6298
+ auto R = std::chrono::duration_cast<std::chrono::microseconds>(clock_type::now() - tx_time);
+ if (_snd.first_rto_sample) {
+ _snd.first_rto_sample = false;
+ // RTTVAR <- R/2
+ // SRTT <- R
+ _snd.rttvar = R / 2;
+ _snd.srtt = R;
+ } else {
+ // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
+ // SRTT <- (1 - alpha) * SRTT + alpha * R'
+ // where alpha = 1/8 and beta = 1/4
+ auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
+ _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
+ _snd.srtt = _snd.srtt * 7 / 8 + R / 8;
+ }
+ // RTO <- SRTT + max(G, K * RTTVAR)
+ _rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
+
+ // Make sure 1 sec << _rto << 60 sec
+ _rto = std::max(_rto, _rto_min);
+ _rto = std::min(_rto, _rto_max);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
+ uint32_t smss = _snd.mss;
+ if (_snd.cwnd < _snd.ssthresh) {
+ // In slow start phase
+ _snd.cwnd += std::min(acked_bytes, smss);
+ } else {
+ // In congestion avoidance phase
+ uint32_t round_up = 1;
+ _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
+ }
+}
+
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::cleanup() {
+ manager.notify(fd, EVENT_READABLE);
+ _snd.closed = true;
+ _snd.unsent.clear();
+ _snd.data.clear();
+ _rcv.out_of_order.map.clear();
+ _rcv.data.clear();
+ stop_retransmit_timer();
+ clear_delayed_ack();
+ center->dispatch_event_external(new tcp<InetTraits>::C_actual_remove_tcb(this));
+ remove_from_tcbs();
+}
+
+template <typename InetTraits>
+tcp_sequence tcp<InetTraits>::tcb::get_isn() {
+ // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
+ // with the expression:
+ // ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
+ // M is the 4 microsecond timer
+ using namespace std::chrono;
+ uint32_t hash[4];
+ hash[0] = _local_ip.ip;
+ hash[1] = _foreign_ip.ip;
+ hash[2] = (_local_port << 16) + _foreign_port;
+ hash[3] = _isn_secret.key[15];
+ ceph::crypto::MD5 md5;
+ md5.Update((const unsigned char*)_isn_secret.key, sizeof(_isn_secret.key));
+ md5.Final((unsigned char*)hash);
+ auto seq = hash[0];
+ auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
+ seq += m.count() / 4;
+ return make_seq(seq);
+}
+
+template <typename InetTraits>
+std::optional<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
+ _poll_active = false;
+ if (_packetq.empty()) {
+ output_one();
+ }
+
+ std::optional<typename InetTraits::l4packet> p;
+ if (in_state(CLOSED)) {
+ return p;
+ }
+
+ ceph_assert(!_packetq.empty());
+
+ p = std::move(_packetq.front());
+ _packetq.pop_front();
+ if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0)) {
+ // If there are packets to send in the queue or tcb is allowed to send
+ // more add tcp back to polling set to keep sending. In addition, dupacks >= 3
+ // is an indication that an segment is lost, stop sending more in this case.
+ output();
+ }
+ return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_read() {
+ // do nothing
+ // _tcb->manager.notify(_tcb->fd, EVENT_READABLE);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_write() {
+ _tcb->close();
+}
+
+template <typename InetTraits>
+constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_min;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_max;
+
+template <typename InetTraits>
+constexpr std::chrono::microseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
+
+template <typename InetTraits>
+typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
+
+
+#endif /* TCP_HH_ */
diff --git a/src/msg/async/dpdk/UserspaceEvent.cc b/src/msg/async/dpdk/UserspaceEvent.cc
new file mode 100644
index 000000000..e0c57fd9b
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.cc
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "UserspaceEvent.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "dpdk "
+
+int UserspaceEventManager::get_eventfd()
+{
+ int fd;
+ if (!unused_fds.empty()) {
+ fd = unused_fds.front();
+ unused_fds.pop_front();
+ } else {
+ fd = ++max_fd;
+ fds.resize(fd + 1);
+ }
+
+ std::optional<UserspaceFDImpl> &impl = fds[fd];
+ ceph_assert(!impl);
+ impl.emplace();
+ ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+ return fd;
+}
+
+int UserspaceEventManager::notify(int fd, int mask)
+{
+ ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << mask << dendl;
+ if ((size_t)fd >= fds.size())
+ return -ENOENT;
+
+ std::optional<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return -ENOENT;
+
+ ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+ << " listening=" << int(impl->listening_mask)
+ << " waiting_idx=" << int(impl->waiting_idx) << dendl;
+
+ impl->activating_mask |= mask;
+ if (impl->waiting_idx)
+ return 0;
+
+ if (impl->listening_mask & mask) {
+ if (waiting_fds.size() <= max_wait_idx)
+ waiting_fds.resize(waiting_fds.size()*2);
+ impl->waiting_idx = ++max_wait_idx;
+ waiting_fds[max_wait_idx] = fd;
+ }
+
+ ldout(cct, 20) << __func__ << " activing=" << int(impl->activating_mask)
+ << " listening=" << int(impl->listening_mask)
+ << " waiting_idx=" << int(impl->waiting_idx) << " done " << dendl;
+ return 0;
+}
+
+void UserspaceEventManager::close(int fd)
+{
+ ldout(cct, 20) << __func__ << " fd=" << fd << dendl;
+ if ((size_t)fd >= fds.size())
+ return ;
+
+ std::optional<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return ;
+
+ if (fd == max_fd)
+ --max_fd;
+ else
+ unused_fds.push_back(fd);
+
+ if (impl->activating_mask) {
+ if (waiting_fds[max_wait_idx] == fd) {
+ ceph_assert(impl->waiting_idx == max_wait_idx);
+ --max_wait_idx;
+ }
+ waiting_fds[impl->waiting_idx] = -1;
+ }
+ impl.reset();
+}
+
+int UserspaceEventManager::poll(int *events, int *masks, int num_events, struct timeval *tp)
+{
+ int fd;
+ uint32_t i = 0;
+ int count = 0;
+ ceph_assert(num_events);
+ // leave zero slot for waiting_fds
+ while (i < max_wait_idx) {
+ fd = waiting_fds[++i];
+ if (fd == -1)
+ continue;
+
+ events[count] = fd;
+ std::optional<UserspaceFDImpl> &impl = fds[fd];
+ ceph_assert(impl);
+ masks[count] = impl->listening_mask & impl->activating_mask;
+ ceph_assert(masks[count]);
+ ldout(cct, 20) << __func__ << " fd=" << fd << " mask=" << masks[count] << dendl;
+ impl->activating_mask &= (~masks[count]);
+ impl->waiting_idx = 0;
+ if (++count >= num_events)
+ break;
+ }
+ if (i < max_wait_idx) {
+ memmove(&waiting_fds[1], &waiting_fds[i+1], sizeof(int)*(max_wait_idx-i));
+ }
+ max_wait_idx -= i;
+ return count;
+}
diff --git a/src/msg/async/dpdk/UserspaceEvent.h b/src/msg/async/dpdk/UserspaceEvent.h
new file mode 100644
index 000000000..49308aca4
--- /dev/null
+++ b/src/msg/async/dpdk/UserspaceEvent.h
@@ -0,0 +1,106 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_USERSPACEEVENT_H
+#define CEPH_USERSPACEEVENT_H
+
+#include <cstddef>
+#include <errno.h>
+#include <string.h>
+
+#include <list>
+#include <optional>
+#include <vector>
+
+#include "include/ceph_assert.h"
+#include "include/int_types.h"
+
+class CephContext;
+
+class UserspaceEventManager {
+ struct UserspaceFDImpl {
+ uint32_t waiting_idx = 0;
+ int16_t read_errno = 0;
+ int16_t write_errno = 0;
+ int8_t listening_mask = 0;
+ int8_t activating_mask = 0;
+ uint32_t magic = 4921;
+ };
+ CephContext *cct;
+ int max_fd = 0;
+ uint32_t max_wait_idx = 0;
+ std::vector<std::optional<UserspaceFDImpl> > fds;
+ std::vector<int> waiting_fds;
+ std::list<uint32_t> unused_fds;
+
+ public:
+ explicit UserspaceEventManager(CephContext *c): cct(c) {
+ waiting_fds.resize(1024);
+ }
+
+ int get_eventfd();
+
+ int listen(int fd, int mask) {
+ if ((size_t)fd >= fds.size())
+ return -ENOENT;
+
+ std::optional<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return -ENOENT;
+
+ impl->listening_mask |= mask;
+ if (impl->activating_mask & impl->listening_mask && !impl->waiting_idx) {
+ if (waiting_fds.size() <= max_wait_idx)
+ waiting_fds.resize(waiting_fds.size()*2);
+ impl->waiting_idx = ++max_wait_idx;
+ waiting_fds[max_wait_idx] = fd;
+ }
+ return 0;
+ }
+
+ int unlisten(int fd, int mask) {
+ if ((size_t)fd >= fds.size())
+ return -ENOENT;
+
+ std::optional<UserspaceFDImpl> &impl = fds[fd];
+ if (!impl)
+ return -ENOENT;
+
+ impl->listening_mask &= (~mask);
+ if (!(impl->activating_mask & impl->listening_mask) && impl->waiting_idx) {
+ if (waiting_fds[max_wait_idx] == fd) {
+ ceph_assert(impl->waiting_idx == max_wait_idx);
+ --max_wait_idx;
+ }
+ waiting_fds[impl->waiting_idx] = -1;
+ impl->waiting_idx = 0;
+ }
+ return 0;
+ }
+
+ int notify(int fd, int mask);
+ void close(int fd);
+ int poll(int *events, int *masks, int num_events, struct timeval *tp);
+
+ bool check() {
+ for (auto &&m : fds) {
+ if (m && m->magic != 4921)
+ return false;
+ }
+ return true;
+ }
+};
+
+#endif //CEPH_USERSPACEEVENT_H
diff --git a/src/msg/async/dpdk/align.h b/src/msg/async/dpdk/align.h
new file mode 100644
index 000000000..3b48f7899
--- /dev/null
+++ b/src/msg/async/dpdk/align.h
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_ALIGN_HH_
+#define CEPH_MSG_DPDK_ALIGN_HH_
+
+#include <cstdint>
+#include <cstdlib>
+
+template <typename T>
+inline constexpr T align_up(T v, T align) {
+ return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_up(T* v, size_t align) {
+ static_assert(sizeof(T) == 1, "align byte pointers only");
+ return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
+}
+
+template <typename T>
+inline constexpr T align_down(T v, T align) {
+ return v & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr T* align_down(T* v, size_t align) {
+ static_assert(sizeof(T) == 1, "align byte pointers only");
+ return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align));
+}
+
+#endif /* CEPH_MSG_DPDK_ALIGN_HH_ */
diff --git a/src/msg/async/dpdk/array_map.h b/src/msg/async/dpdk/array_map.h
new file mode 100644
index 000000000..40f7728dc
--- /dev/null
+++ b/src/msg/async/dpdk/array_map.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_ARRAY_MAP_HH_
+#define CEPH_ARRAY_MAP_HH_
+
+#include <array>
+
+// unordered_map implemented as a simple array
+
+template <typename Value, size_t Max>
+class array_map {
+ std::array<Value, Max> _a {};
+ public:
+ array_map(std::initializer_list<std::pair<size_t, Value>> i) {
+ for (auto kv : i) {
+ _a[kv.first] = kv.second;
+ }
+ }
+ Value& operator[](size_t key) { return _a[key]; }
+ const Value& operator[](size_t key) const { return _a[key]; }
+
+ Value& at(size_t key) {
+ if (key >= Max) {
+ throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max));
+ }
+ return _a[key];
+ }
+};
+
+#endif /* ARRAY_MAP_HH_ */
diff --git a/src/msg/async/dpdk/byteorder.h b/src/msg/async/dpdk/byteorder.h
new file mode 100644
index 000000000..a996ec077
--- /dev/null
+++ b/src/msg/async/dpdk/byteorder.h
@@ -0,0 +1,58 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_BYTEORDER_H_
+#define CEPH_MSG_BYTEORDER_H_
+
+#include <arpa/inet.h> // for ntohs() and friends
+#include <iosfwd>
+#include <utility>
+
+inline uint64_t ntohq(uint64_t v) {
+ return __builtin_bswap64(v);
+}
+inline uint64_t htonq(uint64_t v) {
+ return __builtin_bswap64(v);
+}
+
+inline void ntoh() {}
+inline void hton() {}
+
+inline uint8_t ntoh(uint8_t x) { return x; }
+inline uint8_t hton(uint8_t x) { return x; }
+inline uint16_t ntoh(uint16_t x) { return ntohs(x); }
+inline uint16_t hton(uint16_t x) { return htons(x); }
+inline uint32_t ntoh(uint32_t x) { return ntohl(x); }
+inline uint32_t hton(uint32_t x) { return htonl(x); }
+inline uint64_t ntoh(uint64_t x) { return ntohq(x); }
+inline uint64_t hton(uint64_t x) { return htonq(x); }
+
+inline int8_t ntoh(int8_t x) { return x; }
+inline int8_t hton(int8_t x) { return x; }
+inline int16_t ntoh(int16_t x) { return ntohs(x); }
+inline int16_t hton(int16_t x) { return htons(x); }
+inline int32_t ntoh(int32_t x) { return ntohl(x); }
+inline int32_t hton(int32_t x) { return htonl(x); }
+inline int64_t ntoh(int64_t x) { return ntohq(x); }
+inline int64_t hton(int64_t x) { return htonq(x); }
+
+#endif /* CEPH_MSG_BYTEORDER_H_ */
diff --git a/src/msg/async/dpdk/capture.h b/src/msg/async/dpdk/capture.h
new file mode 100644
index 000000000..1ace8eeb0
--- /dev/null
+++ b/src/msg/async/dpdk/capture.h
@@ -0,0 +1,50 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_MSG_DPDK_CAPTURE_H
+#define CEPH_MSG_DPDK_CAPTURE_H
+
+#include <utility>
+
+template <typename T, typename F>
+class capture_impl {
+ T x;
+ F f;
+ public:
+ capture_impl(capture_impl &) = delete;
+ capture_impl( T && x, F && f )
+ : x{std::forward<T>(x)}, f{std::forward<F>(f)}
+ {}
+
+ template <typename ...Ts> auto operator()( Ts&&...args )
+ -> decltype(f( x, std::forward<Ts>(args)... ))
+ {
+ return f( x, std::forward<Ts>(args)... );
+ }
+
+ template <typename ...Ts> auto operator()( Ts&&...args ) const
+ -> decltype(f( x, std::forward<Ts>(args)... ))
+ {
+ return f( x, std::forward<Ts>(args)... );
+ }
+};
+
+template <typename T, typename F>
+capture_impl<T,F> capture( T && x, F && f ) {
+ return capture_impl<T,F>(
+ std::forward<T>(x), std::forward<F>(f) );
+}
+
+#endif //CEPH_MSG_DPDK_CAPTURE_H
diff --git a/src/msg/async/dpdk/circular_buffer.h b/src/msg/async/dpdk/circular_buffer.h
new file mode 100644
index 000000000..2c92c1204
--- /dev/null
+++ b/src/msg/async/dpdk/circular_buffer.h
@@ -0,0 +1,347 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_CIRCULAR_BUFFER_HH_
+#define CEPH_CIRCULAR_BUFFER_HH_
+
+// A growable double-ended queue container that can be efficiently
+// extended (and shrunk) from both ends. Implementation is a single
+// storage vector.
+//
+// Similar to libstdc++'s std::deque, except that it uses a single level
+// store, and so is more efficient for simple stored items.
+// Similar to boost::circular_buffer_space_optimized, except it uses
+// uninitialized storage for unoccupied elements (and thus move/copy
+// constructors instead of move/copy assignments, which are less efficient).
+
+#include <memory>
+#include <algorithm>
+
+#include "transfer.h"
+
+template <typename T, typename Alloc = std::allocator<T>>
+class circular_buffer {
+ struct impl : Alloc {
+ T* storage = nullptr;
+ // begin, end interpreted (mod capacity)
+ size_t begin = 0;
+ size_t end = 0;
+ size_t capacity = 0;
+ };
+ impl _impl;
+ public:
+ using value_type = T;
+ using size_type = size_t;
+ using reference = T&;
+ using pointer = T*;
+ using const_reference = const T&;
+ using const_pointer = const T*;
+ public:
+ circular_buffer() = default;
+ circular_buffer(circular_buffer&& X);
+ circular_buffer(const circular_buffer& X) = delete;
+ ~circular_buffer();
+ circular_buffer& operator=(const circular_buffer&) = delete;
+ circular_buffer& operator=(circular_buffer&&) = delete;
+ void push_front(const T& data);
+ void push_front(T&& data);
+ template <typename... A>
+ void emplace_front(A&&... args);
+ void push_back(const T& data);
+ void push_back(T&& data);
+ template <typename... A>
+ void emplace_back(A&&... args);
+ T& front();
+ T& back();
+ void pop_front();
+ void pop_back();
+ bool empty() const;
+ size_t size() const;
+ size_t capacity() const;
+ T& operator[](size_t idx);
+ template <typename Func>
+ void for_each(Func func);
+ // access an element, may return wrong or destroyed element
+ // only useful if you do not rely on data accuracy (e.g. prefetch)
+ T& access_element_unsafe(size_t idx);
+ private:
+ void expand();
+ void maybe_expand(size_t nr = 1);
+ size_t mask(size_t idx) const;
+
+ template<typename CB, typename ValueType>
+ struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> {
+ typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t;
+
+ ValueType& operator*() const { return cb->_impl.storage[cb->mask(idx)]; }
+ ValueType* operator->() const { return &cb->_impl.storage[cb->mask(idx)]; }
+ // prefix
+ cbiterator<CB, ValueType>& operator++() {
+ idx++;
+ return *this;
+ }
+ // postfix
+ cbiterator<CB, ValueType> operator++(int unused) {
+ auto v = *this;
+ idx++;
+ return v;
+ }
+ // prefix
+ cbiterator<CB, ValueType>& operator--() {
+ idx--;
+ return *this;
+ }
+ // postfix
+ cbiterator<CB, ValueType> operator--(int unused) {
+ auto v = *this;
+ idx--;
+ return v;
+ }
+ cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const {
+ return cbiterator<CB, ValueType>(cb, idx + n);
+ }
+ cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const {
+ return cbiterator<CB, ValueType>(cb, idx - n);
+ }
+ cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) {
+ idx += n;
+ return *this;
+ }
+ cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) {
+ idx -= n;
+ return *this;
+ }
+ bool operator==(const cbiterator<CB, ValueType>& rhs) const {
+ return idx == rhs.idx;
+ }
+ bool operator!=(const cbiterator<CB, ValueType>& rhs) const {
+ return idx != rhs.idx;
+ }
+ bool operator<(const cbiterator<CB, ValueType>& rhs) const {
+ return idx < rhs.idx;
+ }
+ bool operator>(const cbiterator<CB, ValueType>& rhs) const {
+ return idx > rhs.idx;
+ }
+ bool operator>=(const cbiterator<CB, ValueType>& rhs) const {
+ return idx >= rhs.idx;
+ }
+ bool operator<=(const cbiterator<CB, ValueType>& rhs) const {
+ return idx <= rhs.idx;
+ }
+ typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const {
+ return idx - rhs.idx;
+ }
+ private:
+ CB* cb;
+ size_t idx;
+ cbiterator<CB, ValueType>(CB* b, size_t i) : cb(b), idx(i) {}
+ friend class circular_buffer;
+ };
+ friend class iterator;
+
+ public:
+ typedef cbiterator<circular_buffer, T> iterator;
+ typedef cbiterator<const circular_buffer, const T> const_iterator;
+
+ iterator begin() {
+ return iterator(this, _impl.begin);
+ }
+ const_iterator begin() const {
+ return const_iterator(this, _impl.begin);
+ }
+ iterator end() {
+ return iterator(this, _impl.end);
+ }
+ const_iterator end() const {
+ return const_iterator(this, _impl.end);
+ }
+ const_iterator cbegin() const {
+ return const_iterator(this, _impl.begin);
+ }
+ const_iterator cend() const {
+ return const_iterator(this, _impl.end);
+ }
+};
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::mask(size_t idx) const {
+ return idx & (_impl.capacity - 1);
+}
+
+template <typename T, typename Alloc>
+inline bool circular_buffer<T, Alloc>::empty() const {
+ return _impl.begin == _impl.end;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::size() const {
+ return _impl.end - _impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline size_t circular_buffer<T, Alloc>::capacity() const {
+ return _impl.capacity;
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x)
+ : _impl(std::move(x._impl)) {
+ x._impl = {};
+}
+
+template <typename T, typename Alloc>
+template <typename Func>
+inline void circular_buffer<T, Alloc>::for_each(Func func) {
+ auto s = _impl.storage;
+ auto m = _impl.capacity - 1;
+ for (auto i = _impl.begin; i != _impl.end; ++i) {
+ func(s[i & m]);
+ }
+}
+
+template <typename T, typename Alloc>
+inline circular_buffer<T, Alloc>::~circular_buffer() {
+ for_each([this] (T& obj) {
+ _impl.destroy(&obj);
+ });
+ _impl.deallocate(_impl.storage, _impl.capacity);
+}
+
+template <typename T, typename Alloc>
+void circular_buffer<T, Alloc>::expand() {
+ auto new_cap = std::max<size_t>(_impl.capacity * 2, 1);
+ auto new_storage = _impl.allocate(new_cap);
+ auto p = new_storage;
+ try {
+ for_each([this, &p] (T& obj) {
+ transfer_pass1(_impl, &obj, p);
+ p++;
+ });
+ } catch (...) {
+ while (p != new_storage) {
+ _impl.destroy(--p);
+ }
+ _impl.deallocate(new_storage, new_cap);
+ throw;
+ }
+ p = new_storage;
+ for_each([this, &p] (T& obj) {
+ transfer_pass2(_impl, &obj, p++);
+ });
+ std::swap(_impl.storage, new_storage);
+ std::swap(_impl.capacity, new_cap);
+ _impl.begin = 0;
+ _impl.end = p - _impl.storage;
+ _impl.deallocate(new_storage, new_cap);
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::maybe_expand(size_t nr) {
+ if (_impl.end - _impl.begin + nr > _impl.capacity) {
+ expand();
+ }
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(const T& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ _impl.construct(p, data);
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_front(T&& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ _impl.construct(p, std::move(data));
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_front(Args&&... args) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ _impl.construct(p, std::forward<Args>(args)...);
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(const T& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ _impl.construct(p, data);
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::push_back(T&& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ _impl.construct(p, std::move(data));
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline void circular_buffer<T, Alloc>::emplace_back(Args&&... args) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ _impl.construct(p, std::forward<Args>(args)...);
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::front() {
+ return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::back() {
+ return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_front() {
+ _impl.destroy(&front());
+ ++_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline void circular_buffer<T, Alloc>::pop_back() {
+ _impl.destroy(&back());
+ --_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::operator[](size_t idx) {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline T& circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+#endif /* CEPH_CIRCULAR_BUFFER_HH_ */
diff --git a/src/msg/async/dpdk/const.h b/src/msg/async/dpdk/const.h
new file mode 100644
index 000000000..ea5dc49e5
--- /dev/null
+++ b/src/msg/async/dpdk/const.h
@@ -0,0 +1,42 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_CONST_H_
+#define CEPH_MSG_CONST_H_
+
+#include <stdint.h>
+
+enum class ip_protocol_num : uint8_t {
+ icmp = 1, tcp = 6, unused = 255
+};
+
+enum class eth_protocol_num : uint16_t {
+ ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd
+};
+
+const uint8_t eth_hdr_len = 14;
+const uint8_t tcp_hdr_len_min = 20;
+const uint8_t ipv4_hdr_len_min = 20;
+const uint8_t ipv6_hdr_len_min = 40;
+const uint16_t ip_packet_len_max = 65535;
+
+#endif
diff --git a/src/msg/async/dpdk/dpdk_rte.cc b/src/msg/async/dpdk/dpdk_rte.cc
new file mode 100644
index 000000000..1b110c939
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.cc
@@ -0,0 +1,204 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <bitset>
+
+#include <rte_config.h>
+#include <rte_common.h>
+#include <rte_ethdev.h>
+#include <rte_version.h>
+
+#include "include/str_map.h"
+
+#include "DPDK.h"
+#include "dpdk_rte.h"
+
+namespace dpdk {
+
+ static inline std::vector<char> string2vector(std::string str) {
+ auto v = std::vector<char>(str.begin(), str.end());
+ v.push_back('\0');
+ return v;
+ }
+
+ static int bitcount(unsigned long long n)
+ {
+ return std::bitset<CHAR_BIT * sizeof(n)>{n}.count();
+ }
+
+ static int hex2bitcount(unsigned char c)
+ {
+ int val;
+
+ if (isdigit(c))
+ val = c - '0';
+ else if (isupper(c))
+ val = c - 'A' + 10;
+ else
+ val = c - 'a' + 10;
+ return bitcount(val);
+ }
+
+ static int coremask_bitcount(const char *buf)
+ {
+ int count = 0;
+
+ if (buf[0] == '0' &&
+ ((buf[1] == 'x') || (buf[1] == 'X')))
+ buf += 2;
+
+ for (int i = 0; buf[i] != '\0'; i++) {
+ char c = buf[i];
+ if (isxdigit(c) == 0)
+ return -EINVAL;
+ count += hex2bitcount(c);
+ }
+ return count;
+ }
+
+ bool eal::rte_initialized = false;
+
+ int eal::start()
+ {
+ if (initialized) {
+ return 1;
+ }
+
+ bool done = false;
+ auto coremask = cct->_conf.get_val<std::string>("ms_dpdk_coremask");
+ int coremaskbit = coremask_bitcount(coremask.c_str());
+
+ if (coremaskbit <= 0
+ || static_cast<uint64_t>(coremaskbit) < cct->_conf->ms_async_op_threads)
+ return -EINVAL;
+
+ t = std::thread([&]() {
+ // TODO: Inherit these from the app parameters - "opts"
+ std::vector<std::vector<char>> args {
+ string2vector("ceph"),
+ string2vector("-c"), string2vector(cct->_conf.get_val<std::string>("ms_dpdk_coremask")),
+ string2vector("-n"), string2vector(cct->_conf->ms_dpdk_memory_channel),
+ };
+
+ std::optional<std::string> hugepages_path;
+ if (!cct->_conf->ms_dpdk_hugepages.empty()) {
+ hugepages_path.emplace(cct->_conf->ms_dpdk_hugepages);
+ }
+
+ // If "hugepages" is not provided and DPDK PMD drivers mode is requested -
+ // use the default DPDK huge tables configuration.
+ if (hugepages_path) {
+ args.push_back(string2vector("--huge-dir"));
+ args.push_back(string2vector(*hugepages_path));
+
+ //
+ // We don't know what is going to be our networking configuration so we
+ // assume there is going to be a queue per-CPU. Plus we'll give a DPDK
+ // 64MB for "other stuff".
+ //
+ unsigned int x;
+ std::stringstream ss;
+ ss << std::hex << "fffefffe";
+ ss >> x;
+ size_t size_MB = mem_size(bitcount(x)) >> 20;
+ std::stringstream size_MB_str;
+ size_MB_str << size_MB;
+
+ args.push_back(string2vector("-m"));
+ args.push_back(string2vector(size_MB_str.str()));
+ } else if (!cct->_conf->ms_dpdk_pmd.empty()) {
+ args.push_back(string2vector("--no-huge"));
+ }
+
+ for_each_pair(cct->_conf.get_val<std::string>("ms_dpdk_devs_allowlist"), " ",
+ [&args] (std::string_view key, std::string_view val) {
+ args.push_back(string2vector(std::string(key)));
+ if (!val.empty()) {
+ args.push_back(string2vector(std::string(val)));
+ }
+ });
+
+ std::string rte_file_prefix;
+ rte_file_prefix = "rte_";
+ rte_file_prefix += cct->_conf->name.to_str();
+ args.push_back(string2vector("--file-prefix"));
+ args.push_back(string2vector(rte_file_prefix));
+
+ std::vector<char*> cargs;
+
+ for (auto&& a: args) {
+ cargs.push_back(a.data());
+ }
+ if (!rte_initialized) {
+ /* initialise the EAL for all */
+ int ret = rte_eal_init(cargs.size(), cargs.data());
+ if (ret < 0) {
+ std::unique_lock locker(lock);
+ done = true;
+ cond.notify_all();
+ return ret;
+ }
+ rte_initialized = true;
+ }
+
+ std::unique_lock locker(lock);
+ initialized = true;
+ done = true;
+ cond.notify_all();
+ while (!stopped) {
+ cond.wait(locker, [this] { return !funcs.empty() || stopped; });
+ if (!funcs.empty()) {
+ auto f = std::move(funcs.front());
+ funcs.pop_front();
+ f();
+ cond.notify_all();
+ }
+ }
+ });
+ std::unique_lock locker(lock);
+ cond.wait(locker, [&] { return done; });
+ return initialized ? 0 : -EIO;
+ }
+
+ size_t eal::mem_size(int num_cpus)
+ {
+ size_t memsize = 0;
+ //
+ // PMD mempool memory:
+ //
+ // We don't know what is going to be our networking configuration so we
+ // assume there is going to be a queue per-CPU.
+ //
+ memsize += num_cpus * qp_mempool_obj_size();
+
+ // Plus we'll give a DPDK 64MB for "other stuff".
+ memsize += (64UL << 20);
+
+ return memsize;
+ }
+
+ void eal::stop()
+ {
+ assert(initialized);
+ assert(!stopped);
+ stopped = true;
+ cond.notify_all();
+ t.join();
+ }
+
+} // namespace dpdk
diff --git a/src/msg/async/dpdk/dpdk_rte.h b/src/msg/async/dpdk/dpdk_rte.h
new file mode 100644
index 000000000..6784af6d4
--- /dev/null
+++ b/src/msg/async/dpdk/dpdk_rte.h
@@ -0,0 +1,79 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef CEPH_DPDK_RTE_H_
+#define CEPH_DPDK_RTE_H_
+
+
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include <bitset>
+#include <rte_config.h>
+#include <rte_version.h>
+#include <boost/program_options.hpp>
+
+/*********************** Compat section ***************************************/
+// We currently support only versions 2.0 and above.
+#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0))
+#error "DPDK version above 2.0.0 is required"
+#endif
+
+#if defined(RTE_MBUF_REFCNT_ATOMIC)
+#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \
+ "config/common_linuxapp"
+#endif
+/******************************************************************************/
+
+namespace dpdk {
+
+// DPDK Environment Abstraction Layer
+class eal {
+ public:
+ using cpuset = std::bitset<RTE_MAX_LCORE>;
+ explicit eal(CephContext *cct) : cct(cct) {}
+ int start();
+ void stop();
+ void execute_on_master(std::function<void()> &&f) {
+ bool done = false;
+ std::unique_lock<std::mutex> l(lock);
+ funcs.emplace_back([&]() { f(); done = true; });
+ cond.notify_all();
+ while (!done)
+ cond.wait(l);
+ }
+ /**
+ * Returns the amount of memory needed for DPDK
+ * @param num_cpus Number of CPUs the application is going to use
+ *
+ * @return
+ */
+ size_t mem_size(int num_cpus);
+ static bool rte_initialized;
+ private:
+ CephContext *cct;
+ bool initialized = false;
+ bool stopped = false;
+ std::thread t;
+ std::mutex lock;
+ std::condition_variable cond;
+ std::list<std::function<void()>> funcs;
+};
+
+} // namespace dpdk
+#endif // CEPH_DPDK_RTE_H_
diff --git a/src/msg/async/dpdk/ethernet.cc b/src/msg/async/dpdk/ethernet.cc
new file mode 100644
index 000000000..9aca50788
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.cc
@@ -0,0 +1,16 @@
+#include <iomanip>
+
+#include "ethernet.h"
+
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea) {
+ auto& m = ea.mac;
+ using u = uint32_t;
+ os << std::hex << std::setw(2)
+ << u(m[0]) << ":"
+ << u(m[1]) << ":"
+ << u(m[2]) << ":"
+ << u(m[3]) << ":"
+ << u(m[4]) << ":"
+ << u(m[5]);
+ return os;
+}
diff --git a/src/msg/async/dpdk/ethernet.h b/src/msg/async/dpdk/ethernet.h
new file mode 100644
index 000000000..b007425fe
--- /dev/null
+++ b/src/msg/async/dpdk/ethernet.h
@@ -0,0 +1,84 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_ETHERNET_H_
+#define CEPH_MSG_ETHERNET_H_
+
+#include <array>
+#include <sstream>
+
+#include "include/ceph_assert.h"
+#include "byteorder.h"
+
+struct ethernet_address {
+ ethernet_address() {}
+
+ ethernet_address(const uint8_t *eaddr) {
+ std::copy(eaddr, eaddr + 6, mac.begin());
+ }
+
+ ethernet_address(std::initializer_list<uint8_t> eaddr) {
+ ceph_assert(eaddr.size() == mac.size());
+ std::copy(eaddr.begin(), eaddr.end(), mac.begin());
+ }
+
+ ethernet_address ntoh() {
+ return *this;
+ }
+ ethernet_address hton() {
+ return *this;
+ }
+ std::array<uint8_t, 6> mac;
+} __attribute__((packed));
+
+inline bool operator==(const ethernet_address& a, const ethernet_address& b) {
+ return a.mac == b.mac;
+}
+std::ostream& operator<<(std::ostream& os, const ethernet_address& ea);
+
+struct ethernet {
+ using address = ethernet_address;
+ static address broadcast_address() {
+ return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ }
+ static constexpr uint16_t arp_hardware_type() { return 1; }
+};
+
+struct eth_hdr {
+ ethernet_address dst_mac;
+ ethernet_address src_mac;
+ uint16_t eth_proto;
+ eth_hdr hton() {
+ eth_hdr hdr = *this;
+ hdr.eth_proto = ::hton(eth_proto);
+ return hdr;
+ }
+ eth_hdr ntoh() {
+ eth_hdr hdr = *this;
+ hdr.eth_proto = ::ntoh(eth_proto);
+ return hdr;
+ }
+} __attribute__((packed));
+
+ethernet_address parse_ethernet_address(std::string addr);
+
+#endif /* CEPH_MSG_ETHERNET_H_ */
diff --git a/src/msg/async/dpdk/ip_types.h b/src/msg/async/dpdk/ip_types.h
new file mode 100644
index 000000000..356d8fd6e
--- /dev/null
+++ b/src/msg/async/dpdk/ip_types.h
@@ -0,0 +1,109 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_IP_TYPES_H_H
+#define CEPH_IP_TYPES_H_H
+
+#include <boost/asio/ip/address_v4.hpp>
+#include <string>
+
+class Packet;
+class ethernet_address;
+using resolution_cb = std::function<void (const ethernet_address&, Packet, int)>;
+
+struct ipv4_addr {
+ uint32_t ip;
+ uint16_t port;
+
+ ipv4_addr() : ip(0), port(0) {}
+ ipv4_addr(uint32_t ip, uint16_t port) : ip(ip), port(port) {}
+ ipv4_addr(uint16_t port) : ip(0), port(port) {}
+ ipv4_addr(const std::string &addr);
+ ipv4_addr(const std::string &addr, uint16_t port);
+
+ ipv4_addr(const entity_addr_t &ad) {
+ ip = ntoh(ad.in4_addr().sin_addr.s_addr);
+ port = ad.get_port();
+ }
+
+ ipv4_addr(entity_addr_t &&addr) : ipv4_addr(addr) {}
+};
+
+struct ipv4_address {
+ ipv4_address() : ip(0) {}
+ explicit ipv4_address(uint32_t ip) : ip(ip) {}
+ explicit ipv4_address(const std::string& addr) {
+ ip = static_cast<uint32_t>(boost::asio::ip::address_v4::from_string(addr).to_ulong());
+ }
+ ipv4_address(ipv4_addr addr) {
+ ip = addr.ip;
+ }
+
+ uint32_t ip;
+
+ ipv4_address hton() {
+ ipv4_address addr;
+ addr.ip = ::hton(ip);
+ return addr;
+ }
+ ipv4_address ntoh() {
+ ipv4_address addr;
+ addr.ip = ::ntoh(ip);
+ return addr;
+ }
+
+ friend bool operator==(ipv4_address x, ipv4_address y) {
+ return x.ip == y.ip;
+ }
+ friend bool operator!=(ipv4_address x, ipv4_address y) {
+ return x.ip != y.ip;
+ }
+} __attribute__((packed));
+
+static inline bool is_unspecified(ipv4_address addr) { return addr.ip == 0; }
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a);
+
+namespace std {
+
+ template <>
+ struct hash<ipv4_address> {
+ size_t operator()(ipv4_address a) const { return a.ip; }
+ };
+
+}
+
+#endif //CEPH_IP_TYPES_H_H
diff --git a/src/msg/async/dpdk/net.cc b/src/msg/async/dpdk/net.cc
new file mode 100644
index 000000000..c429c426c
--- /dev/null
+++ b/src/msg/async/dpdk/net.cc
@@ -0,0 +1,205 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ */
+
+#include "net.h"
+#include "DPDK.h"
+#include "DPDKStack.h"
+
+#include "common/dout.h"
+#include "include/ceph_assert.h"
+
+#define dout_subsys ceph_subsys_dpdk
+#undef dout_prefix
+#define dout_prefix *_dout << "net "
+
+interface::interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center)
+ : cct(cct), _dev(dev),
+ _rx(_dev->receive(
+ center->get_id(),
+ [center, this] (Packet p) {
+ return dispatch_packet(center, std::move(p));
+ }
+ )),
+ _hw_address(_dev->hw_address()),
+ _hw_features(_dev->get_hw_features()) {
+ auto idx = 0u;
+ unsigned qid = center->get_id();
+ dev->queue_for_cpu(center->get_id()).register_packet_provider([this, idx, qid] () mutable {
+ std::optional<Packet> p;
+ for (size_t i = 0; i < _pkt_providers.size(); i++) {
+ auto l3p = _pkt_providers[idx++]();
+ if (idx == _pkt_providers.size())
+ idx = 0;
+ if (l3p) {
+ auto l3pv = std::move(*l3p);
+ auto eh = l3pv.p.prepend_header<eth_hdr>();
+ eh->dst_mac = l3pv.to;
+ eh->src_mac = _hw_address;
+ eh->eth_proto = uint16_t(l3pv.proto_num);
+ *eh = eh->hton();
+ ldout(this->cct, 10) << "=== tx === proto " << std::hex << uint16_t(l3pv.proto_num)
+ << " " << _hw_address << " -> " << l3pv.to
+ << " length " << std::dec << l3pv.p.len() << dendl;
+ p = std::move(l3pv.p);
+ return p;
+ }
+ }
+ return p;
+ });
+}
+
+subscription<Packet, ethernet_address> interface::register_l3(
+ eth_protocol_num proto_num,
+ std::function<int (Packet p, ethernet_address from)> next,
+ std::function<bool (forward_hash&, Packet& p, size_t)> forward)
+{
+ auto i = _proto_map.emplace(std::piecewise_construct, std::make_tuple(uint16_t(proto_num)), std::forward_as_tuple(std::move(forward)));
+ ceph_assert(i.second);
+ l3_rx_stream& l3_rx = i.first->second;
+ return l3_rx.packet_stream.listen(std::move(next));
+}
+
+unsigned interface::hash2cpu(uint32_t hash) {
+ return _dev->hash2cpu(hash);
+}
+
+const rss_key_type& interface::rss_key() const {
+ return _dev->rss_key();
+}
+
+uint16_t interface::hw_queues_count() const {
+ return _dev->hw_queues_count();
+}
+
+class C_handle_l2forward : public EventCallback {
+ std::shared_ptr<DPDKDevice> sdev;
+ unsigned &queue_depth;
+ Packet p;
+ unsigned dst;
+
+ public:
+ C_handle_l2forward(std::shared_ptr<DPDKDevice> &p, unsigned &qd, Packet pkt, unsigned target)
+ : sdev(p), queue_depth(qd), p(std::move(pkt)), dst(target) {}
+ void do_request(uint64_t fd) {
+ sdev->l2receive(dst, std::move(p));
+ queue_depth--;
+ delete this;
+ }
+};
+
+void interface::forward(EventCenter *source, unsigned target, Packet p) {
+ static __thread unsigned queue_depth;
+
+ if (queue_depth < 1000) {
+ queue_depth++;
+ // FIXME: need ensure this event not be called after EventCenter destruct
+ _dev->workers[target]->center.dispatch_event_external(
+ new C_handle_l2forward(_dev, queue_depth, std::move(p.free_on_cpu(source)), target));
+ }
+}
+
+int interface::dispatch_packet(EventCenter *center, Packet p) {
+ auto eh = p.get_header<eth_hdr>();
+ if (eh) {
+ auto i = _proto_map.find(ntoh(eh->eth_proto));
+ auto hwrss = p.rss_hash();
+ if (hwrss) {
+ ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+ << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+ << " length " << std::dec << p.len() << " rss_hash " << *p.rss_hash() << dendl;
+ } else {
+ ldout(cct, 10) << __func__ << " === rx === proto " << std::hex << ::ntoh(eh->eth_proto)
+ << " "<< eh->src_mac.ntoh() << " -> " << eh->dst_mac.ntoh()
+ << " length " << std::dec << p.len() << dendl;
+ }
+ if (i != _proto_map.end()) {
+ l3_rx_stream& l3 = i->second;
+ auto fw = _dev->forward_dst(center->get_id(), [&p, &l3, this] () {
+ auto hwrss = p.rss_hash();
+ if (hwrss) {
+ return *hwrss;
+ } else {
+ forward_hash data;
+ if (l3.forward(data, p, sizeof(eth_hdr))) {
+ return toeplitz_hash(rss_key(), data);
+ }
+ return 0u;
+ }
+ });
+ if (fw != center->get_id()) {
+ ldout(cct, 1) << __func__ << " forward to " << fw << dendl;
+ forward(center, fw, std::move(p));
+ } else {
+ auto h = eh->ntoh();
+ auto from = h.src_mac;
+ p.trim_front(sizeof(*eh));
+ // avoid chaining, since queue length is unlimited
+ // drop instead.
+ if (l3.ready()) {
+ return l3.packet_stream.produce(std::move(p), from);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+class C_arp_learn : public EventCallback {
+ DPDKWorker *worker;
+ ethernet_address l2_addr;
+ ipv4_address l3_addr;
+
+ public:
+ C_arp_learn(DPDKWorker *w, ethernet_address l2, ipv4_address l3)
+ : worker(w), l2_addr(l2), l3_addr(l3) {}
+ void do_request(uint64_t id) {
+ worker->arp_learn(l2_addr, l3_addr);
+ delete this;
+ }
+};
+
+void interface::arp_learn(ethernet_address l2, ipv4_address l3)
+{
+ for (auto &&w : _dev->workers) {
+ w->center.dispatch_event_external(
+ new C_arp_learn(w, l2, l3));
+ }
+}
+
+l3_protocol::l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func)
+ : _netif(netif), _proto_num(proto_num) {
+ _netif->register_packet_provider(std::move(func));
+}
+
+subscription<Packet, ethernet_address> l3_protocol::receive(
+ std::function<int (Packet, ethernet_address)> rx_fn,
+ std::function<bool (forward_hash &h, Packet &p, size_t s)> forward) {
+ return _netif->register_l3(_proto_num, std::move(rx_fn), std::move(forward));
+};
diff --git a/src/msg/async/dpdk/net.h b/src/msg/async/dpdk/net.h
new file mode 100644
index 000000000..1966f847c
--- /dev/null
+++ b/src/msg/async/dpdk/net.h
@@ -0,0 +1,138 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_NET_H
+#define CEPH_MSG_DPDK_NET_H
+
+#include "const.h"
+#include "ethernet.h"
+#include "Packet.h"
+#include "stream.h"
+#include "toeplitz.h"
+
+struct hw_features {
+ // Enable tx ip header checksum offload
+ bool tx_csum_ip_offload = false;
+ // Enable tx l4 (TCP or UDP) checksum offload
+ bool tx_csum_l4_offload = false;
+ // Enable rx checksum offload
+ bool rx_csum_offload = false;
+ // LRO is enabled
+ bool rx_lro = false;
+ // Enable tx TCP segment offload
+ bool tx_tso = false;
+ // Enable tx UDP fragmentation offload
+ bool tx_ufo = false;
+ // Maximum Transmission Unit
+ uint16_t mtu = 1500;
+ // Maximun packet len when TCP/UDP offload is enabled
+ uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len;
+};
+
+class forward_hash {
+ uint8_t data[64];
+ size_t end_idx = 0;
+ public:
+ size_t size() const {
+ return end_idx;
+ }
+ void push_back(uint8_t b) {
+ ceph_assert(end_idx < sizeof(data));
+ data[end_idx++] = b;
+ }
+ void push_back(uint16_t b) {
+ push_back(uint8_t(b));
+ push_back(uint8_t(b >> 8));
+ }
+ void push_back(uint32_t b) {
+ push_back(uint16_t(b));
+ push_back(uint16_t(b >> 16));
+ }
+ const uint8_t& operator[](size_t idx) const {
+ return data[idx];
+ }
+};
+
+class interface;
+
+class l3_protocol {
+ public:
+ struct l3packet {
+ eth_protocol_num proto_num;
+ ethernet_address to;
+ Packet p;
+ };
+ using packet_provider_type = std::function<std::optional<l3packet> ()>;
+
+ private:
+ interface* _netif;
+ eth_protocol_num _proto_num;
+
+ public:
+ explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func);
+ subscription<Packet, ethernet_address> receive(
+ std::function<int (Packet, ethernet_address)> rx_fn,
+ std::function<bool (forward_hash &h, Packet &p, size_t s)> forward);
+
+ private:
+ friend class interface;
+};
+
+class DPDKDevice;
+struct ipv4_address;
+
+class interface {
+ CephContext *cct;
+ struct l3_rx_stream {
+ stream<Packet, ethernet_address> packet_stream;
+ std::function<bool (forward_hash&, Packet&, size_t)> forward;
+ bool ready() { return packet_stream.started(); }
+ explicit l3_rx_stream(std::function<bool (forward_hash&, Packet&, size_t)>&& fw) : forward(fw) {}
+ };
+ std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
+ std::shared_ptr<DPDKDevice> _dev;
+ subscription<Packet> _rx;
+ ethernet_address _hw_address;
+ struct hw_features _hw_features;
+ std::vector<l3_protocol::packet_provider_type> _pkt_providers;
+
+ private:
+ int dispatch_packet(EventCenter *c, Packet p);
+ public:
+ explicit interface(CephContext *cct, std::shared_ptr<DPDKDevice> dev, EventCenter *center);
+ ethernet_address hw_address() { return _hw_address; }
+ const struct hw_features& get_hw_features() const { return _hw_features; }
+ subscription<Packet, ethernet_address> register_l3(
+ eth_protocol_num proto_num,
+ std::function<int (Packet, ethernet_address)> next,
+ std::function<bool (forward_hash&, Packet&, size_t)> forward);
+ void forward(EventCenter *source, unsigned target, Packet p);
+ unsigned hash2cpu(uint32_t hash);
+ void register_packet_provider(l3_protocol::packet_provider_type func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ const rss_key_type& rss_key() const;
+ uint16_t hw_queues_count() const;
+ void arp_learn(ethernet_address l2, ipv4_address l3);
+ friend class l3_protocol;
+};
+
+#endif //CEPH_MSG_DPDK_NET_H
diff --git a/src/msg/async/dpdk/queue.h b/src/msg/async/dpdk/queue.h
new file mode 100644
index 000000000..984ddca13
--- /dev/null
+++ b/src/msg/async/dpdk/queue.h
@@ -0,0 +1,96 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_DPDK_QUEUE_H_
+#define CEPH_MSG_DPDK_QUEUE_H_
+
+#include <queue>
+
+#include "circular_buffer.h"
+
+template <typename T>
+class queue {
+ std::queue<T, circular_buffer<T>> _q;
+ size_t _max;
+
+ public:
+ explicit queue(size_t size): _max(size) {}
+
+ // Push an item.
+ //
+ // Returns false if the queue was full and the item was not pushed.
+ bool push(T&& a);
+
+ // pops an item.
+ T pop();
+
+ // Consumes items from the queue, passing them to @func, until @func
+ // returns false or the queue it empty
+ //
+ // Returns false if func returned false.
+ template <typename Func>
+ bool consume(Func&& func);
+
+ // Returns true when the queue is empty.
+ bool empty() const;
+
+ // Returns true when the queue is full.
+ bool full() const;
+
+ size_t size() const { return _q.size(); }
+
+ // Destroy any items in the queue
+ void clear() {
+ while (!_q.empty()) {
+ _q.pop();
+ }
+ }
+};
+
+template <typename T>
+inline bool queue<T>::push(T&& data) {
+ if (_q.size() < _max) {
+ _q.push(std::move(data));
+ notify_not_empty();
+ return true;
+ } else {
+ return false;
+ }
+}
+
+template <typename T>
+inline T queue<T>::pop() {
+ T data = std::move(_q.front());
+ _q.pop();
+ return data;
+}
+
+template <typename T>
+inline bool queue<T>::empty() const {
+ return _q.empty();
+}
+
+template <typename T>
+inline bool queue<T>::full() const {
+ return _q.size() == _max;
+}
+
+#endif /* CEPH_MSG_DPDK_QUEUE_H_ */
diff --git a/src/msg/async/dpdk/shared_ptr.h b/src/msg/async/dpdk/shared_ptr.h
new file mode 100644
index 000000000..d078063b3
--- /dev/null
+++ b/src/msg/async/dpdk/shared_ptr.h
@@ -0,0 +1,391 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:4; indent-tabs-mode:nil -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_LW_SHARED_PTR_H_
+#define CEPH_LW_SHARED_PTR_H_
+
+#include <utility>
+#include <type_traits>
+#include <functional>
+#include <iostream>
+
+// This header defines a shared pointer facility, lw_shared_ptr<>,
+// modeled after std::shared_ptr<>.
+//
+// Unlike std::shared_ptr<>, this implementation is thread
+// safe, and two pointers sharing the same object must not be used in
+// different threads.
+//
+// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<>
+// occupying just one machine word, and adding just one word to the shared
+// object. However, it does not support polymorphism.
+//
+// It supports shared_from_this() via enable_shared_from_this<>
+// and lw_enable_shared_from_this<>().
+//
+
+template <typename T>
+class lw_shared_ptr;
+
+template <typename T>
+class enable_lw_shared_from_this;
+
+template <typename T>
+class enable_shared_from_this;
+
+template <typename T, typename... A>
+lw_shared_ptr<T> make_lw_shared(A&&... a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T&& a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T& a);
+
+struct lw_shared_ptr_counter_base {
+ long _count = 0;
+};
+
+
+namespace internal {
+
+template <class T, class U>
+struct lw_shared_ptr_accessors;
+
+template <class T>
+struct lw_shared_ptr_accessors_esft;
+
+template <class T>
+struct lw_shared_ptr_accessors_no_esft;
+
+}
+
+
+// We want to support two use cases for shared_ptr<T>:
+//
+// 1. T is any type (primitive or class type)
+//
+// 2. T is a class type that inherits from enable_shared_from_this<T>.
+//
+// In the first case, we must wrap T in an object containing the counter,
+// since T may be a primitive type and cannot be a base class.
+//
+// In the second case, we want T to reach the counter through its
+// enable_shared_from_this<> base class, so that we can implement
+// shared_from_this().
+//
+// To implement those two conflicting requirements (T alongside its counter;
+// T inherits from an object containing the counter) we use std::conditional<>
+// and some accessor functions to select between two implementations.
+
+
+// CRTP from this to enable shared_from_this:
+template <typename T>
+class enable_lw_shared_from_this : private lw_shared_ptr_counter_base {
+ using ctor = T;
+protected:
+ enable_lw_shared_from_this() noexcept {}
+ enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {}
+ enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {}
+ enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; }
+ enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; }
+public:
+ lw_shared_ptr<T> shared_from_this();
+ lw_shared_ptr<const T> shared_from_this() const;
+
+ template <typename X>
+ friend class lw_shared_ptr;
+ template <typename X>
+ friend class ::internal::lw_shared_ptr_accessors_esft;
+ template <typename X, class Y>
+ friend class ::internal::lw_shared_ptr_accessors;
+};
+
+template <typename T>
+struct shared_ptr_no_esft : private lw_shared_ptr_counter_base {
+ T _value;
+
+ shared_ptr_no_esft() = default;
+ shared_ptr_no_esft(const T& x) : _value(x) {}
+ shared_ptr_no_esft(T&& x) : _value(std::move(x)) {}
+ template <typename... A>
+ shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {}
+
+ template <typename X>
+ friend class lw_shared_ptr;
+ template <typename X>
+ friend class ::internal::lw_shared_ptr_accessors_no_esft;
+ template <typename X, class Y>
+ friend class ::internal::lw_shared_ptr_accessors;
+};
+
+
+/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed,
+/// primarily so that incomplete classes can be used.
+///
+/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>.
+/// The specialization must be visible for all uses of \c lw_shared_ptr<T>.
+///
+/// To customize, the template must have a `static void dispose(T*)` operator that disposes of
+/// the object.
+template <typename T>
+struct lw_shared_ptr_deleter; // No generic implementation
+
+namespace internal {
+
+template <typename T>
+struct lw_shared_ptr_accessors_esft {
+ using concrete_type = std::remove_const_t<T>;
+ static T* to_value(lw_shared_ptr_counter_base* counter) {
+ return static_cast<T*>(counter);
+ }
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ delete static_cast<T*>(counter);
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // since to_value() is defined above, we don't need to do anything special
+ // to force-instantiate it
+ }
+};
+
+template <typename T>
+struct lw_shared_ptr_accessors_no_esft {
+ using concrete_type = shared_ptr_no_esft<T>;
+ static T* to_value(lw_shared_ptr_counter_base* counter) {
+ return &static_cast<concrete_type*>(counter)->_value;
+ }
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ delete static_cast<concrete_type*>(counter);
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // since to_value() is defined above, we don't need to do anything special
+ // to force-instantiate it
+ }
+};
+
+// Generic case: lw_shared_ptr_deleter<T> is not specialized, select
+// implementation based on whether T inherits from enable_lw_shared_from_this<T>.
+template <typename T, typename U = void>
+struct lw_shared_ptr_accessors : std::conditional_t<
+ std::is_base_of<enable_lw_shared_from_this<T>, T>::value,
+ lw_shared_ptr_accessors_esft<T>,
+ lw_shared_ptr_accessors_no_esft<T>> {
+};
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+struct lw_shared_ptr_accessors<T, std::void_t<decltype(lw_shared_ptr_deleter<T>{})>> {
+ using concrete_type = T;
+ static T* to_value(lw_shared_ptr_counter_base* counter);
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ lw_shared_ptr_deleter<T>::dispose(to_value(counter));
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // instantiate to_value(); must be defined by shared_ptr_incomplete.hh
+ to_value(p);
+ }
+};
+
+}
+
+template <typename T>
+class lw_shared_ptr {
+ using accessors = ::internal::lw_shared_ptr_accessors<std::remove_const_t<T>>;
+ using concrete_type = typename accessors::concrete_type;
+ mutable lw_shared_ptr_counter_base* _p = nullptr;
+private:
+ lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) {
+ if (_p) {
+ ++_p->_count;
+ }
+ }
+ template <typename... A>
+ static lw_shared_ptr make(A&&... a) {
+ auto p = new concrete_type(std::forward<A>(a)...);
+ accessors::instantiate_to_value(p);
+ return lw_shared_ptr(p);
+ }
+public:
+ using element_type = T;
+
+ lw_shared_ptr() noexcept = default;
+ lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {}
+ lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) {
+ if (_p) {
+ ++_p->_count;
+ }
+ }
+ lw_shared_ptr(lw_shared_ptr&& x) noexcept : _p(x._p) {
+ x._p = nullptr;
+ }
+ [[gnu::always_inline]]
+ ~lw_shared_ptr() {
+ if (_p && !--_p->_count) {
+ accessors::dispose(_p);
+ }
+ }
+ lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept {
+ if (_p != x._p) {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(x);
+ }
+ return *this;
+ }
+ lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept {
+ if (_p != x._p) {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(std::move(x));
+ }
+ return *this;
+ }
+ lw_shared_ptr& operator=(std::nullptr_t) noexcept {
+ return *this = lw_shared_ptr();
+ }
+ lw_shared_ptr& operator=(T&& x) noexcept {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x)));
+ return *this;
+ }
+
+ T& operator*() const noexcept { return *accessors::to_value(_p); }
+ T* operator->() const noexcept { return accessors::to_value(_p); }
+ T* get() const noexcept {
+ if (_p) {
+ return accessors::to_value(_p);
+ } else {
+ return nullptr;
+ }
+ }
+
+ long int use_count() const noexcept {
+ if (_p) {
+ return _p->_count;
+ } else {
+ return 0;
+ }
+ }
+
+ operator lw_shared_ptr<const T>() const noexcept {
+ return lw_shared_ptr<const T>(_p);
+ }
+
+ explicit operator bool() const noexcept {
+ return _p;
+ }
+
+ bool owned() const noexcept {
+ return _p->_count == 1;
+ }
+
+ bool operator==(const lw_shared_ptr<const T>& x) const {
+ return _p == x._p;
+ }
+
+ bool operator!=(const lw_shared_ptr<const T>& x) const {
+ return !operator==(x);
+ }
+
+ bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return _p == x._p;
+ }
+
+ bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return !operator==(x);
+ }
+
+ bool operator<(const lw_shared_ptr<const T>& x) const {
+ return _p < x._p;
+ }
+
+ bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return _p < x._p;
+ }
+
+ template <typename U>
+ friend class lw_shared_ptr;
+
+ template <typename X, typename... A>
+ friend lw_shared_ptr<X> make_lw_shared(A&&...);
+
+ template <typename U>
+ friend lw_shared_ptr<U> make_lw_shared(U&&);
+
+ template <typename U>
+ friend lw_shared_ptr<U> make_lw_shared(U&);
+
+ template <typename U>
+ friend class enable_lw_shared_from_this;
+};
+
+template <typename T, typename... A>
+inline
+lw_shared_ptr<T> make_lw_shared(A&&... a) {
+ return lw_shared_ptr<T>::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T&& a) {
+ return lw_shared_ptr<T>::make(std::move(a));
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T& a) {
+ return lw_shared_ptr<T>::make(a);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T>
+enable_lw_shared_from_this<T>::shared_from_this() {
+ return lw_shared_ptr<T>(this);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<const T>
+enable_lw_shared_from_this<T>::shared_from_this() const {
+ return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this));
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) {
+ if (!p) {
+ return out << "null";
+ }
+ return out << *p;
+}
+
+namespace std {
+
+ template <typename T>
+ struct hash<lw_shared_ptr<T>> : private hash<T*> {
+ size_t operator()(const lw_shared_ptr<T>& p) const {
+ return hash<T*>::operator()(p.get());
+ }
+ };
+
+}
+
+#endif /* CEPH_LW_SHARED_PTR_H_ */
diff --git a/src/msg/async/dpdk/stream.h b/src/msg/async/dpdk/stream.h
new file mode 100644
index 000000000..1898e8f86
--- /dev/null
+++ b/src/msg/async/dpdk/stream.h
@@ -0,0 +1,155 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_MSG_STREAM_H_
+#define CEPH_MSG_STREAM_H_
+
+#include <exception>
+#include <cassert>
+
+// A stream<> is the producer side. It may call produce() as long
+// as the returned from the previous invocation is ready.
+// To signify no more data is available, call close().
+//
+// A subscription<> is the consumer side. It is created by a call
+// to stream::listen(). Calling subscription::start(),
+// which registers the data processing callback, starts processing
+// events. It may register for end-of-stream notifications by
+// return the when_done() future, which also delivers error
+// events (as exceptions).
+//
+// The consumer can pause generation of new data by returning
+// positive integer; when it becomes ready, the producer
+// will resume processing.
+
+template <typename... T>
+class subscription;
+
+template <typename... T>
+class stream {
+ subscription<T...>* _sub = nullptr;
+ int done;
+ bool ready;
+ public:
+ using next_fn = std::function<int (T...)>;
+ stream() = default;
+ stream(const stream&) = delete;
+ stream(stream&&) = delete;
+ ~stream() {
+ if (_sub) {
+ _sub->_stream = nullptr;
+ }
+ }
+
+ void operator=(const stream&) = delete;
+ void operator=(stream&&) = delete;
+
+ // Returns a subscription that reads value from this
+ // stream.
+ subscription<T...> listen() {
+ return subscription<T...>(this);
+ }
+
+ // Returns a subscription that reads value from this
+ // stream, and also sets up the listen function.
+ subscription<T...> listen(next_fn next) {
+ auto sub = subscription<T...>(this);
+ sub.start(std::move(next));
+ return sub;
+ }
+
+ // Becomes ready when the listener is ready to accept
+ // values. Call only once, when beginning to produce
+ // values.
+ bool started() {
+ return ready;
+ }
+
+ // Produce a value. Call only after started(), and after
+ // a previous produce() is ready.
+ int produce(T... data) {
+ return _sub->_next(std::move(data)...);
+ }
+
+ // End the stream. Call only after started(), and after
+ // a previous produce() is ready. No functions may be called
+ // after this.
+ void close() {
+ done = 1;
+ }
+
+ // Signal an error. Call only after started(), and after
+ // a previous produce() is ready. No functions may be called
+ // after this.
+ void set_exception(int error) {
+ done = error;
+ }
+ private:
+ void start();
+ friend class subscription<T...>;
+};
+
+template <typename... T>
+class subscription {
+ public:
+ using next_fn = typename stream<T...>::next_fn;
+ private:
+ stream<T...>* _stream;
+ next_fn _next;
+ private:
+ explicit subscription(stream<T...>* s): _stream(s) {
+ ceph_assert(!_stream->_sub);
+ _stream->_sub = this;
+ }
+
+ public:
+ subscription(subscription&& x)
+ : _stream(x._stream), _next(std::move(x._next)) {
+ x._stream = nullptr;
+ if (_stream) {
+ _stream->_sub = this;
+ }
+ }
+ ~subscription() {
+ if (_stream) {
+ _stream->_sub = nullptr;
+ }
+ }
+
+ /// \brief Start receiving events from the stream.
+ ///
+ /// \param next Callback to call for each event
+ void start(std::function<int (T...)> next) {
+ _next = std::move(next);
+ _stream->ready = true;
+ }
+
+ // Becomes ready when the stream is empty, or when an error
+ // happens (in that case, an exception is held).
+ int done() {
+ return _stream->done;
+ }
+
+ friend class stream<T...>;
+};
+
+#endif /* CEPH_MSG_STREAM_H_ */
diff --git a/src/msg/async/dpdk/toeplitz.h b/src/msg/async/dpdk/toeplitz.h
new file mode 100644
index 000000000..3ca388082
--- /dev/null
+++ b/src/msg/async/dpdk/toeplitz.h
@@ -0,0 +1,92 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*-
+ * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CEPH_MSG_TOEPLITZ_H_
+#define CEPH_MSG_TOEPLITZ_H_
+
+#include <vector>
+
+using rss_key_type = std::vector<uint8_t>;
+
+// Mellanox Linux's driver key
+static const rss_key_type default_rsskey_40bytes = {
+ 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
+ 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
+ 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
+ 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
+ 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
+};
+
+// Intel's i40e PMD default RSS key
+static const rss_key_type default_rsskey_52bytes = {
+ 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
+ 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
+ 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
+ 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
+ 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
+ 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
+ 0x81, 0x15, 0x03, 0x66
+};
+
+template<typename T>
+static inline uint32_t toeplitz_hash(const rss_key_type& key, const T& data)
+{
+ uint32_t hash = 0, v;
+ u_int i, b;
+
+ /* XXXRW: Perhaps an assertion about key length vs. data length? */
+
+ v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
+ for (i = 0; i < data.size(); i++) {
+ for (b = 0; b < 8; b++) {
+ if (data[i] & (1<<(7-b)))
+ hash ^= v;
+ v <<= 1;
+ if ((i + 4) < key.size() &&
+ (key[i+4] & (1<<(7-b))))
+ v |= 1;
+ }
+ }
+ return (hash);
+}
+#endif
diff --git a/src/msg/async/dpdk/transfer.h b/src/msg/async/dpdk/transfer.h
new file mode 100644
index 000000000..599db5bd0
--- /dev/null
+++ b/src/msg/async/dpdk/transfer.h
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#ifndef CEPH_TRANSFER_H_
+#define CEPH_TRANSFER_H_
+
+// Helper functions for copying or moving multiple objects in an exception
+// safe manner, then destroying the sources.
+//
+// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs,
+// (this copies the object from @from to @to). If no exceptions are encountered,
+// call transfer_pass2(allocator, &from, &to). This destroys the object at the
+// origin. If exceptions were encountered, simply destroy all copied objects.
+//
+// As an optimization, if the objects are moveable without throwing (noexcept)
+// transfer_pass1() simply moves the objects and destroys the source, and
+// transfer_pass2() does nothing.
+
+#include <type_traits>
+#include <utility>
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+ typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ a.construct(to, std::move(*from));
+ a.destroy(from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+ typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass1(Alloc& a, T* from, T* to,
+ typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ a.construct(to, *from);
+}
+
+template <typename T, typename Alloc>
+inline void transfer_pass2(Alloc& a, T* from, T* to,
+ typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ a.destroy(from);
+}
+
+#endif /* CEPH_TRANSFER_H_ */