summaryrefslogtreecommitdiffstats
path: root/src/seastar/include
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/seastar/include/seastar/core/abort_on_ebadf.hh36
-rw-r--r--src/seastar/include/seastar/core/abort_source.hh139
-rw-r--r--src/seastar/include/seastar/core/alien.hh184
-rw-r--r--src/seastar/include/seastar/core/align.hh55
-rw-r--r--src/seastar/include/seastar/core/aligned_buffer.hh45
-rw-r--r--src/seastar/include/seastar/core/app-template.hh100
-rw-r--r--src/seastar/include/seastar/core/apply.hh64
-rw-r--r--src/seastar/include/seastar/core/array_map.hh50
-rw-r--r--src/seastar/include/seastar/core/bitops.hh73
-rw-r--r--src/seastar/include/seastar/core/bitset-iter.hh198
-rw-r--r--src/seastar/include/seastar/core/byteorder.hh127
-rw-r--r--src/seastar/include/seastar/core/cacheline.hh42
-rw-r--r--src/seastar/include/seastar/core/checked_ptr.hh199
-rw-r--r--src/seastar/include/seastar/core/chunked_fifo.hh626
-rw-r--r--src/seastar/include/seastar/core/circular_buffer.hh511
-rw-r--r--src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh378
-rw-r--r--src/seastar/include/seastar/core/condition-variable.hh171
-rw-r--r--src/seastar/include/seastar/core/coroutine.hh196
-rw-r--r--src/seastar/include/seastar/core/deleter.hh281
-rw-r--r--src/seastar/include/seastar/core/distributed.hh32
-rw-r--r--src/seastar/include/seastar/core/do_with.hh153
-rw-r--r--src/seastar/include/seastar/core/dpdk_rte.hh64
-rw-r--r--src/seastar/include/seastar/core/enum.hh46
-rw-r--r--src/seastar/include/seastar/core/exception_hacks.hh26
-rw-r--r--src/seastar/include/seastar/core/execution_stage.hh543
-rw-r--r--src/seastar/include/seastar/core/expiring_fifo.hh217
-rw-r--r--src/seastar/include/seastar/core/fair_queue.hh247
-rw-r--r--src/seastar/include/seastar/core/file-types.hh140
-rw-r--r--src/seastar/include/seastar/core/file.hh586
-rw-r--r--src/seastar/include/seastar/core/fsqual.hh30
-rw-r--r--src/seastar/include/seastar/core/fstream.hh151
-rw-r--r--src/seastar/include/seastar/core/function_traits.hh68
-rw-r--r--src/seastar/include/seastar/core/future-util.hh31
-rw-r--r--src/seastar/include/seastar/core/future.hh2196
-rw-r--r--src/seastar/include/seastar/core/gate.hh170
-rw-r--r--src/seastar/include/seastar/core/idle_cpu_handler.hh59
-rw-r--r--src/seastar/include/seastar/core/internal/api-level.hh82
-rw-r--r--src/seastar/include/seastar/core/internal/buffer_allocator.hh43
-rw-r--r--src/seastar/include/seastar/core/internal/io_desc.hh35
-rw-r--r--src/seastar/include/seastar/core/internal/io_request.hh266
-rw-r--r--src/seastar/include/seastar/core/internal/poll.hh59
-rw-r--r--src/seastar/include/seastar/core/internal/pollable_fd.hh219
-rw-r--r--src/seastar/include/seastar/core/io_queue.hh170
-rw-r--r--src/seastar/include/seastar/core/iostream-impl.hh535
-rw-r--r--src/seastar/include/seastar/core/iostream.hh360
-rw-r--r--src/seastar/include/seastar/core/layered_file.hh67
-rw-r--r--src/seastar/include/seastar/core/linux-aio.hh234
-rw-r--r--src/seastar/include/seastar/core/loop.hh715
-rw-r--r--src/seastar/include/seastar/core/lowres_clock.hh160
-rw-r--r--src/seastar/include/seastar/core/make_task.hh62
-rw-r--r--src/seastar/include/seastar/core/manual_clock.hh51
-rw-r--r--src/seastar/include/seastar/core/map_reduce.hh254
-rw-r--r--src/seastar/include/seastar/core/memory.hh370
-rw-r--r--src/seastar/include/seastar/core/metrics.hh587
-rw-r--r--src/seastar/include/seastar/core/metrics_api.hh386
-rw-r--r--src/seastar/include/seastar/core/metrics_registration.hh173
-rw-r--r--src/seastar/include/seastar/core/metrics_types.hh83
-rw-r--r--src/seastar/include/seastar/core/on_internal_error.hh56
-rw-r--r--src/seastar/include/seastar/core/pipe.hh267
-rw-r--r--src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh43
-rw-r--r--src/seastar/include/seastar/core/posix.hh492
-rw-r--r--src/seastar/include/seastar/core/preempt.hh58
-rw-r--r--src/seastar/include/seastar/core/prefetch.hh115
-rw-r--r--src/seastar/include/seastar/core/print.hh148
-rw-r--r--src/seastar/include/seastar/core/prometheus.hh51
-rw-r--r--src/seastar/include/seastar/core/queue.hh279
-rw-r--r--src/seastar/include/seastar/core/ragel.hh140
-rw-r--r--src/seastar/include/seastar/core/reactor.hh755
-rw-r--r--src/seastar/include/seastar/core/reactor_config.hh47
-rw-r--r--src/seastar/include/seastar/core/report_exception.hh31
-rw-r--r--src/seastar/include/seastar/core/resource.hh94
-rw-r--r--src/seastar/include/seastar/core/rwlock.hh180
-rw-r--r--src/seastar/include/seastar/core/scattered_message.hh112
-rw-r--r--src/seastar/include/seastar/core/scheduling.hh366
-rw-r--r--src/seastar/include/seastar/core/scheduling_specific.hh189
-rw-r--r--src/seastar/include/seastar/core/scollectd.hh848
-rw-r--r--src/seastar/include/seastar/core/scollectd_api.hh35
-rw-r--r--src/seastar/include/seastar/core/seastar.hh386
-rw-r--r--src/seastar/include/seastar/core/semaphore.hh572
-rw-r--r--src/seastar/include/seastar/core/sharded.hh909
-rw-r--r--src/seastar/include/seastar/core/shared_future.hh299
-rw-r--r--src/seastar/include/seastar/core/shared_mutex.hh183
-rw-r--r--src/seastar/include/seastar/core/shared_ptr.hh868
-rw-r--r--src/seastar/include/seastar/core/shared_ptr_debug_helper.hh70
-rw-r--r--src/seastar/include/seastar/core/shared_ptr_incomplete.hh44
-rw-r--r--src/seastar/include/seastar/core/simple-stream.hh639
-rw-r--r--src/seastar/include/seastar/core/slab.hh568
-rw-r--r--src/seastar/include/seastar/core/sleep.hh93
-rw-r--r--src/seastar/include/seastar/core/smp.hh444
-rw-r--r--src/seastar/include/seastar/core/sstring.hh779
-rw-r--r--src/seastar/include/seastar/core/stall_sampler.hh53
-rw-r--r--src/seastar/include/seastar/core/std-coroutine.hh94
-rw-r--r--src/seastar/include/seastar/core/stream.hh174
-rw-r--r--src/seastar/include/seastar/core/systemwide_memory_barrier.hh37
-rw-r--r--src/seastar/include/seastar/core/task.hh67
-rw-r--r--src/seastar/include/seastar/core/temporary_buffer.hh246
-rw-r--r--src/seastar/include/seastar/core/thread.hh285
-rw-r--r--src/seastar/include/seastar/core/thread_cputime_clock.hh48
-rw-r--r--src/seastar/include/seastar/core/thread_impl.hh84
-rw-r--r--src/seastar/include/seastar/core/timed_out_error.hh42
-rw-r--r--src/seastar/include/seastar/core/timer-set.hh253
-rw-r--r--src/seastar/include/seastar/core/timer.hh225
-rw-r--r--src/seastar/include/seastar/core/transfer.hh75
-rw-r--r--src/seastar/include/seastar/core/unaligned.hh78
-rw-r--r--src/seastar/include/seastar/core/units.hh30
-rw-r--r--src/seastar/include/seastar/core/vector-data-sink.hh47
-rw-r--r--src/seastar/include/seastar/core/weak_ptr.hh120
-rw-r--r--src/seastar/include/seastar/core/when_all.hh562
-rw-r--r--src/seastar/include/seastar/core/with_scheduling_group.hh77
-rw-r--r--src/seastar/include/seastar/core/with_timeout.hh73
-rw-r--r--src/seastar/include/seastar/http/api_docs.hh340
-rw-r--r--src/seastar/include/seastar/http/common.hh74
-rw-r--r--src/seastar/include/seastar/http/exception.hh142
-rw-r--r--src/seastar/include/seastar/http/file_handler.hh172
-rw-r--r--src/seastar/include/seastar/http/function_handlers.hh132
-rw-r--r--src/seastar/include/seastar/http/handlers.hh73
-rw-r--r--src/seastar/include/seastar/http/httpd.hh243
-rw-r--r--src/seastar/include/seastar/http/json_path.hh190
-rw-r--r--src/seastar/include/seastar/http/matcher.hh111
-rw-r--r--src/seastar/include/seastar/http/matchrules.hh122
-rw-r--r--src/seastar/include/seastar/http/mime_types.hh33
-rw-r--r--src/seastar/include/seastar/http/reply.hh182
-rw-r--r--src/seastar/include/seastar/http/request.hh134
-rw-r--r--src/seastar/include/seastar/http/routes.hh289
-rw-r--r--src/seastar/include/seastar/http/transformers.hh59
-rw-r--r--src/seastar/include/seastar/json/formatter.hh335
-rw-r--r--src/seastar/include/seastar/json/json_elements.hh355
-rw-r--r--src/seastar/include/seastar/net/api.hh392
-rw-r--r--src/seastar/include/seastar/net/arp.hh296
-rw-r--r--src/seastar/include/seastar/net/byteorder.hh128
-rw-r--r--src/seastar/include/seastar/net/config.hh65
-rw-r--r--src/seastar/include/seastar/net/const.hh44
-rw-r--r--src/seastar/include/seastar/net/dhcp.hh84
-rw-r--r--src/seastar/include/seastar/net/dns.hh156
-rw-r--r--src/seastar/include/seastar/net/dpdk.hh54
-rw-r--r--src/seastar/include/seastar/net/ethernet.hh96
-rw-r--r--src/seastar/include/seastar/net/inet_address.hh128
-rw-r--r--src/seastar/include/seastar/net/ip.hh529
-rw-r--r--src/seastar/include/seastar/net/ip_checksum.hh76
-rw-r--r--src/seastar/include/seastar/net/native-stack.hh36
-rw-r--r--src/seastar/include/seastar/net/net.hh308
-rw-r--r--src/seastar/include/seastar/net/packet-data-source.hh53
-rw-r--r--src/seastar/include/seastar/net/packet-util.hh158
-rw-r--r--src/seastar/include/seastar/net/packet.hh622
-rw-r--r--src/seastar/include/seastar/net/posix-stack.hh223
-rw-r--r--src/seastar/include/seastar/net/proxy.hh32
-rw-r--r--src/seastar/include/seastar/net/socket_defs.hh186
-rw-r--r--src/seastar/include/seastar/net/stack.hh103
-rw-r--r--src/seastar/include/seastar/net/tcp-stack.hh50
-rw-r--r--src/seastar/include/seastar/net/tcp.hh2135
-rw-r--r--src/seastar/include/seastar/net/tls.hh346
-rw-r--r--src/seastar/include/seastar/net/toeplitz.hh99
-rw-r--r--src/seastar/include/seastar/net/udp.hh59
-rw-r--r--src/seastar/include/seastar/net/unix_address.hh75
-rw-r--r--src/seastar/include/seastar/net/virtio-interface.hh131
-rw-r--r--src/seastar/include/seastar/net/virtio.hh33
-rw-r--r--src/seastar/include/seastar/rpc/lz4_compressor.hh48
-rw-r--r--src/seastar/include/seastar/rpc/lz4_fragmented_compressor.hh44
-rw-r--r--src/seastar/include/seastar/rpc/multi_algo_compressor_factory.hh80
-rw-r--r--src/seastar/include/seastar/rpc/rpc.hh870
-rw-r--r--src/seastar/include/seastar/rpc/rpc_impl.hh881
-rw-r--r--src/seastar/include/seastar/rpc/rpc_types.hh386
-rw-r--r--src/seastar/include/seastar/testing/entry_point.hh33
-rw-r--r--src/seastar/include/seastar/testing/exchanger.hh88
-rw-r--r--src/seastar/include/seastar/testing/on_internal_error.hh40
-rw-r--r--src/seastar/include/seastar/testing/perf_tests.hh259
-rw-r--r--src/seastar/include/seastar/testing/seastar_test.hh68
-rw-r--r--src/seastar/include/seastar/testing/test_case.hh36
-rw-r--r--src/seastar/include/seastar/testing/test_runner.hh59
-rw-r--r--src/seastar/include/seastar/testing/thread_test_case.hh46
-rw-r--r--src/seastar/include/seastar/util/alloc_failure_injector.hh132
-rw-r--r--src/seastar/include/seastar/util/attribute-compat.hh27
-rw-r--r--src/seastar/include/seastar/util/backtrace.hh230
-rw-r--r--src/seastar/include/seastar/util/bool_class.hh110
-rw-r--r--src/seastar/include/seastar/util/concepts.hh34
-rw-r--r--src/seastar/include/seastar/util/conversions.hh48
-rw-r--r--src/seastar/include/seastar/util/critical_alloc_section.hh70
-rw-r--r--src/seastar/include/seastar/util/defer.hh58
-rw-r--r--src/seastar/include/seastar/util/eclipse.hh36
-rw-r--r--src/seastar/include/seastar/util/exceptions.hh45
-rw-r--r--src/seastar/include/seastar/util/file.hh45
-rw-r--r--src/seastar/include/seastar/util/function_input_iterator.hh72
-rw-r--r--src/seastar/include/seastar/util/gcc6-concepts.hh25
-rw-r--r--src/seastar/include/seastar/util/indirect.hh74
-rw-r--r--src/seastar/include/seastar/util/is_smart_ptr.hh34
-rw-r--r--src/seastar/include/seastar/util/later.hh44
-rw-r--r--src/seastar/include/seastar/util/lazy.hh153
-rw-r--r--src/seastar/include/seastar/util/log-cli.hh80
-rw-r--r--src/seastar/include/seastar/util/log-impl.hh117
-rw-r--r--src/seastar/include/seastar/util/log.hh464
-rw-r--r--src/seastar/include/seastar/util/memory_diagnostics.hh85
-rw-r--r--src/seastar/include/seastar/util/noncopyable_function.hh225
-rw-r--r--src/seastar/include/seastar/util/optimized_optional.hh101
-rw-r--r--src/seastar/include/seastar/util/print_safe.hh113
-rw-r--r--src/seastar/include/seastar/util/program-options.hh99
-rw-r--r--src/seastar/include/seastar/util/read_first_line.hh14
-rw-r--r--src/seastar/include/seastar/util/reference_wrapper.hh74
-rw-r--r--src/seastar/include/seastar/util/spinlock.hh104
-rw-r--r--src/seastar/include/seastar/util/std-compat.hh54
-rw-r--r--src/seastar/include/seastar/util/tmp_file.hh200
-rw-r--r--src/seastar/include/seastar/util/transform_iterator.hh57
-rw-r--r--src/seastar/include/seastar/util/tuple_utils.hh174
-rw-r--r--src/seastar/include/seastar/util/used_size.hh36
-rw-r--r--src/seastar/include/seastar/util/variant_utils.hh102
204 files changed, 41932 insertions, 0 deletions
diff --git a/src/seastar/include/seastar/core/abort_on_ebadf.hh b/src/seastar/include/seastar/core/abort_on_ebadf.hh
new file mode 100644
index 000000000..7cb8c05b1
--- /dev/null
+++ b/src/seastar/include/seastar/core/abort_on_ebadf.hh
@@ -0,0 +1,36 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+#pragma once
+
+namespace seastar {
+
+/// Determines whether seastar should throw or abort when operation made by
+/// seastar fails because the target file descriptor is not valid. This is
+/// detected when underlying system calls return EBADF or ENOTSOCK.
+/// The default behavior is to throw std::system_error.
+void set_abort_on_ebadf(bool do_abort);
+
+/// Queries the current setting for seastar's behavior on invalid file descriptor access.
+/// See set_abort_on_ebadf().
+bool is_abort_on_ebadf_enabled();
+
+}
diff --git a/src/seastar/include/seastar/core/abort_source.hh b/src/seastar/include/seastar/core/abort_source.hh
new file mode 100644
index 000000000..e2a4f66bf
--- /dev/null
+++ b/src/seastar/include/seastar/core/abort_source.hh
@@ -0,0 +1,139 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB.
+ */
+
+#pragma once
+
+#include <seastar/util/noncopyable_function.hh>
+#include <seastar/util/optimized_optional.hh>
+#include <seastar/util/std-compat.hh>
+
+#include <boost/intrusive/list.hpp>
+
+#include <exception>
+
+namespace bi = boost::intrusive;
+
+namespace seastar {
+
+/// \addtogroup fiber-module
+/// @{
+
+/// Exception thrown when an \ref abort_source object has been
+/// notified by the \ref abort_source::request_abort() method.
+class abort_requested_exception : public std::exception {
+public:
+ virtual const char* what() const noexcept override {
+ return "abort requested";
+ }
+};
+
+/// Facility to communicate a cancellation request to a fiber.
+/// Callbacks can be registered with the \c abort_source, which are called
+/// atomically with a call to request_abort().
+class abort_source {
+ using subscription_callback_type = noncopyable_function<void() noexcept>;
+
+public:
+ /// Represents a handle to the callback registered by a given fiber. Ending the
+ /// lifetime of the \c subscription will unregister the callback, if it hasn't
+ /// been invoked yet.
+ class subscription : public bi::list_base_hook<bi::link_mode<bi::auto_unlink>> {
+ friend class abort_source;
+
+ abort_source* _as = nullptr;
+ subscription_callback_type _target;
+
+ explicit subscription(abort_source& as, subscription_callback_type target)
+ : _as(&as)
+ , _target(std::move(target)) {
+ as._subscriptions->push_back(*this);
+ }
+
+ void on_abort() {
+ _target();
+ }
+
+ public:
+ subscription() = default;
+
+ subscription(subscription&& other) noexcept(std::is_nothrow_move_constructible<subscription_callback_type>::value)
+ : _as(other._as)
+ , _target(std::move(other._target)) {
+ subscription_list_type::node_algorithms::swap_nodes(other.this_ptr(), this_ptr());
+ }
+
+ subscription& operator=(subscription&& other) noexcept(std::is_nothrow_move_assignable<subscription_callback_type>::value) {
+ if (this != &other) {
+ _target = std::move(other._target);
+ _as = other._as;
+ if (is_linked()) {
+ subscription_list_type::node_algorithms::unlink(this_ptr());
+ }
+ subscription_list_type::node_algorithms::swap_nodes(other.this_ptr(), this_ptr());
+ }
+ return *this;
+ }
+
+ explicit operator bool() const noexcept {
+ return _as != nullptr;
+ }
+ };
+
+private:
+ using subscription_list_type = bi::list<subscription, bi::constant_time_size<false>>;
+ std::optional<subscription_list_type> _subscriptions = subscription_list_type();
+
+public:
+ /// Delays the invocation of the callback \c f until \ref request_abort() is called.
+ /// \returns an engaged \ref optimized_optional containing a \ref subscription that can be used to control
+ /// the lifetime of the callback \c f, if \ref abort_requested() is \c false. Otherwise,
+ /// returns a disengaged \ref optimized_optional.
+ optimized_optional<subscription> subscribe(subscription_callback_type f) noexcept(std::is_nothrow_move_constructible<subscription_callback_type>::value) {
+ if (abort_requested()) {
+ return { };
+ }
+ return { subscription(*this, std::move(f)) };
+ }
+
+ /// Requests that the target operation be aborted. Current subscriptions
+ /// are invoked inline with this call, and no new ones can be registered.
+ void request_abort() {
+ _subscriptions->clear_and_dispose([] (subscription* s) { s->on_abort(); });
+ _subscriptions = { };
+ }
+
+ /// Returns whether an abort has been requested.
+ bool abort_requested() const noexcept {
+ return !_subscriptions;
+ }
+
+
+ /// Throws a \ref abort_requested_exception if cancellation has been requested.
+ void check() const {
+ if (abort_requested()) {
+ throw abort_requested_exception();
+ }
+ }
+};
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/alien.hh b/src/seastar/include/seastar/core/alien.hh
new file mode 100644
index 000000000..43510d4ea
--- /dev/null
+++ b/src/seastar/include/seastar/core/alien.hh
@@ -0,0 +1,184 @@
+// -*- mode:C++; tab-width:4; c-basic-offset:4; indent-tabs-mode:nil -*-
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2018 Red Hat
+ */
+
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <future>
+#include <memory>
+
+#include <boost/lockfree/queue.hpp>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/cacheline.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/metrics_registration.hh>
+
+/// \file
+
+namespace seastar {
+
+class reactor;
+
+/// \brief Integration with non-seastar applications.
+namespace alien {
+
+class message_queue {
+ static constexpr size_t batch_size = 128;
+ static constexpr size_t prefetch_cnt = 2;
+ struct work_item;
+ struct lf_queue_remote {
+ reactor* remote;
+ };
+ using lf_queue_base = boost::lockfree::queue<work_item*>;
+ // use inheritence to control placement order
+ struct lf_queue : lf_queue_remote, lf_queue_base {
+ lf_queue(reactor* remote)
+ : lf_queue_remote{remote}, lf_queue_base{batch_size} {}
+ void maybe_wakeup();
+ } _pending;
+ struct alignas(seastar::cache_line_size) {
+ std::atomic<size_t> value{0};
+ } _sent;
+ // keep this between two structures with statistics
+ // this makes sure that they have at least one cache line
+ // between them, so hw prefetcher will not accidentally prefetch
+ // cache line used by another cpu.
+ metrics::metric_groups _metrics;
+ struct alignas(seastar::cache_line_size) {
+ size_t _received = 0;
+ size_t _last_rcv_batch = 0;
+ };
+ struct work_item {
+ virtual ~work_item() = default;
+ virtual void process() = 0;
+ };
+ template <typename Func>
+ struct async_work_item : work_item {
+ Func _func;
+ async_work_item(Func&& func) : _func(std::move(func)) {}
+ void process() override {
+ _func();
+ }
+ };
+ template<typename Func>
+ size_t process_queue(lf_queue& q, Func process);
+ void submit_item(std::unique_ptr<work_item> wi);
+public:
+ message_queue(reactor *to);
+ void start();
+ void stop();
+ template <typename Func>
+ void submit(Func&& func) {
+ auto wi = std::make_unique<async_work_item<Func>>(std::forward<Func>(func));
+ submit_item(std::move(wi));
+ }
+ size_t process_incoming();
+ bool pure_poll_rx() const;
+};
+
+class smp {
+ struct qs_deleter {
+ unsigned count;
+ qs_deleter(unsigned n = 0) : count(n) {}
+ qs_deleter(const qs_deleter& d) : count(d.count) {}
+ void operator()(message_queue* qs) const;
+ };
+ using qs = std::unique_ptr<message_queue[], qs_deleter>;
+public:
+ static qs create_qs(const std::vector<reactor*>& reactors);
+ static qs _qs;
+ static bool poll_queues();
+ static bool pure_poll_queues();
+};
+
+/// Runs a function on a remote shard from an alien thread where engine() is not available.
+///
+/// \param shard designates the shard to run the function on
+/// \param func a callable to run on shard \c t. If \c func is a temporary object,
+/// its lifetime will be extended by moving it. If \c func is a reference,
+/// the caller must guarantee that it will survive the call.
+/// \note the func must not throw and should return \c void. as we cannot identify the
+/// alien thread, hence we are not able to post the fulfilled promise to the
+/// message queue managed by the shard executing the alien thread which is
+/// interested to the return value. Please use \c submit_to() instead, if
+/// \c func throws.
+template <typename Func>
+void run_on(unsigned shard, Func func) {
+ smp::_qs[shard].submit(std::move(func));
+}
+
+namespace internal {
+template<typename Func>
+using return_value_t = typename futurize<std::invoke_result_t<Func>>::value_type;
+
+template<typename Func,
+ bool = std::is_empty_v<return_value_t<Func>>>
+struct return_type_of {
+ using type = void;
+ static void set(std::promise<void>& p, return_value_t<Func>&&) {
+ p.set_value();
+ }
+};
+template<typename Func>
+struct return_type_of<Func, false> {
+ using return_tuple_t = typename futurize<std::invoke_result_t<Func>>::tuple_type;
+ using type = std::tuple_element_t<0, return_tuple_t>;
+ static void set(std::promise<type>& p, return_value_t<Func>&& t) {
+#if SEASTAR_API_LEVEL < 5
+ p.set_value(std::get<0>(std::move(t)));
+#else
+ p.set_value(std::move(t));
+#endif
+ }
+};
+template <typename Func> using return_type_t = typename return_type_of<Func>::type;
+}
+
+/// Runs a function on a remote shard from an alien thread where engine() is not available.
+///
+/// \param shard designates the shard to run the function on
+/// \param func a callable to run on \c shard. If \c func is a temporary object,
+/// its lifetime will be extended by moving it. If \c func is a reference,
+/// the caller must guarantee that it will survive the call.
+/// \return whatever \c func returns, as a \c std::future<>
+/// \note the caller must keep the returned future alive until \c func returns
+template<typename Func, typename T = internal::return_type_t<Func>>
+std::future<T> submit_to(unsigned shard, Func func) {
+ std::promise<T> pr;
+ auto fut = pr.get_future();
+ run_on(shard, [pr = std::move(pr), func = std::move(func)] () mutable {
+ // std::future returned via std::promise above.
+ (void)func().then_wrapped([pr = std::move(pr)] (auto&& result) mutable {
+ try {
+ internal::return_type_of<Func>::set(pr, result.get());
+ } catch (...) {
+ pr.set_exception(std::current_exception());
+ }
+ });
+ });
+ return fut;
+}
+
+}
+}
diff --git a/src/seastar/include/seastar/core/align.hh b/src/seastar/include/seastar/core/align.hh
new file mode 100644
index 000000000..5dd07c279
--- /dev/null
+++ b/src/seastar/include/seastar/core/align.hh
@@ -0,0 +1,55 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+
+namespace seastar {
+
+template <typename T>
+inline constexpr
+T align_up(T v, T align) {
+ return (v + align - 1) & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr
+T* align_up(T* v, size_t align) {
+ static_assert(sizeof(T) == 1, "align byte pointers only");
+ return reinterpret_cast<T*>(align_up(reinterpret_cast<uintptr_t>(v), align));
+}
+
+template <typename T>
+inline constexpr
+T align_down(T v, T align) {
+ return v & ~(align - 1);
+}
+
+template <typename T>
+inline constexpr
+T* align_down(T* v, size_t align) {
+ static_assert(sizeof(T) == 1, "align byte pointers only");
+ return reinterpret_cast<T*>(align_down(reinterpret_cast<uintptr_t>(v), align));
+}
+
+}
diff --git a/src/seastar/include/seastar/core/aligned_buffer.hh b/src/seastar/include/seastar/core/aligned_buffer.hh
new file mode 100644
index 000000000..2cc0f5a74
--- /dev/null
+++ b/src/seastar/include/seastar/core/aligned_buffer.hh
@@ -0,0 +1,45 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+#pragma once
+#include <stdlib.h>
+#include <memory>
+#include <stdexcept>
+
+namespace seastar {
+
+namespace internal {
+void* allocate_aligned_buffer_impl(size_t size, size_t align);
+}
+
+struct free_deleter {
+ void operator()(void* p) { ::free(p); }
+};
+
+template <typename CharType>
+inline
+std::unique_ptr<CharType[], free_deleter> allocate_aligned_buffer(size_t size, size_t align) {
+ static_assert(sizeof(CharType) == 1, "must allocate byte type");
+ void* ret = internal::allocate_aligned_buffer_impl(size, align);
+ return std::unique_ptr<CharType[], free_deleter>(reinterpret_cast<CharType *>(ret));
+}
+
+
+}
diff --git a/src/seastar/include/seastar/core/app-template.hh b/src/seastar/include/seastar/core/app-template.hh
new file mode 100644
index 000000000..97fb65c38
--- /dev/null
+++ b/src/seastar/include/seastar/core/app-template.hh
@@ -0,0 +1,100 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+#pragma once
+
+#include <boost/program_options.hpp>
+#include <functional>
+#include <seastar/core/future.hh>
+#include <seastar/core/sstring.hh>
+#include <chrono>
+
+namespace seastar {
+
+class app_template {
+public:
+ struct config {
+ /// The name of the application.
+ ///
+ /// Will be used in the --help output to distinguish command line args
+ /// registered by the application, as opposed to those registered by
+ /// seastar and its subsystems.
+ sstring name = "App";
+ /// The description of the application.
+ ///
+ /// Will be printed on the top of the --help output. Lines should be
+ /// hard-wrapped for 80 chars.
+ sstring description = "";
+ std::chrono::duration<double> default_task_quota = std::chrono::microseconds(500);
+ /// \brief Handle SIGINT/SIGTERM by calling reactor::stop()
+ ///
+ /// When true, Seastar will set up signal handlers for SIGINT/SIGTERM that call
+ /// reactor::stop(). The reactor will then execute callbacks installed by
+ /// reactor::at_exit().
+ ///
+ /// When false, Seastar will not set up signal handlers for SIGINT/SIGTERM
+ /// automatically. The default behavior (terminate the program) will be kept.
+ /// You can adjust the behavior of SIGINT/SIGTERM by installing signal handlers
+ /// via reactor::handle_signal().
+ bool auto_handle_sigint_sigterm = true;
+ config() {}
+ };
+
+ using configuration_reader = std::function<void (boost::program_options::variables_map&)>;
+private:
+ config _cfg;
+ boost::program_options::options_description _opts;
+ boost::program_options::options_description _opts_conf_file;
+ boost::program_options::positional_options_description _pos_opts;
+ std::optional<boost::program_options::variables_map> _configuration;
+ configuration_reader _conf_reader;
+
+ configuration_reader get_default_configuration_reader();
+public:
+ struct positional_option {
+ const char* name;
+ const boost::program_options::value_semantic* value_semantic;
+ const char* help;
+ int max_count;
+ };
+public:
+ explicit app_template(config cfg = config());
+
+ boost::program_options::options_description& get_options_description();
+ boost::program_options::options_description& get_conf_file_options_description();
+ boost::program_options::options_description_easy_init add_options();
+ void add_positional_options(std::initializer_list<positional_option> options);
+ boost::program_options::variables_map& configuration();
+ int run_deprecated(int ac, char ** av, std::function<void ()>&& func);
+
+ void set_configuration_reader(configuration_reader conf_reader);
+
+ // Runs given function and terminates the application when the future it
+ // returns resolves. The value with which the future resolves will be
+ // returned by this function.
+ int run(int ac, char ** av, std::function<future<int> ()>&& func);
+
+ // Like run() which takes std::function<future<int>()>, but returns
+ // with exit code 0 when the future returned by func resolves
+ // successfully.
+ int run(int ac, char ** av, std::function<future<> ()>&& func);
+};
+
+}
diff --git a/src/seastar/include/seastar/core/apply.hh b/src/seastar/include/seastar/core/apply.hh
new file mode 100644
index 000000000..0a6ae6e87
--- /dev/null
+++ b/src/seastar/include/seastar/core/apply.hh
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <tuple>
+#include <utility>
+#include <cstddef>
+
+namespace seastar {
+
+template <typename Func, typename Args, typename IndexList>
+struct apply_helper;
+
+template <typename Func, typename Tuple, size_t... I>
+struct apply_helper<Func, Tuple, std::index_sequence<I...>> {
+ static auto apply(Func&& func, Tuple args) {
+ return func(std::get<I>(std::forward<Tuple>(args))...);
+ }
+};
+
+template <typename Func, typename... T>
+[[deprecated("use std::apply() instead")]]
+inline
+auto apply(Func&& func, std::tuple<T...>&& args) {
+ using helper = apply_helper<Func, std::tuple<T...>&&, std::index_sequence_for<T...>>;
+ return helper::apply(std::forward<Func>(func), std::move(args));
+}
+
+template <typename Func, typename... T>
+[[deprecated("use std::apply() instead")]]
+inline
+auto apply(Func&& func, std::tuple<T...>& args) {
+ using helper = apply_helper<Func, std::tuple<T...>&, std::index_sequence_for<T...>>;
+ return helper::apply(std::forward<Func>(func), args);
+}
+
+template <typename Func, typename... T>
+[[deprecated("use std::apply() instead")]]
+inline
+auto apply(Func&& func, const std::tuple<T...>& args) {
+ using helper = apply_helper<Func, const std::tuple<T...>&, std::index_sequence_for<T...>>;
+ return helper::apply(std::forward<Func>(func), args);
+}
+
+}
diff --git a/src/seastar/include/seastar/core/array_map.hh b/src/seastar/include/seastar/core/array_map.hh
new file mode 100644
index 000000000..14ec4118a
--- /dev/null
+++ b/src/seastar/include/seastar/core/array_map.hh
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <array>
+
+namespace seastar {
+
+// unordered_map implemented as a simple array
+
+template <typename Value, size_t Max>
+class array_map {
+ std::array<Value, Max> _a {};
+public:
+ array_map(std::initializer_list<std::pair<size_t, Value>> i) {
+ for (auto kv : i) {
+ _a[kv.first] = kv.second;
+ }
+ }
+ Value& operator[](size_t key) { return _a[key]; }
+ const Value& operator[](size_t key) const { return _a[key]; }
+
+ Value& at(size_t key) {
+ if (key >= Max) {
+ throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max));
+ }
+ return _a[key];
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/bitops.hh b/src/seastar/include/seastar/core/bitops.hh
new file mode 100644
index 000000000..3ea178536
--- /dev/null
+++ b/src/seastar/include/seastar/core/bitops.hh
@@ -0,0 +1,73 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <limits>
+
+namespace seastar {
+
+inline
+constexpr unsigned count_leading_zeros(unsigned x) {
+ return __builtin_clz(x);
+}
+
+inline
+constexpr unsigned count_leading_zeros(unsigned long x) {
+ return __builtin_clzl(x);
+}
+
+inline
+constexpr unsigned count_leading_zeros(unsigned long long x) {
+ return __builtin_clzll(x);
+}
+
+inline
+constexpr unsigned count_trailing_zeros(unsigned x) {
+ return __builtin_ctz(x);
+}
+
+inline
+constexpr unsigned count_trailing_zeros(unsigned long x) {
+ return __builtin_ctzl(x);
+}
+
+inline
+constexpr unsigned count_trailing_zeros(unsigned long long x) {
+ return __builtin_ctzll(x);
+}
+
+template<typename T>
+//requires stdx::is_integral_v<T>
+inline constexpr unsigned log2ceil(T n) {
+ if (n == 1) {
+ return 0;
+ }
+ return std::numeric_limits<T>::digits - count_leading_zeros(n - 1);
+}
+
+template<typename T>
+//requires stdx::is_integral_v<T>
+inline constexpr unsigned log2floor(T n) {
+ return std::numeric_limits<T>::digits - count_leading_zeros(n) - 1;
+}
+
+}
diff --git a/src/seastar/include/seastar/core/bitset-iter.hh b/src/seastar/include/seastar/core/bitset-iter.hh
new file mode 100644
index 000000000..d00744dd0
--- /dev/null
+++ b/src/seastar/include/seastar/core/bitset-iter.hh
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+/*
+ * Imported from OSv:
+ *
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#pragma once
+
+#include <bitset>
+#include <limits>
+
+namespace seastar {
+
+namespace bitsets {
+
+static constexpr int ulong_bits = std::numeric_limits<unsigned long>::digits;
+
+/**
+ * Returns the number of leading zeros in value's binary representation.
+ *
+ * If value == 0 the result is undefied. If T is signed and value is negative
+ * the result is undefined.
+ *
+ * The highest value that can be returned is std::numeric_limits<T>::digits - 1,
+ * which is returned when value == 1.
+ */
+template<typename T>
+inline size_t count_leading_zeros(T value) noexcept;
+
+/**
+ * Returns the number of trailing zeros in value's binary representation.
+ *
+ * If value == 0 the result is undefied. If T is signed and value is negative
+ * the result is undefined.
+ *
+ * The highest value that can be returned is std::numeric_limits<T>::digits - 1.
+ */
+template<typename T>
+static inline size_t count_trailing_zeros(T value) noexcept;
+
+template<>
+inline size_t count_leading_zeros<unsigned long>(unsigned long value) noexcept
+{
+ return __builtin_clzl(value);
+}
+
+template<>
+inline size_t count_leading_zeros<long>(long value) noexcept
+{
+ return __builtin_clzl((unsigned long)value) - 1;
+}
+
+template<>
+inline size_t count_leading_zeros<unsigned long long>(unsigned long long value) noexcept
+{
+ return __builtin_clzll(value);
+}
+
+template<>
+inline size_t count_leading_zeros<long long>(long long value) noexcept
+{
+ return __builtin_clzll((unsigned long long)value) - 1;
+}
+
+template<>
+inline
+size_t count_trailing_zeros<unsigned long>(unsigned long value) noexcept
+{
+ return __builtin_ctzl(value);
+}
+
+template<>
+inline
+size_t count_trailing_zeros<long>(long value) noexcept
+{
+ return __builtin_ctzl((unsigned long)value);
+}
+
+template<>
+inline
+size_t count_trailing_zeros<unsigned long long>(unsigned long long value) noexcept
+{
+ return __builtin_ctzll(value);
+}
+
+template<>
+inline
+size_t count_trailing_zeros<long long>(long long value) noexcept
+{
+ return __builtin_ctzll((unsigned long long)value);
+}
+
+/**
+ * Returns the index of the first set bit.
+ * Result is undefined if bitset.any() == false.
+ */
+template<size_t N>
+static inline size_t get_first_set(const std::bitset<N>& bitset) noexcept
+{
+ static_assert(N <= ulong_bits, "bitset too large");
+ return count_trailing_zeros(bitset.to_ulong());
+}
+
+/**
+ * Returns the index of the last set bit in the bitset.
+ * Result is undefined if bitset.any() == false.
+ */
+template<size_t N>
+static inline size_t get_last_set(const std::bitset<N>& bitset) noexcept
+{
+ static_assert(N <= ulong_bits, "bitset too large");
+ return ulong_bits - 1 - count_leading_zeros(bitset.to_ulong());
+}
+
+template<size_t N>
+class set_iterator : public std::iterator<std::input_iterator_tag, int>
+{
+private:
+ void advance() noexcept
+ {
+ if (_bitset.none()) {
+ _index = -1;
+ } else {
+ auto shift = get_first_set(_bitset) + 1;
+ _index += shift;
+ _bitset >>= shift;
+ }
+ }
+public:
+ set_iterator(std::bitset<N> bitset, int offset = 0) noexcept
+ : _bitset(bitset)
+ , _index(offset - 1)
+ {
+ static_assert(N <= ulong_bits, "This implementation is inefficient for large bitsets");
+ _bitset >>= offset;
+ advance();
+ }
+
+ void operator++() noexcept
+ {
+ advance();
+ }
+
+ int operator*() const noexcept
+ {
+ return _index;
+ }
+
+ bool operator==(const set_iterator& other) const noexcept
+ {
+ return _index == other._index;
+ }
+
+ bool operator!=(const set_iterator& other) const noexcept
+ {
+ return !(*this == other);
+ }
+private:
+ std::bitset<N> _bitset;
+ int _index;
+};
+
+template<size_t N>
+class set_range
+{
+public:
+ using iterator = set_iterator<N>;
+ using value_type = int;
+
+ set_range(std::bitset<N> bitset, int offset = 0) noexcept
+ : _bitset(bitset)
+ , _offset(offset)
+ {
+ }
+
+ iterator begin() const noexcept { return iterator(_bitset, _offset); }
+ iterator end() const noexcept { return iterator(0); }
+private:
+ std::bitset<N> _bitset;
+ int _offset;
+};
+
+template<size_t N>
+static inline set_range<N> for_each_set(std::bitset<N> bitset, int offset = 0) noexcept
+{
+ return set_range<N>(bitset, offset);
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/core/byteorder.hh b/src/seastar/include/seastar/core/byteorder.hh
new file mode 100644
index 000000000..8b5bc9276
--- /dev/null
+++ b/src/seastar/include/seastar/core/byteorder.hh
@@ -0,0 +1,127 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Scylladb, Ltd.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <endian.h>
+#include <seastar/core/unaligned.hh>
+
+namespace seastar {
+
+inline uint8_t cpu_to_le(uint8_t x) noexcept { return x; }
+inline uint8_t le_to_cpu(uint8_t x) noexcept { return x; }
+inline uint16_t cpu_to_le(uint16_t x) noexcept { return htole16(x); }
+inline uint16_t le_to_cpu(uint16_t x) noexcept { return le16toh(x); }
+inline uint32_t cpu_to_le(uint32_t x) noexcept { return htole32(x); }
+inline uint32_t le_to_cpu(uint32_t x) noexcept { return le32toh(x); }
+inline uint64_t cpu_to_le(uint64_t x) noexcept { return htole64(x); }
+inline uint64_t le_to_cpu(uint64_t x) noexcept { return le64toh(x); }
+
+inline int8_t cpu_to_le(int8_t x) noexcept { return x; }
+inline int8_t le_to_cpu(int8_t x) noexcept { return x; }
+inline int16_t cpu_to_le(int16_t x) noexcept { return htole16(x); }
+inline int16_t le_to_cpu(int16_t x) noexcept { return le16toh(x); }
+inline int32_t cpu_to_le(int32_t x) noexcept { return htole32(x); }
+inline int32_t le_to_cpu(int32_t x) noexcept { return le32toh(x); }
+inline int64_t cpu_to_le(int64_t x) noexcept { return htole64(x); }
+inline int64_t le_to_cpu(int64_t x) noexcept { return le64toh(x); }
+
+inline uint8_t cpu_to_be(uint8_t x) noexcept { return x; }
+inline uint8_t be_to_cpu(uint8_t x) noexcept { return x; }
+inline uint16_t cpu_to_be(uint16_t x) noexcept { return htobe16(x); }
+inline uint16_t be_to_cpu(uint16_t x) noexcept { return be16toh(x); }
+inline uint32_t cpu_to_be(uint32_t x) noexcept { return htobe32(x); }
+inline uint32_t be_to_cpu(uint32_t x) noexcept { return be32toh(x); }
+inline uint64_t cpu_to_be(uint64_t x) noexcept { return htobe64(x); }
+inline uint64_t be_to_cpu(uint64_t x) noexcept { return be64toh(x); }
+
+inline int8_t cpu_to_be(int8_t x) noexcept { return x; }
+inline int8_t be_to_cpu(int8_t x) noexcept { return x; }
+inline int16_t cpu_to_be(int16_t x) noexcept { return htobe16(x); }
+inline int16_t be_to_cpu(int16_t x) noexcept { return be16toh(x); }
+inline int32_t cpu_to_be(int32_t x) noexcept { return htobe32(x); }
+inline int32_t be_to_cpu(int32_t x) noexcept { return be32toh(x); }
+inline int64_t cpu_to_be(int64_t x) noexcept { return htobe64(x); }
+inline int64_t be_to_cpu(int64_t x) noexcept { return be64toh(x); }
+
+template <typename T>
+inline T cpu_to_le(const unaligned<T>& v) noexcept {
+ return cpu_to_le(T(v));
+}
+
+template <typename T>
+inline T le_to_cpu(const unaligned<T>& v) noexcept {
+ return le_to_cpu(T(v));
+}
+
+template <typename T>
+inline
+T
+read_le(const char* p) noexcept {
+ T datum;
+ std::copy_n(p, sizeof(T), reinterpret_cast<char*>(&datum));
+ return le_to_cpu(datum);
+}
+
+template <typename T>
+inline
+void
+write_le(char* p, T datum) noexcept {
+ datum = cpu_to_le(datum);
+ std::copy_n(reinterpret_cast<const char*>(&datum), sizeof(T), p);
+}
+
+template <typename T>
+inline
+T
+read_be(const char* p) noexcept {
+ T datum;
+ std::copy_n(p, sizeof(T), reinterpret_cast<char*>(&datum));
+ return be_to_cpu(datum);
+}
+
+template <typename T>
+inline
+void
+write_be(char* p, T datum) noexcept {
+ datum = cpu_to_be(datum);
+ std::copy_n(reinterpret_cast<const char*>(&datum), sizeof(T), p);
+}
+
+template <typename T>
+inline
+T
+consume_be(const char*& p) noexcept {
+ auto ret = read_be<T>(p);
+ p += sizeof(T);
+ return ret;
+}
+
+template <typename T>
+inline
+void
+produce_be(char*& p, T datum) noexcept {
+ write_be<T>(p, datum);
+ p += sizeof(T);
+}
+
+}
diff --git a/src/seastar/include/seastar/core/cacheline.hh b/src/seastar/include/seastar/core/cacheline.hh
new file mode 100644
index 000000000..89bb3846c
--- /dev/null
+++ b/src/seastar/include/seastar/core/cacheline.hh
@@ -0,0 +1,42 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 IBM.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace seastar {
+
+// Platform-dependent cache line size for alignment and padding purposes.
+static constexpr size_t cache_line_size =
+#if defined(__x86_64__) || defined(__i386__)
+ 64;
+#elif defined(__s390x__) || defined(__zarch__)
+ 256;
+#elif defined(__PPC64__)
+ 128;
+#elif defined(__aarch64__)
+ 128; // from Linux, may vary among different microarchitetures?
+#else
+#error "cache_line_size not defined for this architecture"
+#endif
+
+}
diff --git a/src/seastar/include/seastar/core/checked_ptr.hh b/src/seastar/include/seastar/core/checked_ptr.hh
new file mode 100644
index 000000000..a4d10d26d
--- /dev/null
+++ b/src/seastar/include/seastar/core/checked_ptr.hh
@@ -0,0 +1,199 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+/// \file
+/// \brief Contains a seastar::checked_ptr class implementation.
+
+#include <exception>
+#include <seastar/util/concepts.hh>
+
+/// \namespace seastar
+namespace seastar {
+
+/// The exception thrown by a default_null_deref_action.
+class checked_ptr_is_null_exception : public std::exception {};
+
+/// \brief
+/// Default not engaged seastar::checked_ptr dereferencing action (functor).
+///
+/// Throws a seastar::checked_ptr_is_null_exception.
+///
+struct default_null_deref_action {
+ /// \throw seastar::checked_ptr_is_null_exception
+ void operator()() const {
+ throw checked_ptr_is_null_exception();
+ }
+};
+
+/// \cond internal
+/// \namespace seastar::internal
+namespace internal {
+
+/// \name seastar::checked_ptr::get() helpers
+/// Helper functions that simplify the seastar::checked_ptr::get() implementation.
+/// @{
+
+/// Invokes the get() method of a smart pointer object.
+/// \param ptr A smart pointer object
+/// \return A pointer to the underlying object
+template <typename T>
+/// cond SEASTAR_CONCEPT_DOC - nested '\ cond' doesn't seem to work (bug 736553), so working it around
+SEASTAR_CONCEPT( requires requires (T ptr) {
+ ptr.get();
+})
+/// endcond
+inline typename std::pointer_traits<std::remove_const_t<T>>::element_type* checked_ptr_do_get(T& ptr) {
+ return ptr.get();
+}
+
+/// Return a pointer itself for a naked pointer argument.
+/// \param ptr A naked pointer object
+/// \return An input naked pointer object
+template <typename T>
+inline T* checked_ptr_do_get(T* ptr) noexcept {
+ return ptr;
+}
+/// @}
+}
+/// \endcond
+
+/// \class seastar::checked_ptr
+/// \brief
+/// seastar::checked_ptr class is a wrapper class that may be used with any pointer type
+/// (smart like std::unique_ptr or raw pointers like int*).
+///
+/// The seastar::checked_ptr object will invoke the NullDerefAction functor if
+/// it is dereferenced when the underlying pointer is not engaged.
+///
+/// It may still be assigned, compared to other seastar::checked_ptr objects or
+/// moved without limitations.
+///
+/// The default NullDerefAction will throw a seastar::default_null_deref_action exception.
+///
+/// \tparam NullDerefAction a functor that is invoked when a user tries to dereference a not engaged pointer.
+///
+template<typename Ptr, typename NullDerefAction = default_null_deref_action>
+/// \cond SEASTAR_CONCEPT_DOC
+SEASTAR_CONCEPT( requires std::is_default_constructible<NullDerefAction>::value && requires (NullDerefAction action) {
+ NullDerefAction();
+})
+/// \endcond
+class checked_ptr {
+public:
+ /// Underlying element type
+ using element_type = typename std::pointer_traits<Ptr>::element_type;
+
+ /// Type of the pointer to the underlying element
+ using pointer = element_type*;
+
+private:
+ Ptr _ptr = nullptr;
+
+private:
+ /// Invokes a NullDerefAction functor if the underlying pointer is not engaged.
+ void check() const {
+ if (!_ptr) {
+ NullDerefAction()();
+ }
+ }
+
+public:
+ checked_ptr() noexcept(noexcept(Ptr(nullptr))) = default;
+ checked_ptr(std::nullptr_t) noexcept(std::is_nothrow_default_constructible<checked_ptr<Ptr, NullDerefAction>>::value) : checked_ptr() {}
+ checked_ptr(Ptr&& ptr) noexcept(std::is_nothrow_move_constructible<Ptr>::value) : _ptr(std::move(ptr)) {}
+ checked_ptr(const Ptr& p) noexcept(std::is_nothrow_copy_constructible<Ptr>::value) : _ptr(p) {}
+
+ /// \name Checked Methods
+ /// These methods start with invoking a NullDerefAction functor if the underlying pointer is not engaged.
+ /// @{
+
+ /// Invokes the get() method of the underlying smart pointer or returns the pointer itself for a raw pointer (const variant).
+ /// \return The pointer to the underlying object
+ pointer get() const {
+ check();
+ return internal::checked_ptr_do_get(_ptr);
+ }
+
+ /// Gets a reference to the underlying pointer object.
+ /// \return The underlying pointer object
+ const Ptr& operator->() const {
+ check();
+ return _ptr;
+ }
+
+ /// Gets a reference to the underlying pointer object (const variant).
+ /// \return The underlying pointer object
+ Ptr& operator->() {
+ check();
+ return _ptr;
+ }
+
+ /// Gets the reference to the underlying object (const variant).
+ /// \return The reference to the underlying object
+ const element_type& operator*() const {
+ check();
+ return *_ptr;
+ }
+
+ /// Gets the reference to the underlying object.
+ /// \return The reference to the underlying object
+ element_type& operator*() {
+ check();
+ return *_ptr;
+ }
+ /// @}
+
+ /// \name Unchecked methods
+ /// These methods may be invoked when the underlying pointer is not engaged.
+ /// @{
+
+ /// Checks if the underlying pointer is engaged.
+ /// \return TRUE if the underlying pointer is engaged
+ explicit operator bool() const { return bool(_ptr); }
+
+ bool operator==(const checked_ptr& other) const { return _ptr == other._ptr; }
+ bool operator!=(const checked_ptr& other) const { return _ptr != other._ptr; }
+
+ /// Gets the hash value for the underlying pointer object.
+ /// \return The hash value for the underlying pointer object
+ size_t hash() const {
+ return std::hash<Ptr>()(_ptr);
+ }
+ ///@}
+};
+
+}
+
+namespace std {
+/// std::hash specialization for seastar::checked_ptr class
+template<typename T>
+struct hash<seastar::checked_ptr<T>> {
+ /// Get the hash value for the given seastar::checked_ptr object.
+ /// The hash will calculated using the seastar::checked_ptr::hash method.
+ /// \param p object for hash value calculation
+ /// \return The hash value for the given object
+ size_t operator()(const seastar::checked_ptr<T>& p) const {
+ return p.hash();
+ }
+};
+}
diff --git a/src/seastar/include/seastar/core/chunked_fifo.hh b/src/seastar/include/seastar/core/chunked_fifo.hh
new file mode 100644
index 000000000..cfe4a7eaf
--- /dev/null
+++ b/src/seastar/include/seastar/core/chunked_fifo.hh
@@ -0,0 +1,626 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <algorithm>
+
+namespace seastar {
+
+// An unbounded FIFO queue of objects of type T.
+//
+// It provides operations to push items in one end of the queue, and pop them
+// from the other end of the queue - both operations are guaranteed O(1)
+// (not just amortized O(1)). The size() operation is also O(1).
+// chunked_fifo also guarantees that the largest contiguous memory allocation
+// it does is O(1). The total memory used is, of course, O(N).
+//
+// How does chunked_fifo differ from std::list<>, circular_buffer<> and
+// std::deque()?
+//
+// std::list<> can also make all the above guarantees, but is inefficient -
+// both at run speed (every operation requires an allocation), and in memory
+// use. Much more efficient than std::list<> is our circular_buffer<>, which
+// allocates a contiguous array to hold the items and only reallocates it,
+// exponentially, when the queue grows. On one test of several different
+// push/pop scenarios, circular_buffer<> was between 5 and 20 times faster
+// than std::list, and also used considerably less memory.
+// The problem with circular_buffer<> is that gives up on the last guarantee
+// we made above: circular_buffer<> allocates all the items in one large
+// contiguous allocation - that might not be possible when the memory is
+// highly fragmented.
+// std::deque<> aims to solve the contiguous allocation problem by allocating
+// smaller chunks of the queue, and keeping a list of them in an array. This
+// array is necessary to allow for O(1) random access to any element, a
+// feature which we do not need; But this array is itself contiguous so
+// std::deque<> attempts larger contiguous allocations the larger the queue
+// gets: std::deque<>'s contiguous allocation is still O(N) and in fact
+// exactly 1/64 of the size of circular_buffer<>'s contiguous allocation.
+// So it's an improvement over circular_buffer<>, but not a full solution.
+//
+// chunked_fifo<> is such a solution: it also allocates the queue in fixed-
+// size chunks (just like std::deque) but holds them in a linked list, not
+// a contiguous array, so there are no large contiguous allocations.
+//
+// Unlike std::deque<> or circular_buffer<>, chunked_fifo only provides the
+// operations needed by std::queue, i.e.,: empty(), size(), front(), back(),
+// push_back() and pop_front(). For simplicity, we do *not* implement other
+// possible operations, like inserting or deleting elements from the "wrong"
+// side of the queue or from the middle, nor random-access to items in the
+// middle of the queue. However, chunked_fifo does allow iterating over all
+// of the queue's elements without popping them, a feature which std::queue
+// is missing.
+//
+// Another feature of chunked_fifo which std::deque is missing is the ability
+// to control the chunk size, as a template parameter. In std::deque the
+// chunk size is undocumented and fixed - in gcc, it is always 512 bytes.
+// chunked_fifo, on the other hand, makes the chunk size (in number of items
+// instead of bytes) a template parameter; In situations where the queue is
+// expected to become very long, using a larger chunk size might make sense
+// because it will result in fewer allocations.
+//
+// chunked_fifo uses uninitialized storage for unoccupied elements, and thus
+// uses move/copy constructors instead of move/copy assignments, which are
+// less efficient.
+
+template <typename T, size_t items_per_chunk = 128>
+class chunked_fifo {
+ static_assert((items_per_chunk & (items_per_chunk - 1)) == 0,
+ "chunked_fifo chunk size must be power of two");
+ union maybe_item {
+ maybe_item() noexcept {}
+ ~maybe_item() {}
+ T data;
+ };
+ struct chunk {
+ maybe_item items[items_per_chunk];
+ struct chunk* next;
+ // begin and end interpreted mod items_per_chunk
+ unsigned begin;
+ unsigned end;
+ };
+ // We pop from the chunk at _front_chunk. This chunk is then linked to
+ // the following chunks via the "next" link. _back_chunk points to the
+ // last chunk in this list, and it is where we push.
+ chunk* _front_chunk = nullptr; // where we pop
+ chunk* _back_chunk = nullptr; // where we push
+ // We want an O(1) size but don't want to maintain a size() counter
+ // because this will slow down every push and pop operation just for
+ // the rare size() call. Instead, we just keep a count of chunks (which
+ // doesn't change on every push or pop), from which we can calculate
+ // size() when needed, and still be O(1).
+ // This assumes the invariant that all middle chunks (except the front
+ // and back) are always full.
+ size_t _nchunks = 0;
+ // A list of freed chunks, to support reserve() and to improve
+ // performance of repeated push and pop, especially on an empty queue.
+ // It is a performance/memory tradeoff how many freed chunks to keep
+ // here (see save_free_chunks constant below).
+ chunk* _free_chunks = nullptr;
+ size_t _nfree_chunks = 0;
+public:
+ using value_type = T;
+ using size_type = size_t;
+ using reference = T&;
+ using pointer = T*;
+ using const_reference = const T&;
+ using const_pointer = const T*;
+
+private:
+ template <typename U>
+ class basic_iterator {
+ friend class chunked_fifo;
+
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using difference_type = std::ptrdiff_t;
+ using value_type = U;
+ using pointer = U*;
+ using reference = U&;
+
+ protected:
+ chunk* _chunk = nullptr;
+ size_t _item_index = 0;
+
+ protected:
+ inline explicit basic_iterator(chunk* c) noexcept;
+ inline basic_iterator(chunk* c, size_t item_index) noexcept;
+
+ public:
+ inline bool operator==(const basic_iterator& o) const noexcept;
+ inline bool operator!=(const basic_iterator& o) const noexcept;
+ inline pointer operator->() const noexcept;
+ inline reference operator*() const noexcept;
+ inline basic_iterator operator++(int) noexcept;
+ basic_iterator& operator++() noexcept;
+ };
+
+public:
+ class iterator : public basic_iterator<T> {
+ using basic_iterator<T>::basic_iterator;
+ public:
+ iterator() noexcept = default;
+ };
+ class const_iterator : public basic_iterator<const T> {
+ using basic_iterator<T>::basic_iterator;
+ public:
+ const_iterator() noexcept = default;
+ inline const_iterator(iterator o) noexcept;
+ };
+
+public:
+ chunked_fifo() noexcept = default;
+ chunked_fifo(chunked_fifo&& x) noexcept;
+ chunked_fifo(const chunked_fifo& X) = delete;
+ ~chunked_fifo();
+ chunked_fifo& operator=(const chunked_fifo&) = delete;
+ chunked_fifo& operator=(chunked_fifo&&) noexcept;
+ inline void push_back(const T& data);
+ inline void push_back(T&& data);
+ T& back() noexcept;
+ const T& back() const noexcept;
+ template <typename... A>
+ inline void emplace_back(A&&... args);
+ inline T& front() const noexcept;
+ inline void pop_front() noexcept;
+ inline bool empty() const noexcept;
+ inline size_t size() const noexcept;
+ void clear() noexcept;
+ // reserve(n) ensures that at least (n - size()) further push() calls can
+ // be served without needing new memory allocation.
+ // Calling pop()s between these push()es is also allowed and does not
+ // alter this guarantee.
+ // Note that reserve() does not reduce the amount of memory already
+ // reserved - use shrink_to_fit() for that.
+ void reserve(size_t n);
+ // shrink_to_fit() frees memory held, but unused, by the queue. Such
+ // unused memory might exist after pops, or because of reserve().
+ void shrink_to_fit() noexcept;
+ inline iterator begin() noexcept;
+ inline iterator end() noexcept;
+ inline const_iterator begin() const noexcept;
+ inline const_iterator end() const noexcept;
+ inline const_iterator cbegin() const noexcept;
+ inline const_iterator cend() const noexcept;
+private:
+ void back_chunk_new();
+ void front_chunk_delete() noexcept;
+ inline void ensure_room_back();
+ void undo_room_back() noexcept;
+ static inline size_t mask(size_t idx) noexcept;
+
+};
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::basic_iterator(chunk* c) noexcept : _chunk(c), _item_index(_chunk ? _chunk->begin : 0) {
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::basic_iterator(chunk* c, size_t item_index) noexcept : _chunk(c), _item_index(item_index) {
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline bool
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::operator==(const basic_iterator& o) const noexcept {
+ return _chunk == o._chunk && _item_index == o._item_index;
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline bool
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::operator!=(const basic_iterator& o) const noexcept {
+ return !(*this == o);
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline typename chunked_fifo<T, items_per_chunk>::template basic_iterator<U>::pointer
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::operator->() const noexcept {
+ return &_chunk->items[chunked_fifo::mask(_item_index)].data;
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline typename chunked_fifo<T, items_per_chunk>::template basic_iterator<U>::reference
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::operator*() const noexcept {
+ return _chunk->items[chunked_fifo::mask(_item_index)].data;
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+inline typename chunked_fifo<T, items_per_chunk>::template basic_iterator<U>
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::operator++(int) noexcept {
+ auto it = *this;
+ ++(*this);
+ return it;
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename U>
+typename chunked_fifo<T, items_per_chunk>::template basic_iterator<U>&
+chunked_fifo<T, items_per_chunk>::basic_iterator<U>::operator++() noexcept {
+ ++_item_index;
+ if (_item_index == _chunk->end) {
+ _chunk = _chunk->next;
+ _item_index = _chunk ? _chunk->begin : 0;
+ }
+ return *this;
+}
+
+template <typename T, size_t items_per_chunk>
+inline
+chunked_fifo<T, items_per_chunk>::const_iterator::const_iterator(chunked_fifo<T, items_per_chunk>::iterator o) noexcept
+ : basic_iterator<const T>(o._chunk, o._item_index) {
+}
+
+template <typename T, size_t items_per_chunk>
+inline
+chunked_fifo<T, items_per_chunk>::chunked_fifo(chunked_fifo&& x) noexcept
+ : _front_chunk(x._front_chunk)
+ , _back_chunk(x._back_chunk)
+ , _nchunks(x._nchunks)
+ , _free_chunks(x._free_chunks)
+ , _nfree_chunks(x._nfree_chunks) {
+ x._front_chunk = nullptr;
+ x._back_chunk = nullptr;
+ x._nchunks = 0;
+ x._free_chunks = nullptr;
+ x._nfree_chunks = 0;
+}
+
+template <typename T, size_t items_per_chunk>
+inline
+chunked_fifo<T, items_per_chunk>&
+chunked_fifo<T, items_per_chunk>::operator=(chunked_fifo&& x) noexcept {
+ if (&x != this) {
+ this->~chunked_fifo();
+ new (this) chunked_fifo(std::move(x));
+ }
+ return *this;
+}
+
+template <typename T, size_t items_per_chunk>
+inline size_t
+chunked_fifo<T, items_per_chunk>::mask(size_t idx) noexcept {
+ return idx & (items_per_chunk - 1);
+}
+
+template <typename T, size_t items_per_chunk>
+inline bool
+chunked_fifo<T, items_per_chunk>::empty() const noexcept {
+ return _front_chunk == nullptr;
+}
+
+template <typename T, size_t items_per_chunk>
+inline size_t
+chunked_fifo<T, items_per_chunk>::size() const noexcept{
+ if (_front_chunk == nullptr) {
+ return 0;
+ } else if (_back_chunk == _front_chunk) {
+ // Single chunk.
+ return _front_chunk->end - _front_chunk->begin;
+ } else {
+ return _front_chunk->end - _front_chunk->begin
+ +_back_chunk->end - _back_chunk->begin
+ + (_nchunks - 2) * items_per_chunk;
+ }
+}
+
+template <typename T, size_t items_per_chunk>
+void chunked_fifo<T, items_per_chunk>::clear() noexcept {
+#if 1
+ while (!empty()) {
+ pop_front();
+ }
+#else
+ // This is specialized code to free the contents of all the chunks and the
+ // chunks themselves. but since destroying a very full queue is not an
+ // important use case to optimize, the simple loop above is preferable.
+ if (!_front_chunk) {
+ // Empty, nothing to do
+ return;
+ }
+ // Delete front chunk (partially filled)
+ for (auto i = _front_chunk->begin; i != _front_chunk->end; ++i) {
+ _front_chunk->items[mask(i)].data.~T();
+ }
+ chunk *p = _front_chunk->next;
+ delete _front_chunk;
+ // Delete all the middle chunks (all completely filled)
+ if (p) {
+ while (p != _back_chunk) {
+ // These are full chunks
+ chunk *nextp = p->next;
+ for (auto i = 0; i != items_per_chunk; ++i) {
+ // Note we delete out of order (we don't start with p->begin).
+ // That should be fine..
+ p->items[i].data.~T();
+ }
+ delete p;
+ p = nextp;
+ }
+ // Finally delete back chunk (partially filled)
+ for (auto i = _back_chunk->begin; i != _back_chunk->end; ++i) {
+ _back_chunk->items[mask(i)].data.~T();
+ }
+ delete _back_chunk;
+ }
+ _front_chunk = nullptr;
+ _back_chunk = nullptr;
+ _nchunks = 0;
+#endif
+}
+
+template <typename T, size_t items_per_chunk> void
+chunked_fifo<T, items_per_chunk>::shrink_to_fit() noexcept {
+ while (_free_chunks) {
+ auto next = _free_chunks->next;
+ delete _free_chunks;
+ _free_chunks = next;
+ }
+ _nfree_chunks = 0;
+}
+
+template <typename T, size_t items_per_chunk>
+chunked_fifo<T, items_per_chunk>::~chunked_fifo() {
+ clear();
+ shrink_to_fit();
+}
+
+template <typename T, size_t items_per_chunk>
+void
+chunked_fifo<T, items_per_chunk>::back_chunk_new() {
+ chunk *old = _back_chunk;
+ if (_free_chunks) {
+ _back_chunk = _free_chunks;
+ _free_chunks = _free_chunks->next;
+ --_nfree_chunks;
+ } else {
+ _back_chunk = new chunk;
+ }
+ _back_chunk->next = nullptr;
+ _back_chunk->begin = 0;
+ _back_chunk->end = 0;
+ if (old) {
+ old->next = _back_chunk;
+ }
+ if (_front_chunk == nullptr) {
+ _front_chunk = _back_chunk;
+ }
+ _nchunks++;
+}
+
+
+template <typename T, size_t items_per_chunk>
+inline void
+chunked_fifo<T, items_per_chunk>::ensure_room_back() {
+ // If we don't have a back chunk or it's full, we need to create a new one
+ if (_back_chunk == nullptr ||
+ (_back_chunk->end - _back_chunk->begin) == items_per_chunk) {
+ back_chunk_new();
+ }
+}
+
+template <typename T, size_t items_per_chunk>
+void
+chunked_fifo<T, items_per_chunk>::undo_room_back() noexcept {
+ // If we failed creating a new item after ensure_room_back() created a
+ // new empty chunk, we must remove it, or empty() will be incorrect
+ // (either immediately, if the fifo was empty, or when all the items are
+ // popped, if it already had items).
+ if (_back_chunk->begin == _back_chunk->end) {
+ delete _back_chunk;
+ --_nchunks;
+ if (_nchunks == 0) {
+ _back_chunk = nullptr;
+ _front_chunk = nullptr;
+ } else {
+ // Because we don't usually pop from the back, we don't have a "prev"
+ // pointer so we need to find the previous chunk the hard and slow
+ // way. B
+ chunk *old = _back_chunk;
+ _back_chunk = _front_chunk;
+ while (_back_chunk->next != old) {
+ _back_chunk = _back_chunk->next;
+ }
+ _back_chunk->next = nullptr;
+ }
+ }
+
+}
+
+template <typename T, size_t items_per_chunk>
+template <typename... Args>
+inline void
+chunked_fifo<T, items_per_chunk>::emplace_back(Args&&... args) {
+ ensure_room_back();
+ auto p = &_back_chunk->items[mask(_back_chunk->end)].data;
+ try {
+ new(p) T(std::forward<Args>(args)...);
+ } catch(...) {
+ undo_room_back();
+ throw;
+ }
+ ++_back_chunk->end;
+}
+
+template <typename T, size_t items_per_chunk>
+inline void
+chunked_fifo<T, items_per_chunk>::push_back(const T& data) {
+ ensure_room_back();
+ auto p = &_back_chunk->items[mask(_back_chunk->end)].data;
+ try {
+ new(p) T(data);
+ } catch(...) {
+ undo_room_back();
+ throw;
+ }
+ ++_back_chunk->end;
+}
+
+template <typename T, size_t items_per_chunk>
+inline void
+chunked_fifo<T, items_per_chunk>::push_back(T&& data) {
+ ensure_room_back();
+ auto p = &_back_chunk->items[mask(_back_chunk->end)].data;
+ try {
+ new(p) T(std::move(data));
+ } catch(...) {
+ undo_room_back();
+ throw;
+ }
+ ++_back_chunk->end;
+}
+
+template <typename T, size_t items_per_chunk>
+inline
+T&
+chunked_fifo<T, items_per_chunk>::back() noexcept {
+ return _back_chunk->items[mask(_back_chunk->end - 1)].data;
+}
+
+template <typename T, size_t items_per_chunk>
+inline
+const T&
+chunked_fifo<T, items_per_chunk>::back() const noexcept {
+ return _back_chunk->items[mask(_back_chunk->end - 1)].data;
+}
+
+template <typename T, size_t items_per_chunk>
+inline T&
+chunked_fifo<T, items_per_chunk>::front() const noexcept {
+ return _front_chunk->items[mask(_front_chunk->begin)].data;
+}
+
+template <typename T, size_t items_per_chunk>
+inline void
+chunked_fifo<T, items_per_chunk>::front_chunk_delete() noexcept {
+ chunk *next = _front_chunk->next;
+ // Certain use cases may need to repeatedly allocate and free a chunk -
+ // an obvious example is an empty queue to which we push, and then pop,
+ // repeatedly. Another example is pushing and popping to a non-empty queue
+ // we push and pop at different chunks so we need to free and allocate a
+ // chunk every items_per_chunk operations.
+ // The solution is to keep a list of freed chunks instead of freeing them
+ // immediately. There is a performance/memory tradeoff of how many freed
+ // chunks to save: If we save them all, the queue can never shrink from
+ // its maximum memory use (this is how circular_buffer behaves).
+ // The ad-hoc choice made here is to limit the number of saved chunks to 1,
+ // but this could easily be made a configuration option.
+ static constexpr int save_free_chunks = 1;
+ if (_nfree_chunks < save_free_chunks) {
+ _front_chunk->next = _free_chunks;
+ _free_chunks = _front_chunk;
+ ++_nfree_chunks;
+ } else {
+ delete _front_chunk;
+ }
+ // If we only had one chunk, _back_chunk is gone too.
+ if (_back_chunk == _front_chunk) {
+ _back_chunk = nullptr;
+ }
+ _front_chunk = next;
+ --_nchunks;
+}
+
+template <typename T, size_t items_per_chunk>
+inline void
+chunked_fifo<T, items_per_chunk>::pop_front() noexcept {
+ front().~T();
+ // If the front chunk has become empty, we need to free remove it and use
+ // the next one.
+ if (++_front_chunk->begin == _front_chunk->end) {
+ front_chunk_delete();
+ }
+}
+
+template <typename T, size_t items_per_chunk>
+void chunked_fifo<T, items_per_chunk>::reserve(size_t n) {
+ // reserve() guarantees that (n - size()) additional push()es will
+ // succeed without reallocation:
+ if (n <= size()) {
+ return;
+ }
+ size_t need = n - size();
+ // If we already have a back chunk, it might have room for some pushes
+ // before filling up, so decrease "need":
+ if (_back_chunk) {
+ size_t back_chunk_n = items_per_chunk - (_back_chunk->end - _back_chunk->begin);
+ need -= std::min(back_chunk_n, need);
+ }
+ size_t needed_chunks = (need + items_per_chunk - 1) / items_per_chunk;
+ // If we already have some freed chunks saved, we need to allocate fewer
+ // additional chunks, or none at all
+ if (needed_chunks <= _nfree_chunks) {
+ return;
+ }
+ needed_chunks -= _nfree_chunks;
+ while (needed_chunks--) {
+ chunk *c = new chunk;
+ c->next = _free_chunks;
+ _free_chunks = c;
+ ++_nfree_chunks;
+ }
+}
+
+template <typename T, size_t items_per_chunk>
+inline typename chunked_fifo<T, items_per_chunk>::iterator
+chunked_fifo<T, items_per_chunk>::begin() noexcept {
+ return iterator(_front_chunk);
+}
+
+template <typename T, size_t items_per_chunk>
+inline typename chunked_fifo<T, items_per_chunk>::iterator
+chunked_fifo<T, items_per_chunk>::end() noexcept {
+ return iterator(nullptr);
+}
+
+template <typename T, size_t items_per_chunk>
+inline typename chunked_fifo<T, items_per_chunk>::const_iterator
+chunked_fifo<T, items_per_chunk>::begin() const noexcept {
+ return const_iterator(_front_chunk);
+}
+
+template <typename T, size_t items_per_chunk>
+inline typename chunked_fifo<T, items_per_chunk>::const_iterator
+chunked_fifo<T, items_per_chunk>::end() const noexcept {
+ return const_iterator(nullptr);
+}
+
+template <typename T, size_t items_per_chunk>
+inline typename chunked_fifo<T, items_per_chunk>::const_iterator
+chunked_fifo<T, items_per_chunk>::cbegin() const noexcept {
+ return const_iterator(_front_chunk);
+}
+
+template <typename T, size_t items_per_chunk>
+inline typename chunked_fifo<T, items_per_chunk>::const_iterator
+chunked_fifo<T, items_per_chunk>::cend() const noexcept {
+ return const_iterator(nullptr);
+}
+
+}
diff --git a/src/seastar/include/seastar/core/circular_buffer.hh b/src/seastar/include/seastar/core/circular_buffer.hh
new file mode 100644
index 000000000..d1444e4c2
--- /dev/null
+++ b/src/seastar/include/seastar/core/circular_buffer.hh
@@ -0,0 +1,511 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/transfer.hh>
+#include <seastar/core/bitops.hh>
+#include <seastar/util/concepts.hh>
+#include <memory>
+#include <algorithm>
+
+namespace seastar {
+
+/// A growable double-ended queue container that can be efficiently
+/// extended (and shrunk) from both ends. Implementation is a single
+/// storage vector.
+///
+/// Similar to libstdc++'s std::deque, except that it uses a single
+/// level store, and so is more efficient for simple stored items.
+/// Similar to boost::circular_buffer_space_optimized, except it uses
+/// uninitialized storage for unoccupied elements (and thus move/copy
+/// constructors instead of move/copy assignments, which are less
+/// efficient).
+///
+/// The storage of the circular_buffer is expanded automatically in
+/// exponential increments.
+/// When adding new elements:
+/// * if size + 1 > capacity: all iterators and references are
+/// invalidated,
+/// * otherwise only the begin() or end() iterator is invalidated:
+/// * push_front() and emplace_front() will invalidate begin() and
+/// * push_back() and emplace_back() will invalidate end().
+///
+/// Removing elements never invalidates any references and only
+/// invalidates begin() or end() iterators:
+/// * pop_front() will invalidate begin() and
+/// * pop_back() will invalidate end().
+///
+/// reserve() may also invalidate all iterators and references.
+template <typename T, typename Alloc = std::allocator<T>>
+class circular_buffer {
+ struct impl : Alloc {
+ T* storage = nullptr;
+ // begin, end interpreted (mod capacity)
+ size_t begin = 0;
+ size_t end = 0;
+ size_t capacity = 0;
+
+ impl(Alloc a) noexcept : Alloc(std::move(a)) { }
+ void reset() {
+ storage = {};
+ begin = 0;
+ end = 0;
+ capacity = 0;
+ }
+ };
+ static_assert(!std::is_default_constructible_v<Alloc>
+ || std::is_nothrow_default_constructible_v<Alloc>);
+ static_assert(std::is_nothrow_move_constructible_v<Alloc>);
+ impl _impl;
+public:
+ using value_type = T;
+ using size_type = size_t;
+ using reference = T&;
+ using pointer = T*;
+ using const_reference = const T&;
+ using const_pointer = const T*;
+public:
+ circular_buffer() noexcept SEASTAR_CONCEPT(requires std::default_initializable<Alloc>) : circular_buffer(Alloc()) {}
+ circular_buffer(Alloc alloc) noexcept;
+ circular_buffer(circular_buffer&& X) noexcept;
+ circular_buffer(const circular_buffer& X) = delete;
+ ~circular_buffer();
+ circular_buffer& operator=(const circular_buffer&) = delete;
+ circular_buffer& operator=(circular_buffer&& b) noexcept;
+ void push_front(const T& data);
+ void push_front(T&& data);
+ template <typename... A>
+ void emplace_front(A&&... args);
+ void push_back(const T& data);
+ void push_back(T&& data);
+ template <typename... A>
+ void emplace_back(A&&... args);
+ T& front() noexcept;
+ const T& front() const noexcept;
+ T& back() noexcept;
+ const T& back() const noexcept;
+ void pop_front() noexcept;
+ void pop_back() noexcept;
+ bool empty() const;
+ size_t size() const;
+ size_t capacity() const;
+ void reserve(size_t);
+ void clear();
+ T& operator[](size_t idx) noexcept;
+ const T& operator[](size_t idx) const noexcept;
+ template <typename Func>
+ void for_each(Func func);
+ // access an element, may return wrong or destroyed element
+ // only useful if you do not rely on data accuracy (e.g. prefetch)
+ T& access_element_unsafe(size_t idx) noexcept;
+private:
+ void expand();
+ void expand(size_t);
+ void maybe_expand(size_t nr = 1);
+ size_t mask(size_t idx) const;
+
+ template<typename CB, typename ValueType>
+ struct cbiterator : std::iterator<std::random_access_iterator_tag, ValueType> {
+ typedef std::iterator<std::random_access_iterator_tag, ValueType> super_t;
+
+ ValueType& operator*() const noexcept { return cb->_impl.storage[cb->mask(idx)]; }
+ ValueType* operator->() const noexcept { return &cb->_impl.storage[cb->mask(idx)]; }
+ // prefix
+ cbiterator<CB, ValueType>& operator++() noexcept {
+ idx++;
+ return *this;
+ }
+ // postfix
+ cbiterator<CB, ValueType> operator++(int unused) noexcept {
+ auto v = *this;
+ idx++;
+ return v;
+ }
+ // prefix
+ cbiterator<CB, ValueType>& operator--() noexcept {
+ idx--;
+ return *this;
+ }
+ // postfix
+ cbiterator<CB, ValueType> operator--(int unused) noexcept {
+ auto v = *this;
+ idx--;
+ return v;
+ }
+ cbiterator<CB, ValueType> operator+(typename super_t::difference_type n) const noexcept {
+ return cbiterator<CB, ValueType>(cb, idx + n);
+ }
+ cbiterator<CB, ValueType> operator-(typename super_t::difference_type n) const noexcept {
+ return cbiterator<CB, ValueType>(cb, idx - n);
+ }
+ cbiterator<CB, ValueType>& operator+=(typename super_t::difference_type n) noexcept {
+ idx += n;
+ return *this;
+ }
+ cbiterator<CB, ValueType>& operator-=(typename super_t::difference_type n) noexcept {
+ idx -= n;
+ return *this;
+ }
+ bool operator==(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx == rhs.idx;
+ }
+ bool operator!=(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx != rhs.idx;
+ }
+ bool operator<(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx < rhs.idx;
+ }
+ bool operator>(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx > rhs.idx;
+ }
+ bool operator>=(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx >= rhs.idx;
+ }
+ bool operator<=(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx <= rhs.idx;
+ }
+ typename super_t::difference_type operator-(const cbiterator<CB, ValueType>& rhs) const noexcept {
+ return idx - rhs.idx;
+ }
+ private:
+ CB* cb;
+ size_t idx;
+ cbiterator(CB* b, size_t i) noexcept : cb(b), idx(i) {}
+ friend class circular_buffer;
+ };
+ friend class iterator;
+
+public:
+ typedef cbiterator<circular_buffer, T> iterator;
+ typedef cbiterator<const circular_buffer, const T> const_iterator;
+
+ iterator begin() noexcept {
+ return iterator(this, _impl.begin);
+ }
+ const_iterator begin() const noexcept {
+ return const_iterator(this, _impl.begin);
+ }
+ iterator end() noexcept {
+ return iterator(this, _impl.end);
+ }
+ const_iterator end() const noexcept {
+ return const_iterator(this, _impl.end);
+ }
+ const_iterator cbegin() const noexcept {
+ return const_iterator(this, _impl.begin);
+ }
+ const_iterator cend() const noexcept {
+ return const_iterator(this, _impl.end);
+ }
+ iterator erase(iterator first, iterator last) noexcept;
+};
+
+template <typename T, typename Alloc>
+inline
+size_t
+circular_buffer<T, Alloc>::mask(size_t idx) const {
+ return idx & (_impl.capacity - 1);
+}
+
+template <typename T, typename Alloc>
+inline
+bool
+circular_buffer<T, Alloc>::empty() const {
+ return _impl.begin == _impl.end;
+}
+
+template <typename T, typename Alloc>
+inline
+size_t
+circular_buffer<T, Alloc>::size() const {
+ return _impl.end - _impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline
+size_t
+circular_buffer<T, Alloc>::capacity() const {
+ return _impl.capacity;
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::reserve(size_t size) {
+ if (capacity() < size) {
+ // Make sure that the new capacity is a power of two.
+ expand(size_t(1) << log2ceil(size));
+ }
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::clear() {
+ erase(begin(), end());
+}
+
+template <typename T, typename Alloc>
+inline
+circular_buffer<T, Alloc>::circular_buffer(Alloc alloc) noexcept
+ : _impl(std::move(alloc)) {
+}
+
+template <typename T, typename Alloc>
+inline
+circular_buffer<T, Alloc>::circular_buffer(circular_buffer&& x) noexcept
+ : _impl(std::move(x._impl)) {
+ x._impl.reset();
+}
+
+template <typename T, typename Alloc>
+inline
+circular_buffer<T, Alloc>& circular_buffer<T, Alloc>::operator=(circular_buffer&& x) noexcept {
+ if (this != &x) {
+ this->~circular_buffer();
+ new (this) circular_buffer(std::move(x));
+ }
+ return *this;
+}
+
+template <typename T, typename Alloc>
+template <typename Func>
+inline
+void
+circular_buffer<T, Alloc>::for_each(Func func) {
+ auto s = _impl.storage;
+ auto m = _impl.capacity - 1;
+ for (auto i = _impl.begin; i != _impl.end; ++i) {
+ func(s[i & m]);
+ }
+}
+
+template <typename T, typename Alloc>
+inline
+circular_buffer<T, Alloc>::~circular_buffer() {
+ for_each([this] (T& obj) {
+ std::allocator_traits<Alloc>::destroy(_impl, &obj);
+ });
+ _impl.deallocate(_impl.storage, _impl.capacity);
+}
+
+template <typename T, typename Alloc>
+void
+circular_buffer<T, Alloc>::expand() {
+ expand(std::max<size_t>(_impl.capacity * 2, 1));
+}
+
+template <typename T, typename Alloc>
+void
+circular_buffer<T, Alloc>::expand(size_t new_cap) {
+ auto new_storage = _impl.allocate(new_cap);
+ auto p = new_storage;
+ try {
+ for_each([this, &p] (T& obj) {
+ transfer_pass1(_impl, &obj, p);
+ p++;
+ });
+ } catch (...) {
+ while (p != new_storage) {
+ std::allocator_traits<Alloc>::destroy(_impl, --p);
+ }
+ _impl.deallocate(new_storage, new_cap);
+ throw;
+ }
+ p = new_storage;
+ for_each([this, &p] (T& obj) {
+ transfer_pass2(_impl, &obj, p++);
+ });
+ std::swap(_impl.storage, new_storage);
+ std::swap(_impl.capacity, new_cap);
+ _impl.begin = 0;
+ _impl.end = p - _impl.storage;
+ _impl.deallocate(new_storage, new_cap);
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::maybe_expand(size_t nr) {
+ if (_impl.end - _impl.begin + nr > _impl.capacity) {
+ expand();
+ }
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::push_front(const T& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ std::allocator_traits<Alloc>::construct(_impl, p, data);
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::push_front(T&& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ std::allocator_traits<Alloc>::construct(_impl, p, std::move(data));
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline
+void
+circular_buffer<T, Alloc>::emplace_front(Args&&... args) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.begin - 1)];
+ std::allocator_traits<Alloc>::construct(_impl, p, std::forward<Args>(args)...);
+ --_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::push_back(const T& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ std::allocator_traits<Alloc>::construct(_impl, p, data);
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::push_back(T&& data) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ std::allocator_traits<Alloc>::construct(_impl, p, std::move(data));
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+template <typename... Args>
+inline
+void
+circular_buffer<T, Alloc>::emplace_back(Args&&... args) {
+ maybe_expand();
+ auto p = &_impl.storage[mask(_impl.end)];
+ std::allocator_traits<Alloc>::construct(_impl, p, std::forward<Args>(args)...);
+ ++_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline
+T&
+circular_buffer<T, Alloc>::front() noexcept {
+ return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline
+const T&
+circular_buffer<T, Alloc>::front() const noexcept {
+ return _impl.storage[mask(_impl.begin)];
+}
+
+template <typename T, typename Alloc>
+inline
+T&
+circular_buffer<T, Alloc>::back() noexcept {
+ return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline
+const T&
+circular_buffer<T, Alloc>::back() const noexcept {
+ return _impl.storage[mask(_impl.end - 1)];
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::pop_front() noexcept {
+ std::allocator_traits<Alloc>::destroy(_impl, &front());
+ ++_impl.begin;
+}
+
+template <typename T, typename Alloc>
+inline
+void
+circular_buffer<T, Alloc>::pop_back() noexcept {
+ std::allocator_traits<Alloc>::destroy(_impl, &back());
+ --_impl.end;
+}
+
+template <typename T, typename Alloc>
+inline
+T&
+circular_buffer<T, Alloc>::operator[](size_t idx) noexcept {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline
+const T&
+circular_buffer<T, Alloc>::operator[](size_t idx) const noexcept {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline
+T&
+circular_buffer<T, Alloc>::access_element_unsafe(size_t idx) noexcept {
+ return _impl.storage[mask(_impl.begin + idx)];
+}
+
+template <typename T, typename Alloc>
+inline
+typename circular_buffer<T, Alloc>::iterator
+circular_buffer<T, Alloc>::erase(iterator first, iterator last) noexcept {
+ static_assert(std::is_nothrow_move_assignable<T>::value, "erase() assumes move assignment does not throw");
+ if (first == last) {
+ return last;
+ }
+ // Move to the left or right depending on which would result in least amount of moves.
+ // This also guarantees that iterators will be stable when removing from either front or back.
+ if (std::distance(begin(), first) < std::distance(last, end())) {
+ auto new_start = std::move_backward(begin(), first, last);
+ auto i = begin();
+ while (i < new_start) {
+ std::allocator_traits<Alloc>::destroy(_impl, &*i++);
+ }
+ _impl.begin = new_start.idx;
+ return last;
+ } else {
+ auto new_end = std::move(last, end(), first);
+ auto i = new_end;
+ auto e = end();
+ while (i < e) {
+ std::allocator_traits<Alloc>::destroy(_impl, &*i++);
+ }
+ _impl.end = new_end.idx;
+ return first;
+ }
+}
+
+}
diff --git a/src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh b/src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh
new file mode 100644
index 000000000..7e7c093e2
--- /dev/null
+++ b/src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh
@@ -0,0 +1,378 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+// A fixed capacity double-ended queue container that can be efficiently
+// extended (and shrunk) from both ends. Implementation is a single
+// storage vector.
+//
+// Similar to libstdc++'s std::deque, except that it uses a single level
+// store, and so is more efficient for simple stored items.
+
+#include <type_traits>
+#include <cstddef>
+#include <iterator>
+#include <utility>
+
+
+/// \file
+
+namespace seastar {
+
+/// A fixed-capacity container (like boost::static_vector) that can insert
+/// and remove at both ends (like std::deque). Does not allocate.
+///
+/// Does not perform overflow checking when size exceeds capacity.
+///
+/// \tparam T type of objects stored in the container; must be noexcept move enabled
+/// \tparam Capacity maximum number of objects that can be stored in the container; must be a power of 2
+template <typename T, size_t Capacity>
+class circular_buffer_fixed_capacity {
+ size_t _begin = 0;
+ size_t _end = 0;
+ union maybe_storage {
+ T data;
+ maybe_storage() noexcept {}
+ ~maybe_storage() {}
+ };
+ maybe_storage _storage[Capacity];
+private:
+ static size_t mask(size_t idx) { return idx % Capacity; }
+ T* obj(size_t idx) { return &_storage[mask(idx)].data; }
+ const T* obj(size_t idx) const { return &_storage[mask(idx)].data; }
+public:
+ static_assert((Capacity & (Capacity - 1)) == 0, "capacity must be a power of two");
+ static_assert(std::is_nothrow_move_constructible<T>::value && std::is_nothrow_move_assignable<T>::value,
+ "circular_buffer_fixed_capacity only supports nothrow-move value types");
+ using value_type = T;
+ using size_type = size_t;
+ using reference = T&;
+ using pointer = T*;
+ using const_reference = const T&;
+ using const_pointer = const T*;
+ using difference_type = ssize_t;
+public:
+ template <typename ValueType>
+ class cbiterator {
+ using holder = std::conditional_t<std::is_const<ValueType>::value, const maybe_storage, maybe_storage>;
+ holder* _start;
+ size_t _idx;
+ private:
+ cbiterator(holder* start, size_t idx) noexcept : _start(start), _idx(idx) {}
+ public:
+ using iterator_category = std::random_access_iterator_tag;
+ using value_type = ValueType;
+ using difference_type = ssize_t;
+ using pointer = ValueType*;
+ using reference = ValueType&;
+ public:
+ cbiterator();
+ ValueType& operator*() const { return _start[mask(_idx)].data; }
+ ValueType* operator->() const { return &operator*(); }
+ // prefix
+ cbiterator& operator++() {
+ ++_idx;
+ return *this;
+ }
+ // postfix
+ cbiterator operator++(int) {
+ auto v = *this;
+ ++_idx;
+ return v;
+ }
+ // prefix
+ cbiterator& operator--() {
+ --_idx;
+ return *this;
+ }
+ // postfix
+ cbiterator operator--(int) {
+ auto v = *this;
+ --_idx;
+ return v;
+ }
+ cbiterator operator+(difference_type n) const {
+ return cbiterator{_start, _idx + n};
+ }
+ friend cbiterator operator+(difference_type n, cbiterator i) {
+ return i + n;
+ }
+ cbiterator operator-(difference_type n) const {
+ return cbiterator{_start, _idx - n};
+ }
+ cbiterator& operator+=(difference_type n) {
+ _idx += n;
+ return *this;
+ }
+ cbiterator& operator-=(difference_type n) {
+ _idx -= n;
+ return *this;
+ }
+ bool operator==(const cbiterator& rhs) const {
+ return _idx == rhs._idx;
+ }
+ bool operator!=(const cbiterator& rhs) const {
+ return _idx != rhs._idx;
+ }
+ bool operator<(const cbiterator& rhs) const {
+ return ssize_t(_idx - rhs._idx) < 0;
+ }
+ bool operator>(const cbiterator& rhs) const {
+ return ssize_t(_idx - rhs._idx) > 0;
+ }
+ bool operator<=(const cbiterator& rhs) const {
+ return ssize_t(_idx - rhs._idx) <= 0;
+ }
+ bool operator>=(const cbiterator& rhs) const {
+ return ssize_t(_idx - rhs._idx) >= 0;
+ }
+ difference_type operator-(const cbiterator& rhs) const {
+ return _idx - rhs._idx;
+ }
+ friend class circular_buffer_fixed_capacity;
+ };
+public:
+ using iterator = cbiterator<T>;
+ using const_iterator = cbiterator<const T>;
+public:
+ circular_buffer_fixed_capacity() = default;
+ circular_buffer_fixed_capacity(circular_buffer_fixed_capacity&& x) noexcept;
+ ~circular_buffer_fixed_capacity();
+ circular_buffer_fixed_capacity& operator=(circular_buffer_fixed_capacity&& x) noexcept;
+ void push_front(const T& data);
+ void push_front(T&& data);
+ template <typename... A>
+ T& emplace_front(A&&... args);
+ void push_back(const T& data);
+ void push_back(T&& data);
+ template <typename... A>
+ T& emplace_back(A&&... args);
+ T& front();
+ T& back();
+ void pop_front();
+ void pop_back();
+ bool empty() const;
+ size_t size() const;
+ size_t capacity() const;
+ T& operator[](size_t idx);
+ void clear();
+ iterator begin() {
+ return iterator(_storage, _begin);
+ }
+ const_iterator begin() const {
+ return const_iterator(_storage, _begin);
+ }
+ iterator end() {
+ return iterator(_storage, _end);
+ }
+ const_iterator end() const {
+ return const_iterator(_storage, _end);
+ }
+ const_iterator cbegin() const {
+ return const_iterator(_storage, _begin);
+ }
+ const_iterator cend() const {
+ return const_iterator(_storage, _end);
+ }
+ iterator erase(iterator first, iterator last);
+};
+
+template <typename T, size_t Capacity>
+inline
+bool
+circular_buffer_fixed_capacity<T, Capacity>::empty() const {
+ return _begin == _end;
+}
+
+template <typename T, size_t Capacity>
+inline
+size_t
+circular_buffer_fixed_capacity<T, Capacity>::size() const {
+ return _end - _begin;
+}
+
+template <typename T, size_t Capacity>
+inline
+size_t
+circular_buffer_fixed_capacity<T, Capacity>::capacity() const {
+ return Capacity;
+}
+
+template <typename T, size_t Capacity>
+inline
+circular_buffer_fixed_capacity<T, Capacity>::circular_buffer_fixed_capacity(circular_buffer_fixed_capacity&& x) noexcept
+ : _begin(x._begin), _end(x._end) {
+ // This is std::uninitialized_move, but that is c++17 only
+ auto dest = begin();
+ for (auto& obj : x) {
+ new (&*dest++) T(std::move(obj));
+ }
+}
+
+template <typename T, size_t Capacity>
+inline
+circular_buffer_fixed_capacity<T, Capacity>&
+circular_buffer_fixed_capacity<T, Capacity>::operator=(circular_buffer_fixed_capacity&& x) noexcept {
+ if (this != &x) {
+ this->~circular_buffer_fixed_capacity();
+ new (this) circular_buffer_fixed_capacity(std::move(x));
+ }
+ return *this;
+}
+
+template <typename T, size_t Capacity>
+inline
+circular_buffer_fixed_capacity<T, Capacity>::~circular_buffer_fixed_capacity() {
+ clear();
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::push_front(const T& data) {
+ new (obj(_begin - 1)) T(data);
+ --_begin;
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::push_front(T&& data) {
+ new (obj(_begin - 1)) T(std::move(data));
+ --_begin;
+}
+
+template <typename T, size_t Capacity>
+template <typename... Args>
+inline
+T&
+circular_buffer_fixed_capacity<T, Capacity>::emplace_front(Args&&... args) {
+ auto p = new (obj(_begin - 1)) T(std::forward<Args>(args)...);
+ --_begin;
+ return *p;
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::push_back(const T& data) {
+ new (obj(_end)) T(data);
+ ++_end;
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::push_back(T&& data) {
+ new (obj(_end)) T(std::move(data));
+ ++_end;
+}
+
+template <typename T, size_t Capacity>
+template <typename... Args>
+inline
+T&
+circular_buffer_fixed_capacity<T, Capacity>::emplace_back(Args&&... args) {
+ auto p = new (obj(_end)) T(std::forward<Args>(args)...);
+ ++_end;
+ return *p;
+}
+
+template <typename T, size_t Capacity>
+inline
+T&
+circular_buffer_fixed_capacity<T, Capacity>::front() {
+ return *obj(_begin);
+}
+
+template <typename T, size_t Capacity>
+inline
+T&
+circular_buffer_fixed_capacity<T, Capacity>::back() {
+ return *obj(_end - 1);
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::pop_front() {
+ obj(_begin)->~T();
+ ++_begin;
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::pop_back() {
+ obj(_end - 1)->~T();
+ --_end;
+}
+
+template <typename T, size_t Capacity>
+inline
+T&
+circular_buffer_fixed_capacity<T, Capacity>::operator[](size_t idx) {
+ return *obj(_begin + idx);
+}
+
+template <typename T, size_t Capacity>
+inline
+typename circular_buffer_fixed_capacity<T, Capacity>::iterator
+circular_buffer_fixed_capacity<T, Capacity>::erase(iterator first, iterator last) {
+ static_assert(std::is_nothrow_move_assignable<T>::value, "erase() assumes move assignment does not throw");
+ if (first == last) {
+ return last;
+ }
+ // Move to the left or right depending on which would result in least amount of moves.
+ // This also guarantees that iterators will be stable when removing from either front or back.
+ if (std::distance(begin(), first) < std::distance(last, end())) {
+ auto new_start = std::move_backward(begin(), first, last);
+ auto i = begin();
+ while (i < new_start) {
+ *i++.~T();
+ }
+ _begin = new_start.idx;
+ return last;
+ } else {
+ auto new_end = std::move(last, end(), first);
+ auto i = new_end;
+ auto e = end();
+ while (i < e) {
+ *i++.~T();
+ }
+ _end = new_end.idx;
+ return first;
+ }
+}
+
+template <typename T, size_t Capacity>
+inline
+void
+circular_buffer_fixed_capacity<T, Capacity>::clear() {
+ for (auto& obj : *this) {
+ obj.~T();
+ }
+ _begin = _end = 0;
+}
+
+}
+
diff --git a/src/seastar/include/seastar/core/condition-variable.hh b/src/seastar/include/seastar/core/condition-variable.hh
new file mode 100644
index 000000000..fec28f285
--- /dev/null
+++ b/src/seastar/include/seastar/core/condition-variable.hh
@@ -0,0 +1,171 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/loop.hh>
+
+namespace seastar {
+
+/// \addtogroup fiber-module
+/// @{
+
+/// Exception thrown when a condition variable is broken by
+/// \ref condition_variable::broken().
+class broken_condition_variable : public std::exception {
+public:
+ /// Reports the exception reason.
+ virtual const char* what() const noexcept;
+};
+
+/// Exception thrown when wait() operation times out
+/// \ref condition_variable::wait(time_point timeout).
+class condition_variable_timed_out : public std::exception {
+public:
+ /// Reports the exception reason.
+ virtual const char* what() const noexcept;
+};
+
+/// \brief Conditional variable.
+///
+/// This is a standard computer science condition variable sans locking,
+/// since in seastar access to variables is atomic anyway, adapted
+/// for futures. You can wait for variable to be notified.
+///
+/// To support exceptional conditions, a \ref broken() method
+/// is provided, which causes all current waiters to stop waiting,
+/// with an exceptional future returned. This allows causing all
+/// fibers that are blocked on a condition variable to continue.
+/// This issimilar to POSIX's `pthread_cancel()`, with \ref wait()
+/// acting as a cancellation point.
+
+class condition_variable {
+ using duration = semaphore::duration;
+ using clock = semaphore::clock;
+ using time_point = semaphore::time_point;
+ struct condition_variable_exception_factory {
+ static condition_variable_timed_out timeout() noexcept;
+ static broken_condition_variable broken() noexcept;
+ };
+ basic_semaphore<condition_variable_exception_factory> _sem;
+public:
+ /// Constructs a condition_variable object.
+ /// Initialzie the semaphore with a default value of 0 to enusre
+ /// the first call to wait() before signal() won't be waken up immediately.
+ condition_variable() noexcept : _sem(0) {}
+
+ /// Waits until condition variable is signaled, may wake up without condition been met
+ ///
+ /// \return a future that becomes ready when \ref signal() is called
+ /// If the condition variable was \ref broken() will return \ref broken_condition_variable
+ /// exception.
+ future<> wait() noexcept {
+ return _sem.wait();
+ }
+
+ /// Waits until condition variable is signaled or timeout is reached
+ ///
+ /// \param timeout time point at which wait will exit with a timeout
+ ///
+ /// \return a future that becomes ready when \ref signal() is called
+ /// If the condition variable was \ref broken() will return \ref broken_condition_variable
+ /// exception. If timepoint is reached will return \ref condition_variable_timed_out exception.
+ future<> wait(time_point timeout) noexcept {
+ return _sem.wait(timeout);
+ }
+
+ /// Waits until condition variable is signaled or timeout is reached
+ ///
+ /// \param timeout duration after which wait will exit with a timeout
+ ///
+ /// \return a future that becomes ready when \ref signal() is called
+ /// If the condition variable was \ref broken() will return \ref broken_condition_variable
+ /// exception. If timepoint is passed will return \ref condition_variable_timed_out exception.
+ future<> wait(duration timeout) noexcept {
+ return _sem.wait(timeout);
+ }
+
+ /// Waits until condition variable is notified and pred() == true, otherwise
+ /// wait again.
+ ///
+ /// \param pred predicate that checks that awaited condition is true
+ ///
+ /// \return a future that becomes ready when \ref signal() is called
+ /// If the condition variable was \ref broken(), may contain an exception.
+ template<typename Pred>
+ future<> wait(Pred&& pred) noexcept {
+ return do_until(std::forward<Pred>(pred), [this] {
+ return wait();
+ });
+ }
+
+ /// Waits until condition variable is notified and pred() == true or timeout is reached, otherwise
+ /// wait again.
+ ///
+ /// \param timeout time point at which wait will exit with a timeout
+ /// \param pred predicate that checks that awaited condition is true
+ ///
+ /// \return a future that becomes ready when \ref signal() is called
+ /// If the condition variable was \ref broken() will return \ref broken_condition_variable
+ /// exception. If timepoint is reached will return \ref condition_variable_timed_out exception.
+ template<typename Pred>
+ future<> wait(time_point timeout, Pred&& pred) noexcept {
+ return do_until(std::forward<Pred>(pred), [this, timeout] () mutable {
+ return wait(timeout);
+ });
+ }
+
+ /// Waits until condition variable is notified and pred() == true or timeout is reached, otherwise
+ /// wait again.
+ ///
+ /// \param timeout duration after which wait will exit with a timeout
+ /// \param pred predicate that checks that awaited condition is true
+ ///
+ /// \return a future that becomes ready when \ref signal() is called
+ /// If the condition variable was \ref broken() will return \ref broken_condition_variable
+ /// exception. If timepoint is passed will return \ref condition_variable_timed_out exception.
+ template<typename Pred>
+ future<> wait(duration timeout, Pred&& pred) noexcept {
+ return wait(clock::now() + timeout, std::forward<Pred>(pred));
+ }
+ /// Notify variable and wake up a waiter if there is one
+ void signal() noexcept {
+ if (_sem.waiters()) {
+ _sem.signal();
+ }
+ }
+ /// Notify variable and wake up all waiter
+ void broadcast() noexcept {
+ _sem.signal(_sem.waiters());
+ }
+
+ /// Signal to waiters that an error occurred. \ref wait() will see
+ /// an exceptional future<> containing the provided exception parameter.
+ /// The future is made available immediately.
+ void broken() noexcept {
+ _sem.broken();
+ }
+};
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/coroutine.hh b/src/seastar/include/seastar/core/coroutine.hh
new file mode 100644
index 000000000..98e6e6794
--- /dev/null
+++ b/src/seastar/include/seastar/core/coroutine.hh
@@ -0,0 +1,196 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#ifndef SEASTAR_COROUTINES_ENABLED
+#error Coroutines support disabled.
+#endif
+
+#include <seastar/core/std-coroutine.hh>
+
+namespace seastar {
+
+namespace internal {
+
+template <typename T = void>
+class coroutine_traits_base {
+public:
+ class promise_type final : public seastar::task {
+ seastar::promise<T> _promise;
+ public:
+ promise_type() = default;
+ promise_type(promise_type&&) = delete;
+ promise_type(const promise_type&) = delete;
+
+ template<typename... U>
+ void return_value(U&&... value) {
+ _promise.set_value(std::forward<U>(value)...);
+ }
+
+ void return_value(future<T>&& fut) noexcept {
+ fut.forward_to(std::move(_promise));
+ }
+
+ void unhandled_exception() noexcept {
+ _promise.set_exception(std::current_exception());
+ }
+
+ seastar::future<T> get_return_object() noexcept {
+ return _promise.get_future();
+ }
+
+ SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never initial_suspend() noexcept { return { }; }
+ SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never final_suspend() noexcept { return { }; }
+
+ virtual void run_and_dispose() noexcept override {
+ auto handle = SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle<promise_type>::from_promise(*this);
+ handle.resume();
+ }
+
+ task* waiting_task() noexcept override { return _promise.waiting_task(); }
+ };
+};
+
+template <>
+class coroutine_traits_base<> {
+public:
+ class promise_type final : public seastar::task {
+ seastar::promise<> _promise;
+ public:
+ promise_type() = default;
+ promise_type(promise_type&&) = delete;
+ promise_type(const promise_type&) = delete;
+
+ void return_void() noexcept {
+ _promise.set_value();
+ }
+
+// Clang complains if both return_value and return_void are defined
+#if !defined(__clang__)
+ void return_value(future<>&& fut) noexcept {
+ fut.forward_to(std::move(_promise));
+ }
+#endif
+
+ void unhandled_exception() noexcept {
+ _promise.set_exception(std::current_exception());
+ }
+
+ seastar::future<> get_return_object() noexcept {
+ return _promise.get_future();
+ }
+
+ SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never initial_suspend() noexcept { return { }; }
+ SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never final_suspend() noexcept { return { }; }
+
+ virtual void run_and_dispose() noexcept override {
+ auto handle = SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle<promise_type>::from_promise(*this);
+ handle.resume();
+ }
+
+ task* waiting_task() noexcept override { return _promise.waiting_task(); }
+ };
+};
+
+template<typename... T>
+struct awaiter {
+ seastar::future<T...> _future;
+public:
+ explicit awaiter(seastar::future<T...>&& f) noexcept : _future(std::move(f)) { }
+
+ awaiter(const awaiter&) = delete;
+ awaiter(awaiter&&) = delete;
+
+ bool await_ready() const noexcept {
+ return _future.available();
+ }
+
+ template<typename U>
+ void await_suspend(SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle<U> hndl) noexcept {
+ _future.set_coroutine(hndl.promise());
+ }
+
+ std::tuple<T...> await_resume() { return _future.get(); }
+};
+
+template<typename T>
+struct awaiter<T> {
+ seastar::future<T> _future;
+public:
+ explicit awaiter(seastar::future<T>&& f) noexcept : _future(std::move(f)) { }
+
+ awaiter(const awaiter&) = delete;
+ awaiter(awaiter&&) = delete;
+
+ bool await_ready() const noexcept {
+ return _future.available();
+ }
+
+ template<typename U>
+ void await_suspend(SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle<U> hndl) noexcept {
+ _future.set_coroutine(hndl.promise());
+ }
+
+ T await_resume() { return _future.get0(); }
+};
+
+template<>
+struct awaiter<> {
+ seastar::future<> _future;
+public:
+ explicit awaiter(seastar::future<>&& f) noexcept : _future(std::move(f)) { }
+
+ awaiter(const awaiter&) = delete;
+ awaiter(awaiter&&) = delete;
+
+ bool await_ready() const noexcept {
+ return _future.available();
+ }
+
+ template<typename U>
+ void await_suspend(SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle<U> hndl) noexcept {
+ _future.set_coroutine(hndl.promise());
+ }
+
+ void await_resume() { _future.get(); }
+};
+
+} // seastar::internal
+
+template<typename... T>
+auto operator co_await(future<T...> f) noexcept {
+ return internal::awaiter<T...>(std::move(f));
+}
+
+} // seastar
+
+
+namespace SEASTAR_INTERNAL_COROUTINE_NAMESPACE {
+
+template<typename... T, typename... Args>
+class coroutine_traits<seastar::future<T...>, Args...> : public seastar::internal::coroutine_traits_base<T...> {
+};
+
+} // SEASTAR_INTERNAL_COROUTINE_NAMESPACE
+
diff --git a/src/seastar/include/seastar/core/deleter.hh b/src/seastar/include/seastar/core/deleter.hh
new file mode 100644
index 000000000..b637336e1
--- /dev/null
+++ b/src/seastar/include/seastar/core/deleter.hh
@@ -0,0 +1,281 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <cstdlib>
+#include <assert.h>
+#include <type_traits>
+
+namespace seastar {
+
+/// \addtogroup memory-module
+/// @{
+
+/// Provides a mechanism for managing the lifetime of a buffer.
+///
+/// A \c deleter is an object that is used to inform the consumer
+/// of some buffer (not referenced by the deleter itself) how to
+/// delete the buffer. This can be by calling an arbitrary function
+/// or destroying an object carried by the deleter. Examples of
+/// a deleter's encapsulated actions are:
+///
+/// - calling \c std::free(p) on some captured pointer, p
+/// - calling \c delete \c p on some captured pointer, p
+/// - decrementing a reference count somewhere
+///
+/// A deleter performs its action from its destructor.
+class deleter final {
+public:
+ /// \cond internal
+ struct impl;
+ struct raw_object_tag {};
+ /// \endcond
+private:
+ // if bit 0 set, point to object to be freed directly.
+ impl* _impl = nullptr;
+public:
+ /// Constructs an empty deleter that does nothing in its destructor.
+ deleter() noexcept = default;
+ deleter(const deleter&) = delete;
+ /// Moves a deleter.
+ deleter(deleter&& x) noexcept : _impl(x._impl) { x._impl = nullptr; }
+ /// \cond internal
+ explicit deleter(impl* i) noexcept : _impl(i) {}
+ deleter(raw_object_tag tag, void* object) noexcept
+ : _impl(from_raw_object(object)) {}
+ /// \endcond
+ /// Destroys the deleter and carries out the encapsulated action.
+ ~deleter();
+ deleter& operator=(deleter&& x) noexcept;
+ deleter& operator=(deleter&) = delete;
+ /// Performs a sharing operation. The encapsulated action will only
+ /// be carried out after both the original deleter and the returned
+ /// deleter are both destroyed.
+ ///
+ /// \return a deleter with the same encapsulated action as this one.
+ deleter share();
+ /// Checks whether the deleter has an associated action.
+ explicit operator bool() const noexcept { return bool(_impl); }
+ /// \cond internal
+ void reset(impl* i) {
+ this->~deleter();
+ new (this) deleter(i);
+ }
+ /// \endcond
+ /// Appends another deleter to this deleter. When this deleter is
+ /// destroyed, both encapsulated actions will be carried out.
+ void append(deleter d);
+private:
+ static bool is_raw_object(impl* i) noexcept {
+ auto x = reinterpret_cast<uintptr_t>(i);
+ return x & 1;
+ }
+ bool is_raw_object() const noexcept {
+ return is_raw_object(_impl);
+ }
+ static void* to_raw_object(impl* i) noexcept {
+ auto x = reinterpret_cast<uintptr_t>(i);
+ return reinterpret_cast<void*>(x & ~uintptr_t(1));
+ }
+ void* to_raw_object() const noexcept {
+ return to_raw_object(_impl);
+ }
+ impl* from_raw_object(void* object) noexcept {
+ auto x = reinterpret_cast<uintptr_t>(object);
+ return reinterpret_cast<impl*>(x | 1);
+ }
+};
+
+/// \cond internal
+struct deleter::impl {
+ unsigned refs = 1;
+ deleter next;
+ impl(deleter next) : next(std::move(next)) {}
+ virtual ~impl() {}
+};
+/// \endcond
+
+inline
+deleter::~deleter() {
+ if (is_raw_object()) {
+ std::free(to_raw_object());
+ return;
+ }
+ if (_impl && --_impl->refs == 0) {
+ delete _impl;
+ }
+}
+
+inline
+deleter& deleter::operator=(deleter&& x) noexcept {
+ if (this != &x) {
+ this->~deleter();
+ new (this) deleter(std::move(x));
+ }
+ return *this;
+}
+
+/// \cond internal
+template <typename Deleter>
+struct lambda_deleter_impl final : deleter::impl {
+ Deleter del;
+ lambda_deleter_impl(deleter next, Deleter&& del)
+ : impl(std::move(next)), del(std::move(del)) {}
+ virtual ~lambda_deleter_impl() override { del(); }
+};
+
+template <typename Object>
+struct object_deleter_impl final : deleter::impl {
+ Object obj;
+ object_deleter_impl(deleter next, Object&& obj)
+ : impl(std::move(next)), obj(std::move(obj)) {}
+};
+
+template <typename Object>
+inline
+object_deleter_impl<Object>* make_object_deleter_impl(deleter next, Object obj) {
+ return new object_deleter_impl<Object>(std::move(next), std::move(obj));
+}
+/// \endcond
+
+/// Makes a \ref deleter that encapsulates the action of
+/// destroying an object, as well as running another deleter. The input
+/// object is moved to the deleter, and destroyed when the deleter is destroyed.
+///
+/// \param next deleter that will become part of the new deleter's encapsulated action
+/// \param o object whose destructor becomes part of the new deleter's encapsulated action
+/// \related deleter
+template <typename Object>
+deleter
+make_deleter(deleter next, Object o) {
+ return deleter(new lambda_deleter_impl<Object>(std::move(next), std::move(o)));
+}
+
+/// Makes a \ref deleter that encapsulates the action of destroying an object. The input
+/// object is moved to the deleter, and destroyed when the deleter is destroyed.
+///
+/// \param o object whose destructor becomes the new deleter's encapsulated action
+/// \related deleter
+template <typename Object>
+deleter
+make_deleter(Object o) {
+ return make_deleter(deleter(), std::move(o));
+}
+
+/// \cond internal
+struct free_deleter_impl final : deleter::impl {
+ void* obj;
+ free_deleter_impl(void* obj) : impl(deleter()), obj(obj) {}
+ virtual ~free_deleter_impl() override { std::free(obj); }
+};
+/// \endcond
+
+inline
+deleter
+deleter::share() {
+ if (!_impl) {
+ return deleter();
+ }
+ if (is_raw_object()) {
+ _impl = new free_deleter_impl(to_raw_object());
+ }
+ ++_impl->refs;
+ return deleter(_impl);
+}
+
+// Appends 'd' to the chain of deleters. Avoids allocation if possible. For
+// performance reasons the current chain should be shorter and 'd' should be
+// longer.
+inline
+void deleter::append(deleter d) {
+ if (!d._impl) {
+ return;
+ }
+ impl* next_impl = _impl;
+ deleter* next_d = this;
+ while (next_impl) {
+ if (next_impl == d._impl) {
+ return; // Already appended
+ }
+ if (is_raw_object(next_impl)) {
+ next_d->_impl = next_impl = new free_deleter_impl(to_raw_object(next_impl));
+ }
+
+ if (next_impl->refs != 1) {
+ next_d->_impl = next_impl = make_object_deleter_impl(deleter(next_impl), std::move(d));
+ return;
+ }
+
+ next_d = &next_impl->next;
+ next_impl = next_d->_impl;
+ }
+ next_d->_impl = d._impl;
+ d._impl = nullptr;
+}
+
+/// Makes a deleter that calls \c std::free() when it is destroyed.
+///
+/// \param obj object to free.
+/// \related deleter
+inline
+deleter
+make_free_deleter(void* obj) {
+ if (!obj) {
+ return deleter();
+ }
+ return deleter(deleter::raw_object_tag(), obj);
+}
+
+/// Makes a deleter that calls \c std::free() when it is destroyed, as well
+/// as invoking the encapsulated action of another deleter.
+///
+/// \param next deleter to invoke.
+/// \param obj object to free.
+/// \related deleter
+inline
+deleter
+make_free_deleter(deleter next, void* obj) {
+ return make_deleter(std::move(next), [obj] () mutable { std::free(obj); });
+}
+
+/// \see make_deleter(Object)
+/// \related deleter
+template <typename T>
+inline
+deleter
+make_object_deleter(T&& obj) {
+ return deleter{make_object_deleter_impl(deleter(), std::move(obj))};
+}
+
+/// \see make_deleter(deleter, Object)
+/// \related deleter
+template <typename T>
+inline
+deleter
+make_object_deleter(deleter d, T&& obj) {
+ return deleter{make_object_deleter_impl(std::move(d), std::move(obj))};
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/distributed.hh b/src/seastar/include/seastar/core/distributed.hh
new file mode 100644
index 000000000..c2641f612
--- /dev/null
+++ b/src/seastar/include/seastar/core/distributed.hh
@@ -0,0 +1,32 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/sharded.hh>
+
+namespace seastar {
+
+
+template <typename Service>
+using distributed = sharded<Service>;
+
+}
diff --git a/src/seastar/include/seastar/core/do_with.hh b/src/seastar/include/seastar/core/do_with.hh
new file mode 100644
index 000000000..758d75388
--- /dev/null
+++ b/src/seastar/include/seastar/core/do_with.hh
@@ -0,0 +1,153 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <utility>
+#include <memory>
+#include <tuple>
+
+namespace seastar {
+
+
+/// \cond internal
+
+namespace internal {
+
+template <typename HeldState, typename Future>
+class do_with_state final : public continuation_base_from_future<Future>::type {
+ HeldState _held;
+ typename Future::promise_type _pr;
+public:
+ template<typename... T>
+ explicit do_with_state(T&&... args) : _held(std::forward<T>(args)...) {}
+ virtual void run_and_dispose() noexcept override {
+ _pr.set_urgent_state(std::move(this->_state));
+ delete this;
+ }
+ task* waiting_task() noexcept override {
+ return _pr.waiting_task();
+ }
+ HeldState& data() {
+ return _held;
+ }
+ Future get_future() {
+ return _pr.get_future();
+ }
+};
+
+}
+/// \endcond
+
+namespace internal {
+template <typename Tuple, size_t... Idx>
+inline
+auto
+cherry_pick_tuple(std::index_sequence<Idx...>, Tuple&& tuple) {
+ return std::forward_as_tuple(std::get<Idx>(std::forward<Tuple>(tuple))...);
+}
+
+template <typename Tuple, typename Seq>
+struct subtuple;
+
+template <typename Tuple, size_t... Idx>
+struct subtuple<Tuple, std::index_sequence<Idx...>> {
+ using type = std::tuple<std::decay_t<std::tuple_element_t<Idx, Tuple>>...>;
+};
+
+template <typename T1, typename T2, typename... More>
+inline
+auto
+do_with_impl(T1&& rv1, T2&& rv2, More&&... more) {
+ auto all = std::forward_as_tuple(
+ std::forward<T1>(rv1),
+ std::forward<T2>(rv2),
+ std::forward<More>(more)...);
+ constexpr size_t nr = std::tuple_size<decltype(all)>::value - 1;
+ using idx = std::make_index_sequence<nr>;
+ auto&& just_values = cherry_pick_tuple(idx(), std::move(all));
+ auto&& just_func = std::move(std::get<nr>(std::move(all)));
+ using value_tuple = typename subtuple<decltype(all), idx>::type;
+ using ret_type = decltype(std::apply(just_func, std::declval<value_tuple&>()));
+ auto task = std::apply(
+ [](auto&&... x) {
+ return std::make_unique<internal::do_with_state<value_tuple, ret_type>>(std::forward<decltype(x)>(x)...);
+ },
+ std::move(just_values));
+ auto fut = std::apply(just_func, task->data());
+ if (fut.available()) {
+ return fut;
+ }
+ auto ret = task->get_future();
+ internal::set_callback(fut, task.release());
+ return ret;
+}
+}
+
+/// \addtogroup future-util
+/// @{
+
+/// do_with() holds a objects alive until a future completes, and
+/// allow the code involved in making the future complete to have easy
+/// access to this object.
+///
+/// do_with() takes multiple arguments: The last is a function
+/// returning a future. The other are temporary objects (rvalue). The
+/// function is given (a moved copy of) these temporary object, by
+/// reference, and it is ensured that the objects will not be
+/// destructed until the completion of the future returned by the
+/// function.
+///
+/// do_with() returns a future which resolves to whatever value the given future
+/// (returned by the given function) resolves to. This returned value must not
+/// contain references to the temporary object, as at that point the temporary
+/// is destructed.
+///
+/// \return whatever the function returns
+template <typename T1, typename T2, typename... More>
+inline
+auto
+do_with(T1&& rv1, T2&& rv2, More&&... more) noexcept {
+ auto func = internal::do_with_impl<T1, T2, More...>;
+ return futurize_invoke(func, std::forward<T1>(rv1), std::forward<T2>(rv2), std::forward<More>(more)...);
+}
+
+/// Executes the function \c func making sure the lock \c lock is taken,
+/// and later on properly released.
+///
+/// \param lock the lock, which is any object having providing a lock() / unlock() semantics.
+/// Caller must make sure that it outlives \c func.
+/// \param func function to be executed
+/// \returns whatever \c func returns
+template<typename Lock, typename Func>
+inline
+auto with_lock(Lock& lock, Func&& func) {
+ return lock.lock().then([&lock, func = std::forward<Func>(func)] () mutable {
+ return futurize_invoke(func).finally([&lock] {
+ lock.unlock();
+ });
+ });
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/dpdk_rte.hh b/src/seastar/include/seastar/core/dpdk_rte.hh
new file mode 100644
index 000000000..24c3d97a5
--- /dev/null
+++ b/src/seastar/include/seastar/core/dpdk_rte.hh
@@ -0,0 +1,64 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#pragma once
+
+#ifdef SEASTAR_HAVE_DPDK
+
+#include <bitset>
+#include <rte_config.h>
+#include <rte_ethdev.h>
+#include <rte_version.h>
+#include <boost/program_options.hpp>
+
+/*********************** Compat section ***************************************/
+// We currently support only versions 2.0 and above.
+#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0))
+#error "DPDK version above 2.0.0 is required"
+#endif
+
+#if defined(RTE_MBUF_REFCNT_ATOMIC)
+#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \
+ "config/common_linuxapp"
+#endif
+/******************************************************************************/
+
+namespace seastar {
+
+namespace dpdk {
+
+// DPDK Environment Abstraction Layer
+class eal {
+public:
+ using cpuset = std::bitset<RTE_MAX_LCORE>;
+
+ static void init(cpuset cpus, boost::program_options::variables_map opts);
+ /**
+ * Returns the amount of memory needed for DPDK
+ * @param num_cpus Number of CPUs the application is going to use
+ *
+ * @return
+ */
+ static size_t mem_size(int num_cpus, bool hugetlbfs_membackend = true);
+ static bool initialized;
+};
+
+} // namespace dpdk
+
+}
+
+#endif // SEASTAR_HAVE_DPDK
diff --git a/src/seastar/include/seastar/core/enum.hh b/src/seastar/include/seastar/core/enum.hh
new file mode 100644
index 000000000..1ea342322
--- /dev/null
+++ b/src/seastar/include/seastar/core/enum.hh
@@ -0,0 +1,46 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+/*
+ * This header file defines a hash function for enum types, using the
+ * standard hash function of the underlying type (such as int). This makes
+ * it possible to inherit from this type to
+ */
+
+#include <type_traits>
+#include <functional>
+#include <cstddef>
+
+namespace seastar {
+
+template <typename T>
+class enum_hash {
+ static_assert(std::is_enum<T>::value, "must be an enum");
+public:
+ std::size_t operator()(const T& e) const {
+ using utype = typename std::underlying_type<T>::type;
+ return std::hash<utype>()(static_cast<utype>(e));
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/exception_hacks.hh b/src/seastar/include/seastar/core/exception_hacks.hh
new file mode 100644
index 000000000..5a8d54035
--- /dev/null
+++ b/src/seastar/include/seastar/core/exception_hacks.hh
@@ -0,0 +1,26 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+namespace seastar {
+void init_phdr_cache();
+}
diff --git a/src/seastar/include/seastar/core/execution_stage.hh b/src/seastar/include/seastar/core/execution_stage.hh
new file mode 100644
index 000000000..f3e4a7310
--- /dev/null
+++ b/src/seastar/include/seastar/core/execution_stage.hh
@@ -0,0 +1,543 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/chunked_fifo.hh>
+#include <seastar/core/function_traits.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/scheduling.hh>
+#include <seastar/util/reference_wrapper.hh>
+#include <seastar/util/concepts.hh>
+#include <seastar/util/noncopyable_function.hh>
+#include <seastar/util/tuple_utils.hh>
+#include <seastar/util/defer.hh>
+#include <seastar/util/std-compat.hh>
+#include <fmt/format.h>
+#include <fmt/ostream.h>
+#include <vector>
+#include <boost/range/irange.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/container/static_vector.hpp>
+
+namespace seastar {
+
+/// \defgroup execution-stages Execution Stages
+///
+/// \brief
+/// Execution stages provide an infrastructure for processing function calls in
+/// batches in order to improve instruction cache locality.
+///
+/// When the application logic becomes more and more complex and the length
+/// of the data processing pipeline grows it may happen that the most
+/// significant bottleneck are instruction cache misses. The solution for that
+/// problem may be processing similar operations in batches so that instruction
+/// cache locality is improved at the cost of potentially higher latencies and
+/// worse data cache locality.
+///
+/// Execution stages allow batching calls to the specified function object.
+/// Every time concrete_execution_stage::operator()() is used the function call
+/// is added to the queue and a future is returned. Once the number of queued
+/// calls reaches certain threshold the stage is flushed and a task is which
+/// would execute these function calls is scheduled. Execution stages are also
+/// flushed when the reactor polls for events.
+///
+/// When calling a function that is wrapped inside execution stage it is
+/// important to remember that the actual function call will happen at some
+/// later time and it has to be guaranteed the objects passed by lvalue
+/// reference are still alive. In order to avoid accidental passing of a
+/// temporary object by lvalue reference the interface of execution stages
+/// accepts only lvalue references wrapped in reference_wrapper. It is safe to
+/// pass rvalue references, they are decayed and the objects are moved. See
+/// concrete_execution_stage::operator()() for more details.
+
+/// \addtogroup execution-stages
+/// @{
+
+/// \cond internal
+namespace internal {
+
+// Execution wraps lreferences in reference_wrapper so that the caller is forced
+// to use seastar::ref(). Then when the function is actually called the
+// reference is unwrapped. However, we need to distinguish between functions
+// which argument is lvalue reference and functions that take
+// reference_wrapper<> as an argument and not unwrap the latter. To solve this
+// issue reference_wrapper_for_es type is used for wrappings done automatically
+// by execution stage.
+template<typename T>
+struct reference_wrapper_for_es : reference_wrapper<T> {
+ reference_wrapper_for_es(reference_wrapper <T> rw) noexcept
+ : reference_wrapper<T>(std::move(rw)) {}
+};
+
+template<typename T>
+struct wrap_for_es {
+ using type = T;
+};
+
+template<typename T>
+struct wrap_for_es<T&> {
+ using type = reference_wrapper_for_es<T>;
+};
+
+template<typename T>
+struct wrap_for_es<T&&> {
+ using type = T;
+};
+
+template<typename T>
+decltype(auto) unwrap_for_es(T&& object) {
+ return std::forward<T>(object);
+}
+
+template<typename T>
+std::reference_wrapper<T> unwrap_for_es(reference_wrapper_for_es<T> ref) {
+ return std::reference_wrapper<T>(ref.get());
+}
+
+}
+/// \endcond
+
+/// Base execution stage class
+class execution_stage {
+public:
+ struct stats {
+ uint64_t tasks_scheduled = 0;
+ uint64_t tasks_preempted = 0;
+ uint64_t function_calls_enqueued = 0;
+ uint64_t function_calls_executed = 0;
+ };
+protected:
+ bool _empty = true;
+ bool _flush_scheduled = false;
+ scheduling_group _sg;
+ stats _stats;
+ sstring _name;
+ metrics::metric_group _metric_group;
+protected:
+ virtual void do_flush() noexcept = 0;
+public:
+ explicit execution_stage(const sstring& name, scheduling_group sg = {});
+ virtual ~execution_stage();
+
+ execution_stage(const execution_stage&) = delete;
+
+ /// Move constructor
+ ///
+ /// \warning It is illegal to move execution_stage after any operation has
+ /// been pushed to it. The only reason why the move constructor is not
+ /// deleted is the fact that C++14 does not guarantee return value
+ /// optimisation which is required by make_execution_stage().
+ execution_stage(execution_stage&&);
+
+ /// Returns execution stage name
+ const sstring& name() const noexcept { return _name; }
+
+ /// Returns execution stage usage statistics
+ const stats& get_stats() const noexcept { return _stats; }
+
+ /// Flushes execution stage
+ ///
+ /// Ensures that a task which would execute all queued operations is
+ /// scheduled. Does not schedule a new task if there is one already pending
+ /// or the queue is empty.
+ ///
+ /// \return true if a new task has been scheduled
+ bool flush() noexcept;
+
+ /// Checks whether there are pending operations.
+ ///
+ /// \return true if there is at least one queued operation
+ bool poll() const noexcept {
+ return !_empty;
+ }
+};
+
+/// \cond internal
+namespace internal {
+
+class execution_stage_manager {
+ std::vector<execution_stage*> _execution_stages;
+ std::unordered_map<sstring, execution_stage*> _stages_by_name;
+private:
+ execution_stage_manager() = default;
+ execution_stage_manager(const execution_stage_manager&) = delete;
+ execution_stage_manager(execution_stage_manager&&) = delete;
+public:
+ void register_execution_stage(execution_stage& stage);
+ void unregister_execution_stage(execution_stage& stage) noexcept;
+ void update_execution_stage_registration(execution_stage& old_es, execution_stage& new_es) noexcept;
+ execution_stage* get_stage(const sstring& name);
+ bool flush() noexcept;
+ bool poll() const noexcept;
+public:
+ static execution_stage_manager& get() noexcept;
+};
+
+}
+/// \endcond
+
+/// \brief Concrete execution stage class
+///
+/// \note The recommended way of creating execution stages is to use
+/// make_execution_stage().
+///
+/// \tparam ReturnType return type of the function object
+/// \tparam Args argument pack containing arguments to the function object, needs
+/// to have move constructor that doesn't throw
+template<typename ReturnType, typename... Args>
+SEASTAR_CONCEPT(requires std::is_nothrow_move_constructible<std::tuple<Args...>>::value)
+class concrete_execution_stage final : public execution_stage {
+ using args_tuple = std::tuple<Args...>;
+ static_assert(std::is_nothrow_move_constructible<args_tuple>::value,
+ "Function arguments need to be nothrow move constructible");
+
+ static constexpr size_t flush_threshold = 128;
+ static constexpr size_t max_queue_length = 1024;
+
+ using return_type = futurize_t<ReturnType>;
+ using promise_type = typename return_type::promise_type;
+ using input_type = typename tuple_map_types<internal::wrap_for_es, args_tuple>::type;
+
+ struct work_item {
+ input_type _in;
+ promise_type _ready;
+
+ work_item(typename internal::wrap_for_es<Args>::type... args) : _in(std::move(args)...) { }
+
+ work_item(work_item&& other) = delete;
+ work_item(const work_item&) = delete;
+ work_item(work_item&) = delete;
+ };
+ chunked_fifo<work_item, flush_threshold> _queue;
+
+ noncopyable_function<ReturnType (Args...)> _function;
+private:
+ auto unwrap(input_type&& in) {
+ return tuple_map(std::move(in), [] (auto&& obj) {
+ return internal::unwrap_for_es(std::forward<decltype(obj)>(obj));
+ });
+ }
+
+ virtual void do_flush() noexcept override {
+ while (!_queue.empty()) {
+ auto& wi = _queue.front();
+ auto wi_in = std::move(wi._in);
+ auto wi_ready = std::move(wi._ready);
+ _queue.pop_front();
+ futurize<ReturnType>::apply(_function, unwrap(std::move(wi_in))).forward_to(std::move(wi_ready));
+ _stats.function_calls_executed++;
+
+ if (need_preempt()) {
+ _stats.tasks_preempted++;
+ break;
+ }
+ }
+ _empty = _queue.empty();
+ }
+public:
+ explicit concrete_execution_stage(const sstring& name, scheduling_group sg, noncopyable_function<ReturnType (Args...)> f)
+ : execution_stage(name, sg)
+ , _function(std::move(f))
+ {
+ _queue.reserve(flush_threshold);
+ }
+ explicit concrete_execution_stage(const sstring& name, noncopyable_function<ReturnType (Args...)> f)
+ : concrete_execution_stage(name, scheduling_group(), std::move(f)) {
+ }
+
+ /// Enqueues a call to the stage's function
+ ///
+ /// Adds a function call to the queue. Objects passed by value are moved,
+ /// rvalue references are decayed and the objects are moved, lvalue
+ /// references need to be explicitly wrapped using seastar::ref().
+ ///
+ /// Usage example:
+ /// ```
+ /// void do_something(int&, int, std::vector<int>&&);
+ /// thread_local auto stage = seastar::make_execution_stage("execution-stage", do_something);
+ ///
+ /// int global_value;
+ ///
+ /// future<> func(std::vector<int> vec) {
+ /// //return stage(global_value, 42, std::move(vec)); // fail: use seastar::ref to pass references
+ /// return stage(seastar::ref(global_value), 42, std::move(vec)); // ok
+ /// }
+ /// ```
+ ///
+ /// \param args arguments passed to the stage's function
+ /// \return future containing the result of the call to the stage's function
+ return_type operator()(typename internal::wrap_for_es<Args>::type... args) {
+ if (_queue.size() >= max_queue_length) {
+ do_flush();
+ }
+ _queue.emplace_back(std::move(args)...);
+ _empty = false;
+ _stats.function_calls_enqueued++;
+ auto f = _queue.back()._ready.get_future();
+ flush();
+ return f;
+ }
+};
+
+/// \brief Base class for execution stages with support for automatic \ref scheduling_group inheritance
+class inheriting_execution_stage {
+public:
+ struct per_scheduling_group_stats {
+ scheduling_group sg;
+ execution_stage::stats stats;
+ };
+ using stats = boost::container::static_vector<per_scheduling_group_stats, max_scheduling_groups()>;
+};
+
+/// \brief Concrete execution stage class, with support for automatic \ref scheduling_group inheritance
+///
+/// A variation of \ref concrete_execution_stage that inherits the \ref scheduling_group
+/// from the caller. Each call (of `operator()`) can be in its own scheduling group.
+///
+/// \tparam ReturnType return type of the function object
+/// \tparam Args argument pack containing arguments to the function object, needs
+/// to have move constructor that doesn't throw
+template<typename ReturnType, typename... Args>
+SEASTAR_CONCEPT(requires std::is_nothrow_move_constructible<std::tuple<Args...>>::value)
+class inheriting_concrete_execution_stage final : public inheriting_execution_stage {
+ using return_type = futurize_t<ReturnType>;
+ using args_tuple = std::tuple<Args...>;
+ using per_group_stage_type = concrete_execution_stage<ReturnType, Args...>;
+
+ static_assert(std::is_nothrow_move_constructible<args_tuple>::value,
+ "Function arguments need to be nothrow move constructible");
+
+ sstring _name;
+ noncopyable_function<ReturnType (Args...)> _function;
+ std::vector<std::optional<per_group_stage_type>> _stage_for_group{max_scheduling_groups()};
+private:
+ per_group_stage_type make_stage_for_group(scheduling_group sg) {
+ // We can't use std::ref(function), because reference_wrapper decays to noncopyable_function& and
+ // that selects the noncopyable_function copy constructor. Use a lambda instead.
+ auto wrapped_function = [&_function = _function] (Args... args) {
+ return _function(std::forward<Args>(args)...);
+ };
+ auto name = fmt::format("{}.{}", _name, sg.name());
+ return per_group_stage_type(name, sg, wrapped_function);
+ }
+public:
+ /// Construct an inheriting concrete execution stage.
+ ///
+ /// \param name A name for the execution stage; must be unique
+ /// \param f Function to be called in response to operator(). The function
+ /// call will be deferred and batched with similar calls to increase
+ /// instruction cache hit rate.
+ inheriting_concrete_execution_stage(const sstring& name, noncopyable_function<ReturnType (Args...)> f)
+ : _name(std::move(name)),_function(std::move(f)) {
+ }
+
+ /// Enqueues a call to the stage's function
+ ///
+ /// Adds a function call to the queue. Objects passed by value are moved,
+ /// rvalue references are decayed and the objects are moved, lvalue
+ /// references need to be explicitly wrapped using seastar::ref().
+ ///
+ /// The caller's \ref scheduling_group will be preserved across the call.
+ ///
+ /// Usage example:
+ /// ```
+ /// void do_something(int);
+ /// thread_local auto stage = seastar::inheriting_concrete_execution_stage<int>("execution-stage", do_something);
+ ///
+ /// future<> func(int x) {
+ /// return stage(x);
+ /// }
+ /// ```
+ ///
+ /// \param args arguments passed to the stage's function
+ /// \return future containing the result of the call to the stage's function
+ return_type operator()(typename internal::wrap_for_es<Args>::type... args) {
+ auto sg = current_scheduling_group();
+ auto sg_id = internal::scheduling_group_index(sg);
+ auto& slot = _stage_for_group[sg_id];
+ if (!slot) {
+ slot.emplace(make_stage_for_group(sg));
+ }
+ return (*slot)(std::move(args)...);
+ }
+
+ /// Returns summary of individual execution stage usage statistics
+ ///
+ /// \returns a vector of the stats of the individual per-scheduling group
+ /// executation stages. Each element in the vector is a pair composed of
+ /// the scheduling group and the stats for the respective execution
+ /// stage. Scheduling groups that have had no respective calls enqueued
+ /// yet are omitted.
+ inheriting_execution_stage::stats get_stats() const noexcept {
+ inheriting_execution_stage::stats summary;
+ for (unsigned sg_id = 0; sg_id != _stage_for_group.size(); ++sg_id) {
+ auto sg = internal::scheduling_group_from_index(sg_id);
+ if (_stage_for_group[sg_id]) {
+ summary.push_back({sg, _stage_for_group[sg_id]->get_stats()});
+ }
+ }
+ return summary;
+ }
+};
+
+
+/// \cond internal
+namespace internal {
+
+template <typename Ret, typename ArgsTuple>
+struct concrete_execution_stage_helper;
+
+template <typename Ret, typename... Args>
+struct concrete_execution_stage_helper<Ret, std::tuple<Args...>> {
+ using type = concrete_execution_stage<Ret, Args...>;
+};
+
+}
+/// \endcond
+
+/// Creates a new execution stage
+///
+/// Wraps given function object in a concrete_execution_stage. All arguments
+/// of the function object are required to have move constructors that do not
+/// throw. Function object may return a future or an immediate object or void.
+///
+/// Moving execution stages is discouraged and illegal after first function
+/// call is enqueued.
+///
+/// Usage example:
+/// ```
+/// double do_something(int);
+/// thread_local auto stage1 = seastar::make_execution_stage("execution-stage1", do_something);
+///
+/// future<double> func1(int val) {
+/// return stage1(val);
+/// }
+///
+/// future<double> do_some_io(int);
+/// thread_local auto stage2 = seastar::make_execution_stage("execution-stage2", do_some_io);
+///
+/// future<double> func2(int val) {
+/// return stage2(val);
+/// }
+/// ```
+///
+/// \param name unique name of the execution stage
+/// \param sg scheduling group to run under
+/// \param fn function to be executed by the stage
+/// \return concrete_execution_stage
+///
+template<typename Function>
+auto make_execution_stage(const sstring& name, scheduling_group sg, Function&& fn) {
+ using traits = function_traits<Function>;
+ using ret_type = typename traits::return_type;
+ using args_as_tuple = typename traits::args_as_tuple;
+ using concrete_execution_stage = typename internal::concrete_execution_stage_helper<ret_type, args_as_tuple>::type;
+ return concrete_execution_stage(name, sg, std::forward<Function>(fn));
+}
+
+/// Creates a new execution stage (variant taking \ref scheduling_group)
+///
+/// Wraps given function object in a concrete_execution_stage. All arguments
+/// of the function object are required to have move constructors that do not
+/// throw. Function object may return a future or an immediate object or void.
+///
+/// Moving execution stages is discouraged and illegal after first function
+/// call is enqueued.
+///
+/// Usage example:
+/// ```
+/// double do_something(int);
+/// thread_local auto stage1 = seastar::make_execution_stage("execution-stage1", do_something);
+///
+/// future<double> func1(int val) {
+/// return stage1(val);
+/// }
+///
+/// future<double> do_some_io(int);
+/// thread_local auto stage2 = seastar::make_execution_stage("execution-stage2", do_some_io);
+///
+/// future<double> func2(int val) {
+/// return stage2(val);
+/// }
+/// ```
+///
+/// \param name unique name of the execution stage (variant not taking \ref scheduling_group)
+/// \param fn function to be executed by the stage
+/// \return concrete_execution_stage
+///
+template<typename Function>
+auto make_execution_stage(const sstring& name, Function&& fn) {
+ return make_execution_stage(name, scheduling_group(), std::forward<Function>(fn));
+}
+
+/// Creates a new execution stage from a member function
+///
+/// Wraps a pointer to member function in a concrete_execution_stage. When
+/// a function call is pushed to the stage the first argument should be a
+/// pointer to the object the function is a member of.
+///
+/// Usage example:
+/// ```
+/// struct foo {
+/// void do_something(int);
+/// };
+///
+/// thread_local auto stage = seastar::make_execution_stage("execution-stage", &foo::do_something);
+///
+/// future<> func(foo& obj, int val) {
+/// return stage(&obj, val);
+/// }
+/// ```
+///
+/// \see make_execution_stage(const sstring&, Function&&)
+/// \param name unique name of the execution stage
+/// \param fn member function to be executed by the stage
+/// \return concrete_execution_stage
+template<typename Ret, typename Object, typename... Args>
+concrete_execution_stage<Ret, Object*, Args...>
+make_execution_stage(const sstring& name, scheduling_group sg, Ret (Object::*fn)(Args...)) {
+ return concrete_execution_stage<Ret, Object*, Args...>(name, sg, std::mem_fn(fn));
+}
+
+template<typename Ret, typename Object, typename... Args>
+concrete_execution_stage<Ret, const Object*, Args...>
+make_execution_stage(const sstring& name, scheduling_group sg, Ret (Object::*fn)(Args...) const) {
+ return concrete_execution_stage<Ret, const Object*, Args...>(name, sg, std::mem_fn(fn));
+}
+
+template<typename Ret, typename Object, typename... Args>
+concrete_execution_stage<Ret, Object*, Args...>
+make_execution_stage(const sstring& name, Ret (Object::*fn)(Args...)) {
+ return make_execution_stage(name, scheduling_group(), fn);
+}
+
+template<typename Ret, typename Object, typename... Args>
+concrete_execution_stage<Ret, const Object*, Args...>
+make_execution_stage(const sstring& name, Ret (Object::*fn)(Args...) const) {
+ return make_execution_stage(name, scheduling_group(), fn);
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/expiring_fifo.hh b/src/seastar/include/seastar/core/expiring_fifo.hh
new file mode 100644
index 000000000..acc747c29
--- /dev/null
+++ b/src/seastar/include/seastar/core/expiring_fifo.hh
@@ -0,0 +1,217 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/chunked_fifo.hh>
+#include <stdexcept>
+#include <exception>
+#include <memory>
+#include <seastar/core/timer.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/timed_out_error.hh>
+
+namespace seastar {
+
+template<typename T>
+struct dummy_expiry {
+ void operator()(T&) noexcept {};
+};
+
+template<typename... T>
+struct promise_expiry {
+ void operator()(promise<T...>& pr) noexcept {
+ pr.set_exception(std::make_exception_ptr(timed_out_error()));
+ };
+};
+
+/// Container for elements with support for expiration of entries.
+///
+/// OnExpiry is a functor which will be called with a reference to T right before it expires.
+/// T is removed and destroyed from the container immediately after OnExpiry returns.
+/// OnExpiry callback must not modify the container, it can only modify its argument.
+///
+/// The container can only be moved before any elements are pushed.
+///
+template <typename T, typename OnExpiry = dummy_expiry<T>, typename Clock = lowres_clock>
+class expiring_fifo {
+public:
+ using clock = Clock;
+ using time_point = typename Clock::time_point;
+private:
+ struct entry {
+ std::optional<T> payload; // disengaged means that it's expired
+ timer<Clock> tr;
+ entry(T&& payload_) : payload(std::move(payload_)) {}
+ entry(const T& payload_) : payload(payload_) {}
+ entry(T payload_, expiring_fifo& ef, time_point timeout)
+ : payload(std::move(payload_))
+ , tr([this, &ef] {
+ ef._on_expiry(*payload);
+ payload = std::nullopt;
+ --ef._size;
+ ef.drop_expired_front();
+ })
+ {
+ tr.arm(timeout);
+ }
+ entry(entry&& x) = delete;
+ entry(const entry& x) = delete;
+ };
+
+ // If engaged, represents the first element.
+ // This is to avoid large allocations done by chunked_fifo for single-element cases.
+ // expiring_fifo is used to implement wait lists in synchronization primitives
+ // and in some uses it's common to have at most one waiter.
+ std::unique_ptr<entry> _front;
+
+ // There is an invariant that the front element is never expired.
+ chunked_fifo<entry> _list;
+ OnExpiry _on_expiry;
+ size_t _size = 0;
+
+ // Ensures that front() is not expired by dropping expired elements from the front.
+ void drop_expired_front() noexcept {
+ while (!_list.empty() && !_list.front().payload) {
+ _list.pop_front();
+ }
+ if (_front && !_front->payload) {
+ _front.reset();
+ }
+ }
+public:
+ expiring_fifo() noexcept = default;
+ expiring_fifo(OnExpiry on_expiry) noexcept(std::is_nothrow_move_constructible_v<OnExpiry>) : _on_expiry(std::move(on_expiry)) {}
+
+ expiring_fifo(expiring_fifo&& o) noexcept
+ : expiring_fifo(std::move(o._on_expiry)) {
+ // entry objects hold a reference to this so non-empty containers cannot be moved.
+ assert(o._size == 0);
+ }
+
+ expiring_fifo& operator=(expiring_fifo&& o) noexcept {
+ if (this != &o) {
+ this->~expiring_fifo();
+ new (this) expiring_fifo(std::move(o));
+ }
+ return *this;
+ }
+
+ /// Checks if container contains any elements
+ ///
+ /// \note Inside OnExpiry callback, the expired element is still contained.
+ ///
+ /// \return true if and only if there are any elements contained.
+ bool empty() const noexcept {
+ return _size == 0;
+ }
+
+ /// Equivalent to !empty()
+ explicit operator bool() const noexcept {
+ return !empty();
+ }
+
+ /// Returns a reference to the element in the front.
+ /// Valid only when !empty().
+ T& front() noexcept {
+ if (_front) {
+ return *_front->payload;
+ }
+ return *_list.front().payload;
+ }
+
+ /// Returns a reference to the element in the front.
+ /// Valid only when !empty().
+ const T& front() const noexcept {
+ if (_front) {
+ return *_front->payload;
+ }
+ return *_list.front().payload;
+ }
+
+ /// Returns the number of elements contained.
+ ///
+ /// \note Expired elements are not contained. Expiring element is still contained when OnExpiry is called.
+ size_t size() const noexcept {
+ return _size;
+ }
+
+ /// Reserves storage in the container for at least 'size' elements.
+ /// Note that expired elements may also take space when they are not in the front of the queue.
+ ///
+ /// Doesn't give any guarantees about exception safety of subsequent push_back().
+ void reserve(size_t size) {
+ return _list.reserve(size);
+ }
+
+ /// Adds element to the back of the queue.
+ /// The element will never expire.
+ void push_back(const T& payload) {
+ if (_size == 0) {
+ _front = std::make_unique<entry>(payload);
+ } else {
+ _list.emplace_back(payload);
+ }
+ ++_size;
+ }
+
+ /// Adds element to the back of the queue.
+ /// The element will never expire.
+ void push_back(T&& payload) {
+ if (_size == 0) {
+ _front = std::make_unique<entry>(std::move(payload));
+ } else {
+ _list.emplace_back(std::move(payload));
+ }
+ ++_size;
+ }
+
+ /// Adds element to the back of the queue.
+ /// The element will expire when timeout is reached, unless it is time_point::max(), in which
+ /// case it never expires.
+ void push_back(T&& payload, time_point timeout) {
+ if (timeout == time_point::max()) {
+ push_back(std::move(payload));
+ return;
+ }
+ if (_size == 0) {
+ _front = std::make_unique<entry>(std::move(payload), *this, timeout);
+ } else {
+ _list.emplace_back(std::move(payload), *this, timeout);
+ }
+ ++_size;
+ }
+
+ /// Removes the element at the front.
+ /// Can be called only if !empty().
+ void pop_front() noexcept {
+ if (_front) {
+ _front.reset();
+ } else {
+ _list.pop_front();
+ }
+ --_size;
+ drop_expired_front();
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/fair_queue.hh b/src/seastar/include/seastar/core/fair_queue.hh
new file mode 100644
index 000000000..85e47b2aa
--- /dev/null
+++ b/src/seastar/include/seastar/core/fair_queue.hh
@@ -0,0 +1,247 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+#pragma once
+
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/util/noncopyable_function.hh>
+#include <queue>
+#include <chrono>
+#include <unordered_set>
+
+namespace seastar {
+
+/// \brief describes a request that passes through the \ref fair_queue.
+///
+/// A ticket is specified by a \c weight and a \c size. For example, one can specify a request of \c weight
+/// 1 and \c size 16kB. If the \ref fair_queue accepts one such request per second, it will sustain 1 IOPS
+/// at 16kB/s bandwidth.
+///
+/// \related fair_queue
+class fair_queue_ticket {
+ uint32_t _weight = 0; ///< the total weight of these requests for capacity purposes (IOPS).
+ uint32_t _size = 0; ///< the total effective size of these requests
+public:
+ /// Constructs a fair_queue_ticket with a given \c weight and a given \c size
+ ///
+ /// \param weight the weight of the request
+ /// \param size the size of the request
+ fair_queue_ticket(uint32_t weight, uint32_t size);
+ fair_queue_ticket() {}
+ fair_queue_ticket operator+(fair_queue_ticket desc) const;
+ fair_queue_ticket operator-(fair_queue_ticket desc) const;
+ /// Increase the quantity represented in this ticket by the amount represented by \c desc
+ /// \param desc another \ref fair_queue_ticket whose \c weight \c and size will be added to this one
+ fair_queue_ticket& operator+=(fair_queue_ticket desc);
+ /// Decreases the quantity represented in this ticket by the amount represented by \c desc
+ /// \param desc another \ref fair_queue_ticket whose \c weight \c and size will be decremented from this one
+ fair_queue_ticket& operator-=(fair_queue_ticket desc);
+
+ /// \returns true if this fair_queue_ticket is strictly less than \c rhs.
+ ///
+ /// For a fair_queue_ticket to be considered strictly less than another, both its quantities need to be
+ /// less than the other. Note that there is no total ordering between two fair_queue_tickets
+ //
+ /// \param rhs another \ref fair_queue_ticket to be compared to this one.
+ bool strictly_less(fair_queue_ticket rhs) const;
+
+ /// \returns true if the fair_queue_ticket represents a non-zero quantity.
+ ///
+ /// For a fair_queue ticket to be non-zero, at least one of its represented quantities need to
+ /// be non-zero
+ explicit operator bool() const;
+
+ friend std::ostream& operator<<(std::ostream& os, fair_queue_ticket t);
+
+ /// \returns the normalized value of this \ref fair_queue_ticket along a base axis
+ ///
+ /// The normalization function itself is an implementation detail, but one can expect either weight or
+ /// size to have more or less relative importance depending on which of the dimensions in the
+ /// denominator is relatively higher. For example, given this request a, and two other requests
+ /// b and c, such that that c has the same \c weight but a higher \c size than b, one can expect
+ /// the \c size component of this request to play a larger role.
+ ///
+ /// It is legal for the numerator to have one of the quantities set to zero, in which case only
+ /// the other quantity is taken into consideration.
+ ///
+ /// It is however not legal for the axis to have any quantity set to zero.
+ /// \param axis another \ref fair_queue_ticket to be used as a a base vector against which to normalize this fair_queue_ticket.
+ float normalize(fair_queue_ticket axis) const;
+};
+
+/// \addtogroup io-module
+/// @{
+
+/// \cond internal
+class priority_class {
+ struct request {
+ noncopyable_function<void()> func;
+ fair_queue_ticket desc;
+ };
+ friend class fair_queue;
+ uint32_t _shares = 0;
+ float _accumulated = 0;
+ circular_buffer<request> _queue;
+ bool _queued = false;
+
+ friend struct shared_ptr_no_esft<priority_class>;
+ explicit priority_class(uint32_t shares) noexcept : _shares(std::max(shares, 1u)) {}
+
+public:
+ /// \brief return the current amount of shares for this priority class
+ uint32_t shares() const noexcept {
+ return _shares;
+ }
+
+ void update_shares(uint32_t shares) noexcept {
+ _shares = (std::max(shares, 1u));
+ }
+};
+/// \endcond
+
+/// \brief Priority class, to be used with a given \ref fair_queue
+///
+/// An instance of this class is associated with a given \ref fair_queue. When registering
+/// a class, the caller will receive a \ref lw_shared_ptr to an object of this class. All its methods
+/// are private, so the only thing the caller is expected to do with it is to pass it later
+/// to the \ref fair_queue to identify a given class.
+///
+/// \related fair_queue
+using priority_class_ptr = lw_shared_ptr<priority_class>;
+
+/// \brief Fair queuing class
+///
+/// This is a fair queue, allowing multiple request producers to queue requests
+/// that will then be served proportionally to their classes' shares.
+///
+/// To each request, a weight can also be associated. A request of weight 1 will consume
+/// 1 share. Higher weights for a request will consume a proportionally higher amount of
+/// shares.
+///
+/// The user of this interface is expected to register multiple `priority_class`
+/// objects, which will each have a shares attribute.
+///
+/// Internally, each priority class may keep a separate queue of requests.
+/// Requests pertaining to a class can go through even if they are over its
+/// share limit, provided that the other classes have empty queues.
+///
+/// When the classes that lag behind start seeing requests, the fair queue will serve
+/// them first, until balance is restored. This balancing is expected to happen within
+/// a certain time window that obeys an exponential decay.
+class fair_queue {
+public:
+ /// \brief Fair Queue configuration structure.
+ ///
+ /// \sets the operation parameters of a \ref fair_queue
+ /// \related fair_queue
+ struct config {
+ std::chrono::microseconds tau = std::chrono::milliseconds(100);
+ unsigned max_req_count = std::numeric_limits<unsigned>::max();
+ unsigned max_bytes_count = std::numeric_limits<unsigned>::max();
+ };
+private:
+ friend priority_class;
+
+ struct class_compare {
+ bool operator() (const priority_class_ptr& lhs, const priority_class_ptr& rhs) const {
+ return lhs->_accumulated > rhs->_accumulated;
+ }
+ };
+
+ config _config;
+ fair_queue_ticket _maximum_capacity;
+ fair_queue_ticket _current_capacity;
+ fair_queue_ticket _resources_executing;
+ fair_queue_ticket _resources_queued;
+ unsigned _requests_executing = 0;
+ unsigned _requests_queued = 0;
+ using clock_type = std::chrono::steady_clock::time_point;
+ clock_type _base;
+ using prioq = std::priority_queue<priority_class_ptr, std::vector<priority_class_ptr>, class_compare>;
+ prioq _handles;
+ std::unordered_set<priority_class_ptr> _all_classes;
+
+ void push_priority_class(priority_class_ptr pc);
+
+ priority_class_ptr pop_priority_class();
+
+ float normalize_factor() const;
+
+ void normalize_stats();
+
+ bool can_dispatch() const;
+public:
+ /// Constructs a fair queue with configuration parameters \c cfg.
+ ///
+ /// \param cfg an instance of the class \ref config
+ explicit fair_queue(config cfg);
+
+ /// Constructs a fair queue with a given \c capacity, expressed in IOPS.
+ ///
+ /// \param capacity how many concurrent requests are allowed in this queue.
+ /// \param tau the queue exponential decay parameter, as in exp(-1/tau * t)
+ explicit fair_queue(unsigned capacity, std::chrono::microseconds tau = std::chrono::milliseconds(100))
+ : fair_queue(config{tau, capacity}) {}
+
+ /// Registers a priority class against this fair queue.
+ ///
+ /// \param shares how many shares to create this class with
+ priority_class_ptr register_priority_class(uint32_t shares);
+
+ /// Unregister a priority class.
+ ///
+ /// It is illegal to unregister a priority class that still have pending requests.
+ void unregister_priority_class(priority_class_ptr pclass);
+
+ /// \return how many waiters are currently queued for all classes.
+ [[deprecated("fair_queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
+ size_t waiters() const;
+
+ /// \return the number of requests currently executing
+ [[deprecated("fair_queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
+ size_t requests_currently_executing() const;
+
+ /// \return how much resources (weight, size) are currently queued for all classes.
+ fair_queue_ticket resources_currently_waiting() const;
+
+ /// \return the amount of resources (weight, size) currently executing
+ fair_queue_ticket resources_currently_executing() const;
+
+ /// Queue the function \c func through this class' \ref fair_queue, with weight \c weight
+ ///
+ /// It is expected that \c func doesn't throw. If it does throw, it will be just removed from
+ /// the queue and discarded.
+ ///
+ /// The user of this interface is supposed to call \ref notify_requests_finished when the
+ /// request finishes executing - regardless of success or failure.
+ void queue(priority_class_ptr pc, fair_queue_ticket desc, noncopyable_function<void()> func);
+
+ /// Notifies that ont request finished
+ /// \param desc an instance of \c fair_queue_ticket structure describing the request that just finished.
+ void notify_requests_finished(fair_queue_ticket desc, unsigned nr = 1) noexcept;
+
+ /// Try to execute new requests if there is capacity left in the queue.
+ void dispatch_requests();
+};
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/file-types.hh b/src/seastar/include/seastar/core/file-types.hh
new file mode 100644
index 000000000..38052c897
--- /dev/null
+++ b/src/seastar/include/seastar/core/file-types.hh
@@ -0,0 +1,140 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <type_traits>
+
+namespace seastar {
+
+/// \addtogroup fileio-module
+/// @{
+
+/// Enumeration describing how a file is to be opened.
+///
+/// \see file::open_file_dma()
+enum class open_flags {
+ rw = O_RDWR,
+ ro = O_RDONLY,
+ wo = O_WRONLY,
+ create = O_CREAT,
+ truncate = O_TRUNC,
+ exclusive = O_EXCL,
+ dsync = O_DSYNC,
+};
+
+inline open_flags operator|(open_flags a, open_flags b) {
+ return open_flags(std::underlying_type_t<open_flags>(a) | std::underlying_type_t<open_flags>(b));
+}
+
+inline void operator|=(open_flags& a, open_flags b) {
+ a = (a | b);
+}
+
+inline open_flags operator&(open_flags a, open_flags b) {
+ return open_flags(std::underlying_type_t<open_flags>(a) & std::underlying_type_t<open_flags>(b));
+}
+
+inline void operator&=(open_flags& a, open_flags b) {
+ a = (a & b);
+}
+
+/// Enumeration describing the type of a directory entry being listed.
+///
+/// \see file::list_directory()
+enum class directory_entry_type {
+ unknown,
+ block_device,
+ char_device,
+ directory,
+ fifo,
+ link,
+ regular,
+ socket,
+};
+
+/// Enumeration describing the type of a particular filesystem
+enum class fs_type {
+ other,
+ xfs,
+ ext2,
+ ext3,
+ ext4,
+ btrfs,
+ hfs,
+ tmpfs,
+};
+
+// Access flags for files/directories
+enum class access_flags {
+ exists = F_OK,
+ read = R_OK,
+ write = W_OK,
+ execute = X_OK,
+
+ // alias for directory access
+ lookup = execute,
+};
+
+inline access_flags operator|(access_flags a, access_flags b) {
+ return access_flags(std::underlying_type_t<access_flags>(a) | std::underlying_type_t<access_flags>(b));
+}
+
+inline access_flags operator&(access_flags a, access_flags b) {
+ return access_flags(std::underlying_type_t<access_flags>(a) & std::underlying_type_t<access_flags>(b));
+}
+
+// Permissions for files/directories
+enum class file_permissions {
+ user_read = S_IRUSR, // Read by owner
+ user_write = S_IWUSR, // Write by owner
+ user_execute = S_IXUSR, // Execute by owner
+
+ group_read = S_IRGRP, // Read by group
+ group_write = S_IWGRP, // Write by group
+ group_execute = S_IXGRP, // Execute by group
+
+ others_read = S_IROTH, // Read by others
+ others_write = S_IWOTH, // Write by others
+ others_execute = S_IXOTH, // Execute by others
+
+ user_permissions = user_read | user_write | user_execute,
+ group_permissions = group_read | group_write | group_execute,
+ others_permissions = others_read | others_write | others_execute,
+ all_permissions = user_permissions | group_permissions | others_permissions,
+
+ default_file_permissions = user_read | user_write | group_read | group_write | others_read | others_write, // 0666
+ default_dir_permissions = all_permissions, // 0777
+};
+
+inline constexpr file_permissions operator|(file_permissions a, file_permissions b) {
+ return file_permissions(std::underlying_type_t<file_permissions>(a) | std::underlying_type_t<file_permissions>(b));
+}
+
+inline constexpr file_permissions operator&(file_permissions a, file_permissions b) {
+ return file_permissions(std::underlying_type_t<file_permissions>(a) & std::underlying_type_t<file_permissions>(b));
+}
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/file.hh b/src/seastar/include/seastar/core/file.hh
new file mode 100644
index 000000000..74a56cb1d
--- /dev/null
+++ b/src/seastar/include/seastar/core/file.hh
@@ -0,0 +1,586 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/core/do_with.hh>
+#include <seastar/core/stream.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/align.hh>
+#include <seastar/core/fair_queue.hh>
+#include <seastar/core/file-types.hh>
+#include <seastar/util/std-compat.hh>
+#include <system_error>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+namespace seastar {
+
+/// \addtogroup fileio-module
+/// @{
+
+/// A directory entry being listed.
+struct directory_entry {
+ /// Name of the file in a directory entry. Will never be "." or "..". Only the last component is included.
+ sstring name;
+ /// Type of the directory entry, if known.
+ std::optional<directory_entry_type> type;
+};
+
+/// Filesystem object stat information
+struct stat_data {
+ uint64_t device_id; // ID of device containing file
+ uint64_t inode_number; // Inode number
+ uint64_t mode; // File type and mode
+ directory_entry_type type;
+ uint64_t number_of_links;// Number of hard links
+ uint64_t uid; // User ID of owner
+ uint64_t gid; // Group ID of owner
+ uint64_t rdev; // Device ID (if special file)
+ uint64_t size; // Total size, in bytes
+ uint64_t block_size; // Block size for filesystem I/O
+ uint64_t allocated_size; // Total size of allocated storage, in bytes
+
+ std::chrono::system_clock::time_point time_accessed; // Time of last content access
+ std::chrono::system_clock::time_point time_modified; // Time of last content modification
+ std::chrono::system_clock::time_point time_changed; // Time of last status change (either content or attributes)
+};
+
+/// File open options
+///
+/// Options used to configure an open file.
+///
+/// \ref file
+struct file_open_options {
+ uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file
+ bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush
+ uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be
+ file_permissions create_permissions = file_permissions::default_file_permissions; ///< File permissions to use when creating a file
+};
+
+/// \cond internal
+class io_queue;
+using io_priority_class_id = unsigned;
+class io_priority_class {
+ io_priority_class_id _id;
+ friend io_queue;
+
+ io_priority_class() = delete;
+ explicit io_priority_class(io_priority_class_id id) noexcept
+ : _id(id)
+ { }
+
+public:
+ io_priority_class_id id() const {
+ return _id;
+ }
+};
+
+const io_priority_class& default_priority_class();
+
+class file;
+class file_impl;
+
+class file_handle;
+
+// A handle that can be transported across shards and used to
+// create a dup(2)-like `file` object referring to the same underlying file
+class file_handle_impl {
+public:
+ virtual ~file_handle_impl() = default;
+ virtual std::unique_ptr<file_handle_impl> clone() const = 0;
+ virtual shared_ptr<file_impl> to_file() && = 0;
+};
+
+class file_impl {
+protected:
+ static file_impl* get_file_impl(file& f);
+public:
+ unsigned _memory_dma_alignment = 4096;
+ unsigned _disk_read_dma_alignment = 4096;
+ unsigned _disk_write_dma_alignment = 4096;
+public:
+ virtual ~file_impl() {}
+
+ virtual future<size_t> write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) = 0;
+ virtual future<size_t> write_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
+ virtual future<size_t> read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) = 0;
+ virtual future<size_t> read_dma(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc) = 0;
+ virtual future<> flush(void) = 0;
+ virtual future<struct stat> stat(void) = 0;
+ virtual future<> truncate(uint64_t length) = 0;
+ virtual future<> discard(uint64_t offset, uint64_t length) = 0;
+ virtual future<> allocate(uint64_t position, uint64_t length) = 0;
+ virtual future<uint64_t> size(void) = 0;
+ virtual future<> close() = 0;
+ virtual std::unique_ptr<file_handle_impl> dup();
+ virtual subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next) = 0;
+ virtual future<temporary_buffer<uint8_t>> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) = 0;
+
+ friend class reactor;
+};
+
+future<shared_ptr<file_impl>> make_file_impl(int fd, file_open_options options, int oflags) noexcept;
+
+/// \endcond
+
+/// A data file on persistent storage.
+///
+/// File objects represent uncached, unbuffered files. As such great care
+/// must be taken to cache data at the application layer; neither seastar
+/// nor the OS will cache these file.
+///
+/// Data is transferred using direct memory access (DMA). This imposes
+/// restrictions on file offsets and data pointers. The former must be aligned
+/// on a 4096 byte boundary, while a 512 byte boundary suffices for the latter.
+class file {
+ shared_ptr<file_impl> _file_impl;
+public:
+ /// Default constructor constructs an uninitialized file object.
+ ///
+ /// A default constructor is useful for the common practice of declaring
+ /// a variable, and only assigning to it later. The uninitialized file
+ /// must not be used, or undefined behavior will result (currently, a null
+ /// pointer dereference).
+ ///
+ /// One can check whether a file object is in uninitialized state with
+ /// \ref operator bool(); One can reset a file back to uninitialized state
+ /// by assigning file() to it.
+ file() noexcept : _file_impl(nullptr) {}
+
+ file(shared_ptr<file_impl> impl) noexcept
+ : _file_impl(std::move(impl)) {}
+
+ /// Constructs a file object from a \ref file_handle obtained from another shard
+ explicit file(file_handle&& handle) noexcept;
+
+ /// Checks whether the file object was initialized.
+ ///
+ /// \return false if the file object is uninitialized (default
+ /// constructed), true if the file object refers to an actual file.
+ explicit operator bool() const noexcept { return bool(_file_impl); }
+
+ /// Copies a file object. The new and old objects refer to the
+ /// same underlying file.
+ ///
+ /// \param x file object to be copied
+ file(const file& x) = default;
+ /// Moves a file object.
+ file(file&& x) noexcept : _file_impl(std::move(x._file_impl)) {}
+ /// Assigns a file object. After assignent, the destination and source refer
+ /// to the same underlying file.
+ ///
+ /// \param x file object to assign to `this`.
+ file& operator=(const file& x) noexcept = default;
+ /// Moves assigns a file object.
+ file& operator=(file&& x) noexcept = default;
+
+ // O_DIRECT reading requires that buffer, offset, and read length, are
+ // all aligned. Alignment of 4096 was necessary in the past, but no longer
+ // is - 512 is usually enough; But we'll need to use BLKSSZGET ioctl to
+ // be sure it is really enough on this filesystem. 4096 is always safe.
+ // In addition, if we start reading in things outside page boundaries,
+ // we will end up with various pages around, some of them with
+ // overlapping ranges. Those would be very challenging to cache.
+
+ /// Alignment requirement for file offsets (for reads)
+ uint64_t disk_read_dma_alignment() const noexcept {
+ return _file_impl->_disk_read_dma_alignment;
+ }
+
+ /// Alignment requirement for file offsets (for writes)
+ uint64_t disk_write_dma_alignment() const noexcept {
+ return _file_impl->_disk_write_dma_alignment;
+ }
+
+ /// Alignment requirement for data buffers
+ uint64_t memory_dma_alignment() const noexcept {
+ return _file_impl->_memory_dma_alignment;
+ }
+
+
+ /**
+ * Perform a single DMA read operation.
+ *
+ * @param aligned_pos offset to begin reading at (should be aligned)
+ * @param aligned_buffer output buffer (should be aligned)
+ * @param aligned_len number of bytes to read (should be aligned)
+ * @param pc the IO priority class under which to queue this operation
+ *
+ * Alignment is HW dependent but use 4KB alignment to be on the safe side as
+ * explained above.
+ *
+ * @return number of bytes actually read
+ * or exceptional future in case of I/O error
+ */
+ template <typename CharType>
+ future<size_t>
+ dma_read(uint64_t aligned_pos, CharType* aligned_buffer, size_t aligned_len, const io_priority_class& pc = default_priority_class()) noexcept {
+ return dma_read_impl(aligned_pos, reinterpret_cast<uint8_t*>(aligned_buffer), aligned_len, pc);
+ }
+
+ /**
+ * Read the requested amount of bytes starting from the given offset.
+ *
+ * @param pos offset to begin reading from
+ * @param len number of bytes to read
+ * @param pc the IO priority class under which to queue this operation
+ *
+ * @return temporary buffer containing the requested data.
+ * or exceptional future in case of I/O error
+ *
+ * This function doesn't require any alignment for both "pos" and "len"
+ *
+ * @note size of the returned buffer may be smaller than "len" if EOF is
+ * reached or in case of I/O error.
+ */
+ template <typename CharType>
+ future<temporary_buffer<CharType>> dma_read(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) noexcept {
+ return dma_read_impl(pos, len, pc).then([] (temporary_buffer<uint8_t> t) {
+ return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
+ });
+ }
+
+ /// Error thrown when attempting to read past end-of-file
+ /// with \ref dma_read_exactly().
+ class eof_error : public std::exception {};
+
+ /**
+ * Read the exact amount of bytes.
+ *
+ * @param pos offset in a file to begin reading from
+ * @param len number of bytes to read
+ * @param pc the IO priority class under which to queue this operation
+ *
+ * @return temporary buffer containing the read data
+ * or exceptional future in case an error, holding:
+ * end_of_file_error if EOF is reached, file_io_error or
+ * std::system_error in case of I/O error.
+ */
+ template <typename CharType>
+ future<temporary_buffer<CharType>>
+ dma_read_exactly(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) noexcept {
+ return dma_read_exactly_impl(pos, len, pc).then([] (temporary_buffer<uint8_t> t) {
+ return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
+ });
+ }
+
+ /// Performs a DMA read into the specified iovec.
+ ///
+ /// \param pos offset to read from. Must be aligned to \ref disk_read_dma_alignment.
+ /// \param iov vector of address/size pairs to read into. Addresses must be
+ /// aligned.
+ /// \param pc the IO priority class under which to queue this operation
+ ///
+ /// \return a future representing the number of bytes actually read. A short
+ /// read may happen due to end-of-file or an I/O error.
+ future<size_t> dma_read(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) noexcept;
+
+ /// Performs a DMA write from the specified buffer.
+ ///
+ /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
+ /// \param buffer aligned address of buffer to read from. Buffer must exists
+ /// until the future is made ready.
+ /// \param len number of bytes to write. Must be aligned.
+ /// \param pc the IO priority class under which to queue this operation
+ ///
+ /// \return a future representing the number of bytes actually written. A short
+ /// write may happen due to an I/O error.
+ template <typename CharType>
+ future<size_t> dma_write(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class()) noexcept {
+ return dma_write_impl(pos, reinterpret_cast<const uint8_t*>(buffer), len, pc);
+ }
+
+ /// Performs a DMA write to the specified iovec.
+ ///
+ /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment.
+ /// \param iov vector of address/size pairs to write from. Addresses must be
+ /// aligned.
+ /// \param pc the IO priority class under which to queue this operation
+ ///
+ /// \return a future representing the number of bytes actually written. A short
+ /// write may happen due to an I/O error.
+ future<size_t> dma_write(uint64_t pos, std::vector<iovec> iov, const io_priority_class& pc = default_priority_class()) noexcept;
+
+ /// Causes any previously written data to be made stable on persistent storage.
+ ///
+ /// Prior to a flush, written data may or may not survive a power failure. After
+ /// a flush, data is guaranteed to be on disk.
+ future<> flush() noexcept;
+
+ /// Returns \c stat information about the file.
+ future<struct stat> stat() noexcept;
+
+ /// Truncates the file to a specified length.
+ future<> truncate(uint64_t length) noexcept;
+
+ /// Preallocate disk blocks for a specified byte range.
+ ///
+ /// Requests the file system to allocate disk blocks to
+ /// back the specified range (\c length bytes starting at
+ /// \c position). The range may be outside the current file
+ /// size; the blocks can then be used when appending to the
+ /// file.
+ ///
+ /// \param position beginning of the range at which to allocate
+ /// blocks.
+ /// \param length length of range to allocate.
+ /// \return future that becomes ready when the operation completes.
+ future<> allocate(uint64_t position, uint64_t length) noexcept;
+
+ /// Discard unneeded data from the file.
+ ///
+ /// The discard operation tells the file system that a range of offsets
+ /// (which be aligned) is no longer needed and can be reused.
+ future<> discard(uint64_t offset, uint64_t length) noexcept;
+
+ /// Gets the file size.
+ future<uint64_t> size() const noexcept;
+
+ /// Closes the file.
+ ///
+ /// Flushes any pending operations and release any resources associated with
+ /// the file (except for stable storage).
+ ///
+ /// \note
+ /// to ensure file data reaches stable storage, you must call \ref flush()
+ /// before calling \c close().
+ future<> close() noexcept;
+
+ /// Returns a directory listing, given that this file object is a directory.
+ subscription<directory_entry> list_directory(std::function<future<> (directory_entry de)> next);
+
+ /**
+ * Read a data bulk containing the provided addresses range that starts at
+ * the given offset and ends at either the address aligned to
+ * dma_alignment (4KB) or at the file end.
+ *
+ * @param offset starting address of the range the read bulk should contain
+ * @param range_size size of the addresses range
+ * @param pc the IO priority class under which to queue this operation
+ *
+ * @return temporary buffer containing the read data bulk.
+ * or exceptional future holding:
+ * system_error exception in case of I/O error or eof_error when
+ * "offset" is beyond EOF.
+ */
+ template <typename CharType>
+ future<temporary_buffer<CharType>>
+ dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc = default_priority_class()) noexcept {
+ return dma_read_bulk_impl(offset, range_size, pc).then([] (temporary_buffer<uint8_t> t) {
+ return temporary_buffer<CharType>(reinterpret_cast<CharType*>(t.get_write()), t.size(), t.release());
+ });
+ }
+
+ /// \brief Creates a handle that can be transported across shards.
+ ///
+ /// Creates a handle that can be transported across shards, and then
+ /// used to create a new shard-local \ref file object that refers to
+ /// the same on-disk file.
+ ///
+ /// \note Use on read-only files.
+ ///
+ file_handle dup();
+
+ template <typename CharType>
+ struct read_state;
+private:
+ future<temporary_buffer<uint8_t>>
+ dma_read_bulk_impl(uint64_t offset, size_t range_size, const io_priority_class& pc) noexcept;
+
+ future<size_t>
+ dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc) noexcept;
+
+ future<temporary_buffer<uint8_t>>
+ dma_read_impl(uint64_t pos, size_t len, const io_priority_class& pc) noexcept;
+
+ future<size_t>
+ dma_read_impl(uint64_t aligned_pos, uint8_t* aligned_buffer, size_t aligned_len, const io_priority_class& pc) noexcept;
+
+ future<temporary_buffer<uint8_t>>
+ dma_read_exactly_impl(uint64_t pos, size_t len, const io_priority_class& pc) noexcept;
+
+ friend class reactor;
+ friend class file_impl;
+};
+
+/// \brief Helper for ensuring a file is closed after \c func is called.
+///
+/// The file provided by the \c file_fut future is passed to \c func.
+///
+/// \param file_fut A future that produces a file
+/// \param func A function that uses a file
+/// \returns the future returned by \c func, or an exceptional future if either \c file_fut or closing the file failed.
+template <typename Func>
+SEASTAR_CONCEPT( requires std::invocable<Func, file&> && std::is_nothrow_move_constructible_v<Func> )
+auto with_file(future<file> file_fut, Func func) noexcept {
+ static_assert(std::is_nothrow_move_constructible_v<Func>, "Func's move constructor must not throw");
+ return file_fut.then([func = std::move(func)] (file f) mutable {
+ return do_with(std::move(f), [func = std::move(func)] (file& f) mutable {
+ return futurize_invoke(func, f).finally([&f] {
+ return f.close();
+ });
+ });
+ });
+}
+
+/// \brief Helper for ensuring a file is closed if \c func fails.
+///
+/// The file provided by the \c file_fut future is passed to \c func.
+/// * If func throws an exception E, the file is closed and we return
+/// a failed future with E.
+/// * If func returns a value V, the file is not closed and we return
+/// a future with V.
+/// Note that when an exception is not thrown, it is the
+/// responsibility of func to make sure the file will be closed. It
+/// can close the file itself, return it, or store it somewhere.
+///
+/// \param file_fut A future that produces a file
+/// \param func A function that uses a file
+/// \returns the future returned by \c func, or an exceptional future if \c file_fut failed or a nested exception if closing the file failed.
+template <typename Func>
+SEASTAR_CONCEPT( requires std::invocable<Func, file&> && std::is_nothrow_move_constructible_v<Func> )
+auto with_file_close_on_failure(future<file> file_fut, Func func) noexcept {
+ static_assert(std::is_nothrow_move_constructible_v<Func>, "Func's move constructor must not throw");
+ return file_fut.then([func = std::move(func)] (file f) mutable {
+ return do_with(std::move(f), [func = std::move(func)] (file& f) mutable {
+ return futurize_invoke(std::move(func), f).then_wrapped([&f] (auto ret) mutable {
+ if (!ret.failed()) {
+ return ret;
+ }
+ return ret.finally([&f] {
+ // If f.close() fails, return that as nested exception.
+ return f.close();
+ });
+ });
+ });
+ });
+}
+
+/// \example file_demo.cc
+/// A program demonstrating the use of \ref seastar::with_file
+/// and \ref seastar::with_file_close_on_failure
+
+/// \brief A shard-transportable handle to a file
+///
+/// If you need to access a file (for reads only) across multiple shards,
+/// you can use the file::dup() method to create a `file_handle`, transport
+/// this file handle to another shard, and use the handle to create \ref file
+/// object on that shard. This is more efficient than calling open_file_dma()
+/// again.
+class file_handle {
+ std::unique_ptr<file_handle_impl> _impl;
+private:
+ explicit file_handle(std::unique_ptr<file_handle_impl> impl) : _impl(std::move(impl)) {}
+public:
+ /// Copies a file handle object
+ file_handle(const file_handle&);
+ /// Moves a file handle object
+ file_handle(file_handle&&) noexcept;
+ /// Assigns a file handle object
+ file_handle& operator=(const file_handle&);
+ /// Move-assigns a file handle object
+ file_handle& operator=(file_handle&&) noexcept;
+ /// Converts the file handle object to a \ref file.
+ file to_file() const &;
+ /// Converts the file handle object to a \ref file.
+ file to_file() &&;
+
+ friend class file;
+};
+
+/// \cond internal
+
+template <typename CharType>
+struct file::read_state {
+ typedef temporary_buffer<CharType> tmp_buf_type;
+
+ read_state(uint64_t offset, uint64_t front, size_t to_read,
+ size_t memory_alignment, size_t disk_alignment)
+ : buf(tmp_buf_type::aligned(memory_alignment,
+ align_up(to_read, disk_alignment)))
+ , _offset(offset)
+ , _to_read(to_read)
+ , _front(front) {}
+
+ bool done() const {
+ return eof || pos >= _to_read;
+ }
+
+ /**
+ * Trim the buffer to the actual number of read bytes and cut the
+ * bytes from offset 0 till "_front".
+ *
+ * @note this function has to be called only if we read bytes beyond
+ * "_front".
+ */
+ void trim_buf_before_ret() {
+ if (have_good_bytes()) {
+ buf.trim(pos);
+ buf.trim_front(_front);
+ } else {
+ buf.trim(0);
+ }
+ }
+
+ uint64_t cur_offset() const {
+ return _offset + pos;
+ }
+
+ size_t left_space() const {
+ return buf.size() - pos;
+ }
+
+ size_t left_to_read() const {
+ // positive as long as (done() == false)
+ return _to_read - pos;
+ }
+
+ void append_new_data(tmp_buf_type& new_data) {
+ auto to_copy = std::min(left_space(), new_data.size());
+
+ std::memcpy(buf.get_write() + pos, new_data.get(), to_copy);
+ pos += to_copy;
+ }
+
+ bool have_good_bytes() const {
+ return pos > _front;
+ }
+
+public:
+ bool eof = false;
+ tmp_buf_type buf;
+ size_t pos = 0;
+private:
+ uint64_t _offset;
+ size_t _to_read;
+ uint64_t _front;
+};
+
+/// \endcond
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/fsqual.hh b/src/seastar/include/seastar/core/fsqual.hh
new file mode 100644
index 000000000..24a3d1d72
--- /dev/null
+++ b/src/seastar/include/seastar/core/fsqual.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2017 ScyllaDB
+ */
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+
+bool filesystem_has_good_aio_support(sstring directory, bool verbose = false);
+
+}
diff --git a/src/seastar/include/seastar/core/fstream.hh b/src/seastar/include/seastar/core/fstream.hh
new file mode 100644
index 000000000..67d59abfd
--- /dev/null
+++ b/src/seastar/include/seastar/core/fstream.hh
@@ -0,0 +1,151 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+/// \file
+
+// File <-> streams adapters
+//
+// Seastar files are block-based due to the reliance on DMA - you must read
+// on sector boundaries. The adapters in this file provide a byte stream
+// interface to files, while retaining the zero-copy characteristics of
+// seastar files.
+
+#include <seastar/core/file.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/internal/api-level.hh>
+
+namespace seastar {
+
+class file_input_stream_history {
+ static constexpr uint64_t window_size = 4 * 1024 * 1024;
+ struct window {
+ uint64_t total_read = 0;
+ uint64_t unused_read = 0;
+ };
+ window current_window;
+ window previous_window;
+ unsigned read_ahead = 1;
+
+ friend class file_data_source_impl;
+};
+
+/// Data structure describing options for opening a file input stream
+struct file_input_stream_options {
+ size_t buffer_size = 8192; ///< I/O buffer size
+ unsigned read_ahead = 0; ///< Maximum number of extra read-ahead operations
+ ::seastar::io_priority_class io_priority_class = default_priority_class();
+ lw_shared_ptr<file_input_stream_history> dynamic_adjustments = { }; ///< Input stream history, if null dynamic adjustments are disabled
+};
+
+/// \brief Creates an input_stream to read a portion of a file.
+///
+/// \param file File to read; multiple streams for the same file may coexist
+/// \param offset Starting offset to read from (no alignment restrictions)
+/// \param len Maximum number of bytes to read; the stream will stop at end-of-file
+/// even if `offset + len` is beyond end-of-file.
+/// \param options A set of options controlling the stream.
+///
+/// \note Multiple input streams may exist concurrently for the same file.
+input_stream<char> make_file_input_stream(
+ file file, uint64_t offset, uint64_t len, file_input_stream_options options = {});
+
+// Create an input_stream for a given file, with the specified options.
+// Multiple fibers of execution (continuations) may safely open
+// multiple input streams concurrently for the same file.
+input_stream<char> make_file_input_stream(
+ file file, uint64_t offset, file_input_stream_options = {});
+
+// Create an input_stream for reading starting at a given position of the
+// given file. Multiple fibers of execution (continuations) may safely open
+// multiple input streams concurrently for the same file.
+input_stream<char> make_file_input_stream(
+ file file, file_input_stream_options = {});
+
+struct file_output_stream_options {
+ // For small files, setting preallocation_size can make it impossible for XFS to find
+ // an aligned extent. On the other hand, without it, XFS will divide the file into
+ // file_size/buffer_size extents. To avoid fragmentation, we set the default buffer_size
+ // to 64k (so each extent will be a minimum of 64k) and preallocation_size to 0 (to avoid
+ // extent allocation problems).
+ //
+ // Large files should increase both buffer_size and preallocation_size.
+ unsigned buffer_size = 65536;
+ unsigned preallocation_size = 0; ///< Preallocate extents. For large files, set to a large number (a few megabytes) to reduce fragmentation
+ unsigned write_behind = 1; ///< Number of buffers to write in parallel
+ ::seastar::io_priority_class io_priority_class = default_priority_class();
+};
+
+SEASTAR_INCLUDE_API_V2 namespace api_v2 {
+
+/// Create an output_stream for writing starting at the position zero of a
+/// newly created file.
+/// NOTE: flush() should be the last thing to be called on a file output stream.
+[[deprecated("use Seastar_API_LEVEL=3 instead")]]
+output_stream<char> make_file_output_stream(
+ file file,
+ uint64_t buffer_size = 8192);
+
+/// Create an output_stream for writing starting at the position zero of a
+/// newly created file.
+/// NOTE: flush() should be the last thing to be called on a file output stream.
+[[deprecated("use Seastar_API_LEVEL=3 instead")]]
+output_stream<char> make_file_output_stream(
+ file file,
+ file_output_stream_options options);
+
+/// Create a data_sink for writing starting at the position zero of a
+/// newly created file.
+[[deprecated("use Seastar_API_LEVEL=3 instead")]]
+data_sink make_file_data_sink(file, file_output_stream_options);
+
+}
+
+SEASTAR_INCLUDE_API_V3 namespace api_v3 {
+inline namespace and_newer {
+
+/// Create an output_stream for writing starting at the position zero of a
+/// newly created file.
+/// NOTE: flush() should be the last thing to be called on a file output stream.
+/// Closes the file if the stream creation fails.
+future<output_stream<char>> make_file_output_stream(
+ file file,
+ uint64_t buffer_size = 8192) noexcept;
+
+/// Create an output_stream for writing starting at the position zero of a
+/// newly created file.
+/// NOTE: flush() should be the last thing to be called on a file output stream.
+/// Closes the file if the stream creation fails.
+future<output_stream<char>> make_file_output_stream(
+ file file,
+ file_output_stream_options options) noexcept;
+
+/// Create a data_sink for writing starting at the position zero of a
+/// newly created file.
+/// Closes the file if the sink creation fails.
+future<data_sink> make_file_data_sink(file, file_output_stream_options) noexcept;
+
+}
+}
+
+}
diff --git a/src/seastar/include/seastar/core/function_traits.hh b/src/seastar/include/seastar/core/function_traits.hh
new file mode 100644
index 000000000..a3b9b9d31
--- /dev/null
+++ b/src/seastar/include/seastar/core/function_traits.hh
@@ -0,0 +1,68 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <tuple>
+
+namespace seastar {
+
+template<typename T>
+struct function_traits;
+
+template<typename Ret, typename... Args>
+struct function_traits<Ret(Args...)>
+{
+ using return_type = Ret;
+ using args_as_tuple = std::tuple<Args...>;
+ using signature = Ret (Args...);
+
+ static constexpr std::size_t arity = sizeof...(Args);
+
+ template <std::size_t N>
+ struct arg
+ {
+ static_assert(N < arity, "no such parameter index.");
+ using type = typename std::tuple_element<N, std::tuple<Args...>>::type;
+ };
+};
+
+template<typename Ret, typename... Args>
+struct function_traits<Ret(*)(Args...)> : public function_traits<Ret(Args...)>
+{};
+
+template <typename T, typename Ret, typename... Args>
+struct function_traits<Ret(T::*)(Args...)> : public function_traits<Ret(Args...)>
+{};
+
+template <typename T, typename Ret, typename... Args>
+struct function_traits<Ret(T::*)(Args...) const> : public function_traits<Ret(Args...)>
+{};
+
+template <typename T>
+struct function_traits : public function_traits<decltype(&T::operator())>
+{};
+
+template<typename T>
+struct function_traits<T&> : public function_traits<std::remove_reference_t<T>>
+{};
+
+}
diff --git a/src/seastar/include/seastar/core/future-util.hh b/src/seastar/include/seastar/core/future-util.hh
new file mode 100644
index 000000000..3252accf5
--- /dev/null
+++ b/src/seastar/include/seastar/core/future-util.hh
@@ -0,0 +1,31 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/with_scheduling_group.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/core/when_all.hh>
+#include <seastar/core/map_reduce.hh>
+#include <seastar/core/with_timeout.hh>
+#include <seastar/util/later.hh>
diff --git a/src/seastar/include/seastar/core/future.hh b/src/seastar/include/seastar/core/future.hh
new file mode 100644
index 000000000..81ee18528
--- /dev/null
+++ b/src/seastar/include/seastar/core/future.hh
@@ -0,0 +1,2196 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/task.hh>
+#include <seastar/core/thread_impl.hh>
+#include <stdexcept>
+#include <atomic>
+#include <memory>
+#include <type_traits>
+#include <assert.h>
+#include <cstdlib>
+#include <seastar/core/function_traits.hh>
+#include <seastar/util/critical_alloc_section.hh>
+#include <seastar/util/attribute-compat.hh>
+#include <seastar/util/concepts.hh>
+#include <seastar/util/noncopyable_function.hh>
+#include <seastar/util/backtrace.hh>
+
+#if __cplusplus > 201703L
+#include <concepts>
+#endif
+
+namespace seastar {
+
+struct nested_exception : public std::exception {
+ std::exception_ptr inner;
+ std::exception_ptr outer;
+ nested_exception(std::exception_ptr inner, std::exception_ptr outer) noexcept;
+ nested_exception(nested_exception&&) noexcept;
+ nested_exception(const nested_exception&) noexcept;
+ [[noreturn]] void rethrow_nested() const;
+ virtual const char* what() const noexcept override;
+};
+
+/// \defgroup future-module Futures and Promises
+///
+/// \brief
+/// Futures and promises are the basic tools for asynchronous
+/// programming in seastar. A future represents a result that
+/// may not have been computed yet, for example a buffer that
+/// is being read from the disk, or the result of a function
+/// that is executed on another cpu. A promise object allows
+/// the future to be eventually resolved by assigning it a value.
+///
+/// \brief
+/// Another way to look at futures and promises are as the reader
+/// and writer sides, respectively, of a single-item, single use
+/// queue. You read from the future, and write to the promise,
+/// and the system takes care that it works no matter what the
+/// order of operations is.
+///
+/// \brief
+/// The normal way of working with futures is to chain continuations
+/// to them. A continuation is a block of code (usually a lamdba)
+/// that is called when the future is assigned a value (the future
+/// is resolved); the continuation can then access the actual value.
+///
+
+/// \defgroup future-module-impl Implementation overview
+/// \ingroup future-module
+///
+/// A future has a stored value. Semantically, the value is a
+/// std::optional<std::variant<T, std::exception_ptr>>. The actual
+/// type of the value in the implementation is future_state<T>.
+///
+/// A future without an initial value can be created by first creating
+/// a promise and then calling promise::get_future. The promise also
+/// stores a future_state<T> in case promise::set_value is called
+/// before get_future.
+///
+/// In addition to the future_state<T>, the promise and the future
+/// point to each other and the pointers are updated when either is
+/// moved.
+///
+/// If a future is consumed by future::then before the future is
+/// ready, a continuation is dynamically allocated. The continuation
+/// also has a future_state<T>, but unlinke a future it is never
+/// moved.
+///
+/// After a future creates a continuation, the corresponding promise
+/// points to the newly allocated continuation. When
+/// promise::set_value is called, the continuation is ready and is
+/// scheduled.
+///
+/// A promise then consists of
+/// * A future_state<T> for use when there is no corresponding future
+/// or continuation (_local_state).
+/// * A pointer to a future to allow updates when the promise is moved
+/// (_future).
+/// * A pointer to the continuation (_task).
+/// * A pointer to future_state<T> (_state) that can point to
+/// 1. The future_state<T> in the promise itself
+/// 2. The future_state<T> in the future
+/// 2. The future_state<T> in the continuation
+///
+/// A special case is when a future blocks inside a thread. In that
+/// case we still need a continuation, but that continuation doesn't
+/// need a future_state<T> since the original future still exists on
+/// the stack.
+///
+/// So the valid states for a promise are:
+///
+/// 1. A newly created promise. _state points to _local_state and
+/// _task and _future are null.
+/// 2. After get_future is called. _state points to the state in the
+/// future, _future points to the future and _task is null.
+/// 3. The future has been consumed by future::then. Now the _state
+/// points to the state in the continuation, _future is null and
+/// _task points to the continuation.
+/// 4. A call to future::get is blocked in a thread. This is a mix of
+/// cases 2 and 3. Like 2, there is a valid future and _future and
+/// _state point to the future and its state. Like 3, there is a
+/// valid continuation and _task points to it, but that
+/// continuation has no state of its own.
+
+/// \defgroup future-util Future Utilities
+/// \ingroup future-module
+///
+/// \brief
+/// These utilities are provided to help perform operations on futures.
+
+
+/// \addtogroup future-module
+/// @{
+
+#if SEASTAR_API_LEVEL < 6
+template <class... T>
+#else
+template <class T = void>
+#endif
+class promise;
+
+template <class SEASTAR_ELLIPSIS T>
+class future;
+
+template <typename... T>
+class shared_future;
+
+struct future_state_base;
+
+/// \brief Creates a \ref future in an available, value state.
+///
+/// Creates a \ref future object that is already resolved. This
+/// is useful when it is determined that no I/O needs to be performed
+/// to perform a computation (for example, because the data is cached
+/// in some buffer).
+template <typename... T, typename... A>
+future<T...> make_ready_future(A&&... value) noexcept;
+
+/// \brief Creates a \ref future in an available, failed state.
+///
+/// Creates a \ref future object that is already resolved in a failed
+/// state. This is useful when no I/O needs to be performed to perform
+/// a computation (for example, because the connection is closed and
+/// we cannot read from it).
+template <typename... T>
+future<T...> make_exception_future(std::exception_ptr&& value) noexcept;
+
+template <typename... T>
+future<T...> make_exception_future(const std::exception_ptr& ex) noexcept {
+ return make_exception_future<T...>(std::exception_ptr(ex));
+}
+
+template <typename... T>
+future<T...> make_exception_future(std::exception_ptr& ex) noexcept {
+ return make_exception_future<T...>(static_cast<const std::exception_ptr&>(ex));
+}
+
+/// \cond internal
+void engine_exit(std::exception_ptr eptr = {});
+
+void report_failed_future(const std::exception_ptr& ex) noexcept;
+
+void report_failed_future(const future_state_base& state) noexcept;
+
+void with_allow_abandoned_failed_futures(unsigned count, noncopyable_function<void ()> func);
+
+/// \endcond
+
+/// \brief Exception type for broken promises
+///
+/// When a promise is broken, i.e. a promise object with an attached
+/// continuation is destroyed before setting any value or exception, an
+/// exception of `broken_promise` type is propagated to that abandoned
+/// continuation.
+struct broken_promise : std::logic_error {
+ broken_promise();
+};
+
+/// \brief Returns std::current_exception() wrapped in a future
+///
+/// This is equivalent to
+/// make_exception_future(std::current_exception()), but expands to
+/// less code.
+template <typename... T>
+future<T...> current_exception_as_future() noexcept;
+
+extern template
+future<> current_exception_as_future() noexcept;
+
+namespace internal {
+#if SEASTAR_API_LEVEL < 6
+template <class... T>
+#else
+template <class T = void>
+#endif
+class promise_base_with_type;
+class promise_base;
+
+struct monostate {};
+
+template <typename... T>
+struct future_stored_type;
+
+template <>
+struct future_stored_type<> {
+#if SEASTAR_API_LEVEL < 5
+ using type = std::tuple<>;
+#else
+ using type = monostate;
+#endif
+};
+
+template <typename T>
+struct future_stored_type<T> {
+#if SEASTAR_API_LEVEL < 5
+ using type = std::tuple<T>;
+#else
+ using type = std::conditional_t<std::is_void_v<T>, internal::monostate, T>;
+#endif
+};
+
+template <typename... T>
+using future_stored_type_t = typename future_stored_type<T...>::type;
+
+template<typename T>
+#if SEASTAR_API_LEVEL < 5
+using future_tuple_type_t = T;
+#else
+using future_tuple_type_t = std::conditional_t<std::is_same_v<T, monostate>, std::tuple<>, std::tuple<T>>;
+#endif
+
+// It doesn't seem to be possible to use std::tuple_element_t with an empty tuple. There is an static_assert in it that
+// fails the build even if it is in the non enabled side of std::conditional.
+template <typename T>
+struct get0_return_type;
+
+template <>
+struct get0_return_type<std::tuple<>> {
+ using type = void;
+ static type get0(std::tuple<> v) { }
+};
+
+template <typename T0, typename... T>
+struct get0_return_type<std::tuple<T0, T...>> {
+ using type = T0;
+ static type get0(std::tuple<T0, T...> v) { return std::get<0>(std::move(v)); }
+};
+
+template<typename T>
+using maybe_wrap_ref = std::conditional_t<std::is_reference_v<T>, std::reference_wrapper<std::remove_reference_t<T>>, T>;
+
+/// \brief Wrapper for keeping uninitialized values of non default constructible types.
+///
+/// This is similar to a std::optional<T>, but it doesn't know if it is holding a value or not, so the user is
+/// responsible for calling constructors and destructors.
+///
+/// The advantage over just using a union directly is that this uses inheritance when possible and so benefits from the
+/// empty base optimization.
+template <typename T, bool is_trivial_class>
+struct uninitialized_wrapper_base;
+
+template <typename T>
+struct uninitialized_wrapper_base<T, false> {
+ using tuple_type = future_tuple_type_t<T>;
+ union any {
+ any() noexcept {}
+ ~any() {}
+ // T can be a reference, so wrap it.
+ maybe_wrap_ref<T> value;
+ } _v;
+
+public:
+ uninitialized_wrapper_base() noexcept = default;
+ template<typename... U>
+ std::enable_if_t<!std::is_same_v<std::tuple<std::remove_cv_t<U>...>, std::tuple<tuple_type>>, void>
+ uninitialized_set(U&&... vs) {
+ new (&_v.value) maybe_wrap_ref<T>{T(std::forward<U>(vs)...)};
+ }
+ void uninitialized_set(tuple_type&& v) {
+ uninitialized_set(std::move(std::get<0>(v)));
+ }
+ void uninitialized_set(const tuple_type& v) {
+ uninitialized_set(std::get<0>(v));
+ }
+ maybe_wrap_ref<T>& uninitialized_get() {
+ return _v.value;
+ }
+ const maybe_wrap_ref<T>& uninitialized_get() const {
+ return _v.value;
+ }
+};
+
+template <typename T> struct uninitialized_wrapper_base<T, true> : private T {
+ using tuple_type = future_tuple_type_t<T>;
+ uninitialized_wrapper_base() noexcept = default;
+ template<typename... U>
+ std::enable_if_t<!std::is_same_v<std::tuple<std::remove_cv_t<U>...>, std::tuple<tuple_type>>, void>
+ uninitialized_set(U&&... vs) {
+ new (this) T(std::forward<U>(vs)...);
+ }
+ void uninitialized_set(tuple_type&& v) {
+ if constexpr (std::tuple_size_v<tuple_type> != 0) {
+ uninitialized_set(std::move(std::get<0>(v)));
+ }
+ }
+ void uninitialized_set(const tuple_type& v) {
+ if constexpr (std::tuple_size_v<tuple_type> != 0) {
+ uninitialized_set(std::get<0>(v));
+ }
+ }
+ T& uninitialized_get() {
+ return *this;
+ }
+ const T& uninitialized_get() const {
+ return *this;
+ }
+};
+
+template <typename T>
+constexpr bool can_inherit =
+#ifdef _LIBCPP_VERSION
+// We expect std::tuple<> to be trivially constructible and
+// destructible. That is not the case with libc++
+// (https://bugs.llvm.org/show_bug.cgi?id=41714). We could avoid this
+// optimization when using libc++ and relax the asserts, but
+// inspection suggests that std::tuple<> is trivial, it is just not
+// marked as such.
+ std::is_same<std::tuple<>, T>::value ||
+#endif
+ (std::is_trivially_destructible<T>::value && std::is_trivially_constructible<T>::value &&
+ std::is_class<T>::value && !std::is_final<T>::value);
+
+// The objective is to avoid extra space for empty types like std::tuple<>. We could use std::is_empty_v, but it is
+// better to check that both the constructor and destructor can be skipped.
+template <typename T>
+struct uninitialized_wrapper
+ : public uninitialized_wrapper_base<T, can_inherit<T>> {};
+
+template <typename T>
+struct is_trivially_move_constructible_and_destructible {
+ static constexpr bool value = std::is_trivially_move_constructible<T>::value && std::is_trivially_destructible<T>::value;
+};
+
+template <bool... v>
+struct all_true : std::false_type {};
+
+template <>
+struct all_true<> : std::true_type {};
+
+template <bool... v>
+struct all_true<true, v...> : public all_true<v...> {};
+
+template<typename T>
+struct is_tuple_effectively_trivially_move_constructible_and_destructible_helper;
+
+template <typename... T>
+struct is_tuple_effectively_trivially_move_constructible_and_destructible_helper<std::tuple<T...>> {
+ static constexpr bool value = all_true<is_trivially_move_constructible_and_destructible<T>::value...>::value;
+};
+
+template <typename T>
+static constexpr bool is_tuple_effectively_trivially_move_constructible_and_destructible =
+ is_tuple_effectively_trivially_move_constructible_and_destructible_helper<T>::value;
+
+}
+
+//
+// A future/promise pair maintain one logical value (a future_state).
+// There are up to three places that can store it, but only one is
+// active at any time.
+//
+// - in the promise _local_state member variable
+//
+// This is necessary because a promise is created first and there
+// would be nowhere else to put the value.
+//
+// - in the future _state variable
+//
+// This is used anytime a future exists and then has not been called
+// yet. This guarantees a simple access to the value for any code
+// that already has a future.
+//
+// - in the task associated with the .then() clause (after .then() is called,
+// if a value was not set)
+//
+//
+// The promise maintains a pointer to the state, which is modified as
+// the state moves to a new location due to events (such as .then() or
+// get_future being called) or due to the promise or future being
+// moved around.
+//
+
+// non templated base class to reduce code duplication
+struct future_state_base {
+ static_assert(sizeof(std::exception_ptr) == sizeof(void*), "exception_ptr not a pointer");
+ enum class state : uintptr_t {
+ invalid = 0,
+ future = 1,
+ // the substate is intended to decouple the run-time prevention
+ // for duplicative result extraction (calling e.g. then() twice
+ // ends up in abandoned()) from the wrapped object's destruction
+ // handling which is orchestrated by future_state. Instead of
+ // creating a temporary future_state just for the sake of setting
+ // the "invalid" in the source instance, result_unavailable can
+ // be set to ensure future_state_base::available() returns false.
+ result_unavailable = 2,
+ result = 3,
+ exception_min = 4, // or anything greater
+ };
+ union any {
+ any() noexcept { st = state::future; }
+ any(state s) noexcept { st = s; }
+ void set_exception(std::exception_ptr&& e) noexcept {
+ new (&ex) std::exception_ptr(std::move(e));
+ assert(st >= state::exception_min);
+ }
+ any(std::exception_ptr&& e) noexcept {
+ set_exception(std::move(e));
+ }
+ // From a users' perspective, a result_unavailable is not valid
+ bool valid() const noexcept { return st != state::invalid && st != state::result_unavailable; }
+ bool available() const noexcept { return st == state::result || st >= state::exception_min; }
+ bool failed() const noexcept { return __builtin_expect(st >= state::exception_min, false); }
+ void check_failure() noexcept;
+ ~any() noexcept { }
+ std::exception_ptr take_exception() noexcept {
+ std::exception_ptr ret(std::move(ex));
+ // Unfortunately in libstdc++ ~exception_ptr is defined out of line. We know that it does nothing for
+ // moved out values, so we omit calling it. This is critical for the code quality produced for this
+ // function. Without the out of line call, gcc can figure out that both sides of the if produce
+ // identical code and merges them.if
+ // We don't make any assumptions about other c++ libraries.
+ // There is request with gcc to define it inline: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90295
+#ifndef __GLIBCXX__
+ ex.~exception_ptr();
+#endif
+ st = state::invalid;
+ return ret;
+ }
+ void move_it(any&& x) noexcept {
+#ifdef __GLIBCXX__
+ // Unfortunally gcc cannot fully optimize the regular
+ // implementation:
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95014
+ // Given what we know about the libstdc++ implementation
+ // (see the comment in take_exception), we can just
+ // memmove and zero x. We use memmove to guarantee
+ // vaild results if &x == this.
+ memmove(static_cast<void*>(this), &x, sizeof(any));
+ x.st = state::invalid;
+#else
+ if (x.st < state::exception_min) {
+ st = x.st;
+ x.st = state::invalid;
+ } else {
+ new (&ex) std::exception_ptr(x.take_exception());
+ }
+#endif
+ }
+ any(any&& x) noexcept {
+ move_it(std::move(x));
+ }
+ any& operator=(any&& x) noexcept {
+ check_failure();
+ // If this is a self move assignment, check_failure
+ // guarantees that we don't have an exception and calling
+ // move_it is safe.
+ move_it(std::move(x));
+ return *this;
+ }
+ bool has_result() const noexcept {
+ return st == state::result || st == state::result_unavailable;
+ }
+ state st;
+ std::exception_ptr ex;
+ } _u;
+
+ future_state_base() noexcept = default;
+ future_state_base(state st) noexcept : _u(st) { }
+ future_state_base(std::exception_ptr&& ex) noexcept : _u(std::move(ex)) { }
+ future_state_base(future_state_base&& x) noexcept : _u(std::move(x._u)) { }
+
+ // We never need to destruct this polymorphicly, so we can make it
+ // protected instead of virtual.
+protected:
+ struct current_exception_future_marker {};
+ future_state_base(current_exception_future_marker) noexcept;
+ struct nested_exception_marker {};
+ future_state_base(nested_exception_marker, future_state_base&& old) noexcept;
+ future_state_base(nested_exception_marker, future_state_base&& n, future_state_base&& old) noexcept;
+ ~future_state_base() noexcept = default;
+
+ void rethrow_exception() &&;
+ void rethrow_exception() const&;
+
+public:
+
+ bool valid() const noexcept { return _u.valid(); }
+ bool available() const noexcept { return _u.available(); }
+ bool failed() const noexcept { return _u.failed(); }
+
+ void ignore() noexcept;
+
+ void set_exception(std::exception_ptr&& ex) noexcept {
+ assert(_u.st == state::future);
+ _u.set_exception(std::move(ex));
+ }
+ future_state_base& operator=(future_state_base&& x) noexcept = default;
+ void set_exception(future_state_base&& state) noexcept {
+ assert(_u.st == state::future);
+ *this = std::move(state);
+ }
+ std::exception_ptr get_exception() && noexcept {
+ assert(_u.st >= state::exception_min);
+ // Move ex out so future::~future() knows we've handled it
+ return _u.take_exception();
+ }
+ const std::exception_ptr& get_exception() const& noexcept {
+ assert(_u.st >= state::exception_min);
+ return _u.ex;
+ }
+ template <typename U>
+ friend struct future_state;
+ template <typename... U>
+ friend future<U...> current_exception_as_future() noexcept;
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class future;
+ template <typename T>
+ friend struct futurize;
+};
+
+void report_failed_future(future_state_base::any&& state) noexcept;
+
+inline void future_state_base::any::check_failure() noexcept {
+ if (failed()) {
+ report_failed_future(std::move(*this));
+ }
+}
+
+struct ready_future_marker {};
+struct exception_future_marker {};
+struct future_for_get_promise_marker {};
+
+/// \cond internal
+template <typename T>
+struct future_state : public future_state_base, private internal::uninitialized_wrapper<T> {
+ static constexpr bool copy_noexcept = std::is_nothrow_copy_constructible<T>::value;
+#if SEASTAR_API_LEVEL < 5
+ static constexpr bool has_trivial_move_and_destroy = internal::is_tuple_effectively_trivially_move_constructible_and_destructible<T>;
+#else
+ static constexpr bool has_trivial_move_and_destroy = internal::is_trivially_move_constructible_and_destructible<T>::value;
+#endif
+ static_assert(std::is_nothrow_move_constructible<T>::value,
+ "Types must be no-throw move constructible");
+ static_assert(std::is_nothrow_destructible<T>::value,
+ "Types must be no-throw destructible");
+ future_state() noexcept = default;
+ void move_it(future_state&& x) noexcept {
+ if constexpr (has_trivial_move_and_destroy) {
+ memmove(reinterpret_cast<char*>(&this->uninitialized_get()),
+ &x.uninitialized_get(),
+ internal::used_size<internal::maybe_wrap_ref<T>>::value);
+ } else if (_u.has_result()) {
+ this->uninitialized_set(std::move(x.uninitialized_get()));
+ std::destroy_at(&x.uninitialized_get());
+ }
+ }
+
+ [[gnu::always_inline]]
+ future_state(future_state&& x) noexcept : future_state_base(std::move(x)) {
+ move_it(std::move(x));
+ }
+
+ void clear() noexcept {
+ if (_u.has_result()) {
+ std::destroy_at(&this->uninitialized_get());
+ } else {
+ _u.check_failure();
+ }
+ }
+ __attribute__((always_inline))
+ ~future_state() noexcept {
+ clear();
+ }
+ future_state& operator=(future_state&& x) noexcept {
+ clear();
+ future_state_base::operator=(std::move(x));
+ // If &x == this, _u.st is now state::invalid and so it is
+ // safe to call move_it.
+ move_it(std::move(x));
+ return *this;
+ }
+ template <typename... A>
+ future_state(ready_future_marker, A&&... a) noexcept : future_state_base(state::result) {
+ try {
+ this->uninitialized_set(std::forward<A>(a)...);
+ } catch (...) {
+ new (this) future_state(current_exception_future_marker());
+ }
+ }
+ template <typename... A>
+ void set(A&&... a) noexcept {
+ assert(_u.st == state::future);
+ new (this) future_state(ready_future_marker(), std::forward<A>(a)...);
+ }
+ future_state(exception_future_marker m, std::exception_ptr&& ex) noexcept : future_state_base(std::move(ex)) { }
+ future_state(exception_future_marker m, future_state_base&& state) noexcept : future_state_base(std::move(state)) { }
+ future_state(current_exception_future_marker m) noexcept : future_state_base(m) { }
+ future_state(nested_exception_marker m, future_state_base&& old) noexcept : future_state_base(m, std::move(old)) { }
+ future_state(nested_exception_marker m, future_state_base&& n, future_state_base&& old) noexcept : future_state_base(m, std::move(n), std::move(old)) { }
+ T&& get_value() && noexcept {
+ assert(_u.st == state::result);
+ return static_cast<T&&>(this->uninitialized_get());
+ }
+ T&& take_value() && noexcept {
+ assert(_u.st == state::result);
+ _u.st = state::result_unavailable;
+ return static_cast<T&&>(this->uninitialized_get());
+ }
+ template<typename U = T>
+ const std::enable_if_t<std::is_copy_constructible<U>::value, U>& get_value() const& noexcept(copy_noexcept) {
+ assert(_u.st == state::result);
+ return this->uninitialized_get();
+ }
+ T&& take() && {
+ assert(available());
+ if (_u.st >= state::exception_min) {
+ std::move(*this).rethrow_exception();
+ }
+ _u.st = state::result_unavailable;
+ return static_cast<T&&>(this->uninitialized_get());
+ }
+ T&& get() && {
+ assert(available());
+ if (_u.st >= state::exception_min) {
+ std::move(*this).rethrow_exception();
+ }
+ return static_cast<T&&>(this->uninitialized_get());
+ }
+ const T& get() const& {
+ assert(available());
+ if (_u.st >= state::exception_min) {
+ rethrow_exception();
+ }
+ return this->uninitialized_get();
+ }
+ using get0_return_type = typename internal::get0_return_type<internal::future_tuple_type_t<T>>::type;
+ static get0_return_type get0(T&& x) {
+ return internal::get0_return_type<T>::get0(std::move(x));
+ }
+
+ get0_return_type get0() {
+#if SEASTAR_API_LEVEL < 5
+ return get0(std::move(*this).get());
+#else
+ return std::move(*this).get();
+#endif
+ }
+};
+
+#if SEASTAR_API_LEVEL < 6
+template <typename... T>
+#else
+template <typename T = void>
+#endif
+class continuation_base : public task {
+protected:
+ using future_state = seastar::future_state<internal::future_stored_type_t<T SEASTAR_ELLIPSIS>>;
+ future_state _state;
+ using future_type = future<T SEASTAR_ELLIPSIS>;
+ using promise_type = promise<T SEASTAR_ELLIPSIS>;
+public:
+ continuation_base() noexcept = default;
+ void set_state(future_state&& state) noexcept {
+ _state = std::move(state);
+ }
+ // This override of waiting_task() is needed here because there are cases
+ // when backtrace is obtained from the destructor of this class and objects
+ // of derived classes are already destroyed at that time. If we didn't
+ // have this override we would get a "pure virtual function call" exception.
+ virtual task* waiting_task() noexcept override { return nullptr; }
+ friend class internal::promise_base_with_type<T SEASTAR_ELLIPSIS>;
+ friend class promise<T SEASTAR_ELLIPSIS>;
+ friend class future<T SEASTAR_ELLIPSIS>;
+};
+
+// Given a future type, find the corresponding continuation_base.
+template <typename Future>
+struct continuation_base_from_future;
+
+template <typename... T>
+struct continuation_base_from_future<future<T...>> {
+ using type = continuation_base<T...>;
+};
+
+template <typename Future>
+using continuation_base_from_future_t = typename continuation_base_from_future<Future>::type;
+
+#if SEASTAR_API_LEVEL < 6
+template <typename Promise, typename... T>
+#else
+template <typename Promise, typename T = void>
+#endif
+class continuation_base_with_promise : public continuation_base<T SEASTAR_ELLIPSIS> {
+ friend class internal::promise_base_with_type<T SEASTAR_ELLIPSIS>;
+protected:
+ continuation_base_with_promise(Promise&& pr) noexcept : _pr(std::move(pr)) {
+ task::make_backtrace();
+ }
+ virtual task* waiting_task() noexcept override;
+ Promise _pr;
+};
+
+#if SEASTAR_API_LEVEL < 6
+template <typename Promise, typename Func, typename Wrapper, typename... T>
+#else
+template <typename Promise, typename Func, typename Wrapper, typename T = void>
+#endif
+struct continuation final : continuation_base_with_promise<Promise, T SEASTAR_ELLIPSIS> {
+ // Func is the original function passed to then/then_wrapped. The
+ // Wrapper is a helper function that implements the specific logic
+ // needed by then/then_wrapped. We call the wrapper passing it the
+ // original function, promise and state.
+ // Note that if Func's move constructor throws, this will call
+ // std::unexpected. We could try to require Func to be nothrow
+ // move constructible, but that will cause a lot of churn. Since
+ // we can't support a failure to create a continuation, calling
+ // std::unexpected as close to the failure as possible is the best
+ // we can do.
+ continuation(Promise&& pr, Func&& func, Wrapper&& wrapper) noexcept
+ : continuation_base_with_promise<Promise, T SEASTAR_ELLIPSIS>(std::move(pr))
+ , _func(std::move(func))
+ , _wrapper(std::move(wrapper)) {}
+ virtual void run_and_dispose() noexcept override {
+ try {
+ _wrapper(std::move(this->_pr), _func, std::move(this->_state));
+ } catch (...) {
+ this->_pr.set_to_current_exception();
+ }
+ delete this;
+ }
+ Func _func;
+ [[no_unique_address]] Wrapper _wrapper;
+};
+
+#if SEASTAR_API_LEVEL < 4
+
+// This is an internal future<> payload for seastar::when_all_succeed(). It is used
+// to return a variadic future (when two or more of its input futures were non-void),
+// but with variadic futures deprecated and soon gone this is no longer possible.
+//
+// Instead, we use this tuple type, and future::then() knows to unpack it.
+//
+// The whole thing is temporary for a transition period.
+template <typename... T>
+struct when_all_succeed_tuple : std::tuple<T...> {
+ using std::tuple<T...>::tuple;
+ when_all_succeed_tuple(std::tuple<T...>&& t)
+ noexcept(std::is_nothrow_move_constructible<std::tuple<T...>>::value)
+ : std::tuple<T...>(std::move(t)) {}
+};
+
+#endif
+
+namespace internal {
+
+template <typename... T>
+future<T...> make_exception_future(future_state_base&& state) noexcept;
+
+template <typename... T, typename U>
+void set_callback(future<T...>& fut, U* callback) noexcept;
+
+class future_base;
+
+class promise_base {
+protected:
+ enum class urgent { no, yes };
+ future_base* _future = nullptr;
+
+ // This points to the future_state that is currently being
+ // used. See comment above the future_state struct definition for
+ // details.
+ future_state_base* _state;
+
+ task* _task = nullptr;
+
+ promise_base(const promise_base&) = delete;
+ promise_base(future_state_base* state) noexcept : _state(state) {}
+ promise_base(future_base* future, future_state_base* state) noexcept;
+ void move_it(promise_base&& x) noexcept;
+ promise_base(promise_base&& x) noexcept;
+
+ void clear() noexcept;
+
+ // We never need to destruct this polymorphicly, so we can make it
+ // protected instead of virtual
+ ~promise_base() noexcept {
+ clear();
+ }
+
+ void operator=(const promise_base&) = delete;
+ promise_base& operator=(promise_base&& x) noexcept;
+
+ template<urgent Urgent>
+ void make_ready() noexcept;
+
+ template<typename T>
+ void set_exception_impl(T&& val) noexcept {
+ if (_state) {
+ _state->set_exception(std::move(val));
+ make_ready<urgent::no>();
+ } else {
+ // We get here if promise::get_future is called and the
+ // returned future is destroyed without creating a
+ // continuation.
+ // In older versions of seastar we would store a local
+ // copy of ex and warn in the promise destructor.
+ // Since there isn't any way for the user to clear
+ // the exception, we issue the warning from here.
+ report_failed_future(val);
+ }
+ }
+
+ void set_exception(future_state_base&& state) noexcept {
+ set_exception_impl(std::move(state));
+ }
+
+ void set_exception(std::exception_ptr&& ex) noexcept {
+ set_exception_impl(std::move(ex));
+ }
+
+ void set_exception(const std::exception_ptr& ex) noexcept {
+ set_exception(std::exception_ptr(ex));
+ }
+
+ template<typename Exception>
+ std::enable_if_t<!std::is_same<std::remove_reference_t<Exception>, std::exception_ptr>::value, void> set_exception(Exception&& e) noexcept {
+ set_exception(make_exception_ptr(std::forward<Exception>(e)));
+ }
+
+ friend class future_base;
+ template <typename SEASTAR_ELLIPSIS U> friend class seastar::future;
+
+public:
+ /// Set this promise to the current exception.
+ ///
+ /// This is equivalent to set_exception(std::current_exception()),
+ /// but expands to less code.
+ void set_to_current_exception() noexcept;
+
+ /// Returns the task which is waiting for this promise to resolve, or nullptr.
+ task* waiting_task() const noexcept { return _task; }
+};
+
+/// \brief A promise with type but no local data.
+///
+/// This is a promise without any local data. We use this for when the
+/// future is created first, so we know the promise always has an
+/// external place to point to. We cannot just use promise_base
+/// because we need to know the type that is being stored.
+template <typename SEASTAR_ELLIPSIS T>
+class promise_base_with_type : protected internal::promise_base {
+protected:
+ using future_state = seastar::future_state<future_stored_type_t<T SEASTAR_ELLIPSIS>>;
+ future_state* get_state() noexcept {
+ return static_cast<future_state*>(_state);
+ }
+ static constexpr bool copy_noexcept = future_state::copy_noexcept;
+public:
+ promise_base_with_type(future_state_base* state) noexcept : promise_base(state) { }
+ promise_base_with_type(future<T SEASTAR_ELLIPSIS>* future) noexcept : promise_base(future, &future->_state) { }
+ promise_base_with_type(promise_base_with_type&& x) noexcept = default;
+ promise_base_with_type(const promise_base_with_type&) = delete;
+ promise_base_with_type& operator=(promise_base_with_type&& x) noexcept = default;
+ void operator=(const promise_base_with_type&) = delete;
+
+ void set_urgent_state(future_state&& state) noexcept {
+ auto* ptr = get_state();
+ // The state can be null if the corresponding future has been
+ // destroyed without producing a continuation.
+ if (ptr) {
+ // FIXME: This is a fairly expensive assert. It would be a
+ // good candidate for being disabled in release builds if
+ // we had such an assert.
+ assert(ptr->_u.st == future_state_base::state::future);
+ new (ptr) future_state(std::move(state));
+ make_ready<urgent::yes>();
+ }
+ }
+
+ template <typename... A>
+ void set_value(A&&... a) noexcept {
+ if (auto *s = get_state()) {
+ s->set(std::forward<A>(a)...);
+ make_ready<urgent::no>();
+ }
+ }
+
+ /// Set this promise to the current exception.
+ ///
+ /// This is equivalent to set_exception(std::current_exception()),
+ /// but expands to less code.
+ void set_to_current_exception() noexcept {
+ internal::promise_base::set_to_current_exception();
+ }
+
+ /// Returns the task which is waiting for this promise to resolve, or nullptr.
+ using internal::promise_base::waiting_task;
+
+private:
+
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class seastar::future;
+
+ friend future_state;
+};
+}
+/// \endcond
+
+/// \brief promise - allows a future value to be made available at a later time.
+///
+/// \tparam T A list of types to be carried as the result of the associated future.
+/// A list with two or more types is deprecated; use
+/// \c promise<std::tuple<T...>> instead.
+template <typename SEASTAR_ELLIPSIS T>
+class promise : private internal::promise_base_with_type<T SEASTAR_ELLIPSIS> {
+ using future_state = typename internal::promise_base_with_type<T SEASTAR_ELLIPSIS>::future_state;
+ future_state _local_state;
+
+public:
+ /// \brief Constructs an empty \c promise.
+ ///
+ /// Creates promise with no associated future yet (see get_future()).
+ promise() noexcept : internal::promise_base_with_type<T SEASTAR_ELLIPSIS>(&_local_state) {}
+
+ /// \brief Moves a \c promise object.
+ void move_it(promise&& x) noexcept;
+ promise(promise&& x) noexcept : internal::promise_base_with_type<T SEASTAR_ELLIPSIS>(std::move(x)) {
+ move_it(std::move(x));
+ }
+ promise(const promise&) = delete;
+ promise& operator=(promise&& x) noexcept {
+ internal::promise_base_with_type<T SEASTAR_ELLIPSIS>::operator=(std::move(x));
+ // If this is a self-move, _state is now nullptr and it is
+ // safe to call move_it.
+ move_it(std::move(x));
+ return *this;
+ }
+ void operator=(const promise&) = delete;
+
+ /// Set this promise to the current exception.
+ ///
+ /// This is equivalent to set_exception(std::current_exception()),
+ /// but expands to less code.
+ void set_to_current_exception() noexcept {
+ internal::promise_base::set_to_current_exception();
+ }
+
+ /// Returns the task which is waiting for this promise to resolve, or nullptr.
+ using internal::promise_base::waiting_task;
+
+ /// \brief Gets the promise's associated future.
+ ///
+ /// The future and promise will be remember each other, even if either or
+ /// both are moved. When \c set_value() or \c set_exception() are called
+ /// on the promise, the future will be become ready, and if a continuation
+ /// was attached to the future, it will run.
+ future<T SEASTAR_ELLIPSIS> get_future() noexcept;
+
+ /// \brief Sets the promises value
+ ///
+ /// Forwards the arguments and makes them available to the associated
+ /// future. May be called either before or after \c get_future().
+ ///
+ /// The arguments can have either the types the promise is
+ /// templated with, or a corresponding std::tuple. That is, given
+ /// a promise<int, double>, both calls are valid:
+ ///
+ /// pr.set_value(42, 43.0);
+ /// pr.set_value(std::tuple<int, double>(42, 43.0))
+ template <typename... A>
+ void set_value(A&&... a) noexcept {
+ internal::promise_base_with_type<T SEASTAR_ELLIPSIS>::set_value(std::forward<A>(a)...);
+ }
+
+ /// \brief Marks the promise as failed
+ ///
+ /// Forwards the exception argument to the future and makes it
+ /// available. May be called either before or after \c get_future().
+ void set_exception(std::exception_ptr&& ex) noexcept {
+ internal::promise_base::set_exception(std::move(ex));
+ }
+
+ void set_exception(const std::exception_ptr& ex) noexcept {
+ internal::promise_base::set_exception(ex);
+ }
+
+ /// \brief Marks the promise as failed
+ ///
+ /// Forwards the exception argument to the future and makes it
+ /// available. May be called either before or after \c get_future().
+ template<typename Exception>
+ std::enable_if_t<!std::is_same<std::remove_reference_t<Exception>, std::exception_ptr>::value, void> set_exception(Exception&& e) noexcept {
+ internal::promise_base::set_exception(std::forward<Exception>(e));
+ }
+
+ using internal::promise_base_with_type<T SEASTAR_ELLIPSIS>::set_urgent_state;
+
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class future;
+};
+
+#if SEASTAR_API_LEVEL < 6
+/// \brief Specialization of \c promise<void>
+///
+/// This is an alias for \c promise<>, for generic programming purposes.
+/// For example, You may have a \c promise<T> where \c T can legally be
+/// \c void.
+template<>
+class promise<void> : public promise<> {};
+#endif
+
+/// @}
+
+/// \addtogroup future-util
+/// @{
+
+
+/// \brief Check whether a type is a future
+///
+/// This is a type trait evaluating to \c true if the given type is a
+/// future.
+///
+template <typename... T> struct is_future : std::false_type {};
+
+/// \cond internal
+/// \addtogroup future-util
+template <typename... T> struct is_future<future<T...>> : std::true_type {};
+
+/// \endcond
+
+
+/// \brief Converts a type to a future type, if it isn't already.
+///
+/// \return Result in member type 'type'.
+template <typename T>
+struct futurize;
+
+SEASTAR_CONCEPT(
+
+template <typename T>
+concept Future = is_future<T>::value;
+
+template <typename Func, typename... T>
+concept CanInvoke = std::invocable<Func, T...>;
+
+// Deprecated alias
+template <typename Func, typename... T>
+concept CanApply = CanInvoke<Func, T...>;
+
+template <typename Func, typename... T>
+concept CanApplyTuple
+ = sizeof...(T) == 1
+ && requires (Func func, std::tuple<T...> wrapped_val) {
+ { std::apply(func, std::get<0>(std::move(wrapped_val))) };
+ };
+
+template <typename Func, typename Return, typename... T>
+concept InvokeReturns = requires (Func f, T... args) {
+ { f(std::forward<T>(args)...) } -> std::same_as<Return>;
+};
+
+// Deprecated alias
+template <typename Func, typename Return, typename... T>
+concept ApplyReturns = InvokeReturns<Func, Return, T...>;
+
+template <typename Func, typename... T>
+concept InvokeReturnsAnyFuture = requires (Func f, T... args) {
+ requires is_future<decltype(f(std::forward<T>(args)...))>::value;
+};
+
+// Deprecated alias
+template <typename Func, typename... T>
+concept ApplyReturnsAnyFuture = InvokeReturnsAnyFuture<Func, T...>;
+
+)
+
+/// \endcond
+
+// Converts a type to a future type, if it isn't already.
+template <typename T>
+using futurize_t = typename futurize<T>::type;
+
+/// @}
+
+template<typename Func, typename... Args>
+auto futurize_invoke(Func&& func, Args&&... args) noexcept;
+
+template<typename Func, typename... Args>
+auto futurize_apply(Func&& func, std::tuple<Args...>&& args) noexcept;
+
+/// \addtogroup future-module
+/// @{
+namespace internal {
+class future_base {
+protected:
+ promise_base* _promise;
+ future_base() noexcept : _promise(nullptr) {}
+ future_base(promise_base* promise, future_state_base* state) noexcept : _promise(promise) {
+ _promise->_future = this;
+ _promise->_state = state;
+ }
+
+ void move_it(future_base&& x, future_state_base* state) noexcept {
+ _promise = x._promise;
+ if (auto* p = _promise) {
+ x.detach_promise();
+ p->_future = this;
+ p->_state = state;
+ }
+ }
+
+ future_base(future_base&& x, future_state_base* state) noexcept {
+ move_it(std::move(x), state);
+ }
+
+ void clear() noexcept {
+ if (_promise) {
+ detach_promise();
+ }
+ }
+
+ ~future_base() noexcept {
+ clear();
+ }
+
+ promise_base* detach_promise() noexcept {
+ _promise->_state = nullptr;
+ _promise->_future = nullptr;
+ return std::exchange(_promise, nullptr);
+ }
+
+ void schedule(task* tws, future_state_base* state) noexcept {
+ promise_base* p = detach_promise();
+ p->_state = state;
+ p->_task = tws;
+ }
+
+ void do_wait() noexcept;
+
+#ifdef SEASTAR_COROUTINES_ENABLED
+ void set_coroutine(task& coroutine) noexcept;
+#endif
+
+ friend class promise_base;
+};
+
+template <typename Func, typename... T>
+struct future_result {
+ using type = std::invoke_result_t<Func, T...>;
+ using future_type = futurize_t<type>;
+ using func_type = future_type (T&&...);
+};
+
+template <typename Func>
+struct future_result<Func, void> {
+ using type = std::invoke_result_t<Func>;
+ using future_type = futurize_t<type>;
+ using func_type = future_type ();
+};
+
+template <typename Func, typename SEASTAR_ELLIPSIS T>
+using future_result_t = typename future_result<Func, T SEASTAR_ELLIPSIS>::type;
+
+template <typename Func, typename T>
+auto future_invoke(Func&& func, T&& v) {
+ if constexpr (std::is_same_v<T, monostate>) {
+ return std::invoke(std::forward<Func>(func));
+ } else {
+ return std::invoke(std::forward<Func>(func), std::forward<T>(v));
+ }
+}
+
+// This is a customization point for future::then()'s implementation.
+// It behaves differently when the future value type is a when_all_succeed_tuple
+// instantiation, indicating we need to unpack the tuple into multiple lambda
+// arguments.
+template <typename Future>
+struct call_then_impl;
+
+// Generic case - the input is not a future<when_all_succeed_tuple<...>>, so
+// we just forward everything to future::then_impl.
+template <typename... T>
+struct call_then_impl<future<T...>> {
+ template <typename Func>
+ using result_type = typename future_result<Func, T...>::future_type;
+
+ template <typename Func>
+ using func_type = typename future_result<Func, T...>::func_type;
+
+ template <typename Func>
+ static result_type<Func> run(future<T...>& fut, Func&& func) noexcept {
+ return fut.then_impl(std::forward<Func>(func));
+ }
+};
+
+#if SEASTAR_API_LEVEL < 4
+
+// Special case: we unpack the tuple before calling the function
+template <typename... T>
+struct call_then_impl<future<when_all_succeed_tuple<T...>>> {
+ template <typename Func>
+ using result_type = futurize_t<std::invoke_result_t<Func, T&&...>>;
+
+ template <typename Func>
+ using func_type = result_type<Func> (T&&...);
+
+ using was_tuple = when_all_succeed_tuple<T...>;
+ using std_tuple = std::tuple<T...>;
+
+ template <typename Func>
+ static auto run(future<was_tuple>& fut, Func&& func) noexcept {
+ // constructing func in the lambda can throw, but there's nothing we can do
+ // about it, similar to #84.
+ return fut.then_impl([func = std::forward<Func>(func)] (was_tuple&& t) mutable {
+ return std::apply(func, static_cast<std_tuple&&>(std::move(t)));
+ });
+ }
+};
+
+#endif
+
+template <typename Func, typename... Args>
+using call_then_impl_result_type = typename call_then_impl<future<Args...>>::template result_type<Func>;
+
+SEASTAR_CONCEPT(
+template <typename Func, typename... Args>
+concept CanInvokeWhenAllSucceed = requires {
+ typename call_then_impl_result_type<Func, Args...>;
+};
+)
+
+template <typename Func, typename... T>
+struct result_of_apply {
+ // no "type" member if not a function call signature or not a tuple
+};
+
+template <typename Func, typename... T>
+struct result_of_apply<Func, std::tuple<T...>> : std::invoke_result<Func, T...> {
+ // Let std::invoke_result_t determine the result if the input is a tuple
+};
+
+template <typename Func, typename... T>
+using result_of_apply_t = typename result_of_apply<Func, T...>::type;
+
+}
+
+template <typename Promise, typename SEASTAR_ELLIPSIS T>
+task* continuation_base_with_promise<Promise, T SEASTAR_ELLIPSIS>::waiting_task() noexcept {
+ return _pr.waiting_task();
+}
+
+/// \brief A representation of a possibly not-yet-computed value.
+///
+/// A \c future represents a value that has not yet been computed
+/// (an asynchronous computation). It can be in one of several
+/// states:
+/// - unavailable: the computation has not been completed yet
+/// - value: the computation has been completed successfully and a
+/// value is available.
+/// - failed: the computation completed with an exception.
+///
+/// methods in \c future allow querying the state and, most importantly,
+/// scheduling a \c continuation to be executed when the future becomes
+/// available. Only one such continuation may be scheduled.
+///
+/// A \ref future should not be discarded before it is waited upon and
+/// its result is extracted. Discarding a \ref future means that the
+/// computed value becomes inaccessible, but more importantly, any
+/// exceptions raised from the computation will disappear unchecked as
+/// well. Another very important consequence is potentially unbounded
+/// resource consumption due to the launcher of the deserted
+/// continuation not being able track the amount of in-progress
+/// continuations, nor their individual resource consumption.
+/// To prevent accidental discarding of futures, \ref future is
+/// declared `[[nodiscard]]` if the compiler supports it. Also, when a
+/// discarded \ref future resolves with an error a warning is logged
+/// (at runtime).
+/// That said there can be legitimate cases where a \ref future is
+/// discarded. The most prominent example is launching a new
+/// [fiber](\ref fiber-module), or in other words, moving a continuation
+/// chain to the background (off the current [fiber](\ref fiber-module)).
+/// Even if a \ref future is discarded purposefully, it is still strongly
+/// advisable to wait on it indirectly (via a \ref gate or
+/// \ref semaphore), control their concurrency, their resource consumption
+/// and handle any errors raised from them.
+///
+/// \tparam T A list of types to be carried as the result of the future,
+/// similar to \c std::tuple<T...>. An empty list (\c future<>)
+/// means that there is no result, and an available future only
+/// contains a success/failure indication (and in the case of a
+/// failure, an exception).
+/// A list with two or more types is deprecated; use
+/// \c future<std::tuple<T...>> instead.
+template <typename SEASTAR_ELLIPSIS T>
+class SEASTAR_NODISCARD future : private internal::future_base {
+ using future_state = seastar::future_state<internal::future_stored_type_t<T SEASTAR_ELLIPSIS>>;
+ future_state _state;
+ static constexpr bool copy_noexcept = future_state::copy_noexcept;
+ using call_then_impl = internal::call_then_impl<future>;
+
+private:
+ // This constructor creates a future that is not ready but has no
+ // associated promise yet. The use case is to have a less flexible
+ // but more efficient future/promise pair where we know that
+ // promise::set_value cannot possibly be called without a matching
+ // future and so that promise doesn't need to store a
+ // future_state.
+ future(future_for_get_promise_marker m) noexcept { }
+
+ future(promise<T SEASTAR_ELLIPSIS>* pr) noexcept : future_base(pr, &_state), _state(std::move(pr->_local_state)) { }
+ template <typename... A>
+ future(ready_future_marker m, A&&... a) noexcept : _state(m, std::forward<A>(a)...) { }
+ future(future_state_base::current_exception_future_marker m) noexcept : _state(m) {}
+ future(future_state_base::nested_exception_marker m, future_state_base&& old) noexcept : _state(m, std::move(old)) {}
+ future(future_state_base::nested_exception_marker m, future_state_base&& n, future_state_base&& old) noexcept : _state(m, std::move(n), std::move(old)) {}
+ future(exception_future_marker m, std::exception_ptr&& ex) noexcept : _state(m, std::move(ex)) { }
+ future(exception_future_marker m, future_state_base&& state) noexcept : _state(m, std::move(state)) { }
+ [[gnu::always_inline]]
+ explicit future(future_state&& state) noexcept
+ : _state(std::move(state)) {
+ }
+ internal::promise_base_with_type<T SEASTAR_ELLIPSIS> get_promise() noexcept {
+ assert(!_promise);
+ return internal::promise_base_with_type<T SEASTAR_ELLIPSIS>(this);
+ }
+ internal::promise_base_with_type<T SEASTAR_ELLIPSIS>* detach_promise() noexcept {
+ return static_cast<internal::promise_base_with_type<T SEASTAR_ELLIPSIS>*>(future_base::detach_promise());
+ }
+ void schedule(continuation_base<T SEASTAR_ELLIPSIS>* tws) noexcept {
+ future_base::schedule(tws, &tws->_state);
+ }
+ template <typename Pr, typename Func, typename Wrapper>
+ void schedule(Pr&& pr, Func&& func, Wrapper&& wrapper) noexcept {
+ // If this new throws a std::bad_alloc there is nothing that
+ // can be done about it. The corresponding future is not ready
+ // and we cannot break the chain. Since this function is
+ // noexcept, it will call std::terminate if new throws.
+ memory::scoped_critical_alloc_section _;
+ auto tws = new continuation<Pr, Func, Wrapper, T SEASTAR_ELLIPSIS>(std::move(pr), std::move(func), std::move(wrapper));
+ // In a debug build we schedule ready futures, but not in
+ // other build modes.
+#ifdef SEASTAR_DEBUG
+ if (_state.available()) {
+ tws->set_state(std::move(_state));
+ ::seastar::schedule(tws);
+ return;
+ }
+#endif
+ schedule(tws);
+ _state._u.st = future_state_base::state::invalid;
+ }
+
+ [[gnu::always_inline]]
+ future_state&& get_available_state_ref() noexcept {
+ if (_promise) {
+ detach_promise();
+ }
+ return std::move(_state);
+ }
+
+ future<T SEASTAR_ELLIPSIS> rethrow_with_nested(future_state_base&& n) noexcept {
+ return future<T SEASTAR_ELLIPSIS>(future_state_base::nested_exception_marker(), std::move(n), std::move(_state));
+ }
+
+ future<T SEASTAR_ELLIPSIS> rethrow_with_nested() noexcept {
+ return future<T SEASTAR_ELLIPSIS>(future_state_base::nested_exception_marker(), std::move(_state));
+ }
+
+ template<typename... U>
+ friend class shared_future;
+public:
+ /// \brief The data type carried by the future.
+ using value_type = internal::future_stored_type_t<T SEASTAR_ELLIPSIS>;
+ using tuple_type = internal::future_tuple_type_t<value_type>;
+ /// \brief The data type carried by the future.
+ using promise_type = promise<T SEASTAR_ELLIPSIS>;
+ /// \brief Moves the future into a new object.
+ [[gnu::always_inline]]
+ future(future&& x) noexcept : future_base(std::move(x), &_state), _state(std::move(x._state)) { }
+ future(const future&) = delete;
+ future& operator=(future&& x) noexcept {
+ clear();
+ move_it(std::move(x), &_state);
+ _state = std::move(x._state);
+ return *this;
+ }
+ void operator=(const future&) = delete;
+ /// \brief gets the value returned by the computation
+ ///
+ /// Requires that the future be available. If the value
+ /// was computed successfully, it is returned (as an
+ /// \c std::tuple). Otherwise, an exception is thrown.
+ ///
+ /// If get() is called in a \ref seastar::thread context,
+ /// then it need not be available; instead, the thread will
+ /// be paused until the future becomes available.
+ [[gnu::always_inline]]
+ value_type&& get() {
+ wait();
+ return get_available_state_ref().take();
+ }
+
+ [[gnu::always_inline]]
+ std::exception_ptr get_exception() noexcept {
+ return get_available_state_ref().get_exception();
+ }
+
+ /// Gets the value returned by the computation.
+ ///
+ /// Similar to \ref get(), but instead of returning a
+ /// tuple, returns the first value of the tuple. This is
+ /// useful for the common case of a \c future<T> with exactly
+ /// one type parameter.
+ ///
+ /// Equivalent to: \c std::get<0>(f.get()).
+ using get0_return_type = typename future_state::get0_return_type;
+ get0_return_type get0() {
+#if SEASTAR_API_LEVEL < 5
+ return future_state::get0(get());
+#else
+ return (get0_return_type)get();
+#endif
+ }
+
+ /// Wait for the future to be available (in a seastar::thread)
+ ///
+ /// When called from a seastar::thread, this function blocks the
+ /// thread until the future is availble. Other threads and
+ /// continuations continue to execute; only the thread is blocked.
+ void wait() noexcept {
+ if (_state.available()) {
+ return;
+ }
+ do_wait();
+ }
+
+ /// \brief Checks whether the future is available.
+ ///
+ /// \return \c true if the future has a value, or has failed.
+ [[gnu::always_inline]]
+ bool available() const noexcept {
+ return _state.available();
+ }
+
+ /// \brief Checks whether the future has failed.
+ ///
+ /// \return \c true if the future is availble and has failed.
+ [[gnu::always_inline]]
+ bool failed() const noexcept {
+ return _state.failed();
+ }
+
+ /// \brief Schedule a block of code to run when the future is ready.
+ ///
+ /// Schedules a function (often a lambda) to run when the future becomes
+ /// available. The function is called with the result of this future's
+ /// computation as parameters. The return value of the function becomes
+ /// the return value of then(), itself as a future; this allows then()
+ /// calls to be chained.
+ ///
+ /// If the future failed, the function is not called, and the exception
+ /// is propagated into the return value of then().
+ ///
+ /// \param func - function to be called when the future becomes available,
+ /// unless it has failed.
+ /// \return a \c future representing the return value of \c func, applied
+ /// to the eventual value of this future.
+ template <typename Func, typename Result = futurize_t<typename call_then_impl::template result_type<Func>>>
+ SEASTAR_CONCEPT( requires std::invocable<Func, T SEASTAR_ELLIPSIS> || internal::CanInvokeWhenAllSucceed<Func, T SEASTAR_ELLIPSIS>)
+ Result
+ then(Func&& func) noexcept {
+ // The implementation of then() is customized via the call_then_impl helper
+ // template, in order to special case the results of when_all_succeed().
+ // when_all_succeed() used to return a variadic future, which is deprecated, so
+ // now it returns a when_all_succeed_tuple, which we intercept in call_then_impl,
+ // and treat it as a variadic future.
+#ifndef SEASTAR_TYPE_ERASE_MORE
+ return call_then_impl::run(*this, std::move(func));
+#else
+ using func_type = typename call_then_impl::template func_type<Func>;
+ noncopyable_function<func_type> ncf;
+ {
+ memory::scoped_critical_alloc_section _;
+ ncf = noncopyable_function<func_type>([func = std::forward<Func>(func)](auto&&... args) mutable {
+ return futurize_invoke(func, std::forward<decltype(args)>(args)...);
+ });
+ }
+ return call_then_impl::run(*this, std::move(ncf));
+#endif
+ }
+
+ /// \brief Schedule a block of code to run when the future is ready, unpacking tuples.
+ ///
+ /// Schedules a function (often a lambda) to run when the future becomes
+ /// available. The function is called with the result of this future's
+ /// computation as parameters. The return value of the function becomes
+ /// the return value of then(), itself as a future; this allows then()
+ /// calls to be chained.
+ ///
+ /// This member function is only available is the payload is std::tuple;
+ /// The tuple elements are passed as individual arguments to `func`, which
+ /// must have the same arity as the tuple.
+ ///
+ /// If the future failed, the function is not called, and the exception
+ /// is propagated into the return value of then().
+ ///
+ /// \param func - function to be called when the future becomes available,
+ /// unless it has failed.
+ /// \return a \c future representing the return value of \c func, applied
+ /// to the eventual value of this future.
+ template <typename Func, typename Result = futurize_t<internal::result_of_apply_t<Func, T SEASTAR_ELLIPSIS>>>
+ SEASTAR_CONCEPT( requires ::seastar::CanApplyTuple<Func, T SEASTAR_ELLIPSIS>)
+ Result
+ then_unpack(Func&& func) noexcept {
+ return then([func = std::forward<Func>(func)] (T&& SEASTAR_ELLIPSIS tuple) mutable {
+ // sizeof...(tuple) is required to be 1
+ return std::apply(func, std::move(tuple) SEASTAR_ELLIPSIS);
+ });
+ }
+
+private:
+
+ // Keep this simple so that Named Return Value Optimization is used.
+ template <typename Func, typename Result>
+ Result then_impl_nrvo(Func&& func) noexcept {
+ using futurator = futurize<internal::future_result_t<Func, T SEASTAR_ELLIPSIS>>;
+ typename futurator::type fut(future_for_get_promise_marker{});
+ using pr_type = decltype(fut.get_promise());
+ schedule(fut.get_promise(), std::move(func), [](pr_type&& pr, Func& func, future_state&& state) {
+ if (state.failed()) {
+ pr.set_exception(static_cast<future_state_base&&>(std::move(state)));
+ } else {
+ futurator::satisfy_with_result_of(std::move(pr), [&func, &state] {
+#if SEASTAR_API_LEVEL < 5
+ return std::apply(func, std::move(state).get_value());
+#else
+ // clang thinks that "state" is not used, below, for future<>.
+ // Make it think it is used to avoid an unused-lambda-capture warning.
+ (void)state;
+ return internal::future_invoke(func, std::move(state).get_value());
+#endif
+ });
+ }
+ });
+ return fut;
+ }
+
+ template <typename Func, typename Result = futurize_t<internal::future_result_t<Func, T SEASTAR_ELLIPSIS>>>
+ Result
+ then_impl(Func&& func) noexcept {
+#ifndef SEASTAR_DEBUG
+ using futurator = futurize<internal::future_result_t<Func, T SEASTAR_ELLIPSIS>>;
+ if (failed()) {
+ return futurator::make_exception_future(static_cast<future_state_base&&>(get_available_state_ref()));
+ } else if (available()) {
+#if SEASTAR_API_LEVEL < 5
+ return futurator::apply(std::forward<Func>(func), get_available_state_ref().take_value());
+#else
+ return futurator::invoke(std::forward<Func>(func), get_available_state_ref().take_value());
+#endif
+ }
+#endif
+ return then_impl_nrvo<Func, Result>(std::forward<Func>(func));
+ }
+
+public:
+ /// \brief Schedule a block of code to run when the future is ready, allowing
+ /// for exception handling.
+ ///
+ /// Schedules a function (often a lambda) to run when the future becomes
+ /// available. The function is called with the this future as a parameter;
+ /// it will be in an available state. The return value of the function becomes
+ /// the return value of then_wrapped(), itself as a future; this allows
+ /// then_wrapped() calls to be chained.
+ ///
+ /// Unlike then(), the function will be called for both value and exceptional
+ /// futures.
+ ///
+ /// \param func - function to be called when the future becomes available,
+ /// \return a \c future representing the return value of \c func, applied
+ /// to the eventual value of this future.
+ template <typename Func, typename FuncResult = std::invoke_result_t<Func, future>>
+ SEASTAR_CONCEPT( requires std::invocable<Func, future> )
+ futurize_t<FuncResult>
+ then_wrapped(Func&& func) & noexcept {
+ return then_wrapped_maybe_erase<false, FuncResult>(std::forward<Func>(func));
+ }
+
+ template <typename Func, typename FuncResult = std::invoke_result_t<Func, future&&>>
+ SEASTAR_CONCEPT( requires std::invocable<Func, future&&> )
+ futurize_t<FuncResult>
+ then_wrapped(Func&& func) && noexcept {
+ return then_wrapped_maybe_erase<true, FuncResult>(std::forward<Func>(func));
+ }
+
+private:
+
+ template <bool AsSelf, typename FuncResult, typename Func>
+ futurize_t<FuncResult>
+ then_wrapped_maybe_erase(Func&& func) noexcept {
+#ifndef SEASTAR_TYPE_ERASE_MORE
+ return then_wrapped_common<AsSelf, FuncResult>(std::forward<Func>(func));
+#else
+ using futurator = futurize<FuncResult>;
+ using WrapFuncResult = typename futurator::type;
+ noncopyable_function<WrapFuncResult (future&&)> ncf;
+ {
+ memory::scoped_critical_alloc_section _;
+ ncf = noncopyable_function<WrapFuncResult(future &&)>([func = std::forward<Func>(func)](future&& f) mutable {
+ return futurator::invoke(func, std::move(f));
+ });
+ }
+ return then_wrapped_common<AsSelf, WrapFuncResult>(std::move(ncf));
+#endif
+ }
+
+ // Keep this simple so that Named Return Value Optimization is used.
+ template <typename FuncResult, typename Func>
+ futurize_t<FuncResult>
+ then_wrapped_nrvo(Func&& func) noexcept {
+ using futurator = futurize<FuncResult>;
+ typename futurator::type fut(future_for_get_promise_marker{});
+ using pr_type = decltype(fut.get_promise());
+ schedule(fut.get_promise(), std::move(func), [](pr_type&& pr, Func& func, future_state&& state) {
+ futurator::satisfy_with_result_of(std::move(pr), [&func, &state] {
+ return func(future(std::move(state)));
+ });
+ });
+ return fut;
+ }
+
+
+ template <bool AsSelf, typename FuncResult, typename Func>
+ futurize_t<FuncResult>
+ then_wrapped_common(Func&& func) noexcept {
+#ifndef SEASTAR_DEBUG
+ using futurator = futurize<FuncResult>;
+ if (available()) {
+ if constexpr (AsSelf) {
+ if (_promise) {
+ detach_promise();
+ }
+ return futurator::invoke(std::forward<Func>(func), std::move(*this));
+ } else {
+ return futurator::invoke(std::forward<Func>(func), future(get_available_state_ref()));
+ }
+ }
+#endif
+ return then_wrapped_nrvo<FuncResult, Func>(std::forward<Func>(func));
+ }
+
+ void forward_to(internal::promise_base_with_type<T SEASTAR_ELLIPSIS>&& pr) noexcept {
+ if (_state.available()) {
+ pr.set_urgent_state(std::move(_state));
+ } else {
+ *detach_promise() = std::move(pr);
+ }
+ }
+
+public:
+ /// \brief Satisfy some \ref promise object with this future as a result.
+ ///
+ /// Arranges so that when this future is resolve, it will be used to
+ /// satisfy an unrelated promise. This is similar to scheduling a
+ /// continuation that moves the result of this future into the promise
+ /// (using promise::set_value() or promise::set_exception(), except
+ /// that it is more efficient.
+ ///
+ /// \param pr a promise that will be fulfilled with the results of this
+ /// future.
+ void forward_to(promise<T SEASTAR_ELLIPSIS>&& pr) noexcept {
+ if (_state.available()) {
+ pr.set_urgent_state(std::move(_state));
+ } else if (&pr._local_state != pr._state) {
+ // The only case when _state points to _local_state is
+ // when get_future was never called. Given that pr will
+ // soon be destroyed, we know get_future will never be
+ // called and we can just ignore this request.
+ *detach_promise() = std::move(pr);
+ }
+ }
+
+
+
+ /**
+ * Finally continuation for statements that require waiting for the result.
+ * I.e. you need to "finally" call a function that returns a possibly
+ * unavailable future. The returned future will be "waited for", any
+ * exception generated will be propagated, but the return value is ignored.
+ * I.e. the original return value (the future upon which you are making this
+ * call) will be preserved.
+ *
+ * If the original return value or the callback return value is an
+ * exceptional future it will be propagated.
+ *
+ * If both of them are exceptional - the std::nested_exception exception
+ * with the callback exception on top and the original future exception
+ * nested will be propagated.
+ */
+ template <typename Func>
+ SEASTAR_CONCEPT( requires std::invocable<Func> )
+ future<T SEASTAR_ELLIPSIS> finally(Func&& func) noexcept {
+ return then_wrapped(finally_body<Func, is_future<std::invoke_result_t<Func>>::value>(std::forward<Func>(func)));
+ }
+
+
+ template <typename Func, bool FuncReturnsFuture>
+ struct finally_body;
+
+ template <typename Func>
+ struct finally_body<Func, true> {
+ Func _func;
+
+ finally_body(Func&& func) noexcept : _func(std::forward<Func>(func))
+ { }
+
+ future<T SEASTAR_ELLIPSIS> operator()(future<T SEASTAR_ELLIPSIS>&& result) noexcept {
+ return futurize_invoke(_func).then_wrapped([result = std::move(result)](auto&& f_res) mutable {
+ if (!f_res.failed()) {
+ return std::move(result);
+ } else {
+ return result.rethrow_with_nested(std::move(f_res._state));
+ }
+ });
+ }
+ };
+
+ template <typename Func>
+ struct finally_body<Func, false> {
+ Func _func;
+
+ finally_body(Func&& func) noexcept : _func(std::forward<Func>(func))
+ { }
+
+ future<T SEASTAR_ELLIPSIS> operator()(future<T SEASTAR_ELLIPSIS>&& result) noexcept {
+ try {
+ _func();
+ return std::move(result);
+ } catch (...) {
+ return result.rethrow_with_nested();
+ }
+ };
+ };
+
+ /// \brief Terminate the program if this future fails.
+ ///
+ /// Terminates the entire program is this future resolves
+ /// to an exception. Use with caution.
+ future<> or_terminate() noexcept {
+ return then_wrapped([] (auto&& f) {
+ try {
+ f.get();
+ } catch (...) {
+ engine_exit(std::current_exception());
+ }
+ });
+ }
+
+ /// \brief Discards the value carried by this future.
+ ///
+ /// Converts the future into a no-value \c future<>, by
+ /// ignoring any result. Exceptions are propagated unchanged.
+ future<> discard_result() noexcept {
+ // We need the generic variadic lambda, below, because then() behaves differently
+ // when value_type is when_all_succeed_tuple
+ return then([] (auto&&...) {});
+ }
+
+ /// \brief Handle the exception carried by this future.
+ ///
+ /// When the future resolves, if it resolves with an exception,
+ /// handle_exception(func) replaces the exception with the value
+ /// returned by func. The exception is passed (as a std::exception_ptr)
+ /// as a parameter to func; func may return the replacement value
+ /// immediately (T or std::tuple<T...>) or in the future (future<T...>)
+ /// and is even allowed to return (or throw) its own exception.
+ ///
+ /// The idiom fut.discard_result().handle_exception(...) can be used
+ /// to handle an exception (if there is one) without caring about the
+ /// successful value; Because handle_exception() is used here on a
+ /// future<>, the handler function does not need to return anything.
+ template <typename Func>
+ /* Broken?
+ SEASTAR_CONCEPT( requires ::seastar::InvokeReturns<Func, future<T...>, std::exception_ptr>
+ || (sizeof...(T) == 0 && ::seastar::InvokeReturns<Func, void, std::exception_ptr>)
+ || (sizeof...(T) == 1 && ::seastar::InvokeReturns<Func, T..., std::exception_ptr>)
+ ) */
+ future<T SEASTAR_ELLIPSIS> handle_exception(Func&& func) noexcept {
+ return then_wrapped([func = std::forward<Func>(func)]
+ (auto&& fut) mutable -> future<T SEASTAR_ELLIPSIS> {
+ if (!fut.failed()) {
+ return make_ready_future<T SEASTAR_ELLIPSIS>(fut.get());
+ } else {
+ return futurize_invoke(func, fut.get_exception());
+ }
+ });
+ }
+
+ /// \brief Handle the exception of a certain type carried by this future.
+ ///
+ /// When the future resolves, if it resolves with an exception of a type that
+ /// provided callback receives as a parameter, handle_exception(func) replaces
+ /// the exception with the value returned by func. The exception is passed (by
+ /// reference) as a parameter to func; func may return the replacement value
+ /// immediately (T or std::tuple<T...>) or in the future (future<T...>)
+ /// and is even allowed to return (or throw) its own exception.
+ /// If exception, that future holds, does not match func parameter type
+ /// it is propagated as is.
+ template <typename Func>
+ future<T SEASTAR_ELLIPSIS> handle_exception_type(Func&& func) noexcept {
+ using trait = function_traits<Func>;
+ static_assert(trait::arity == 1, "func can take only one parameter");
+ using ex_type = typename trait::template arg<0>::type;
+ return then_wrapped([func = std::forward<Func>(func)]
+ (auto&& fut) mutable -> future<T SEASTAR_ELLIPSIS> {
+ try {
+ return make_ready_future<T SEASTAR_ELLIPSIS>(fut.get());
+ } catch(ex_type& ex) {
+ return futurize_invoke(func, ex);
+ }
+ });
+ }
+
+ /// \brief Ignore any result hold by this future
+ ///
+ /// Ignore any result (value or exception) hold by this future.
+ /// Use with caution since usually ignoring exception is not what
+ /// you want
+ void ignore_ready_future() noexcept {
+ _state.ignore();
+ }
+
+#ifdef SEASTAR_COROUTINES_ENABLED
+ using future_base::set_coroutine;
+#endif
+private:
+ void set_callback(continuation_base<T SEASTAR_ELLIPSIS>* callback) noexcept {
+ if (_state.available()) {
+ callback->set_state(get_available_state_ref());
+ ::seastar::schedule(callback);
+ } else {
+ assert(_promise);
+ schedule(callback);
+ }
+
+ }
+
+ /// \cond internal
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class future;
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class promise;
+ template <typename U>
+ friend struct futurize;
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class internal::promise_base_with_type;
+ template <typename... U, typename... A>
+ friend future<U...> make_ready_future(A&&... value) noexcept;
+ template <typename... U>
+ friend future<U...> make_exception_future(std::exception_ptr&& ex) noexcept;
+ template <typename... U, typename Exception>
+ friend future<U...> make_exception_future(Exception&& ex) noexcept;
+ template <typename... U>
+ friend future<U...> internal::make_exception_future(future_state_base&& state) noexcept;
+ template <typename... U>
+ friend future<U...> current_exception_as_future() noexcept;
+ template <typename... U, typename V>
+ friend void internal::set_callback(future<U...>&, V*) noexcept;
+ template <typename Future>
+ friend struct internal::call_then_impl;
+ /// \endcond
+};
+
+
+namespace internal {
+template <typename T>
+struct futurize_base {
+ /// If \c T is a future, \c T; otherwise \c future<T>
+ using type = future<T>;
+ /// The promise type associated with \c type.
+ using promise_type = promise<T>;
+ using promise_base_with_type = internal::promise_base_with_type<T>;
+
+ /// Convert a value or a future to a future
+ static inline type convert(T&& value) { return make_ready_future<T>(std::move(value)); }
+ static inline type convert(type&& value) { return std::move(value); }
+
+ /// Makes an exceptional future of type \ref type.
+ template <typename Arg>
+ static inline type make_exception_future(Arg&& arg) noexcept;
+};
+
+template <>
+struct futurize_base<void> {
+ using type = future<>;
+ using promise_type = promise<>;
+ using promise_base_with_type = internal::promise_base_with_type<>;
+
+ static inline type convert(type&& value) {
+ return std::move(value);
+ }
+ template <typename Arg>
+ static inline type make_exception_future(Arg&& arg) noexcept;
+};
+
+template <typename T>
+struct futurize_base<future<T>> : public futurize_base<T> {};
+
+template <>
+struct futurize_base<future<>> : public futurize_base<void> {};
+}
+
+template <typename T>
+struct futurize : public internal::futurize_base<T> {
+ using base = internal::futurize_base<T>;
+ using type = typename base::type;
+ using promise_type = typename base::promise_type;
+ using promise_base_with_type = typename base::promise_base_with_type;
+ /// The value tuple type associated with \c type
+ using value_type = typename type::value_type;
+ using tuple_type = typename type::tuple_type;
+ using base::convert;
+ using base::make_exception_future;
+
+ /// Apply a function to an argument list (expressed as a tuple)
+ /// and return the result, as a future (if it wasn't already).
+ template<typename Func, typename... FuncArgs>
+ static inline type apply(Func&& func, std::tuple<FuncArgs...>&& args) noexcept;
+
+ /// Invoke a function to an argument list
+ /// and return the result, as a future (if it wasn't already).
+ template<typename Func, typename... FuncArgs>
+ static inline type invoke(Func&& func, FuncArgs&&... args) noexcept;
+
+ template<typename Func>
+ static inline type invoke(Func&& func, internal::monostate) noexcept {
+ return invoke(std::forward<Func>(func));
+ }
+
+ /// Deprecated alias of invoke
+ template<typename Func, typename... FuncArgs>
+ [[deprecated("Use invoke for varargs")]]
+ static inline type apply(Func&& func, FuncArgs&&... args) noexcept {
+ return invoke(std::forward<Func>(func), std::forward<FuncArgs>(args)...);
+ }
+
+ static type current_exception_as_future() noexcept {
+ return type(future_state_base::current_exception_future_marker());
+ }
+
+ /// Convert the tuple representation into a future
+ static type from_tuple(tuple_type&& value) {
+ return type(ready_future_marker(), std::move(value));
+ }
+ /// Convert the tuple representation into a future
+ static type from_tuple(const tuple_type& value) {
+ return type(ready_future_marker(), value);
+ }
+
+#if SEASTAR_API_LEVEL >= 5
+ /// Convert the tuple representation into a future
+ static type from_tuple(value_type&& value) {
+ return type(ready_future_marker(), std::move(value));
+ }
+ /// Convert the tuple representation into a future
+ static type from_tuple(const value_type& value) {
+ return type(ready_future_marker(), value);
+ }
+#endif
+private:
+ /// Forwards the result of, or exception thrown by, func() to the
+ /// promise. This avoids creating a future if func() doesn't
+ /// return one.
+ template<typename Func>
+ SEASTAR_CONCEPT( requires std::invocable<Func> )
+ static void satisfy_with_result_of(promise_base_with_type&&, Func&& func);
+
+ template <typename SEASTAR_ELLIPSIS U>
+ friend class future;
+};
+
+inline internal::promise_base::promise_base(future_base* future, future_state_base* state) noexcept
+ : _future(future), _state(state) {
+ _future->_promise = this;
+}
+
+template <typename SEASTAR_ELLIPSIS T>
+inline
+future<T SEASTAR_ELLIPSIS>
+promise<T SEASTAR_ELLIPSIS>::get_future() noexcept {
+ assert(!this->_future && this->_state && !this->_task);
+ return future<T SEASTAR_ELLIPSIS>(this);
+}
+
+template <typename SEASTAR_ELLIPSIS T>
+inline
+void promise<T SEASTAR_ELLIPSIS>::move_it(promise&& x) noexcept {
+ if (this->_state == &x._local_state) {
+ this->_state = &_local_state;
+ new (&_local_state) future_state(std::move(x._local_state));
+ }
+}
+
+template <typename... T, typename... A>
+inline
+future<T...> make_ready_future(A&&... value) noexcept {
+ return future<T...>(ready_future_marker(), std::forward<A>(value)...);
+}
+
+template <typename... T>
+inline
+future<T...> make_exception_future(std::exception_ptr&& ex) noexcept {
+ return future<T...>(exception_future_marker(), std::move(ex));
+}
+
+template <typename... T>
+inline
+future<T...> internal::make_exception_future(future_state_base&& state) noexcept {
+ return future<T...>(exception_future_marker(), std::move(state));
+}
+
+template <typename... T>
+future<T...> current_exception_as_future() noexcept {
+ return future<T...>(future_state_base::current_exception_future_marker());
+}
+
+void log_exception_trace() noexcept;
+
+/// \brief Creates a \ref future in an available, failed state.
+///
+/// Creates a \ref future object that is already resolved in a failed
+/// state. This no I/O needs to be performed to perform a computation
+/// (for example, because the connection is closed and we cannot read
+/// from it).
+template <typename... T, typename Exception>
+inline
+future<T...> make_exception_future(Exception&& ex) noexcept {
+ log_exception_trace();
+ return make_exception_future<T...>(std::make_exception_ptr(std::forward<Exception>(ex)));
+}
+
+template <typename... T, typename Exception>
+future<T...> make_exception_future_with_backtrace(Exception&& ex) noexcept {
+ return make_exception_future<T...>(make_backtraced_exception_ptr<Exception>(std::forward<Exception>(ex)));
+}
+
+/// @}
+
+/// \cond internal
+
+template<typename T>
+template<typename Func, typename... FuncArgs>
+typename futurize<T>::type futurize<T>::apply(Func&& func, std::tuple<FuncArgs...>&& args) noexcept {
+ try {
+ using ret_t = decltype(std::apply(std::forward<Func>(func), std::move(args)));
+ if constexpr (std::is_void_v<ret_t>) {
+ std::apply(std::forward<Func>(func), std::move(args));
+ return make_ready_future<>();
+ } else if constexpr (is_future<ret_t>::value){
+ return std::apply(std::forward<Func>(func), std::move(args));
+ } else {
+ return convert(std::apply(std::forward<Func>(func), std::move(args)));
+ }
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template<typename T>
+template<typename Func>
+SEASTAR_CONCEPT( requires std::invocable<Func> )
+void futurize<T>::satisfy_with_result_of(promise_base_with_type&& pr, Func&& func) {
+ using ret_t = decltype(func());
+ if constexpr (std::is_void_v<ret_t>) {
+ func();
+ pr.set_value();
+ } else if constexpr (is_future<ret_t>::value) {
+ func().forward_to(std::move(pr));
+ } else {
+ pr.set_value(func());
+ }
+}
+
+template<typename T>
+template<typename Func, typename... FuncArgs>
+typename futurize<T>::type futurize<T>::invoke(Func&& func, FuncArgs&&... args) noexcept {
+ try {
+ using ret_t = decltype(func(std::forward<FuncArgs>(args)...));
+ if constexpr (std::is_void_v<ret_t>) {
+ func(std::forward<FuncArgs>(args)...);
+ return make_ready_future<>();
+ } else if constexpr (is_future<ret_t>::value) {
+ return func(std::forward<FuncArgs>(args)...);
+ } else {
+ return convert(func(std::forward<FuncArgs>(args)...));
+ }
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template <typename T>
+template <typename Arg>
+inline
+future<T>
+internal::futurize_base<T>::make_exception_future(Arg&& arg) noexcept {
+ using ::seastar::make_exception_future;
+ using ::seastar::internal::make_exception_future;
+ return make_exception_future<T>(std::forward<Arg>(arg));
+}
+
+template <typename Arg>
+inline
+future<>
+internal::futurize_base<void>::make_exception_future(Arg&& arg) noexcept {
+ using ::seastar::make_exception_future;
+ using ::seastar::internal::make_exception_future;
+ return make_exception_future<>(std::forward<Arg>(arg));
+}
+
+template<typename Func, typename... Args>
+auto futurize_invoke(Func&& func, Args&&... args) noexcept {
+ using futurator = futurize<std::invoke_result_t<Func, Args&&...>>;
+ return futurator::invoke(std::forward<Func>(func), std::forward<Args>(args)...);
+}
+
+template<typename Func, typename... Args>
+[[deprecated("Use futurize_invoke for varargs")]]
+auto futurize_apply(Func&& func, Args&&... args) noexcept {
+ return futurize_invoke(std::forward<Func>(func), std::forward<Args>(args)...);
+}
+
+template<typename Func, typename... Args>
+auto futurize_apply(Func&& func, std::tuple<Args...>&& args) noexcept {
+ using futurator = futurize<std::invoke_result_t<Func, Args&&...>>;
+ return futurator::apply(std::forward<Func>(func), std::move(args));
+}
+
+namespace internal {
+
+template <typename... T, typename U>
+inline
+void set_callback(future<T...>& fut, U* callback) noexcept {
+ // It would be better to use continuation_base<T...> for U, but
+ // then a derived class of continuation_base<T...> won't be matched
+ return fut.set_callback(callback);
+}
+
+}
+
+
+/// \endcond
+
+}
diff --git a/src/seastar/include/seastar/core/gate.hh b/src/seastar/include/seastar/core/gate.hh
new file mode 100644
index 000000000..5d440c07f
--- /dev/null
+++ b/src/seastar/include/seastar/core/gate.hh
@@ -0,0 +1,170 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/util/std-compat.hh>
+#include <exception>
+
+namespace seastar {
+
+/// \addtogroup fiber-module
+/// @{
+
+/// Exception thrown when a \ref gate object has been closed
+/// by the \ref gate::close() method.
+class gate_closed_exception : public std::exception {
+public:
+ virtual const char* what() const noexcept override {
+ return "gate closed";
+ }
+};
+
+/// Facility to stop new requests, and to tell when existing requests are done.
+///
+/// When stopping a service that serves asynchronous requests, we are faced with
+/// two problems: preventing new requests from coming in, and knowing when existing
+/// requests have completed. The \c gate class provides a solution.
+class gate {
+ size_t _count = 0;
+ std::optional<promise<>> _stopped;
+public:
+ /// Tries to register an in-progress request.
+ ///
+ /// If the gate is not closed, the request is registered and the function returns `true`,
+ /// Otherwise the function just returns `false` and has no other effect.
+ bool try_enter() noexcept {
+ bool opened = !_stopped;
+ if (opened) {
+ ++_count;
+ }
+ return opened;
+ }
+ /// Registers an in-progress request.
+ ///
+ /// If the gate is not closed, the request is registered. Otherwise,
+ /// a \ref gate_closed_exception is thrown.
+ void enter() {
+ if (!try_enter()) {
+ throw gate_closed_exception();
+ }
+ }
+ /// Unregisters an in-progress request.
+ ///
+ /// If the gate is closed, and there are no more in-progress requests,
+ /// the `_stopped` promise will be fulfilled.
+ void leave() noexcept {
+ --_count;
+ if (!_count && _stopped) {
+ _stopped->set_value();
+ }
+ }
+ /// Potentially stop an in-progress request.
+ ///
+ /// If the gate is already closed, a \ref gate_closed_exception is thrown.
+ /// By using \ref enter() and \ref leave(), the program can ensure that
+ /// no further requests are serviced. However, long-running requests may
+ /// continue to run. The check() method allows such a long operation to
+ /// voluntarily stop itself after the gate is closed, by making calls to
+ /// check() in appropriate places. check() with throw an exception and
+ /// bail out of the long-running code if the gate is closed.
+ void check() {
+ if (_stopped) {
+ throw gate_closed_exception();
+ }
+ }
+ /// Closes the gate.
+ ///
+ /// Future calls to \ref enter() will fail with an exception, and when
+ /// all current requests call \ref leave(), the returned future will be
+ /// made ready.
+ future<> close() noexcept {
+ assert(!_stopped && "seastar::gate::close() cannot be called more than once");
+ _stopped = std::make_optional(promise<>());
+ if (!_count) {
+ _stopped->set_value();
+ }
+ return _stopped->get_future();
+ }
+
+ /// Returns a current number of registered in-progress requests.
+ size_t get_count() const noexcept {
+ return _count;
+ }
+
+ /// Returns whether the gate is closed.
+ bool is_closed() const noexcept {
+ return bool(_stopped);
+ }
+};
+
+namespace internal {
+
+template <typename Func>
+inline
+auto
+invoke_func_with_gate(gate& g, Func&& func) noexcept {
+ return futurize_invoke(std::forward<Func>(func)).finally([&g] { g.leave(); });
+}
+
+} // namespace intgernal
+
+/// Executes the function \c func making sure the gate \c g is properly entered
+/// and later on, properly left.
+///
+/// \param func function to be executed
+/// \param g the gate. Caller must make sure that it outlives this function.
+/// \returns whatever \c func returns
+///
+/// \relates gate
+template <typename Func>
+inline
+auto
+with_gate(gate& g, Func&& func) {
+ g.enter();
+ return internal::invoke_func_with_gate(g, std::forward<Func>(func));
+}
+
+/// Executes the function \c func if the gate \c g can be entered
+/// and later on, properly left.
+///
+/// \param func function to be executed
+/// \param g the gate. Caller must make sure that it outlives this function.
+///
+/// If the gate is already closed, an exception future holding
+/// \ref gate_closed_exception is returned, otherwise
+/// \returns whatever \c func returns.
+///
+/// \relates gate
+template <typename Func>
+inline
+auto
+try_with_gate(gate& g, Func&& func) noexcept {
+ if (!g.try_enter()) {
+ using futurator = futurize<std::result_of_t<Func()>>;
+ return futurator::make_exception_future(gate_closed_exception());
+ }
+ return internal::invoke_func_with_gate(g, std::forward<Func>(func));
+}
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/idle_cpu_handler.hh b/src/seastar/include/seastar/core/idle_cpu_handler.hh
new file mode 100644
index 000000000..793fc422a
--- /dev/null
+++ b/src/seastar/include/seastar/core/idle_cpu_handler.hh
@@ -0,0 +1,59 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/util/noncopyable_function.hh>
+
+/// \file
+
+namespace seastar {
+
+/// Indicates the outcome of a user callback installed to take advantage of
+/// idle CPU cycles.
+enum class idle_cpu_handler_result {
+ no_more_work, //!< The user callback has no more work to perform
+ interrupted_by_higher_priority_task //!< A call to the work_waiting_on_reactor parameter to idle_cpu_handler returned `true`
+};
+
+/// Signature of a callback provided by the reactor to a user callback installed to take
+/// advantage of idle cpu cycles, used to periodically check if the CPU is still idle.
+///
+/// \return true if the reactor has new work to do
+using work_waiting_on_reactor = const noncopyable_function<bool()>&;
+
+/// Signature of a callback provided by the user, that the reactor calls when it has idle cycles.
+///
+/// The `poll` parameter is a work_waiting_on_reactor function that should be periodically called
+/// to check if the idle callback should return with idle_cpu_handler_result::interrupted_by_higher_priority_task
+using idle_cpu_handler = noncopyable_function<idle_cpu_handler_result(work_waiting_on_reactor poll)>;
+
+/// Set a handler that will be called when there is no task to execute on cpu.
+/// Handler should do a low priority work.
+///
+/// Handler's return value determines whether handler did any actual work. If no work was done then reactor will go
+/// into sleep.
+///
+/// Handler's argument is a function that returns true if a task which should be executed on cpu appears or false
+/// otherwise. This function should be used by a handler to return early if a task appears.
+void set_idle_cpu_handler(idle_cpu_handler&& handler);
+
+}
diff --git a/src/seastar/include/seastar/core/internal/api-level.hh b/src/seastar/include/seastar/core/internal/api-level.hh
new file mode 100644
index 000000000..c18e03f82
--- /dev/null
+++ b/src/seastar/include/seastar/core/internal/api-level.hh
@@ -0,0 +1,82 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#pragma once
+
+// For IDEs that don't see SEASTAR_API_LEVEL, generate a nice default
+#ifndef SEASTAR_API_LEVEL
+#define SEASTAR_API_LEVEL 3
+#endif
+
+#if SEASTAR_API_LEVEL == 6
+#define SEASTAR_INCLUDE_API_V6 inline
+#else
+#define SEASTAR_INCLUDE_API_V6
+#endif
+
+#if SEASTAR_API_LEVEL == 5
+#define SEASTAR_INCLUDE_API_V5 inline
+#else
+#define SEASTAR_INCLUDE_API_V5
+#endif
+
+#if SEASTAR_API_LEVEL == 4
+#define SEASTAR_INCLUDE_API_V4 inline
+#else
+#define SEASTAR_INCLUDE_API_V4
+#endif
+
+#if SEASTAR_API_LEVEL == 3
+#define SEASTAR_INCLUDE_API_V3 inline
+#else
+#define SEASTAR_INCLUDE_API_V3
+#endif
+
+#if SEASTAR_API_LEVEL == 2
+#define SEASTAR_INCLUDE_API_V2 inline
+#else
+#define SEASTAR_INCLUDE_API_V2
+#endif
+
+// Declare them here so we don't have to use the macros everywhere
+namespace seastar {
+ SEASTAR_INCLUDE_API_V2 namespace api_v2 {
+ }
+ SEASTAR_INCLUDE_API_V3 namespace api_v3 {
+ inline namespace and_newer {
+ }
+ }
+ SEASTAR_INCLUDE_API_V4 namespace api_v4 {
+ inline namespace and_newer {
+ using namespace api_v3::and_newer;
+ }
+ }
+ SEASTAR_INCLUDE_API_V5 namespace api_v5 {
+ inline namespace and_newer {
+ using namespace api_v4::and_newer;
+ }
+ }
+ SEASTAR_INCLUDE_API_V6 namespace api_v6 {
+ inline namespace and_newer {
+ using namespace api_v5::and_newer;
+ }
+ }
+}
diff --git a/src/seastar/include/seastar/core/internal/buffer_allocator.hh b/src/seastar/include/seastar/core/internal/buffer_allocator.hh
new file mode 100644
index 000000000..0cdd39e75
--- /dev/null
+++ b/src/seastar/include/seastar/core/internal/buffer_allocator.hh
@@ -0,0 +1,43 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+
+#pragma once
+
+namespace seastar {
+
+template <typename CharType>
+class temporary_buffer;
+
+namespace internal {
+
+// Internal interface for allocating buffers for reads. Used to decouple
+// allocation strategies (where to allocate from, and what sizes) from the
+// point where allocation happens, to make it as late as possible.
+class buffer_allocator {
+public:
+ virtual ~buffer_allocator() = default;
+ virtual temporary_buffer<char> allocate_buffer() = 0;
+};
+
+
+}
+
+}
diff --git a/src/seastar/include/seastar/core/internal/io_desc.hh b/src/seastar/include/seastar/core/internal/io_desc.hh
new file mode 100644
index 000000000..74d1cf90f
--- /dev/null
+++ b/src/seastar/include/seastar/core/internal/io_desc.hh
@@ -0,0 +1,35 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/linux-aio.hh>
+#include <exception>
+
+namespace seastar {
+
+class kernel_completion {
+protected:
+ ~kernel_completion() = default;
+public:
+ virtual void complete_with(ssize_t res) = 0;
+};
+}
diff --git a/src/seastar/include/seastar/core/internal/io_request.hh b/src/seastar/include/seastar/core/internal/io_request.hh
new file mode 100644
index 000000000..f8d3c9af8
--- /dev/null
+++ b/src/seastar/include/seastar/core/internal/io_request.hh
@@ -0,0 +1,266 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/linux-aio.hh>
+#include <seastar/core/internal/io_desc.hh>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+namespace seastar {
+namespace internal {
+
+class io_request {
+public:
+ enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel };
+private:
+ operation _op;
+ int _fd;
+ union {
+ uint64_t pos;
+ int flags;
+ int events;
+ } _attr;
+ // the upper layers give us void pointers, but storing void pointers here is just
+ // dangerous. The constructors seem to be happy to convert other pointers to void*,
+ // even if they are marked as explicit, and then you end up losing approximately 3 hours
+ // and 15 minutes (hypothetically, of course), trying to chase the weirdest bug.
+ // Let's store a char* for safety, and cast it back to void* in the accessor.
+ union {
+ char* addr;
+ ::iovec* iovec;
+ ::msghdr* msghdr;
+ ::sockaddr* sockaddr;
+ } _ptr;
+
+ // accept wants a socklen_t*, connect wants a socklen_t
+ union {
+ size_t len;
+ socklen_t* socklen_ptr;
+ socklen_t socklen;
+ } _size;
+ kernel_completion* _kernel_completion;
+
+ explicit io_request(operation op, int fd, int flags, ::msghdr* msg)
+ : _op(op)
+ , _fd(fd)
+ {
+ _attr.flags = flags;
+ _ptr.msghdr = msg;
+ }
+
+ explicit io_request(operation op, int fd, sockaddr* sa, socklen_t sl)
+ : _op(op)
+ , _fd(fd)
+ {
+ _ptr.sockaddr = sa;
+ _size.socklen = sl;
+ }
+
+ explicit io_request(operation op, int fd, int flags, sockaddr* sa, socklen_t* sl)
+ : _op(op)
+ , _fd(fd)
+ {
+ _attr.flags = flags;
+ _ptr.sockaddr = sa;
+ _size.socklen_ptr = sl;
+ }
+ explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size)
+ : _op(op)
+ , _fd(fd)
+ {
+ _attr.pos = pos;
+ _ptr.addr = ptr;
+ _size.len = size;
+ }
+
+ explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size)
+ : _op(op)
+ , _fd(fd)
+ {
+ _attr.pos = pos;
+ _ptr.iovec = ptr;
+ _size.len = size;
+ }
+
+ explicit io_request(operation op, int fd)
+ : _op(op)
+ , _fd(fd)
+ {}
+ explicit io_request(operation op, int fd, int events)
+ : _op(op)
+ , _fd(fd)
+ {
+ _attr.events = events;
+ }
+
+ explicit io_request(operation op, int fd, char *ptr)
+ : _op(op)
+ , _fd(fd)
+ {
+ _ptr.addr = ptr;
+ }
+public:
+ bool is_read() const {
+ switch (_op) {
+ case operation::read:
+ case operation::readv:
+ case operation::recvmsg:
+ case operation::recv:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ bool is_write() const {
+ switch (_op) {
+ case operation::write:
+ case operation::writev:
+ case operation::send:
+ case operation::sendmsg:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ sstring opname() const;
+
+ operation opcode() const {
+ return _op;
+ }
+
+ int fd() const {
+ return _fd;
+ }
+
+ uint64_t pos() const {
+ return _attr.pos;
+ }
+
+ int flags() const {
+ return _attr.flags;
+ }
+
+ int events() const {
+ return _attr.events;
+ }
+
+ void* address() const {
+ return reinterpret_cast<void*>(_ptr.addr);
+ }
+
+ iovec* iov() const {
+ return _ptr.iovec;
+ }
+
+ ::sockaddr* posix_sockaddr() const {
+ return _ptr.sockaddr;
+ }
+
+ ::msghdr* msghdr() const {
+ return _ptr.msghdr;
+ }
+
+ size_t size() const {
+ return _size.len;
+ }
+
+ size_t iov_len() const {
+ return _size.len;
+ }
+
+ socklen_t socklen() const {
+ return _size.socklen;
+ }
+
+ socklen_t* socklen_ptr() const {
+ return _size.socklen_ptr;
+ }
+
+ void attach_kernel_completion(kernel_completion* kc) {
+ _kernel_completion = kc;
+ }
+
+ kernel_completion* get_kernel_completion() const {
+ return _kernel_completion;
+ }
+
+ static io_request make_read(int fd, uint64_t pos, void* address, size_t size) {
+ return io_request(operation::read, fd, pos, reinterpret_cast<char*>(address), size);
+ }
+
+ static io_request make_readv(int fd, uint64_t pos, std::vector<iovec>& iov) {
+ return io_request(operation::readv, fd, pos, iov.data(), iov.size());
+ }
+
+ static io_request make_recv(int fd, void* address, size_t size, int flags) {
+ return io_request(operation::recv, fd, flags, reinterpret_cast<char*>(address), size);
+ }
+
+ static io_request make_recvmsg(int fd, ::msghdr* msg, int flags) {
+ return io_request(operation::recvmsg, fd, flags, msg);
+ }
+
+ static io_request make_send(int fd, const void* address, size_t size, int flags) {
+ return io_request(operation::send, fd, flags, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
+ }
+
+ static io_request make_sendmsg(int fd, ::msghdr* msg, int flags) {
+ return io_request(operation::sendmsg, fd, flags, msg);
+ }
+
+ static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) {
+ return io_request(operation::write, fd, pos, const_cast<char*>(reinterpret_cast<const char*>(address)), size);
+ }
+
+ static io_request make_writev(int fd, uint64_t pos, std::vector<iovec>& iov) {
+ return io_request(operation::writev, fd, pos, iov.data(), iov.size());
+ }
+
+ static io_request make_fdatasync(int fd) {
+ return io_request(operation::fdatasync, fd);
+ }
+
+ static io_request make_accept(int fd, struct sockaddr* addr, socklen_t* addrlen, int flags) {
+ return io_request(operation::accept, fd, flags, addr, addrlen);
+ }
+
+ static io_request make_connect(int fd, struct sockaddr* addr, socklen_t addrlen) {
+ return io_request(operation::connect, fd, addr, addrlen);
+ }
+
+ static io_request make_poll_add(int fd, int events) {
+ return io_request(operation::poll_add, fd, events);
+ }
+
+ static io_request make_poll_remove(int fd, void *addr) {
+ return io_request(operation::poll_remove, fd, reinterpret_cast<char*>(addr));
+ }
+ static io_request make_cancel(int fd, void *addr) {
+ return io_request(operation::cancel, fd, reinterpret_cast<char*>(addr));
+ }
+};
+}
+}
diff --git a/src/seastar/include/seastar/core/internal/poll.hh b/src/seastar/include/seastar/core/internal/poll.hh
new file mode 100644
index 000000000..1816a60de
--- /dev/null
+++ b/src/seastar/include/seastar/core/internal/poll.hh
@@ -0,0 +1,59 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+#pragma once
+
+namespace seastar {
+
+struct pollfn {
+ virtual ~pollfn() {}
+ // Returns true if work was done (false = idle)
+ virtual bool poll() = 0;
+ // Checks if work needs to be done, but without actually doing any
+ // returns true if works needs to be done (false = idle)
+ virtual bool pure_poll() = 0;
+ // Tries to enter interrupt mode.
+ //
+ // If it returns true, then events from this poller will wake
+ // a sleeping idle loop, and exit_interrupt_mode() must be called
+ // to return to normal polling.
+ //
+ // If it returns false, the sleeping idle loop may not be entered.
+ virtual bool try_enter_interrupt_mode() = 0;
+ virtual void exit_interrupt_mode() = 0;
+};
+
+// The common case for poller -- do not make any difference between
+// poll() and pure_poll(), always/never agree to go to sleep and do
+// nothing on wakeup.
+template <bool Passive>
+struct simple_pollfn : public pollfn {
+ virtual bool pure_poll() override final {
+ return poll();
+ }
+ virtual bool try_enter_interrupt_mode() override final {
+ return Passive;
+ }
+ virtual void exit_interrupt_mode() override final {
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/internal/pollable_fd.hh b/src/seastar/include/seastar/core/internal/pollable_fd.hh
new file mode 100644
index 000000000..c978877c2
--- /dev/null
+++ b/src/seastar/include/seastar/core/internal/pollable_fd.hh
@@ -0,0 +1,219 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/posix.hh>
+#include <vector>
+#include <tuple>
+#include <seastar/core/internal/io_desc.hh>
+#include <boost/intrusive_ptr.hpp>
+
+namespace seastar {
+
+class reactor;
+class pollable_fd;
+class pollable_fd_state;
+class socket_address;
+
+namespace internal {
+
+class buffer_allocator;
+
+}
+
+namespace net {
+
+class packet;
+
+}
+
+class pollable_fd_state;
+
+using pollable_fd_state_ptr = boost::intrusive_ptr<pollable_fd_state>;
+
+class pollable_fd_state {
+ unsigned _refs = 0;
+public:
+ virtual ~pollable_fd_state() {}
+ struct speculation {
+ int events = 0;
+ explicit speculation(int epoll_events_guessed = 0) : events(epoll_events_guessed) {}
+ };
+ pollable_fd_state(const pollable_fd_state&) = delete;
+ void operator=(const pollable_fd_state&) = delete;
+ void speculate_epoll(int events) { events_known |= events; }
+ file_desc fd;
+ bool events_rw = false; // single consumer for both read and write (accept())
+ bool no_more_recv = false; // For udp, there is no shutdown indication from the kernel
+ bool no_more_send = false; // For udp, there is no shutdown indication from the kernel
+ int events_requested = 0; // wanted by pollin/pollout promises
+ int events_epoll = 0; // installed in epoll
+ int events_known = 0; // returned from epoll
+
+ friend class reactor;
+ friend class pollable_fd;
+
+ future<size_t> read_some(char* buffer, size_t size);
+ future<size_t> read_some(uint8_t* buffer, size_t size);
+ future<size_t> read_some(const std::vector<iovec>& iov);
+ future<temporary_buffer<char>> read_some(internal::buffer_allocator* ba);
+ future<> write_all(const char* buffer, size_t size);
+ future<> write_all(const uint8_t* buffer, size_t size);
+ future<size_t> write_some(net::packet& p);
+ future<> write_all(net::packet& p);
+ future<> readable();
+ future<> writeable();
+ future<> readable_or_writeable();
+ void abort_reader();
+ void abort_writer();
+ future<std::tuple<pollable_fd, socket_address>> accept();
+ future<> connect(socket_address& sa);
+ future<size_t> sendmsg(struct msghdr *msg);
+ future<size_t> recvmsg(struct msghdr *msg);
+ future<size_t> sendto(socket_address addr, const void* buf, size_t len);
+
+protected:
+ explicit pollable_fd_state(file_desc fd, speculation speculate = speculation())
+ : fd(std::move(fd)), events_known(speculate.events) {}
+private:
+ void maybe_no_more_recv();
+ void maybe_no_more_send();
+ void forget(); // called on end-of-life
+
+ friend void intrusive_ptr_add_ref(pollable_fd_state* fd) {
+ ++fd->_refs;
+ }
+ friend void intrusive_ptr_release(pollable_fd_state* fd);
+};
+
+class pollable_fd {
+public:
+ using speculation = pollable_fd_state::speculation;
+ pollable_fd() = default;
+ pollable_fd(file_desc fd, speculation speculate = speculation());
+public:
+ future<size_t> read_some(char* buffer, size_t size) {
+ return _s->read_some(buffer, size);
+ }
+ future<size_t> read_some(uint8_t* buffer, size_t size) {
+ return _s->read_some(buffer, size);
+ }
+ future<size_t> read_some(const std::vector<iovec>& iov) {
+ return _s->read_some(iov);
+ }
+ future<temporary_buffer<char>> read_some(internal::buffer_allocator* ba) {
+ return _s->read_some(ba);
+ }
+ future<> write_all(const char* buffer, size_t size) {
+ return _s->write_all(buffer, size);
+ }
+ future<> write_all(const uint8_t* buffer, size_t size) {
+ return _s->write_all(buffer, size);
+ }
+ future<size_t> write_some(net::packet& p) {
+ return _s->write_some(p);
+ }
+ future<> write_all(net::packet& p) {
+ return _s->write_all(p);
+ }
+ future<> readable() {
+ return _s->readable();
+ }
+ future<> writeable() {
+ return _s->writeable();
+ }
+ future<> readable_or_writeable() {
+ return _s->readable_or_writeable();
+ }
+ void abort_reader() {
+ return _s->abort_reader();
+ }
+ void abort_writer() {
+ return _s->abort_writer();
+ }
+ future<std::tuple<pollable_fd, socket_address>> accept() {
+ return _s->accept();
+ }
+ future<> connect(socket_address& sa) {
+ return _s->connect(sa);
+ }
+ future<size_t> sendmsg(struct msghdr *msg) {
+ return _s->sendmsg(msg);
+ }
+ future<size_t> recvmsg(struct msghdr *msg) {
+ return _s->recvmsg(msg);
+ }
+ future<size_t> sendto(socket_address addr, const void* buf, size_t len) {
+ return _s->sendto(addr, buf, len);
+ }
+ file_desc& get_file_desc() const { return _s->fd; }
+ void shutdown(int how);
+ void close() { _s.reset(); }
+ explicit operator bool() const noexcept {
+ return bool(_s);
+ }
+protected:
+ int get_fd() const { return _s->fd.get(); }
+ void maybe_no_more_recv() { return _s->maybe_no_more_recv(); }
+ void maybe_no_more_send() { return _s->maybe_no_more_send(); }
+ friend class reactor;
+ friend class readable_eventfd;
+ friend class writeable_eventfd;
+ friend class aio_storage_context;
+private:
+ pollable_fd_state_ptr _s;
+};
+
+class writeable_eventfd;
+
+class readable_eventfd {
+ pollable_fd _fd;
+public:
+ explicit readable_eventfd(size_t initial = 0) : _fd(try_create_eventfd(initial)) {}
+ readable_eventfd(readable_eventfd&&) = default;
+ writeable_eventfd write_side();
+ future<size_t> wait();
+ int get_write_fd() { return _fd.get_fd(); }
+private:
+ explicit readable_eventfd(file_desc&& fd) : _fd(std::move(fd)) {}
+ static file_desc try_create_eventfd(size_t initial);
+
+ friend class writeable_eventfd;
+};
+
+class writeable_eventfd {
+ file_desc _fd;
+public:
+ explicit writeable_eventfd(size_t initial = 0) : _fd(try_create_eventfd(initial)) {}
+ writeable_eventfd(writeable_eventfd&&) = default;
+ readable_eventfd read_side();
+ void signal(size_t nr);
+ int get_read_fd() { return _fd.get(); }
+private:
+ explicit writeable_eventfd(file_desc&& fd) : _fd(std::move(fd)) {}
+ static file_desc try_create_eventfd(size_t initial);
+
+ friend class readable_eventfd;
+};
+
+}
diff --git a/src/seastar/include/seastar/core/io_queue.hh b/src/seastar/include/seastar/core/io_queue.hh
new file mode 100644
index 000000000..dd7c1b1ab
--- /dev/null
+++ b/src/seastar/include/seastar/core/io_queue.hh
@@ -0,0 +1,170 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/fair_queue.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/internal/io_request.hh>
+#include <mutex>
+#include <array>
+
+namespace seastar {
+
+class io_priority_class;
+
+/// Renames an io priority class
+///
+/// Renames an `io_priority_class` previously created with register_one_priority_class().
+///
+/// The operation is global and affects all shards.
+/// The operation affects the exported statistics labels.
+///
+/// \param pc The io priority class to be renamed
+/// \param new_name The new name for the io priority class
+/// \return a future that is ready when the io priority class have been renamed
+future<>
+rename_priority_class(io_priority_class pc, sstring new_name);
+
+namespace internal {
+namespace linux_abi {
+
+struct io_event;
+struct iocb;
+
+}
+}
+
+using shard_id = unsigned;
+
+class io_priority_class;
+
+class io_queue {
+private:
+ struct priority_class_data {
+ priority_class_ptr ptr;
+ size_t bytes;
+ uint64_t ops;
+ uint32_t nr_queued;
+ std::chrono::duration<double> queue_time;
+ metrics::metric_groups _metric_groups;
+ priority_class_data(sstring name, sstring mountpoint, priority_class_ptr ptr, shard_id owner);
+ void rename(sstring new_name, sstring mountpoint, shard_id owner);
+ private:
+ void register_stats(sstring name, sstring mountpoint, shard_id owner);
+ };
+
+ std::vector<std::vector<std::unique_ptr<priority_class_data>>> _priority_classes;
+ fair_queue _fq;
+
+ static constexpr unsigned _max_classes = 2048;
+ static std::mutex _register_lock;
+ static std::array<uint32_t, _max_classes> _registered_shares;
+ static std::array<sstring, _max_classes> _registered_names;
+
+public:
+ static io_priority_class register_one_priority_class(sstring name, uint32_t shares);
+ static bool rename_one_priority_class(io_priority_class pc, sstring name);
+
+private:
+ priority_class_data& find_or_create_class(const io_priority_class& pc, shard_id owner);
+
+ fair_queue_ticket request_fq_ticket(const internal::io_request& req, size_t len) const;
+
+ // The fields below are going away, they are just here so we can implement deprecated
+ // functions that used to be provided by the fair_queue and are going away (from both
+ // the fair_queue and the io_queue). Double-accounting for now will allow for easier
+ // decoupling and is temporary
+ size_t _queued_requests = 0;
+ size_t _requests_executing = 0;
+public:
+ // We want to represent the fact that write requests are (maybe) more expensive
+ // than read requests. To avoid dealing with floating point math we will scale one
+ // read request to be counted by this amount.
+ //
+ // A write request that is 30% more expensive than a read will be accounted as
+ // (read_request_base_count * 130) / 100.
+ // It is also technically possible for reads to be the expensive ones, in which case
+ // writes will have an integer value lower than read_request_base_count.
+ static constexpr unsigned read_request_base_count = 128;
+
+ struct config {
+ dev_t devid;
+ shard_id coordinator;
+ unsigned capacity = std::numeric_limits<unsigned>::max();
+ unsigned max_req_count = std::numeric_limits<unsigned>::max();
+ unsigned max_bytes_count = std::numeric_limits<unsigned>::max();
+ unsigned disk_req_write_to_read_multiplier = read_request_base_count;
+ unsigned disk_bytes_write_to_read_multiplier = read_request_base_count;
+ sstring mountpoint = "undefined";
+ };
+
+ io_queue(config cfg);
+ ~io_queue();
+
+ future<size_t>
+ queue_request(const io_priority_class& pc, size_t len, internal::io_request req) noexcept;
+
+ [[deprecated("modern I/O queues should use a property file")]] size_t capacity() const {
+ return _config.capacity;
+ }
+
+ [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
+ size_t queued_requests() const {
+ return _queued_requests;
+ }
+
+ // How many requests are sent to disk but not yet returned.
+ [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]]
+ size_t requests_currently_executing() const {
+ return _requests_executing;
+ }
+
+ void notify_requests_finished(fair_queue_ticket& desc) noexcept;
+
+ // Dispatch requests that are pending in the I/O queue
+ void poll_io_queue() {
+ _fq.dispatch_requests();
+ }
+
+ sstring mountpoint() const {
+ return _config.mountpoint;
+ }
+
+ shard_id coordinator() const {
+ return _config.coordinator;
+ }
+
+ dev_t dev_id() const noexcept {
+ return _config.devid;
+ }
+
+ future<> update_shares_for_class(io_priority_class pc, size_t new_shares);
+ void rename_priority_class(io_priority_class pc, sstring new_name);
+
+private:
+ config _config;
+ static fair_queue::config make_fair_queue_config(config cfg);
+};
+
+}
diff --git a/src/seastar/include/seastar/core/iostream-impl.hh b/src/seastar/include/seastar/core/iostream-impl.hh
new file mode 100644
index 000000000..2b3354742
--- /dev/null
+++ b/src/seastar/include/seastar/core/iostream-impl.hh
@@ -0,0 +1,535 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+
+#pragma once
+
+#include <seastar/core/do_with.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/net/packet.hh>
+#include <seastar/util/variant_utils.hh>
+
+namespace seastar {
+
+inline future<temporary_buffer<char>> data_source_impl::skip(uint64_t n)
+{
+ return do_with(uint64_t(n), [this] (uint64_t& n) {
+ return repeat_until_value([&] {
+ return get().then([&] (temporary_buffer<char> buffer) -> std::optional<temporary_buffer<char>> {
+ if (buffer.size() >= n) {
+ buffer.trim_front(n);
+ return buffer;
+ }
+ n -= buffer.size();
+ return { };
+ });
+ });
+ });
+}
+
+template<typename CharType>
+inline
+future<> output_stream<CharType>::write(const char_type* buf) {
+ return write(buf, strlen(buf));
+}
+
+template<typename CharType>
+template<typename StringChar, typename SizeType, SizeType MaxSize, bool NulTerminate>
+inline
+future<> output_stream<CharType>::write(const basic_sstring<StringChar, SizeType, MaxSize, NulTerminate>& s) {
+ return write(reinterpret_cast<const CharType *>(s.c_str()), s.size());
+}
+
+template<typename CharType>
+inline
+future<> output_stream<CharType>::write(const std::basic_string<CharType>& s) {
+ return write(s.c_str(), s.size());
+}
+
+template<typename CharType>
+future<> output_stream<CharType>::write(scattered_message<CharType> msg) {
+ return write(std::move(msg).release());
+}
+
+template<typename CharType>
+future<>
+output_stream<CharType>::zero_copy_put(net::packet p) {
+ // if flush is scheduled, disable it, so it will not try to write in parallel
+ _flush = false;
+ if (_flushing) {
+ // flush in progress, wait for it to end before continuing
+ return _in_batch.value().get_future().then([this, p = std::move(p)] () mutable {
+ return _fd.put(std::move(p));
+ });
+ } else {
+ return _fd.put(std::move(p));
+ }
+}
+
+// Writes @p in chunks of _size length. The last chunk is buffered if smaller.
+template <typename CharType>
+future<>
+output_stream<CharType>::zero_copy_split_and_put(net::packet p) {
+ return repeat([this, p = std::move(p)] () mutable {
+ if (p.len() < _size) {
+ if (p.len()) {
+ _zc_bufs = std::move(p);
+ } else {
+ _zc_bufs = net::packet::make_null_packet();
+ }
+ return make_ready_future<stop_iteration>(stop_iteration::yes);
+ }
+ auto chunk = p.share(0, _size);
+ p.trim_front(_size);
+ return zero_copy_put(std::move(chunk)).then([] {
+ return stop_iteration::no;
+ });
+ });
+}
+
+template<typename CharType>
+future<> output_stream<CharType>::write(net::packet p) {
+ static_assert(std::is_same<CharType, char>::value, "packet works on char");
+
+ if (p.len() != 0) {
+ assert(!_end && "Mixing buffered writes and zero-copy writes not supported yet");
+
+ if (_zc_bufs) {
+ _zc_bufs.append(std::move(p));
+ } else {
+ _zc_bufs = std::move(p);
+ }
+
+ if (_zc_bufs.len() >= _size) {
+ if (_trim_to_size) {
+ return zero_copy_split_and_put(std::move(_zc_bufs));
+ } else {
+ return zero_copy_put(std::move(_zc_bufs));
+ }
+ }
+ }
+ return make_ready_future<>();
+}
+
+template<typename CharType>
+future<> output_stream<CharType>::write(temporary_buffer<CharType> p) {
+ if (p.empty()) {
+ return make_ready_future<>();
+ }
+ assert(!_end && "Mixing buffered writes and zero-copy writes not supported yet");
+
+ return write(net::packet(std::move(p)));
+}
+
+template <typename CharType>
+future<temporary_buffer<CharType>>
+input_stream<CharType>::read_exactly_part(size_t n, tmp_buf out, size_t completed) {
+ if (available()) {
+ auto now = std::min(n - completed, available());
+ std::copy(_buf.get(), _buf.get() + now, out.get_write() + completed);
+ _buf.trim_front(now);
+ completed += now;
+ }
+ if (completed == n) {
+ return make_ready_future<tmp_buf>(std::move(out));
+ }
+
+ // _buf is now empty
+ return _fd.get().then([this, n, out = std::move(out), completed] (auto buf) mutable {
+ if (buf.size() == 0) {
+ _eof = true;
+ return make_ready_future<tmp_buf>(std::move(buf));
+ }
+ _buf = std::move(buf);
+ return this->read_exactly_part(n, std::move(out), completed);
+ });
+}
+
+template <typename CharType>
+future<temporary_buffer<CharType>>
+input_stream<CharType>::read_exactly(size_t n) {
+ if (_buf.size() == n) {
+ // easy case: steal buffer, return to caller
+ return make_ready_future<tmp_buf>(std::move(_buf));
+ } else if (_buf.size() > n) {
+ // buffer large enough, share it with caller
+ auto front = _buf.share(0, n);
+ _buf.trim_front(n);
+ return make_ready_future<tmp_buf>(std::move(front));
+ } else if (_buf.size() == 0) {
+ // buffer is empty: grab one and retry
+ return _fd.get().then([this, n] (auto buf) mutable {
+ if (buf.size() == 0) {
+ _eof = true;
+ return make_ready_future<tmp_buf>(std::move(buf));
+ }
+ _buf = std::move(buf);
+ return this->read_exactly(n);
+ });
+ } else {
+ // buffer too small: start copy/read loop
+ tmp_buf b(n);
+ return read_exactly_part(n, std::move(b), 0);
+ }
+}
+
+template <typename CharType>
+template <typename Consumer>
+SEASTAR_CONCEPT(requires InputStreamConsumer<Consumer, CharType> || ObsoleteInputStreamConsumer<Consumer, CharType>)
+future<>
+input_stream<CharType>::consume(Consumer&& consumer) {
+ return repeat([consumer = std::move(consumer), this] () mutable {
+ if (_buf.empty() && !_eof) {
+ return _fd.get().then([this] (tmp_buf buf) {
+ _buf = std::move(buf);
+ _eof = _buf.empty();
+ return make_ready_future<stop_iteration>(stop_iteration::no);
+ });
+ }
+ return consumer(std::move(_buf)).then([this] (consumption_result_type result) {
+ return seastar::visit(result.get(), [this] (const continue_consuming&) {
+ // If we're here, consumer consumed entire buffer and is ready for
+ // more now. So we do not return, and rather continue the loop.
+ //
+ // If we're at eof, we should stop.
+ return make_ready_future<stop_iteration>(stop_iteration(this->_eof));
+ }, [this] (stop_consuming<CharType>& stop) {
+ // consumer is done
+ this->_buf = std::move(stop.get_buffer());
+ return make_ready_future<stop_iteration>(stop_iteration::yes);
+ }, [this] (const skip_bytes& skip) {
+ return this->_fd.skip(skip.get_value()).then([this](tmp_buf buf) {
+ if (!buf.empty()) {
+ this->_buf = std::move(buf);
+ }
+ return make_ready_future<stop_iteration>(stop_iteration::no);
+ });
+ });
+ });
+ });
+}
+
+template <typename CharType>
+template <typename Consumer>
+SEASTAR_CONCEPT(requires InputStreamConsumer<Consumer, CharType> || ObsoleteInputStreamConsumer<Consumer, CharType>)
+future<>
+input_stream<CharType>::consume(Consumer& consumer) {
+ return consume(std::ref(consumer));
+}
+
+template <typename CharType>
+future<temporary_buffer<CharType>>
+input_stream<CharType>::read_up_to(size_t n) {
+ using tmp_buf = temporary_buffer<CharType>;
+ if (_buf.empty()) {
+ if (_eof) {
+ return make_ready_future<tmp_buf>();
+ } else {
+ return _fd.get().then([this, n] (tmp_buf buf) {
+ _eof = buf.empty();
+ _buf = std::move(buf);
+ return read_up_to(n);
+ });
+ }
+ } else if (_buf.size() <= n) {
+ // easy case: steal buffer, return to caller
+ return make_ready_future<tmp_buf>(std::move(_buf));
+ } else {
+ // buffer is larger than n, so share its head with a caller
+ auto front = _buf.share(0, n);
+ _buf.trim_front(n);
+ return make_ready_future<tmp_buf>(std::move(front));
+ }
+}
+
+template <typename CharType>
+future<temporary_buffer<CharType>>
+input_stream<CharType>::read() {
+ using tmp_buf = temporary_buffer<CharType>;
+ if (_eof) {
+ return make_ready_future<tmp_buf>();
+ }
+ if (_buf.empty()) {
+ return _fd.get().then([this] (tmp_buf buf) {
+ _eof = buf.empty();
+ return make_ready_future<tmp_buf>(std::move(buf));
+ });
+ } else {
+ return make_ready_future<tmp_buf>(std::move(_buf));
+ }
+}
+
+template <typename CharType>
+future<>
+input_stream<CharType>::skip(uint64_t n) {
+ auto skip_buf = std::min(n, _buf.size());
+ _buf.trim_front(skip_buf);
+ n -= skip_buf;
+ if (!n) {
+ return make_ready_future<>();
+ }
+ return _fd.skip(n).then([this] (temporary_buffer<CharType> buffer) {
+ _buf = std::move(buffer);
+ });
+}
+
+template <typename CharType>
+data_source
+input_stream<CharType>::detach() && {
+ if (_buf) {
+ throw std::logic_error("detach() called on a used input_stream");
+ }
+
+ return std::move(_fd);
+}
+
+// Writes @buf in chunks of _size length. The last chunk is buffered if smaller.
+template <typename CharType>
+future<>
+output_stream<CharType>::split_and_put(temporary_buffer<CharType> buf) {
+ assert(_end == 0);
+
+ return repeat([this, buf = std::move(buf)] () mutable {
+ if (buf.size() < _size) {
+ if (!_buf) {
+ _buf = _fd.allocate_buffer(_size);
+ }
+ std::copy(buf.get(), buf.get() + buf.size(), _buf.get_write());
+ _end = buf.size();
+ return make_ready_future<stop_iteration>(stop_iteration::yes);
+ }
+ auto chunk = buf.share(0, _size);
+ buf.trim_front(_size);
+ return put(std::move(chunk)).then([] {
+ return stop_iteration::no;
+ });
+ });
+}
+
+template <typename CharType>
+future<>
+output_stream<CharType>::write(const char_type* buf, size_t n) {
+ if (__builtin_expect(!_buf || n > _size - _end, false)) {
+ return slow_write(buf, n);
+ }
+ std::copy_n(buf, n, _buf.get_write() + _end);
+ _end += n;
+ return make_ready_future<>();
+}
+
+template <typename CharType>
+future<>
+output_stream<CharType>::slow_write(const char_type* buf, size_t n) {
+ assert(!_zc_bufs && "Mixing buffered writes and zero-copy writes not supported yet");
+ auto bulk_threshold = _end ? (2 * _size - _end) : _size;
+ if (n >= bulk_threshold) {
+ if (_end) {
+ auto now = _size - _end;
+ std::copy(buf, buf + now, _buf.get_write() + _end);
+ _end = _size;
+ temporary_buffer<char> tmp = _fd.allocate_buffer(n - now);
+ std::copy(buf + now, buf + n, tmp.get_write());
+ _buf.trim(_end);
+ _end = 0;
+ return put(std::move(_buf)).then([this, tmp = std::move(tmp)]() mutable {
+ if (_trim_to_size) {
+ return split_and_put(std::move(tmp));
+ } else {
+ return put(std::move(tmp));
+ }
+ });
+ } else {
+ temporary_buffer<char> tmp = _fd.allocate_buffer(n);
+ std::copy(buf, buf + n, tmp.get_write());
+ if (_trim_to_size) {
+ return split_and_put(std::move(tmp));
+ } else {
+ return put(std::move(tmp));
+ }
+ }
+ }
+
+ if (!_buf) {
+ _buf = _fd.allocate_buffer(_size);
+ }
+
+ auto now = std::min(n, _size - _end);
+ std::copy(buf, buf + now, _buf.get_write() + _end);
+ _end += now;
+ if (now == n) {
+ return make_ready_future<>();
+ } else {
+ temporary_buffer<char> next = _fd.allocate_buffer(_size);
+ std::copy(buf + now, buf + n, next.get_write());
+ _end = n - now;
+ std::swap(next, _buf);
+ return put(std::move(next));
+ }
+}
+
+template <typename CharType>
+future<>
+output_stream<CharType>::flush() {
+ if (!_batch_flushes) {
+ if (_end) {
+ _buf.trim(_end);
+ _end = 0;
+ return put(std::move(_buf)).then([this] {
+ return _fd.flush();
+ });
+ } else if (_zc_bufs) {
+ return zero_copy_put(std::move(_zc_bufs)).then([this] {
+ return _fd.flush();
+ });
+ }
+ } else {
+ if (_ex) {
+ // flush is a good time to deliver outstanding errors
+ return make_exception_future<>(std::move(_ex));
+ } else {
+ _flush = true;
+ if (!_in_batch) {
+ add_to_flush_poller(this);
+ _in_batch = promise<>();
+ }
+ }
+ }
+ return make_ready_future<>();
+}
+
+void add_to_flush_poller(output_stream<char>* x);
+
+template <typename CharType>
+future<>
+output_stream<CharType>::put(temporary_buffer<CharType> buf) {
+ // if flush is scheduled, disable it, so it will not try to write in parallel
+ _flush = false;
+ if (_flushing) {
+ // flush in progress, wait for it to end before continuing
+ return _in_batch.value().get_future().then([this, buf = std::move(buf)] () mutable {
+ return _fd.put(std::move(buf));
+ });
+ } else {
+ return _fd.put(std::move(buf));
+ }
+}
+
+template <typename CharType>
+void
+output_stream<CharType>::poll_flush() {
+ if (!_flush) {
+ // flush was canceled, do nothing
+ _flushing = false;
+ _in_batch.value().set_value();
+ _in_batch = std::nullopt;
+ return;
+ }
+
+ auto f = make_ready_future();
+ _flush = false;
+ _flushing = true; // make whoever wants to write into the fd to wait for flush to complete
+
+ if (_end) {
+ // send whatever is in the buffer right now
+ _buf.trim(_end);
+ _end = 0;
+ f = _fd.put(std::move(_buf));
+ } else if(_zc_bufs) {
+ f = _fd.put(std::move(_zc_bufs));
+ }
+
+ // FIXME: future is discarded
+ (void)f.then([this] {
+ return _fd.flush();
+ }).then_wrapped([this] (future<> f) {
+ try {
+ f.get();
+ } catch (...) {
+ _ex = std::current_exception();
+ }
+ // if flush() was called while flushing flush once more
+ poll_flush();
+ });
+}
+
+template <typename CharType>
+future<>
+output_stream<CharType>::close() {
+ return flush().finally([this] {
+ if (_in_batch) {
+ return _in_batch.value().get_future();
+ } else {
+ return make_ready_future();
+ }
+ }).then([this] {
+ // report final exception as close error
+ if (_ex) {
+ std::rethrow_exception(_ex);
+ }
+ }).finally([this] {
+ return _fd.close();
+ });
+}
+
+template <typename CharType>
+data_sink
+output_stream<CharType>::detach() && {
+ if (_buf) {
+ throw std::logic_error("detach() called on a used output_stream");
+ }
+
+ return std::move(_fd);
+}
+
+namespace internal {
+
+/// \cond internal
+template <typename CharType>
+struct stream_copy_consumer {
+private:
+ output_stream<CharType>& _os;
+ using unconsumed_remainder = std::optional<temporary_buffer<CharType>>;
+public:
+ stream_copy_consumer(output_stream<CharType>& os) : _os(os) {
+ }
+ future<unconsumed_remainder> operator()(temporary_buffer<CharType> data) {
+ if (data.empty()) {
+ return make_ready_future<unconsumed_remainder>(std::move(data));
+ }
+ return _os.write(data.get(), data.size()).then([] () {
+ return make_ready_future<unconsumed_remainder>();
+ });
+ }
+};
+/// \endcond
+
+}
+
+extern template struct internal::stream_copy_consumer<char>;
+
+template <typename CharType>
+future<> copy(input_stream<CharType>& in, output_stream<CharType>& out) {
+ return in.consume(internal::stream_copy_consumer<CharType>(out));
+}
+
+extern template future<> copy<char>(input_stream<char>&, output_stream<char>&);
+}
diff --git a/src/seastar/include/seastar/core/iostream.hh b/src/seastar/include/seastar/core/iostream.hh
new file mode 100644
index 000000000..ae81ecd1d
--- /dev/null
+++ b/src/seastar/include/seastar/core/iostream.hh
@@ -0,0 +1,360 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+//
+// Buffered input and output streams
+//
+// Two abstract classes (data_source and data_sink) provide means
+// to acquire bulk data from, or push bulk data to, some provider.
+// These could be tied to a TCP connection, a disk file, or a memory
+// buffer.
+//
+// Two concrete classes (input_stream and output_stream) buffer data
+// from data_source and data_sink and provide easier means to process
+// it.
+//
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/core/scattered_message.hh>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+namespace net { class packet; }
+
+class data_source_impl {
+public:
+ virtual ~data_source_impl() {}
+ virtual future<temporary_buffer<char>> get() = 0;
+ virtual future<temporary_buffer<char>> skip(uint64_t n);
+ virtual future<> close() { return make_ready_future<>(); }
+};
+
+class data_source {
+ std::unique_ptr<data_source_impl> _dsi;
+protected:
+ data_source_impl* impl() const { return _dsi.get(); }
+public:
+ data_source() noexcept = default;
+ explicit data_source(std::unique_ptr<data_source_impl> dsi) noexcept : _dsi(std::move(dsi)) {}
+ data_source(data_source&& x) noexcept = default;
+ data_source& operator=(data_source&& x) noexcept = default;
+ future<temporary_buffer<char>> get() { return _dsi->get(); }
+ future<temporary_buffer<char>> skip(uint64_t n) { return _dsi->skip(n); }
+ future<> close() { return _dsi->close(); }
+};
+
+class data_sink_impl {
+public:
+ virtual ~data_sink_impl() {}
+ virtual temporary_buffer<char> allocate_buffer(size_t size) {
+ return temporary_buffer<char>(size);
+ }
+ virtual future<> put(net::packet data) = 0;
+ virtual future<> put(std::vector<temporary_buffer<char>> data) {
+ net::packet p;
+ p.reserve(data.size());
+ for (auto& buf : data) {
+ p = net::packet(std::move(p), net::fragment{buf.get_write(), buf.size()}, buf.release());
+ }
+ return put(std::move(p));
+ }
+ virtual future<> put(temporary_buffer<char> buf) {
+ return put(net::packet(net::fragment{buf.get_write(), buf.size()}, buf.release()));
+ }
+ virtual future<> flush() {
+ return make_ready_future<>();
+ }
+ virtual future<> close() = 0;
+};
+
+class data_sink {
+ std::unique_ptr<data_sink_impl> _dsi;
+public:
+ data_sink() noexcept = default;
+ explicit data_sink(std::unique_ptr<data_sink_impl> dsi) noexcept : _dsi(std::move(dsi)) {}
+ data_sink(data_sink&& x) noexcept = default;
+ data_sink& operator=(data_sink&& x) noexcept = default;
+ temporary_buffer<char> allocate_buffer(size_t size) {
+ return _dsi->allocate_buffer(size);
+ }
+ future<> put(std::vector<temporary_buffer<char>> data) {
+ return _dsi->put(std::move(data));
+ }
+ future<> put(temporary_buffer<char> data) {
+ return _dsi->put(std::move(data));
+ }
+ future<> put(net::packet p) {
+ return _dsi->put(std::move(p));
+ }
+ future<> flush() {
+ return _dsi->flush();
+ }
+ future<> close() { return _dsi->close(); }
+};
+
+struct continue_consuming {};
+
+template <typename CharType>
+class stop_consuming {
+public:
+ using tmp_buf = temporary_buffer<CharType>;
+ explicit stop_consuming(tmp_buf buf) : _buf(std::move(buf)) {}
+
+ tmp_buf& get_buffer() { return _buf; }
+ const tmp_buf& get_buffer() const { return _buf; }
+private:
+ tmp_buf _buf;
+};
+
+class skip_bytes {
+public:
+ explicit skip_bytes(uint64_t v) : _value(v) {}
+ uint64_t get_value() const { return _value; }
+private:
+ uint64_t _value;
+};
+
+template <typename CharType>
+class consumption_result {
+public:
+ using stop_consuming_type = stop_consuming<CharType>;
+ using consumption_variant = std::variant<continue_consuming, stop_consuming_type, skip_bytes>;
+ using tmp_buf = typename stop_consuming_type::tmp_buf;
+
+ /*[[deprecated]]*/ consumption_result(std::optional<tmp_buf> opt_buf) {
+ if (opt_buf) {
+ _result = stop_consuming_type{std::move(opt_buf.value())};
+ }
+ }
+
+ consumption_result(const continue_consuming&) {}
+ consumption_result(stop_consuming_type&& stop) : _result(std::move(stop)) {}
+ consumption_result(skip_bytes&& skip) : _result(std::move(skip)) {}
+
+ consumption_variant& get() { return _result; }
+ const consumption_variant& get() const { return _result; }
+
+private:
+ consumption_variant _result;
+};
+
+// Consumer concept, for consume() method
+SEASTAR_CONCEPT(
+// The consumer should operate on the data given to it, and
+// return a future "consumption result", which can be
+// - continue_consuming, if the consumer has consumed all the input given
+// to it and is ready for more
+// - stop_consuming, when the consumer is done (and in that case
+// the contained buffer is the unconsumed part of the last data buffer - this
+// can also happen to be empty).
+// - skip_bytes, when the consumer has consumed all the input given to it
+// and wants to skip before processing the next chunk
+//
+// For backward compatibility reasons, we also support the deprecated return value
+// of type "unconsumed remainder" which can be
+// - empty optional, if the consumer consumed all the input given to it
+// and is ready for more
+// - non-empty optional, when the consumer is done (and in that case
+// the value is the unconsumed part of the last data buffer - this
+// can also happen to be empty).
+
+template <typename Consumer, typename CharType>
+concept InputStreamConsumer = requires (Consumer c) {
+ { c(temporary_buffer<CharType>{}) } -> std::same_as<future<consumption_result<CharType>>>;
+};
+
+template <typename Consumer, typename CharType>
+concept ObsoleteInputStreamConsumer = requires (Consumer c) {
+ { c(temporary_buffer<CharType>{}) } -> std::same_as<future<std::optional<temporary_buffer<CharType>>>>;
+};
+)
+
+/// Buffers data from a data_source and provides a stream interface to the user.
+///
+/// \note All methods must be called sequentially. That is, no method may be
+/// invoked before the previous method's returned future is resolved.
+template <typename CharType>
+class input_stream final {
+ static_assert(sizeof(CharType) == 1, "must buffer stream of bytes");
+ data_source _fd;
+ temporary_buffer<CharType> _buf;
+ bool _eof = false;
+private:
+ using tmp_buf = temporary_buffer<CharType>;
+ size_t available() const { return _buf.size(); }
+protected:
+ void reset() { _buf = {}; }
+ data_source* fd() { return &_fd; }
+public:
+ using consumption_result_type = consumption_result<CharType>;
+ // unconsumed_remainder is mapped for compatibility only; new code should use consumption_result_type
+ using unconsumed_remainder = std::optional<tmp_buf>;
+ using char_type = CharType;
+ input_stream() noexcept = default;
+ explicit input_stream(data_source fd) noexcept : _fd(std::move(fd)), _buf() {}
+ input_stream(input_stream&&) = default;
+ input_stream& operator=(input_stream&&) = default;
+ /// Reads n bytes from the stream, or fewer if reached the end of stream.
+ ///
+ /// \returns a future that waits until n bytes are available in the
+ /// stream and returns them. If the end of stream is reached before n
+ /// bytes were read, fewer than n bytes will be returned - so despite
+ /// the method's name, the caller must not assume the returned buffer
+ /// will always contain exactly n bytes.
+ ///
+ /// \throws if an I/O error occurs during the read. As explained above,
+ /// prematurely reaching the end of stream is *not* an I/O error.
+ future<temporary_buffer<CharType>> read_exactly(size_t n);
+ template <typename Consumer>
+ SEASTAR_CONCEPT(requires InputStreamConsumer<Consumer, CharType> || ObsoleteInputStreamConsumer<Consumer, CharType>)
+ future<> consume(Consumer&& c);
+ template <typename Consumer>
+ SEASTAR_CONCEPT(requires InputStreamConsumer<Consumer, CharType> || ObsoleteInputStreamConsumer<Consumer, CharType>)
+ future<> consume(Consumer& c);
+ bool eof() const { return _eof; }
+ /// Returns some data from the stream, or an empty buffer on end of
+ /// stream.
+ future<tmp_buf> read();
+ /// Returns up to n bytes from the stream, or an empty buffer on end of
+ /// stream.
+ future<tmp_buf> read_up_to(size_t n);
+ /// Detaches the \c input_stream from the underlying data source.
+ ///
+ /// Waits for any background operations (for example, read-ahead) to
+ /// complete, so that the any resources the stream is using can be
+ /// safely destroyed. An example is a \ref file resource used by
+ /// the stream returned by make_file_input_stream().
+ ///
+ /// \return a future that becomes ready when this stream no longer
+ /// needs the data source.
+ future<> close() {
+ return _fd.close();
+ }
+ /// Ignores n next bytes from the stream.
+ future<> skip(uint64_t n);
+
+ /// Detaches the underlying \c data_source from the \c input_stream.
+ ///
+ /// The intended usage is custom \c data_source_impl implementations
+ /// wrapping an existing \c input_stream, therefore it shouldn't be
+ /// called on an \c input_stream that was already used.
+ /// After calling \c detach() the \c input_stream is in an unusable,
+ /// moved-from state.
+ ///
+ /// \throws std::logic_error if called on a used stream
+ ///
+ /// \returns the data_source
+ data_source detach() &&;
+private:
+ future<temporary_buffer<CharType>> read_exactly_part(size_t n, tmp_buf buf, size_t completed);
+};
+
+/// Facilitates data buffering before it's handed over to data_sink.
+///
+/// When trim_to_size is true it's guaranteed that data sink will not receive
+/// chunks larger than the configured size, which could be the case when a
+/// single write call is made with data larger than the configured size.
+///
+/// The data sink will not receive empty chunks.
+///
+/// \note All methods must be called sequentially. That is, no method
+/// may be invoked before the previous method's returned future is
+/// resolved.
+template <typename CharType>
+class output_stream final {
+ static_assert(sizeof(CharType) == 1, "must buffer stream of bytes");
+ data_sink _fd;
+ temporary_buffer<CharType> _buf;
+ net::packet _zc_bufs = net::packet::make_null_packet(); //zero copy buffers
+ size_t _size = 0;
+ size_t _begin = 0;
+ size_t _end = 0;
+ bool _trim_to_size = false;
+ bool _batch_flushes = false;
+ std::optional<promise<>> _in_batch;
+ bool _flush = false;
+ bool _flushing = false;
+ std::exception_ptr _ex;
+private:
+ size_t available() const { return _end - _begin; }
+ size_t possibly_available() const { return _size - _begin; }
+ future<> split_and_put(temporary_buffer<CharType> buf);
+ future<> put(temporary_buffer<CharType> buf);
+ void poll_flush();
+ future<> zero_copy_put(net::packet p);
+ future<> zero_copy_split_and_put(net::packet p);
+ [[gnu::noinline]]
+ future<> slow_write(const CharType* buf, size_t n);
+public:
+ using char_type = CharType;
+ output_stream() noexcept = default;
+ output_stream(data_sink fd, size_t size, bool trim_to_size = false, bool batch_flushes = false) noexcept
+ : _fd(std::move(fd)), _size(size), _trim_to_size(trim_to_size), _batch_flushes(batch_flushes) {}
+ output_stream(output_stream&&) noexcept = default;
+ output_stream& operator=(output_stream&&) noexcept = default;
+ ~output_stream() { assert(!_in_batch && "Was this stream properly closed?"); }
+ future<> write(const char_type* buf, size_t n);
+ future<> write(const char_type* buf);
+
+ template <typename StringChar, typename SizeType, SizeType MaxSize, bool NulTerminate>
+ future<> write(const basic_sstring<StringChar, SizeType, MaxSize, NulTerminate>& s);
+ future<> write(const std::basic_string<char_type>& s);
+
+ future<> write(net::packet p);
+ future<> write(scattered_message<char_type> msg);
+ future<> write(temporary_buffer<char_type>);
+ future<> flush();
+
+ /// Flushes the stream before closing it (and the underlying data sink) to
+ /// any further writes. The resulting future must be waited on before
+ /// destroying this object.
+ future<> close();
+
+ /// Detaches the underlying \c data_sink from the \c output_stream.
+ ///
+ /// The intended usage is custom \c data_sink_impl implementations
+ /// wrapping an existing \c output_stream, therefore it shouldn't be
+ /// called on an \c output_stream that was already used.
+ /// After calling \c detach() the \c output_stream is in an unusable,
+ /// moved-from state.
+ ///
+ /// \throws std::logic_error if called on a used stream
+ ///
+ /// \returns the data_sink
+ data_sink detach() &&;
+private:
+ friend class reactor;
+};
+
+/*!
+ * \brief copy all the content from the input stream to the output stream
+ */
+template <typename CharType>
+future<> copy(input_stream<CharType>&, output_stream<CharType>&);
+
+}
+
+#include "iostream-impl.hh"
diff --git a/src/seastar/include/seastar/core/layered_file.hh b/src/seastar/include/seastar/core/layered_file.hh
new file mode 100644
index 000000000..aa150e37e
--- /dev/null
+++ b/src/seastar/include/seastar/core/layered_file.hh
@@ -0,0 +1,67 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/file.hh>
+
+namespace seastar {
+
+/// \addtogroup fileio-module
+/// @{
+
+/// Base class for layered file implementations.
+///
+/// A layered file implementation implements `file_impl` virtual
+/// functions such as dma_read() by forwarding them to another, existing
+/// file called the underlying file. This base class simplifies construction
+/// of layered files by performing standard tasks such as setting up the
+/// file alignment. Actual implementation of the I/O methods is left for the
+/// derived class.
+class layered_file_impl : public file_impl {
+protected:
+ file _underlying_file;
+public:
+ /// Constructs a layered file. This sets up the underlying_file() method
+ /// and initializes alignment constants to be the same as the underlying file.
+ explicit layered_file_impl(file underlying_file) noexcept
+ : _underlying_file(std::move(underlying_file)) {
+ _memory_dma_alignment = _underlying_file.memory_dma_alignment();
+ _disk_read_dma_alignment = _underlying_file.disk_read_dma_alignment();
+ _disk_write_dma_alignment = _underlying_file.disk_write_dma_alignment();
+ }
+
+ /// The underlying file which can be used to back I/O methods.
+ file& underlying_file() noexcept {
+ return _underlying_file;
+ }
+
+ /// The underlying file which can be used to back I/O methods.
+ const file& underlying_file() const noexcept {
+ return _underlying_file;
+ }
+};
+
+
+/// @}
+
+
+}
diff --git a/src/seastar/include/seastar/core/linux-aio.hh b/src/seastar/include/seastar/core/linux-aio.hh
new file mode 100644
index 000000000..1e4eef211
--- /dev/null
+++ b/src/seastar/include/seastar/core/linux-aio.hh
@@ -0,0 +1,234 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <endian.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <signal.h>
+#include <cstdint>
+
+namespace seastar {
+
+namespace internal {
+
+namespace linux_abi {
+
+using aio_context_t = unsigned long;
+
+enum class iocb_cmd : uint16_t {
+ PREAD = 0,
+ PWRITE = 1,
+ FSYNC = 2,
+ FDSYNC = 3,
+ POLL = 5,
+ NOOP = 6,
+ PREADV = 7,
+ PWRITEV = 8,
+};
+
+struct io_event {
+ uint64_t data;
+ uint64_t obj;
+ int64_t res;
+ int64_t res2;
+};
+
+constexpr int IOCB_FLAG_RESFD = 1;
+
+struct iocb {
+ uint64_t aio_data;
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ uint32_t aio_key;
+ int32_t aio_rw_flags;
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ int32_t aio_rw_flags;
+ uint32_t aio_key;
+#else
+#error bad byteorder
+#endif
+
+ iocb_cmd aio_lio_opcode;
+ int16_t aio_reqprio;
+ uint32_t aio_fildes;
+
+ uint64_t aio_buf;
+ uint64_t aio_nbytes;
+ int64_t aio_offset;
+
+ uint64_t aio_reserved2;
+
+ uint32_t aio_flags;
+
+ uint32_t aio_resfd;
+};
+
+struct aio_sigset {
+ const sigset_t *sigmask;
+ size_t sigsetsize;
+};
+
+}
+
+linux_abi::iocb make_read_iocb(int fd, uint64_t offset, void* buffer, size_t len);
+linux_abi::iocb make_write_iocb(int fd, uint64_t offset, const void* buffer, size_t len);
+linux_abi::iocb make_readv_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov);
+linux_abi::iocb make_writev_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov);
+linux_abi::iocb make_poll_iocb(int fd, uint32_t events);
+
+void set_user_data(linux_abi::iocb& iocb, void* data);
+void* get_user_data(const linux_abi::iocb& iocb);
+void set_nowait(linux_abi::iocb& iocb, bool nowait);
+
+void set_eventfd_notification(linux_abi::iocb& iocb, int eventfd);
+
+linux_abi::iocb* get_iocb(const linux_abi::io_event& ioev);
+
+int io_setup(int nr_events, linux_abi::aio_context_t* io_context);
+int io_destroy(linux_abi::aio_context_t io_context);
+int io_submit(linux_abi::aio_context_t io_context, long nr, linux_abi::iocb** iocbs);
+int io_cancel(linux_abi::aio_context_t io_context, linux_abi::iocb* iocb, linux_abi::io_event* result);
+int io_getevents(linux_abi::aio_context_t io_context, long min_nr, long nr, linux_abi::io_event* events, const ::timespec* timeout,
+ bool force_syscall = false);
+int io_pgetevents(linux_abi::aio_context_t io_context, long min_nr, long nr, linux_abi::io_event* events, const ::timespec* timeout, const sigset_t* sigmask,
+ bool force_syscall = false);
+
+void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context);
+
+}
+
+extern bool aio_nowait_supported;
+
+namespace internal {
+
+inline
+linux_abi::iocb
+make_read_iocb(int fd, uint64_t offset, void* buffer, size_t len) {
+ linux_abi::iocb iocb{};
+ iocb.aio_lio_opcode = linux_abi::iocb_cmd::PREAD;
+ iocb.aio_fildes = fd;
+ iocb.aio_offset = offset;
+ iocb.aio_buf = reinterpret_cast<uintptr_t>(buffer);
+ iocb.aio_nbytes = len;
+ return iocb;
+}
+
+inline
+linux_abi::iocb
+make_write_iocb(int fd, uint64_t offset, const void* buffer, size_t len) {
+ linux_abi::iocb iocb{};
+ iocb.aio_lio_opcode = linux_abi::iocb_cmd::PWRITE;
+ iocb.aio_fildes = fd;
+ iocb.aio_offset = offset;
+ iocb.aio_buf = reinterpret_cast<uintptr_t>(buffer);
+ iocb.aio_nbytes = len;
+ return iocb;
+}
+
+inline
+linux_abi::iocb
+make_readv_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov) {
+ linux_abi::iocb iocb{};
+ iocb.aio_lio_opcode = linux_abi::iocb_cmd::PREADV;
+ iocb.aio_fildes = fd;
+ iocb.aio_offset = offset;
+ iocb.aio_buf = reinterpret_cast<uintptr_t>(iov);
+ iocb.aio_nbytes = niov;
+ return iocb;
+}
+
+inline
+linux_abi::iocb
+make_writev_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov) {
+ linux_abi::iocb iocb{};
+ iocb.aio_lio_opcode = linux_abi::iocb_cmd::PWRITEV;
+ iocb.aio_fildes = fd;
+ iocb.aio_offset = offset;
+ iocb.aio_buf = reinterpret_cast<uintptr_t>(iov);
+ iocb.aio_nbytes = niov;
+ return iocb;
+}
+
+inline
+linux_abi::iocb
+make_poll_iocb(int fd, uint32_t events) {
+ linux_abi::iocb iocb{};
+ iocb.aio_lio_opcode = linux_abi::iocb_cmd::POLL;
+ iocb.aio_fildes = fd;
+ iocb.aio_buf = events;
+ return iocb;
+}
+
+inline
+linux_abi::iocb
+make_fdsync_iocb(int fd) {
+ linux_abi::iocb iocb{};
+ iocb.aio_lio_opcode = linux_abi::iocb_cmd::FDSYNC;
+ iocb.aio_fildes = fd;
+ return iocb;
+}
+
+inline
+void
+set_user_data(linux_abi::iocb& iocb, void* data) {
+ iocb.aio_data = reinterpret_cast<uintptr_t>(data);
+}
+
+inline
+void*
+get_user_data(const linux_abi::iocb& iocb) {
+ return reinterpret_cast<void*>(uintptr_t(iocb.aio_data));
+}
+
+inline
+void
+set_eventfd_notification(linux_abi::iocb& iocb, int eventfd) {
+ iocb.aio_flags |= linux_abi::IOCB_FLAG_RESFD;
+ iocb.aio_resfd = eventfd;
+}
+
+inline
+linux_abi::iocb*
+get_iocb(const linux_abi::io_event& ev) {
+ return reinterpret_cast<linux_abi::iocb*>(uintptr_t(ev.obj));
+}
+
+inline
+void
+set_nowait(linux_abi::iocb& iocb, bool nowait) {
+#ifdef RWF_NOWAIT
+ if (aio_nowait_supported) {
+ if (nowait) {
+ iocb.aio_rw_flags |= RWF_NOWAIT;
+ } else {
+ iocb.aio_rw_flags &= ~RWF_NOWAIT;
+ }
+ }
+#endif
+}
+
+}
+
+
+}
+
diff --git a/src/seastar/include/seastar/core/loop.hh b/src/seastar/include/seastar/core/loop.hh
new file mode 100644
index 000000000..a7a0145a8
--- /dev/null
+++ b/src/seastar/include/seastar/core/loop.hh
@@ -0,0 +1,715 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 ScyllaDB.
+ */
+
+#pragma once
+
+#include <iterator>
+#include <memory>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/task.hh>
+#include <seastar/util/bool_class.hh>
+#include <seastar/core/semaphore.hh>
+
+namespace seastar {
+
+/// \addtogroup future-util
+/// @{
+
+// The AsyncAction concept represents an action which can complete later than
+// the actual function invocation. It is represented by a function which
+// returns a future which resolves when the action is done.
+
+struct stop_iteration_tag { };
+using stop_iteration = bool_class<stop_iteration_tag>;
+
+namespace internal {
+
+template <typename AsyncAction>
+class repeater final : public continuation_base<stop_iteration> {
+ promise<> _promise;
+ AsyncAction _action;
+public:
+ explicit repeater(AsyncAction&& action) : _action(std::move(action)) {}
+ future<> get_future() { return _promise.get_future(); }
+ task* waiting_task() noexcept override { return _promise.waiting_task(); }
+ virtual void run_and_dispose() noexcept override {
+ if (_state.failed()) {
+ _promise.set_exception(std::move(_state).get_exception());
+ delete this;
+ return;
+ } else {
+ if (_state.get0() == stop_iteration::yes) {
+ _promise.set_value();
+ delete this;
+ return;
+ }
+ _state = {};
+ }
+ try {
+ do {
+ auto f = futurize_invoke(_action);
+ if (!f.available()) {
+ internal::set_callback(f, this);
+ return;
+ }
+ if (f.get0() == stop_iteration::yes) {
+ _promise.set_value();
+ delete this;
+ return;
+ }
+ } while (!need_preempt());
+ } catch (...) {
+ _promise.set_exception(std::current_exception());
+ delete this;
+ return;
+ }
+ _state.set(stop_iteration::no);
+ schedule(this);
+ }
+};
+
+} // namespace internal
+
+// Delete these overloads so that the actual implementation can use a
+// universal reference but still reject lvalue references.
+template<typename AsyncAction>
+future<> repeat(const AsyncAction& action) noexcept = delete;
+template<typename AsyncAction>
+future<> repeat(AsyncAction& action) noexcept = delete;
+
+/// Invokes given action until it fails or the function requests iteration to stop by returning
+/// \c stop_iteration::yes.
+///
+/// \param action a callable taking no arguments, returning a future<stop_iteration>. Will
+/// be called again as soon as the future resolves, unless the
+/// future fails, action throws, or it resolves with \c stop_iteration::yes.
+/// If \c action is an r-value it can be moved in the middle of iteration.
+/// \return a ready future if we stopped successfully, or a failed future if
+/// a call to to \c action failed.
+template<typename AsyncAction>
+SEASTAR_CONCEPT( requires seastar::InvokeReturns<AsyncAction, stop_iteration> || seastar::InvokeReturns<AsyncAction, future<stop_iteration>> )
+inline
+future<> repeat(AsyncAction&& action) noexcept {
+ using futurator = futurize<std::result_of_t<AsyncAction()>>;
+ static_assert(std::is_same<future<stop_iteration>, typename futurator::type>::value, "bad AsyncAction signature");
+ for (;;) {
+ // Do not type-erase here in case this is a short repeat()
+ auto f = futurator::invoke(action);
+
+ if (!f.available() || f.failed() || need_preempt()) {
+ return [&] () noexcept {
+ memory::scoped_critical_alloc_section _;
+ auto repeater = new internal::repeater<AsyncAction>(std::move(action));
+ auto ret = repeater->get_future();
+ internal::set_callback(f, repeater);
+ return ret;
+ }();
+ }
+
+ if (f.get0() == stop_iteration::yes) {
+ return make_ready_future<>();
+ }
+ }
+}
+
+/// \cond internal
+
+template <typename T>
+struct repeat_until_value_type_helper;
+
+/// Type helper for repeat_until_value()
+template <typename T>
+struct repeat_until_value_type_helper<future<std::optional<T>>> {
+ /// The type of the value we are computing
+ using value_type = T;
+ /// Type used by \c AsyncAction while looping
+ using optional_type = std::optional<T>;
+ /// Return type of repeat_until_value()
+ using future_type = future<value_type>;
+};
+
+/// Return value of repeat_until_value()
+template <typename AsyncAction>
+using repeat_until_value_return_type
+ = typename repeat_until_value_type_helper<typename futurize<std::result_of_t<AsyncAction()>>::type>::future_type;
+
+/// \endcond
+
+namespace internal {
+
+template <typename AsyncAction, typename T>
+class repeat_until_value_state final : public continuation_base<std::optional<T>> {
+ promise<T> _promise;
+ AsyncAction _action;
+public:
+ explicit repeat_until_value_state(AsyncAction action) : _action(std::move(action)) {}
+ repeat_until_value_state(std::optional<T> st, AsyncAction action) : repeat_until_value_state(std::move(action)) {
+ this->_state.set(std::move(st));
+ }
+ future<T> get_future() { return _promise.get_future(); }
+ task* waiting_task() noexcept override { return _promise.waiting_task(); }
+ virtual void run_and_dispose() noexcept override {
+ if (this->_state.failed()) {
+ _promise.set_exception(std::move(this->_state).get_exception());
+ delete this;
+ return;
+ } else {
+ auto v = std::move(this->_state).get0();
+ if (v) {
+ _promise.set_value(std::move(*v));
+ delete this;
+ return;
+ }
+ this->_state = {};
+ }
+ try {
+ do {
+ auto f = futurize_invoke(_action);
+ if (!f.available()) {
+ internal::set_callback(f, this);
+ return;
+ }
+ auto ret = f.get0();
+ if (ret) {
+ _promise.set_value(std::move(*ret));
+ delete this;
+ return;
+ }
+ } while (!need_preempt());
+ } catch (...) {
+ _promise.set_exception(std::current_exception());
+ delete this;
+ return;
+ }
+ this->_state.set(std::nullopt);
+ schedule(this);
+ }
+};
+
+} // namespace internal
+
+/// Invokes given action until it fails or the function requests iteration to stop by returning
+/// an engaged \c future<std::optional<T>> or std::optional<T>. The value is extracted
+/// from the \c optional, and returned, as a future, from repeat_until_value().
+///
+/// \param action a callable taking no arguments, returning a future<std::optional<T>>
+/// or std::optional<T>. Will be called again as soon as the future
+/// resolves, unless the future fails, action throws, or it resolves with
+/// an engaged \c optional. If \c action is an r-value it can be moved
+/// in the middle of iteration.
+/// \return a ready future if we stopped successfully, or a failed future if
+/// a call to to \c action failed. The \c optional's value is returned.
+template<typename AsyncAction>
+SEASTAR_CONCEPT( requires requires (AsyncAction aa) {
+ bool(futurize_invoke(aa).get0());
+ futurize_invoke(aa).get0().value();
+} )
+repeat_until_value_return_type<AsyncAction>
+repeat_until_value(AsyncAction action) noexcept {
+ using futurator = futurize<std::result_of_t<AsyncAction()>>;
+ using type_helper = repeat_until_value_type_helper<typename futurator::type>;
+ // the "T" in the documentation
+ using value_type = typename type_helper::value_type;
+ using optional_type = typename type_helper::optional_type;
+ do {
+ auto f = futurator::invoke(action);
+
+ if (!f.available()) {
+ return [&] () noexcept {
+ memory::scoped_critical_alloc_section _;
+ auto state = new internal::repeat_until_value_state<AsyncAction, value_type>(std::move(action));
+ auto ret = state->get_future();
+ internal::set_callback(f, state);
+ return ret;
+ }();
+ }
+
+ if (f.failed()) {
+ return make_exception_future<value_type>(f.get_exception());
+ }
+
+ optional_type&& optional = std::move(f).get0();
+ if (optional) {
+ return make_ready_future<value_type>(std::move(optional.value()));
+ }
+ } while (!need_preempt());
+
+ try {
+ auto state = new internal::repeat_until_value_state<AsyncAction, value_type>(std::nullopt, std::move(action));
+ auto f = state->get_future();
+ schedule(state);
+ return f;
+ } catch (...) {
+ return make_exception_future<value_type>(std::current_exception());
+ }
+}
+
+namespace internal {
+
+template <typename StopCondition, typename AsyncAction>
+class do_until_state final : public continuation_base<> {
+ promise<> _promise;
+ StopCondition _stop;
+ AsyncAction _action;
+public:
+ explicit do_until_state(StopCondition stop, AsyncAction action) : _stop(std::move(stop)), _action(std::move(action)) {}
+ future<> get_future() { return _promise.get_future(); }
+ task* waiting_task() noexcept override { return _promise.waiting_task(); }
+ virtual void run_and_dispose() noexcept override {
+ if (_state.available()) {
+ if (_state.failed()) {
+ _promise.set_urgent_state(std::move(_state));
+ delete this;
+ return;
+ }
+ _state = {}; // allow next cycle to overrun state
+ }
+ try {
+ do {
+ if (_stop()) {
+ _promise.set_value();
+ delete this;
+ return;
+ }
+ auto f = _action();
+ if (!f.available()) {
+ internal::set_callback(f, this);
+ return;
+ }
+ if (f.failed()) {
+ f.forward_to(std::move(_promise));
+ delete this;
+ return;
+ }
+ } while (!need_preempt());
+ } catch (...) {
+ _promise.set_exception(std::current_exception());
+ delete this;
+ return;
+ }
+ schedule(this);
+ }
+};
+
+} // namespace internal
+
+/// Invokes given action until it fails or given condition evaluates to true.
+///
+/// \param stop_cond a callable taking no arguments, returning a boolean that
+/// evalutes to true when you don't want to call \c action
+/// any longer
+/// \param action a callable taking no arguments, returning a future<>. Will
+/// be called again as soon as the future resolves, unless the
+/// future fails, or \c stop_cond returns \c true.
+/// \return a ready future if we stopped successfully, or a failed future if
+/// a call to to \c action failed.
+template<typename AsyncAction, typename StopCondition>
+SEASTAR_CONCEPT( requires seastar::InvokeReturns<StopCondition, bool> && seastar::InvokeReturns<AsyncAction, future<>> )
+inline
+future<> do_until(StopCondition stop_cond, AsyncAction action) noexcept {
+ using namespace internal;
+ for (;;) {
+ if (stop_cond()) {
+ return make_ready_future<>();
+ }
+ auto f = futurize_invoke(action);
+ if (f.failed()) {
+ return f;
+ }
+ if (!f.available() || need_preempt()) {
+ return [&] () noexcept {
+ memory::scoped_critical_alloc_section _;
+ auto task = new do_until_state<StopCondition, AsyncAction>(std::move(stop_cond), std::move(action));
+ auto ret = task->get_future();
+ internal::set_callback(f, task);
+ return ret;
+ }();
+ }
+ }
+}
+
+/// Invoke given action until it fails.
+///
+/// Calls \c action repeatedly until it returns a failed future.
+///
+/// \param action a callable taking no arguments, returning a \c future<>
+/// that becomes ready when you wish it to be called again.
+/// \return a future<> that will resolve to the first failure of \c action
+template<typename AsyncAction>
+SEASTAR_CONCEPT( requires seastar::InvokeReturns<AsyncAction, future<>> )
+inline
+future<> keep_doing(AsyncAction action) noexcept {
+ return repeat([action = std::move(action)] () mutable {
+ return action().then([] {
+ return stop_iteration::no;
+ });
+ });
+}
+
+namespace internal {
+template <typename Iterator, typename AsyncAction>
+class do_for_each_state final : public continuation_base<> {
+ Iterator _begin;
+ Iterator _end;
+ AsyncAction _action;
+ promise<> _pr;
+
+public:
+ do_for_each_state(Iterator begin, Iterator end, AsyncAction action, future<> first_unavailable)
+ : _begin(std::move(begin)), _end(std::move(end)), _action(std::move(action)) {
+ internal::set_callback(first_unavailable, this);
+ }
+ virtual void run_and_dispose() noexcept override {
+ std::unique_ptr<do_for_each_state> zis(this);
+ if (_state.failed()) {
+ _pr.set_urgent_state(std::move(_state));
+ return;
+ }
+ while (_begin != _end) {
+ auto f = futurize_invoke(_action, *_begin++);
+ if (f.failed()) {
+ f.forward_to(std::move(_pr));
+ return;
+ }
+ if (!f.available() || need_preempt()) {
+ _state = {};
+ internal::set_callback(f, this);
+ zis.release();
+ return;
+ }
+ }
+ _pr.set_value();
+ }
+ task* waiting_task() noexcept override {
+ return _pr.waiting_task();
+ }
+ future<> get_future() {
+ return _pr.get_future();
+ }
+};
+
+template<typename Iterator, typename AsyncAction>
+inline
+future<> do_for_each_impl(Iterator begin, Iterator end, AsyncAction action) {
+ while (begin != end) {
+ auto f = futurize_invoke(action, *begin++);
+ if (f.failed()) {
+ return f;
+ }
+ if (!f.available() || need_preempt()) {
+ auto* s = new internal::do_for_each_state<Iterator, AsyncAction>{
+ std::move(begin), std::move(end), std::move(action), std::move(f)};
+ return s->get_future();
+ }
+ }
+ return make_ready_future<>();
+}
+} // namespace internal
+
+/// \addtogroup future-util
+
+/// \brief Call a function for each item in a range, sequentially (iterator version).
+///
+/// For each item in a range, call a function, waiting for the previous
+/// invocation to complete before calling the next one.
+///
+/// \param begin an \c InputIterator designating the beginning of the range
+/// \param end an \c InputIterator designating the endof the range
+/// \param action a callable, taking a reference to objects from the range
+/// as a parameter, and returning a \c future<> that resolves
+/// when it is acceptable to process the next item.
+/// \return a ready future on success, or the first failed future if
+/// \c action failed.
+template<typename Iterator, typename AsyncAction>
+SEASTAR_CONCEPT( requires requires (Iterator i, AsyncAction aa) {
+ { futurize_invoke(aa, *i) } -> std::same_as<future<>>;
+} )
+inline
+future<> do_for_each(Iterator begin, Iterator end, AsyncAction action) noexcept {
+ try {
+ return internal::do_for_each_impl(std::move(begin), std::move(end), std::move(action));
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+/// \brief Call a function for each item in a range, sequentially (range version).
+///
+/// For each item in a range, call a function, waiting for the previous
+/// invocation to complete before calling the next one.
+///
+/// \param c an \c Container object designating input range
+/// \param action a callable, taking a reference to objects from the range
+/// as a parameter, and returning a \c future<> that resolves
+/// when it is acceptable to process the next item.
+/// \return a ready future on success, or the first failed future if
+/// \c action failed.
+template<typename Container, typename AsyncAction>
+SEASTAR_CONCEPT( requires requires (Container c, AsyncAction aa) {
+ { futurize_invoke(aa, *c.begin()) } -> std::same_as<future<>>;
+} )
+inline
+future<> do_for_each(Container& c, AsyncAction action) noexcept {
+ try {
+ return internal::do_for_each_impl(std::begin(c), std::end(c), std::move(action));
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+namespace internal {
+
+template <typename Iterator, typename IteratorCategory>
+inline
+size_t
+iterator_range_estimate_vector_capacity(Iterator begin, Iterator end, IteratorCategory category) {
+ // For InputIterators we can't estimate needed capacity
+ return 0;
+}
+
+template <typename Iterator>
+inline
+size_t
+iterator_range_estimate_vector_capacity(Iterator begin, Iterator end, std::forward_iterator_tag category) {
+ // May be linear time below random_access_iterator_tag, but still better than reallocation
+ return std::distance(begin, end);
+}
+
+} // namespace internal
+
+/// \cond internal
+
+class parallel_for_each_state final : private continuation_base<> {
+ std::vector<future<>> _incomplete;
+ promise<> _result;
+ std::exception_ptr _ex;
+private:
+ // Wait for one of the futures in _incomplete to complete, and then
+ // decide what to do: wait for another one, or deliver _result if all
+ // are complete.
+ void wait_for_one() noexcept;
+ virtual void run_and_dispose() noexcept override;
+ task* waiting_task() noexcept override { return _result.waiting_task(); }
+public:
+ parallel_for_each_state(size_t n);
+ void add_future(future<>&& f);
+ future<> get_future();
+};
+
+/// \endcond
+
+/// \brief Run tasks in parallel (iterator version).
+///
+/// Given a range [\c begin, \c end) of objects, run \c func on each \c *i in
+/// the range, and return a future<> that resolves when all the functions
+/// complete. \c func should return a future<> that indicates when it is
+/// complete. All invocations are performed in parallel. This allows the range
+/// to refer to stack objects, but means that unlike other loops this cannot
+/// check need_preempt and can only be used with small ranges.
+///
+/// \param begin an \c InputIterator designating the beginning of the range
+/// \param end an \c InputIterator designating the end of the range
+/// \param func Function to invoke with each element in the range (returning
+/// a \c future<>)
+/// \return a \c future<> that resolves when all the function invocations
+/// complete. If one or more return an exception, the return value
+/// contains one of the exceptions.
+template <typename Iterator, typename Func>
+SEASTAR_CONCEPT( requires requires (Func f, Iterator i) { { f(*i++) } -> std::same_as<future<>>; } )
+inline
+future<>
+parallel_for_each(Iterator begin, Iterator end, Func&& func) noexcept {
+ parallel_for_each_state* s = nullptr;
+ // Process all elements, giving each future the following treatment:
+ // - available, not failed: do nothing
+ // - available, failed: collect exception in ex
+ // - not available: collect in s (allocating it if needed)
+ while (begin != end) {
+ auto f = futurize_invoke(std::forward<Func>(func), *begin++);
+ if (!f.available() || f.failed()) {
+ if (!s) {
+ memory::scoped_critical_alloc_section _;
+ using itraits = std::iterator_traits<Iterator>;
+ auto n = (internal::iterator_range_estimate_vector_capacity(begin, end, typename itraits::iterator_category()) + 1);
+ s = new parallel_for_each_state(n);
+ }
+ s->add_future(std::move(f));
+ }
+ }
+ // If any futures were not available, hand off to parallel_for_each_state::start().
+ // Otherwise we can return a result immediately.
+ if (s) {
+ // s->get_future() takes ownership of s (and chains it to one of the futures it contains)
+ // so this isn't a leak
+ return s->get_future();
+ }
+ return make_ready_future<>();
+}
+
+/// \brief Run tasks in parallel (range version).
+///
+/// Given a \c range of objects, invoke \c func with each object
+/// in the range, and return a future<> that resolves when all
+/// the functions complete. \c func should return a future<> that indicates
+/// when it is complete. All invocations are performed in parallel. This allows
+/// the range to refer to stack objects, but means that unlike other loops this
+/// cannot check need_preempt and can only be used with small ranges.
+///
+/// \param range A range of objects to iterate run \c func on
+/// \param func A callable, accepting reference to the range's
+/// \c value_type, and returning a \c future<>.
+/// \return a \c future<> that becomes ready when the entire range
+/// was processed. If one or more of the invocations of
+/// \c func returned an exceptional future, then the return
+/// value will contain one of those exceptions.
+
+namespace internal {
+
+template <typename Range, typename Func>
+inline
+future<>
+parallel_for_each_impl(Range&& range, Func&& func) {
+ return parallel_for_each(std::begin(range), std::end(range),
+ std::forward<Func>(func));
+}
+
+} // namespace internal
+
+template <typename Range, typename Func>
+SEASTAR_CONCEPT( requires requires (Func f, Range r) { { f(*r.begin()) } -> std::same_as<future<>>; } )
+inline
+future<>
+parallel_for_each(Range&& range, Func&& func) noexcept {
+ auto impl = internal::parallel_for_each_impl<Range, Func>;
+ return futurize_invoke(impl, std::forward<Range>(range), std::forward<Func>(func));
+}
+
+/// Run a maximum of \c max_concurrent tasks in parallel (iterator version).
+///
+/// Given a range [\c begin, \c end) of objects, run \c func on each \c *i in
+/// the range, and return a future<> that resolves when all the functions
+/// complete. \c func should return a future<> that indicates when it is
+/// complete. Up to \c max_concurrent invocations are performed in parallel.
+/// This does not allow the range to refer to stack objects. The caller
+/// must ensure that the range outlives the call to max_concurrent_for_each
+/// so it can be iterated in the background.
+///
+/// \param begin an \c InputIterator designating the beginning of the range
+/// \param end an \c InputIterator designating the end of the range
+/// \param max_concurrent maximum number of concurrent invocations of \c func, must be greater than zero.
+/// \param func Function to invoke with each element in the range (returning
+/// a \c future<>)
+/// \return a \c future<> that resolves when all the function invocations
+/// complete. If one or more return an exception, the return value
+/// contains one of the exceptions.
+template <typename Iterator, typename Func>
+SEASTAR_CONCEPT( requires requires (Func f, Iterator i) { { f(*i++) } -> std::same_as<future<>>; } )
+inline
+future<>
+max_concurrent_for_each(Iterator begin, Iterator end, size_t max_concurrent, Func&& func) noexcept {
+ struct state {
+ Iterator begin;
+ Iterator end;
+ Func func;
+ size_t max_concurrent;
+ semaphore sem;
+ std::exception_ptr err;
+
+ state(Iterator begin_, Iterator end_, size_t max_concurrent_, Func func_)
+ : begin(std::move(begin_))
+ , end(std::move(end_))
+ , func(std::move(func_))
+ , max_concurrent(max_concurrent_)
+ , sem(max_concurrent_)
+ , err()
+ { }
+ };
+
+ assert(max_concurrent > 0);
+
+ try {
+ return do_with(state(std::move(begin), std::move(end), max_concurrent, std::forward<Func>(func)), [] (state& s) {
+ return do_until([&s] { return s.begin == s.end; }, [&s] {
+ return s.sem.wait().then([&s] () mutable noexcept {
+ // Possibly run in background and signal _sem when the task is done.
+ // The background tasks are waited on using _sem.
+ (void)futurize_invoke(s.func, *s.begin++).then_wrapped([&s] (future<> fut) {
+ if (fut.failed()) {
+ auto e = fut.get_exception();;
+ if (!s.err) {
+ s.err = std::move(e);
+ }
+ }
+ s.sem.signal();
+ });
+ });
+ }).then([&s] {
+ // Wait for any background task to finish
+ // and signal and semaphore
+ return s.sem.wait(s.max_concurrent);
+ }).then([&s] {
+ if (!s.err) {
+ return make_ready_future<>();
+ }
+ return seastar::make_exception_future<>(std::move(s.err));
+ });
+ });
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+/// Run a maximum of \c max_concurrent tasks in parallel (range version).
+///
+/// Given a range [\c begin, \c end) of objects, run \c func on each \c *i in
+/// the range, and return a future<> that resolves when all the functions
+/// complete. \c func should return a future<> that indicates when it is
+/// complete. Up to \c max_concurrent invocations are performed in parallel.
+/// This does not allow the range to refer to stack objects. The caller
+/// must ensure that the range outlives the call to max_concurrent_for_each
+/// so it can be iterated in the background.
+///
+/// \param begin an \c InputIterator designating the beginning of the range
+/// \param end an \c InputIterator designating the end of the range
+/// \param max_concurrent maximum number of concurrent invocations of \c func, must be greater than zero.
+/// \param func Function to invoke with each element in the range (returning
+/// a \c future<>)
+/// \return a \c future<> that resolves when all the function invocations
+/// complete. If one or more return an exception, the return value
+/// contains one of the exceptions.
+template <typename Range, typename Func>
+SEASTAR_CONCEPT( requires std::ranges::range<Range> && requires (Func f, Range r) { { f(*r.begin()) } -> std::same_as<future<>>; } )
+inline
+future<>
+max_concurrent_for_each(Range&& range, size_t max_concurrent, Func&& func) noexcept {
+ try {
+ return max_concurrent_for_each(std::begin(range), std::end(range), max_concurrent, std::forward<Func>(func));
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/lowres_clock.hh b/src/seastar/include/seastar/core/lowres_clock.hh
new file mode 100644
index 000000000..2d683a87a
--- /dev/null
+++ b/src/seastar/include/seastar/core/lowres_clock.hh
@@ -0,0 +1,160 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/cacheline.hh>
+#include <seastar/core/timer.hh>
+
+#include <cstdint>
+
+#include <atomic>
+#include <chrono>
+
+namespace seastar {
+
+//
+// Forward declarations.
+//
+
+class lowres_clock;
+class lowres_system_clock;
+
+/// \cond internal
+
+class lowres_clock_impl final {
+public:
+ using base_steady_clock = std::chrono::steady_clock;
+ using base_system_clock = std::chrono::system_clock;
+
+ // The clocks' resolutions are 10 ms. However, to make it is easier to do calculations with
+ // `std::chrono::milliseconds`, we make the clock period 1 ms instead of 10 ms.
+ using period = std::ratio<1, 1000>;
+
+ using steady_rep = base_steady_clock::rep;
+ using steady_duration = std::chrono::duration<steady_rep, period>;
+ using steady_time_point = std::chrono::time_point<lowres_clock, steady_duration>;
+
+ using system_rep = base_system_clock::rep;
+ using system_duration = std::chrono::duration<system_rep, period>;
+ using system_time_point = std::chrono::time_point<lowres_system_clock, system_duration>;
+
+ static steady_time_point steady_now() noexcept {
+ auto const nr = counters::_steady_now.load(std::memory_order_relaxed);
+ return steady_time_point(steady_duration(nr));
+ }
+
+ static system_time_point system_now() noexcept {
+ auto const nr = counters::_system_now.load(std::memory_order_relaxed);
+ return system_time_point(system_duration(nr));
+ }
+
+ // For construction.
+ friend class smp;
+private:
+ // Both counters are updated by cpu0 and read by other cpus. Place them on their own cache line to avoid false
+ // sharing.
+ struct alignas(seastar::cache_line_size) counters final {
+ static std::atomic<steady_rep> _steady_now;
+ static std::atomic<system_rep> _system_now;
+ };
+
+ // The timer expires every 10 ms.
+ static constexpr std::chrono::milliseconds _granularity{10};
+
+ // High-resolution timer to drive these low-resolution clocks.
+ timer<> _timer{};
+
+ static void update() noexcept;
+
+ // Private to ensure that static variables are only initialized once.
+ // might throw when arming timer.
+ lowres_clock_impl();
+};
+
+/// \endcond
+
+//
+/// \brief Low-resolution and efficient steady clock.
+///
+/// This is a monotonic clock with a granularity of 10 ms. Time points from this clock do not correspond to system
+/// time.
+///
+/// The primary benefit of this clock is that invoking \c now() is inexpensive compared to
+/// \c std::chrono::steady_clock::now().
+///
+/// \see \c lowres_system_clock for a low-resolution clock which produces time points corresponding to system time.
+///
+class lowres_clock final {
+public:
+ using rep = lowres_clock_impl::steady_rep;
+ using period = lowres_clock_impl::period;
+ using duration = lowres_clock_impl::steady_duration;
+ using time_point = lowres_clock_impl::steady_time_point;
+
+ static constexpr bool is_steady = true;
+
+ ///
+ /// \note Outside of a Seastar application, the result is undefined.
+ ///
+ static time_point now() noexcept {
+ return lowres_clock_impl::steady_now();
+ }
+};
+
+///
+/// \brief Low-resolution and efficient system clock.
+///
+/// This clock has the same granularity as \c lowres_clock, but it is not required to be monotonic and its time points
+/// correspond to system time.
+///
+/// The primary benefit of this clock is that invoking \c now() is inexpensive compared to
+/// \c std::chrono::system_clock::now().
+///
+class lowres_system_clock final {
+public:
+ using rep = lowres_clock_impl::system_rep;
+ using period = lowres_clock_impl::period;
+ using duration = lowres_clock_impl::system_duration;
+ using time_point = lowres_clock_impl::system_time_point;
+
+ static constexpr bool is_steady = lowres_clock_impl::base_system_clock::is_steady;
+
+ ///
+ /// \note Outside of a Seastar application, the result is undefined.
+ ///
+ static time_point now() noexcept {
+ return lowres_clock_impl::system_now();
+ }
+
+ static std::time_t to_time_t(time_point t) noexcept {
+ return std::chrono::duration_cast<std::chrono::seconds>(t.time_since_epoch()).count();
+ }
+
+ static time_point from_time_t(std::time_t t) noexcept {
+ return time_point(std::chrono::duration_cast<duration>(std::chrono::seconds(t)));
+ }
+};
+
+extern template class timer<lowres_clock>;
+
+}
+
diff --git a/src/seastar/include/seastar/core/make_task.hh b/src/seastar/include/seastar/core/make_task.hh
new file mode 100644
index 000000000..74c521b07
--- /dev/null
+++ b/src/seastar/include/seastar/core/make_task.hh
@@ -0,0 +1,62 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <seastar/core/task.hh>
+#include <seastar/core/future.hh>
+
+namespace seastar {
+
+template <typename Func>
+class lambda_task final : public task {
+ Func _func;
+ using futurator = futurize<std::result_of_t<Func()>>;
+ typename futurator::promise_type _result;
+public:
+ lambda_task(scheduling_group sg, const Func& func) : task(sg), _func(func) {}
+ lambda_task(scheduling_group sg, Func&& func) : task(sg), _func(std::move(func)) {}
+ typename futurator::type get_future() noexcept { return _result.get_future(); }
+ virtual void run_and_dispose() noexcept override {
+ futurator::invoke(_func).forward_to(std::move(_result));
+ delete this;
+ }
+ virtual task* waiting_task() noexcept override {
+ return _result.waiting_task();
+ }
+};
+
+template <typename Func>
+inline
+lambda_task<Func>*
+make_task(Func&& func) noexcept {
+ return new lambda_task<Func>(current_scheduling_group(), std::forward<Func>(func));
+}
+
+template <typename Func>
+inline
+lambda_task<Func>*
+make_task(scheduling_group sg, Func&& func) noexcept {
+ return new lambda_task<Func>(sg, std::forward<Func>(func));
+}
+
+}
diff --git a/src/seastar/include/seastar/core/manual_clock.hh b/src/seastar/include/seastar/core/manual_clock.hh
new file mode 100644
index 000000000..34bc5bec1
--- /dev/null
+++ b/src/seastar/include/seastar/core/manual_clock.hh
@@ -0,0 +1,51 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/timer.hh>
+
+#include <atomic>
+#include <chrono>
+
+namespace seastar {
+
+class manual_clock {
+public:
+ using rep = int64_t;
+ using period = std::chrono::nanoseconds::period;
+ using duration = std::chrono::duration<rep, period>;
+ using time_point = std::chrono::time_point<manual_clock, duration>;
+private:
+ static std::atomic<rep> _now;
+ static void expire_timers() noexcept;
+public:
+ manual_clock() noexcept;
+ static time_point now() noexcept {
+ return time_point(duration(_now.load(std::memory_order_relaxed)));
+ }
+ static void advance(duration d) noexcept;
+};
+
+extern template class timer<manual_clock>;
+
+}
+
diff --git a/src/seastar/include/seastar/core/map_reduce.hh b/src/seastar/include/seastar/core/map_reduce.hh
new file mode 100644
index 000000000..f9e1ad075
--- /dev/null
+++ b/src/seastar/include/seastar/core/map_reduce.hh
@@ -0,0 +1,254 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 ScyllaDB.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/shared_ptr.hh>
+
+namespace seastar {
+
+/// \addtogroup future-util
+/// @{
+
+/// \cond internal
+
+template <typename T, bool IsFuture>
+struct reducer_with_get_traits;
+
+template <typename T>
+struct reducer_with_get_traits<T, false> {
+ using result_type = decltype(std::declval<T>().get());
+ using future_type = future<result_type>;
+ static future_type maybe_call_get(future<> f, lw_shared_ptr<T> r) {
+ return f.then([r = std::move(r)] () mutable {
+ return make_ready_future<result_type>(std::move(*r).get());
+ });
+ }
+};
+
+template <typename T>
+struct reducer_with_get_traits<T, true> {
+ using future_type = decltype(std::declval<T>().get());
+ static future_type maybe_call_get(future<> f, lw_shared_ptr<T> r) {
+ return f.then([r = std::move(r)] () mutable {
+ return r->get();
+ }).then_wrapped([r] (future_type f) {
+ return f;
+ });
+ }
+};
+
+template <typename T, typename V = void>
+struct reducer_traits {
+ using future_type = future<>;
+ static future_type maybe_call_get(future<> f, lw_shared_ptr<T> r) {
+ return f.then([r = std::move(r)] {});
+ }
+};
+
+template <typename T>
+struct reducer_traits<T, decltype(std::declval<T>().get(), void())> : public reducer_with_get_traits<T, is_future<std::result_of_t<decltype(&T::get)(T)>>::value> {};
+
+/// \endcond
+
+/// Map a function over a range and reduce the result.
+///
+/// \param begin an \c InputIterator designating the beginning of the range
+/// \param end an \c InputIterator designating the end of the range
+/// \param mapper is a callable which transforms values from the iterator range into a future<T>
+/// \param r is an object which can be called with T as parameter and yields a future<>
+/// It may have a get() method which returns a value of type U which holds the result of reduction.
+/// \return Th reduced value wrapped in a future.
+/// If the reducer has no get() method then this function returns future<>.
+
+// TODO: specialize for non-deferring reducer
+template <typename Iterator, typename Mapper, typename Reducer>
+SEASTAR_CONCEPT( requires requires (Iterator i, Mapper mapper, Reducer reduce) {
+ *i++;
+ { i != i } -> std::convertible_to<bool>;
+ mapper(*i);
+ reduce(futurize_invoke(mapper, *i).get0());
+} )
+inline
+auto
+map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Reducer&& r)
+ -> typename reducer_traits<Reducer>::future_type
+{
+ auto r_ptr = make_lw_shared(std::forward<Reducer>(r));
+ future<> ret = make_ready_future<>();
+ while (begin != end) {
+ ret = futurize_invoke(mapper, *begin++).then_wrapped([ret = std::move(ret), r_ptr] (auto f) mutable {
+ return ret.then_wrapped([f = std::move(f), r_ptr] (auto rf) mutable {
+ if (rf.failed()) {
+ f.ignore_ready_future();
+ return std::move(rf);
+ } else {
+ return futurize_invoke(*r_ptr, std::move(f.get0()));
+ }
+ });
+ });
+ }
+ return reducer_traits<Reducer>::maybe_call_get(std::move(ret), r_ptr);
+}
+
+/// Asynchronous map/reduce transformation.
+///
+/// Given a range of objects, an asynchronous unary function
+/// operating on these objects, an initial value, and a
+/// binary function for reducing, map_reduce() will
+/// transform each object in the range, then invoke
+/// the the reducing function with the result.
+///
+/// Example:
+///
+/// Calculate the total size of several files:
+///
+/// \code
+/// map_reduce(files.begin(), files.end(),
+/// std::mem_fn(file::size),
+/// size_t(0),
+/// std::plus<size_t>())
+/// \endcode
+///
+/// Requirements:
+/// - Iterator: an InputIterator.
+/// - Mapper: unary function taking Iterator::value_type and producing a future<...>.
+/// - Initial: any value type
+/// - Reduce: a binary function taking two Initial values and returning an Initial
+///
+/// Return type:
+/// - future<Initial>
+///
+/// \param begin beginning of object range to operate on
+/// \param end end of object range to operate on
+/// \param mapper map function to call on each object, returning a future
+/// \param initial initial input value to reduce function
+/// \param reduce binary function for merging two result values from \c mapper
+///
+/// \return equivalent to \c reduce(reduce(initial, mapper(obj0)), mapper(obj1)) ...
+template <typename Iterator, typename Mapper, typename Initial, typename Reduce>
+SEASTAR_CONCEPT( requires requires (Iterator i, Mapper mapper, Initial initial, Reduce reduce) {
+ *i++;
+ { i != i} -> std::convertible_to<bool>;
+ mapper(*i);
+ requires is_future<decltype(mapper(*i))>::value;
+ { reduce(std::move(initial), mapper(*i).get0()) } -> std::convertible_to<Initial>;
+} )
+inline
+future<Initial>
+map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Initial initial, Reduce reduce) {
+ struct state {
+ Initial result;
+ Reduce reduce;
+ };
+ auto s = make_lw_shared(state{std::move(initial), std::move(reduce)});
+ future<> ret = make_ready_future<>();
+ while (begin != end) {
+ ret = futurize_invoke(mapper, *begin++).then_wrapped([s = s.get(), ret = std::move(ret)] (auto f) mutable {
+ try {
+ s->result = s->reduce(std::move(s->result), std::move(f.get0()));
+ return std::move(ret);
+ } catch (...) {
+ return std::move(ret).then_wrapped([ex = std::current_exception()] (auto f) {
+ f.ignore_ready_future();
+ return make_exception_future<>(ex);
+ });
+ }
+ });
+ }
+ return ret.then([s] {
+ return make_ready_future<Initial>(std::move(s->result));
+ });
+}
+
+/// Asynchronous map/reduce transformation (range version).
+///
+/// Given a range of objects, an asynchronous unary function
+/// operating on these objects, an initial value, and a
+/// binary function for reducing, map_reduce() will
+/// transform each object in the range, then invoke
+/// the the reducing function with the result.
+///
+/// Example:
+///
+/// Calculate the total size of several files:
+///
+/// \code
+/// std::vector<file> files = ...;
+/// map_reduce(files,
+/// std::mem_fn(file::size),
+/// size_t(0),
+/// std::plus<size_t>())
+/// \endcode
+///
+/// Requirements:
+/// - Iterator: an InputIterator.
+/// - Mapper: unary function taking Iterator::value_type and producing a future<...>.
+/// - Initial: any value type
+/// - Reduce: a binary function taking two Initial values and returning an Initial
+///
+/// Return type:
+/// - future<Initial>
+///
+/// \param range object range to operate on
+/// \param mapper map function to call on each object, returning a future
+/// \param initial initial input value to reduce function
+/// \param reduce binary function for merging two result values from \c mapper
+///
+/// \return equivalent to \c reduce(reduce(initial, mapper(obj0)), mapper(obj1)) ...
+template <typename Range, typename Mapper, typename Initial, typename Reduce>
+SEASTAR_CONCEPT( requires requires (Range range, Mapper mapper, Initial initial, Reduce reduce) {
+ std::begin(range);
+ std::end(range);
+ mapper(*std::begin(range));
+ requires is_future<std::remove_reference_t<decltype(mapper(*std::begin(range)))>>::value;
+ { reduce(std::move(initial), mapper(*std::begin(range)).get0()) } -> std::convertible_to<Initial>;
+} )
+inline
+future<Initial>
+map_reduce(Range&& range, Mapper&& mapper, Initial initial, Reduce reduce) {
+ return map_reduce(std::begin(range), std::end(range), std::forward<Mapper>(mapper),
+ std::move(initial), std::move(reduce));
+}
+
+/// Implements @Reducer concept. Calculates the result by
+/// adding elements to the accumulator.
+template <typename Result, typename Addend = Result>
+class adder {
+private:
+ Result _result;
+public:
+ future<> operator()(const Addend& value) {
+ _result += value;
+ return make_ready_future<>();
+ }
+ Result get() && {
+ return std::move(_result);
+ }
+};
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/memory.hh b/src/seastar/include/seastar/core/memory.hh
new file mode 100644
index 000000000..92b63cd2b
--- /dev/null
+++ b/src/seastar/include/seastar/core/memory.hh
@@ -0,0 +1,370 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/resource.hh>
+#include <seastar/core/bitops.hh>
+#include <new>
+#include <functional>
+#include <vector>
+
+namespace seastar {
+
+/// \defgroup memory-module Memory management
+///
+/// Functions and classes for managing memory.
+///
+/// Memory management in seastar consists of the following:
+///
+/// - Low-level memory management in the \ref memory namespace.
+/// - Various smart pointers: \ref shared_ptr, \ref lw_shared_ptr,
+/// and \ref foreign_ptr.
+/// - zero-copy support: \ref temporary_buffer and \ref deleter.
+
+/// Low-level memory management support
+///
+/// The \c memory namespace provides functions and classes for interfacing
+/// with the seastar memory allocator.
+///
+/// The seastar memory allocator splits system memory into a pool per
+/// logical core (lcore). Memory allocated one an lcore should be freed
+/// on the same lcore; failing to do so carries a severe performance
+/// penalty. It is possible to share memory with another core, but this
+/// should be limited to avoid cache coherency traffic.
+/// You can obtain the memory layout of the current shard with
+/// \ref get_memory_layout().
+///
+/// ## Critical allocation scopes
+///
+/// Seastar supports marking scopes as critical allocation scopes for the purpose
+/// of special treatment from various memory related utilities.
+/// See \ref scoped_critical_alloc_section.
+///
+/// ## Diagnostics and debugging features
+///
+/// ### Allocation failure injector
+///
+/// Allows injecting allocation failures for testing resiliency against
+/// allocation failures, or exceptions in general. See:
+/// * \ref alloc_failure_injector
+/// * \ref with_allocation_failures()
+///
+/// ### Large allocation warning
+///
+/// Large allocations put great pressure on the allocator which might be unable
+/// to serve them even if there is enough memory available, due to memory
+/// fragmentation. This is especially relevant for long-running applications,
+/// the kind of applications that are typically built with seastar. This feature
+/// allows finding these large by logging a warning on large allocations, with
+/// the stacktrace of the. See:
+/// * \ref set_large_allocation_warning_threshold()
+/// * \ref get_large_allocation_warning_threshold()
+/// * \ref scoped_large_allocation_warning_threshold
+/// * \ref scoped_large_allocation_warning_disable
+///
+/// ### Heap profiling
+///
+/// Heap profiling allows finding out how memory is used by your application, by
+/// recording the stacktrace of all allocations. See:
+/// * \ref set_heap_profiling_enabled()
+/// * \ref scoped_heap_profiling
+///
+/// ### Abort on allocation failure
+///
+/// Often, the best way to debug an allocation failure is a coredump. This
+/// feature allows dumping core on allocation failures, containing the stack of
+/// the failed allocation, by means of aborting. To enable set the
+/// `abort_on_seastar_bad_alloc` configuration option or the respective command
+/// line flag.
+///
+/// ### Dump diagnostics report
+///
+/// Dump a diagnostic report of the state of the seastar allocator upon allocation
+/// failure. The report is dumped with the `seastar_memory` logger, with debug
+/// level.
+/// You can configure a report to be dumped with error level on certain allocation
+/// kinds, see:
+/// * set_dump_memory_diagnostics_on_alloc_failure_kind()
+/// * set_additional_diagnostics_producer()
+/// * generate_memory_diagnostics_report()
+///
+/// The diagnostics report dump can be configured with the command
+/// line/configuration file via the \p dump-memory-diagnostics-on-alloc-failure-kind
+/// command-line flag/configuration item.
+namespace memory {
+
+/// \cond internal
+
+#ifdef SEASTAR_OVERRIDE_ALLOCATOR_PAGE_SIZE
+#define SEASTAR_INTERNAL_ALLOCATOR_PAGE_SIZE (SEASTAR_OVERRIDE_ALLOCATOR_PAGE_SIZE)
+#else
+#define SEASTAR_INTERNAL_ALLOCATOR_PAGE_SIZE 4096
+#endif
+
+static constexpr size_t page_size = SEASTAR_INTERNAL_ALLOCATOR_PAGE_SIZE;
+static constexpr size_t page_bits = log2ceil(page_size);
+static constexpr size_t huge_page_size =
+#if defined(__x86_64__) || defined(__i386__) || defined(__s390x__) || defined(__zarch__)
+ 1 << 21; // 2M
+#elif defined(__aarch64__)
+ 1 << 21; // 2M
+#elif defined(__PPC__)
+ 1 << 24; // 16M
+#else
+#error "Huge page size is not defined for this architecture"
+#endif
+
+void configure(std::vector<resource::memory> m, bool mbind,
+ std::optional<std::string> hugetlbfs_path = {});
+
+void enable_abort_on_allocation_failure();
+
+class disable_abort_on_alloc_failure_temporarily {
+public:
+ disable_abort_on_alloc_failure_temporarily();
+ ~disable_abort_on_alloc_failure_temporarily() noexcept;
+};
+
+// Disables heap profiling as long as this object is alive.
+// Can be nested, in which case the profiling is re-enabled when all
+// the objects go out of scope.
+class disable_backtrace_temporarily {
+ bool _old;
+public:
+ disable_backtrace_temporarily();
+ ~disable_backtrace_temporarily();
+};
+
+enum class reclaiming_result {
+ reclaimed_nothing,
+ reclaimed_something
+};
+
+// Determines when reclaimer can be invoked
+enum class reclaimer_scope {
+ //
+ // Reclaimer is only invoked in its own fiber. That fiber will be
+ // given higher priority than regular application fibers.
+ //
+ async,
+
+ //
+ // Reclaimer may be invoked synchronously with allocation.
+ // It may also be invoked in async scope.
+ //
+ // Reclaimer may invoke allocation, though it is discouraged because
+ // the system may be low on memory and such allocations may fail.
+ // Reclaimers which allocate should be prepared for re-entry.
+ //
+ sync
+};
+
+class reclaimer {
+public:
+ struct request {
+ // The number of bytes which is needed to be released.
+ // The reclaimer can release a different amount.
+ // If less is released then the reclaimer may be invoked again.
+ size_t bytes_to_reclaim;
+ };
+ using reclaim_fn = std::function<reclaiming_result ()>;
+private:
+ std::function<reclaiming_result (request)> _reclaim;
+ reclaimer_scope _scope;
+public:
+ // Installs new reclaimer which will be invoked when system is falling
+ // low on memory. 'scope' determines when reclaimer can be executed.
+ reclaimer(std::function<reclaiming_result ()> reclaim, reclaimer_scope scope = reclaimer_scope::async);
+ reclaimer(std::function<reclaiming_result (request)> reclaim, reclaimer_scope scope = reclaimer_scope::async);
+ ~reclaimer();
+ reclaiming_result do_reclaim(size_t bytes_to_reclaim) { return _reclaim(request{bytes_to_reclaim}); }
+ reclaimer_scope scope() const { return _scope; }
+};
+
+extern std::pmr::polymorphic_allocator<char>* malloc_allocator;
+
+// Call periodically to recycle objects that were freed
+// on cpu other than the one they were allocated on.
+//
+// Returns @true if any work was actually performed.
+bool drain_cross_cpu_freelist();
+
+
+// We don't want the memory code calling back into the rest of
+// the system, so allow the rest of the system to tell the memory
+// code how to initiate reclaim.
+//
+// When memory is low, calling \c hook(fn) will result in fn being called
+// in a safe place wrt. allocations.
+void set_reclaim_hook(
+ std::function<void (std::function<void ()>)> hook);
+
+/// \endcond
+
+class statistics;
+
+/// Capture a snapshot of memory allocation statistics for this lcore.
+statistics stats();
+
+/// Memory allocation statistics.
+class statistics {
+ uint64_t _mallocs;
+ uint64_t _frees;
+ uint64_t _cross_cpu_frees;
+ size_t _total_memory;
+ size_t _free_memory;
+ uint64_t _reclaims;
+ uint64_t _large_allocs;
+
+ uint64_t _foreign_mallocs;
+ uint64_t _foreign_frees;
+ uint64_t _foreign_cross_frees;
+private:
+ statistics(uint64_t mallocs, uint64_t frees, uint64_t cross_cpu_frees,
+ uint64_t total_memory, uint64_t free_memory, uint64_t reclaims, uint64_t large_allocs,
+ uint64_t foreign_mallocs, uint64_t foreign_frees, uint64_t foreign_cross_frees)
+ : _mallocs(mallocs), _frees(frees), _cross_cpu_frees(cross_cpu_frees)
+ , _total_memory(total_memory), _free_memory(free_memory), _reclaims(reclaims), _large_allocs(large_allocs)
+ , _foreign_mallocs(foreign_mallocs), _foreign_frees(foreign_frees)
+ , _foreign_cross_frees(foreign_cross_frees) {}
+public:
+ /// Total number of memory allocations calls since the system was started.
+ uint64_t mallocs() const { return _mallocs; }
+ /// Total number of memory deallocations calls since the system was started.
+ uint64_t frees() const { return _frees; }
+ /// Total number of memory deallocations that occured on a different lcore
+ /// than the one on which they were allocated.
+ uint64_t cross_cpu_frees() const { return _cross_cpu_frees; }
+ /// Total number of objects which were allocated but not freed.
+ size_t live_objects() const { return mallocs() - frees(); }
+ /// Total free memory (in bytes)
+ size_t free_memory() const { return _free_memory; }
+ /// Total allocated memory (in bytes)
+ size_t allocated_memory() const { return _total_memory - _free_memory; }
+ /// Total memory (in bytes)
+ size_t total_memory() const { return _total_memory; }
+ /// Number of reclaims performed due to low memory
+ uint64_t reclaims() const { return _reclaims; }
+ /// Number of allocations which violated the large allocation threshold
+ uint64_t large_allocations() const { return _large_allocs; }
+ /// Number of foreign allocations
+ uint64_t foreign_mallocs() const { return _foreign_mallocs; }
+ /// Number of foreign frees
+ uint64_t foreign_frees() const { return _foreign_frees; }
+ /// Number of foreign frees on reactor threads
+ uint64_t foreign_cross_frees() const { return _foreign_cross_frees; }
+ friend statistics stats();
+};
+
+struct memory_layout {
+ uintptr_t start;
+ uintptr_t end;
+};
+
+// Discover virtual address range used by the allocator on current shard.
+// Supported only when seastar allocator is enabled.
+memory::memory_layout get_memory_layout();
+
+/// Returns the value of free memory low water mark in bytes.
+/// When free memory is below this value, reclaimers are invoked until it goes above again.
+size_t min_free_memory();
+
+/// Sets the value of free memory low water mark in memory::page_size units.
+void set_min_free_pages(size_t pages);
+
+/// Enable the large allocation warning threshold.
+///
+/// Warn when allocation above a given threshold are performed.
+///
+/// \param threshold size (in bytes) above which an allocation will be logged
+void set_large_allocation_warning_threshold(size_t threshold);
+
+/// Gets the current large allocation warning threshold.
+size_t get_large_allocation_warning_threshold();
+
+/// Disable large allocation warnings.
+void disable_large_allocation_warning();
+
+/// Set a different large allocation warning threshold for a scope.
+class scoped_large_allocation_warning_threshold {
+ size_t _old_threshold;
+public:
+ explicit scoped_large_allocation_warning_threshold(size_t threshold)
+ : _old_threshold(get_large_allocation_warning_threshold()) {
+ set_large_allocation_warning_threshold(threshold);
+ }
+ scoped_large_allocation_warning_threshold(const scoped_large_allocation_warning_threshold&) = delete;
+ scoped_large_allocation_warning_threshold(scoped_large_allocation_warning_threshold&& x) = delete;
+ ~scoped_large_allocation_warning_threshold() {
+ if (_old_threshold) {
+ set_large_allocation_warning_threshold(_old_threshold);
+ }
+ }
+ void operator=(const scoped_large_allocation_warning_threshold&) const = delete;
+ void operator=(scoped_large_allocation_warning_threshold&&) = delete;
+};
+
+/// Disable large allocation warnings for a scope.
+class scoped_large_allocation_warning_disable {
+ size_t _old_threshold;
+public:
+ scoped_large_allocation_warning_disable()
+ : _old_threshold(get_large_allocation_warning_threshold()) {
+ disable_large_allocation_warning();
+ }
+ scoped_large_allocation_warning_disable(const scoped_large_allocation_warning_disable&) = delete;
+ scoped_large_allocation_warning_disable(scoped_large_allocation_warning_disable&& x) = delete;
+ ~scoped_large_allocation_warning_disable() {
+ if (_old_threshold) {
+ set_large_allocation_warning_threshold(_old_threshold);
+ }
+ }
+ void operator=(const scoped_large_allocation_warning_disable&) const = delete;
+ void operator=(scoped_large_allocation_warning_disable&&) = delete;
+};
+
+/// Enable/disable heap profiling.
+///
+/// In order to use heap profiling you have to define
+/// `SEASTAR_HEAPPROF`.
+/// Heap profiling data is not currently exposed via an API for
+/// inspection, instead it was designed to be inspected from a
+/// debugger.
+/// For an example script that makes use of the heap profiling data
+/// see [scylla-gdb.py] (https://github.com/scylladb/scylla/blob/e1b22b6a4c56b4f1d0adf65d1a11db4bcb51fe7d/scylla-gdb.py#L1439)
+/// This script can generate either textual representation of the data,
+/// or a zoomable flame graph ([flame graph generation instructions](https://github.com/scylladb/scylla/wiki/Seastar-heap-profiler),
+/// [example flame graph](https://user-images.githubusercontent.com/1389273/72920437-f0cf8a80-3d51-11ea-92f0-f3dbeb698871.png)).
+void set_heap_profiling_enabled(bool);
+
+/// Enable heap profiling for the duration of the scope.
+///
+/// For more information about heap profiling see
+/// \ref set_heap_profiling_enabled().
+class scoped_heap_profiling {
+public:
+ scoped_heap_profiling() noexcept;
+ ~scoped_heap_profiling();
+};
+
+}
+}
diff --git a/src/seastar/include/seastar/core/metrics.hh b/src/seastar/include/seastar/core/metrics.hh
new file mode 100644
index 000000000..88d1ad6a0
--- /dev/null
+++ b/src/seastar/include/seastar/core/metrics.hh
@@ -0,0 +1,587 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+
+#include <functional>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <boost/lexical_cast.hpp>
+#include <map>
+#include <seastar/core/metrics_types.hh>
+#include <seastar/util/std-compat.hh>
+
+/*! \file metrics.hh
+ * \brief header for metrics creation.
+ *
+ * This header file contains the metrics creation method with their helper function.
+ * Include this file when need to create metrics.
+ * Typically this will be in your source file.
+ *
+ * Code that is under the impl namespace should not be used directly.
+ *
+ */
+
+namespace seastar {
+
+/*!
+ * \addtogroup metrics
+ * @{
+ *
+ * \namespace seastar::metrics
+ * \brief metrics creation and registration
+ *
+ * the metrics namespace holds the relevant method and classes to generate metrics.
+ *
+ * The metrics layer support registering metrics, that later will be
+ * exported via different API protocols.
+ *
+ * To be able to support multiple protocols the following simplifications where made:
+ * 1. The id of the metrics is based on the collectd id
+ * 2. A metric could be a single value either a reference or a function
+ *
+ * To add metrics definition to class A do the following:
+ * * Add a metrics_group memeber to A
+ * * Add a a set_metrics() method that would be called in the constructor.
+ *
+ *
+ * In A header file
+ * \code
+ * #include "core/metrics_registration.hh"
+ * class A {
+ * metric_groups _metrics
+ *
+ * void setup_metrics();
+ *
+ * };
+ * \endcode
+ *
+ * In A source file:
+ *
+ * \code
+ * include "core/metrics.hh"
+ *
+ * void A::setup_metrics() {
+ * namespace sm = seastar::metrics;
+ * _metrics = sm::create_metric_group();
+ * _metrics->add_group("cache", {sm::make_gauge("bytes", "used", [this] { return _region.occupancy().used_space(); })});
+ * }
+ * \endcode
+ */
+
+namespace metrics {
+
+class double_registration : public std::runtime_error {
+public:
+ double_registration(std::string what);
+};
+
+/*!
+ * \defgroup metrics_types metrics type definitions
+ * The following are for the metric layer use, do not use them directly
+ * Instead use the make_counter, make_gauge, make_absolute and make_derived
+ *
+ */
+using metric_type_def = sstring; /*!< Used to hold an inherit type (like bytes)*/
+using metric_name_type = sstring; /*!< The metric name'*/
+using instance_id_type = sstring; /*!< typically used for the shard id*/
+
+/*!
+ * \brief Human-readable description of a metric/group.
+ *
+ *
+ * Uses a separate class to deal with type resolution
+ *
+ * Add this to metric creation:
+ *
+ * \code
+ * _metrics->add_group("groupname", {
+ * sm::make_gauge("metric_name", value, description("A documentation about the return value"))
+ * });
+ * \endcode
+ *
+ */
+class description {
+public:
+ description(sstring s = sstring()) : _s(std::move(s))
+ {}
+ const sstring& str() const {
+ return _s;
+ }
+private:
+ sstring _s;
+};
+
+/*!
+ * \brief Label a metrics
+ *
+ * Label are useful for adding information about a metric that
+ * later you would need to aggregate by.
+ * For example, if you have multiple queues on a shard.
+ * Adding the queue id as a Label will allow you to use the same name
+ * of the metrics with multiple id instances.
+ *
+ * label_instance holds an instance of label consist of a key and value.
+ *
+ * Typically you will not generate a label_instance yourself, but use a label
+ * object for that.
+ * @see label for more information
+ *
+ *
+ */
+class label_instance {
+ sstring _key;
+ sstring _value;
+public:
+ /*!
+ * \brief create a label_instance
+ * label instance consists of key and value.
+ * The key is an sstring.
+ * T - the value type can be any type that can be lexical_cast to string
+ * (ie. if it support the redirection operator for stringstream).
+ *
+ * All primitive types are supported so all the following examples are valid:
+ * label_instance a("smp_queue", 1)
+ * label_instance a("my_key", "my_value")
+ * label_instance a("internal_id", -1)
+ */
+ template<typename T>
+ label_instance(const sstring& key, T v) : _key(key), _value(boost::lexical_cast<std::string>(v)){}
+
+ /*!
+ * \brief returns the label key
+ */
+ const sstring key() const {
+ return _key;
+ }
+
+ /*!
+ * \brief returns the label value
+ */
+ const sstring value() const {
+ return _value;
+ }
+ bool operator<(const label_instance&) const;
+ bool operator==(const label_instance&) const;
+ bool operator!=(const label_instance&) const;
+};
+
+
+/*!
+ * \brief Class that creates label instances
+ *
+ * A factory class to create label instance
+ * Typically, the same Label name is used in multiple places.
+ * label is a label factory, you create it once, and use it to create the label_instance.
+ *
+ * In the example we would like to label the smp_queue with with the queue owner
+ *
+ * seastar::metrics::label smp_owner("smp_owner");
+ *
+ * now, when creating a new smp metric we can add a label to it:
+ *
+ * sm::make_queue_length("send_batch_queue_length", _last_snt_batch, {smp_owner(cpuid)})
+ *
+ * where cpuid in this case is unsiged.
+ */
+class label {
+ sstring key;
+public:
+ using instance = label_instance;
+ /*!
+ * \brief creating a label
+ * key is the label name, it will be the key for all label_instance
+ * that will be created from this label.
+ */
+ explicit label(const sstring& key) : key(key) {
+ }
+
+ /*!
+ * \brief creating a label instance
+ *
+ * Use the function operator to create a new label instance.
+ * T - the value type can be any type that can be lexical_cast to string
+ * (ie. if it support the redirection operator for stringstream).
+ *
+ * All primitive types are supported so if lab is a label, all the following examples are valid:
+ * lab(1)
+ * lab("my_value")
+ * lab(-1)
+ */
+ template<typename T>
+ instance operator()(T value) const {
+ return label_instance(key, std::forward<T>(value));
+ }
+
+ /*!
+ * \brief returns the label name
+ */
+ const sstring& name() const {
+ return key;
+ }
+};
+
+/*!
+ * \namespace impl
+ * \brief holds the implementation parts of the metrics layer, do not use directly.
+ *
+ * The metrics layer define a thin API for adding metrics.
+ * Some of the implementation details need to be in the header file, they should not be use directly.
+ */
+namespace impl {
+
+// The value binding data types
+enum class data_type : uint8_t {
+ COUNTER, // unsigned int 64
+ GAUGE, // double
+ DERIVE, // signed int 64
+ ABSOLUTE, // unsigned int 64
+ HISTOGRAM,
+};
+
+/*!
+ * \brief A helper class that used to return metrics value.
+ *
+ * Do not use directly @see metrics_creation
+ */
+struct metric_value {
+ std::variant<double, histogram> u;
+ data_type _type;
+ data_type type() const {
+ return _type;
+ }
+
+ double d() const {
+ return std::get<double>(u);
+ }
+
+ uint64_t ui() const {
+ return std::get<double>(u);
+ }
+
+ int64_t i() const {
+ return std::get<double>(u);
+ }
+
+ metric_value()
+ : _type(data_type::GAUGE) {
+ }
+
+ metric_value(histogram&& h, data_type t = data_type::HISTOGRAM) :
+ u(std::move(h)), _type(t) {
+ }
+ metric_value(const histogram& h, data_type t = data_type::HISTOGRAM) :
+ u(h), _type(t) {
+ }
+
+ metric_value(double d, data_type t)
+ : u(d), _type(t) {
+ }
+
+ metric_value& operator=(const metric_value& c) = default;
+
+ metric_value& operator+=(const metric_value& c) {
+ *this = *this + c;
+ return *this;
+ }
+
+ metric_value operator+(const metric_value& c);
+ const histogram& get_histogram() const {
+ return std::get<histogram>(u);
+ }
+};
+
+using metric_function = std::function<metric_value()>;
+
+struct metric_type {
+ data_type base_type;
+ metric_type_def type_name;
+};
+
+struct metric_definition_impl {
+ metric_name_type name;
+ metric_type type;
+ metric_function f;
+ description d;
+ bool enabled = true;
+ std::map<sstring, sstring> labels;
+ metric_definition_impl& operator ()(bool enabled);
+ metric_definition_impl& operator ()(const label_instance& label);
+ metric_definition_impl& set_type(const sstring& type_name);
+ metric_definition_impl(
+ metric_name_type name,
+ metric_type type,
+ metric_function f,
+ description d,
+ std::vector<label_instance> labels);
+};
+
+class metric_groups_def {
+public:
+ metric_groups_def() = default;
+ virtual ~metric_groups_def() = default;
+ metric_groups_def(const metric_groups_def&) = delete;
+ metric_groups_def(metric_groups_def&&) = default;
+ virtual metric_groups_def& add_metric(group_name_type name, const metric_definition& md) = 0;
+ virtual metric_groups_def& add_group(group_name_type name, const std::initializer_list<metric_definition>& l) = 0;
+ virtual metric_groups_def& add_group(group_name_type name, const std::vector<metric_definition>& l) = 0;
+};
+
+instance_id_type shard();
+
+template<typename T, typename En = std::true_type>
+struct is_callable;
+
+template<typename T>
+struct is_callable<T, typename std::integral_constant<bool, !std::is_void<typename std::result_of<T()>::type>::value>::type> : public std::true_type {
+};
+
+template<typename T>
+struct is_callable<T, typename std::enable_if<std::is_fundamental<T>::value, std::true_type>::type> : public std::false_type {
+};
+
+template<typename T, typename = std::enable_if_t<is_callable<T>::value>>
+metric_function make_function(T val, data_type dt) {
+ return [dt, val] {
+ return metric_value(val(), dt);
+ };
+}
+
+template<typename T, typename = std::enable_if_t<!is_callable<T>::value>>
+metric_function make_function(T& val, data_type dt) {
+ return [dt, &val] {
+ return metric_value(val, dt);
+ };
+}
+}
+
+extern const bool metric_disabled;
+
+extern label shard_label;
+
+/*
+ * The metrics definition are defined to be compatible with collectd metrics defintion.
+ * Typically you should used gauge or derived.
+ */
+
+
+/*!
+ * \brief Gauge are a general purpose metric.
+ *
+ * They can support floating point and can increase or decrease
+ */
+template<typename T>
+impl::metric_definition_impl make_gauge(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {}) {
+ return {name, {impl::data_type::GAUGE, "gauge"}, make_function(std::forward<T>(val), impl::data_type::GAUGE), d, labels};
+}
+
+/*!
+ * \brief Gauge are a general purpose metric.
+ *
+ * They can support floating point and can increase or decrease
+ */
+template<typename T>
+impl::metric_definition_impl make_gauge(metric_name_type name,
+ description d, T&& val) {
+ return {name, {impl::data_type::GAUGE, "gauge"}, make_function(std::forward<T>(val), impl::data_type::GAUGE), d, {}};
+}
+
+/*!
+ * \brief Gauge are a general purpose metric.
+ *
+ * They can support floating point and can increase or decrease
+ */
+template<typename T>
+impl::metric_definition_impl make_gauge(metric_name_type name,
+ description d, std::vector<label_instance> labels, T&& val) {
+ return {name, {impl::data_type::GAUGE, "gauge"}, make_function(std::forward<T>(val), impl::data_type::GAUGE), d, labels};
+}
+
+
+/*!
+ * \brief Derive are used when a rate is more interesting than the value.
+ *
+ * Derive is an integer value that can increase or decrease, typically it is used when looking at the
+ * derivation of the value.
+ *
+ * It is OK to use it when counting things and if no wrap-around is expected (it shouldn't) it's prefer over counter metric.
+ */
+template<typename T>
+impl::metric_definition_impl make_derive(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {}) {
+ return {name, {impl::data_type::DERIVE, "derive"}, make_function(std::forward<T>(val), impl::data_type::DERIVE), d, labels};
+}
+
+
+/*!
+ * \brief Derive are used when a rate is more interesting than the value.
+ *
+ * Derive is an integer value that can increase or decrease, typically it is used when looking at the
+ * derivation of the value.
+ *
+ * It is OK to use it when counting things and if no wrap-around is expected (it shouldn't) it's prefer over counter metric.
+ */
+template<typename T>
+impl::metric_definition_impl make_derive(metric_name_type name, description d,
+ T&& val) {
+ return {name, {impl::data_type::DERIVE, "derive"}, make_function(std::forward<T>(val), impl::data_type::DERIVE), d, {}};
+}
+
+
+/*!
+ * \brief Derive are used when a rate is more interesting than the value.
+ *
+ * Derive is an integer value that can increase or decrease, typically it is used when looking at the
+ * derivation of the value.
+ *
+ * It is OK to use it when counting things and if no wrap-around is expected (it shouldn't) it's prefer over counter metric.
+ */
+template<typename T>
+impl::metric_definition_impl make_derive(metric_name_type name, description d, std::vector<label_instance> labels,
+ T&& val) {
+ return {name, {impl::data_type::DERIVE, "derive"}, make_function(std::forward<T>(val), impl::data_type::DERIVE), d, labels};
+}
+
+
+/*!
+ * \brief create a counter metric
+ *
+ * Counters are similar to derived, but they assume monotony, so if a counter value decrease in a series it is count as a wrap-around.
+ * It is better to use large enough data value than to use counter.
+ *
+ */
+template<typename T>
+impl::metric_definition_impl make_counter(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {}) {
+ return {name, {impl::data_type::COUNTER, "counter"}, make_function(std::forward<T>(val), impl::data_type::COUNTER), d, labels};
+}
+
+/*!
+ * \brief create an absolute metric.
+ *
+ * Absolute are used for metric that are being erased after each time they are read.
+ * They are here for compatibility reasons and should general be avoided in most applications.
+ */
+template<typename T>
+impl::metric_definition_impl make_absolute(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {}) {
+ return {name, {impl::data_type::ABSOLUTE, "absolute"}, make_function(std::forward<T>(val), impl::data_type::ABSOLUTE), d, labels};
+}
+
+/*!
+ * \brief create a histogram metric.
+ *
+ * Histograms are a list o buckets with upper values and counter for the number
+ * of entries in each bucket.
+ */
+template<typename T>
+impl::metric_definition_impl make_histogram(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {}) {
+ return {name, {impl::data_type::HISTOGRAM, "histogram"}, make_function(std::forward<T>(val), impl::data_type::HISTOGRAM), d, labels};
+}
+
+/*!
+ * \brief create a histogram metric.
+ *
+ * Histograms are a list o buckets with upper values and counter for the number
+ * of entries in each bucket.
+ */
+template<typename T>
+impl::metric_definition_impl make_histogram(metric_name_type name,
+ description d, std::vector<label_instance> labels, T&& val) {
+ return {name, {impl::data_type::HISTOGRAM, "histogram"}, make_function(std::forward<T>(val), impl::data_type::HISTOGRAM), d, labels};
+}
+
+
+/*!
+ * \brief create a histogram metric.
+ *
+ * Histograms are a list o buckets with upper values and counter for the number
+ * of entries in each bucket.
+ */
+template<typename T>
+impl::metric_definition_impl make_histogram(metric_name_type name,
+ description d, T&& val) {
+ return {name, {impl::data_type::HISTOGRAM, "histogram"}, make_function(std::forward<T>(val), impl::data_type::HISTOGRAM), d, {}};
+}
+
+
+/*!
+ * \brief create a total_bytes metric.
+ *
+ * total_bytes are used for an ever growing counters, like the total bytes
+ * passed on a network.
+ */
+
+template<typename T>
+impl::metric_definition_impl make_total_bytes(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {},
+ instance_id_type instance = impl::shard()) {
+ return make_derive(name, std::forward<T>(val), d, labels).set_type("total_bytes");
+}
+
+/*!
+ * \brief create a current_bytes metric.
+ *
+ * current_bytes are used to report on current status in bytes.
+ * For example the current free memory.
+ */
+
+template<typename T>
+impl::metric_definition_impl make_current_bytes(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {},
+ instance_id_type instance = impl::shard()) {
+ return make_derive(name, std::forward<T>(val), d, labels).set_type("bytes");
+}
+
+
+/*!
+ * \brief create a queue_length metric.
+ *
+ * queue_length are used to report on queue length
+ */
+
+template<typename T>
+impl::metric_definition_impl make_queue_length(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {},
+ instance_id_type instance = impl::shard()) {
+ return make_gauge(name, std::forward<T>(val), d, labels).set_type("queue_length");
+}
+
+
+/*!
+ * \brief create a total operation metric.
+ *
+ * total_operations are used for ever growing operation counter.
+ */
+
+template<typename T>
+impl::metric_definition_impl make_total_operations(metric_name_type name,
+ T&& val, description d=description(), std::vector<label_instance> labels = {},
+ instance_id_type instance = impl::shard()) {
+ return make_derive(name, std::forward<T>(val), d, labels).set_type("total_operations");
+}
+
+/*! @} */
+}
+}
diff --git a/src/seastar/include/seastar/core/metrics_api.hh b/src/seastar/include/seastar/core/metrics_api.hh
new file mode 100644
index 000000000..1343045f6
--- /dev/null
+++ b/src/seastar/include/seastar/core/metrics_api.hh
@@ -0,0 +1,386 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+
+#include <seastar/core/metrics.hh>
+#include <unordered_map>
+#include <seastar/core/sharded.hh>
+#include <boost/functional/hash.hpp>
+/*!
+ * \file metrics_api.hh
+ * \brief header file for metric API layer (like promehteus or collectd)
+ *
+ *
+ *
+ */
+namespace seastar {
+namespace metrics {
+namespace impl {
+
+using labels_type = std::map<sstring, sstring>;
+}
+}
+}
+
+namespace std {
+
+template<>
+struct hash<seastar::metrics::impl::labels_type> {
+ using argument_type = seastar::metrics::impl::labels_type;
+ using result_type = ::std::size_t;
+ result_type operator()(argument_type const& s) const {
+ result_type h = 0;
+ for (auto&& i : s) {
+ boost::hash_combine(h, std::hash<seastar::sstring>{}(i.second));
+ }
+ return h;
+ }
+};
+
+}
+
+namespace seastar {
+namespace metrics {
+namespace impl {
+
+/**
+ * Metrics are collected in groups that belongs to some logical entity.
+ * For example, different measurements of the cpu, will belong to group "cpu".
+ *
+ * Name is the metric name like used_objects or used_bytes
+ *
+ * Inherit type allows customizing one of the basic types (gauge, counter, derive).
+ *
+ * Instance_id is used to differentiate multiple instance of the metrics.
+ * In the seastar environment it is typical to have a metric per shard.
+ *
+ */
+
+class metric_id {
+public:
+ metric_id() = default;
+ metric_id(group_name_type group, metric_name_type name,
+ labels_type labels = {})
+ : _group(std::move(group)), _name(
+ std::move(name)), _labels(labels) {
+ }
+ metric_id(metric_id &&) = default;
+ metric_id(const metric_id &) = default;
+
+ metric_id & operator=(metric_id &&) = default;
+ metric_id & operator=(const metric_id &) = default;
+
+ const group_name_type & group_name() const {
+ return _group;
+ }
+ void group_name(const group_name_type & name) {
+ _group = name;
+ }
+ const instance_id_type & instance_id() const {
+ return _labels.at(shard_label.name());
+ }
+ const metric_name_type & name() const {
+ return _name;
+ }
+ const labels_type& labels() const {
+ return _labels;
+ }
+ sstring full_name() const;
+
+ bool operator<(const metric_id&) const;
+ bool operator==(const metric_id&) const;
+private:
+ auto as_tuple() const {
+ return std::tie(group_name(), instance_id(), name(), labels());
+ }
+ group_name_type _group;
+ metric_name_type _name;
+ labels_type _labels;
+};
+}
+}
+}
+
+namespace std {
+
+template<>
+struct hash<seastar::metrics::impl::metric_id>
+{
+ typedef seastar::metrics::impl::metric_id argument_type;
+ typedef ::std::size_t result_type;
+ result_type operator()(argument_type const& s) const
+ {
+ result_type const h1 ( std::hash<seastar::sstring>{}(s.group_name()) );
+ result_type const h2 ( std::hash<seastar::sstring>{}(s.instance_id()) );
+ return h1 ^ (h2 << 1); // or use boost::hash_combine
+ }
+};
+
+}
+
+namespace seastar {
+namespace metrics {
+namespace impl {
+
+/*!
+ * \brief holds metadata information of a metric family
+ *
+ * Holds the information that is shared between all metrics
+ * that belongs to the same metric_family
+ */
+struct metric_family_info {
+ data_type type;
+ metric_type_def inherit_type;
+ description d;
+ sstring name;
+};
+
+
+/*!
+ * \brief holds metric metadata
+ */
+struct metric_info {
+ metric_id id;
+ bool enabled;
+};
+
+
+using metrics_registration = std::vector<metric_id>;
+
+class metric_groups_impl : public metric_groups_def {
+ metrics_registration _registration;
+public:
+ metric_groups_impl() = default;
+ ~metric_groups_impl();
+ metric_groups_impl(const metric_groups_impl&) = delete;
+ metric_groups_impl(metric_groups_impl&&) = default;
+ metric_groups_impl& add_metric(group_name_type name, const metric_definition& md);
+ metric_groups_impl& add_group(group_name_type name, const std::initializer_list<metric_definition>& l);
+ metric_groups_impl& add_group(group_name_type name, const std::vector<metric_definition>& l);
+};
+
+class impl;
+
+class registered_metric {
+ metric_info _info;
+ metric_function _f;
+ shared_ptr<impl> _impl;
+public:
+ registered_metric(metric_id id, metric_function f, bool enabled=true);
+ virtual ~registered_metric() {}
+ virtual metric_value operator()() const {
+ return _f();
+ }
+
+ bool is_enabled() const {
+ return _info.enabled;
+ }
+
+ void set_enabled(bool b) {
+ _info.enabled = b;
+ }
+
+ const metric_id& get_id() const {
+ return _info.id;
+ }
+
+ const metric_info& info() const {
+ return _info;
+ }
+ metric_function& get_function() {
+ return _f;
+ }
+};
+
+using register_ref = shared_ptr<registered_metric>;
+using metric_instances = std::map<labels_type, register_ref>;
+
+class metric_family {
+ metric_instances _instances;
+ metric_family_info _info;
+public:
+ using iterator = metric_instances::iterator;
+ using const_iterator = metric_instances::const_iterator;
+
+ metric_family() = default;
+ metric_family(const metric_family&) = default;
+ metric_family(const metric_instances& instances) : _instances(instances) {
+ }
+ metric_family(const metric_instances& instances, const metric_family_info& info) : _instances(instances), _info(info) {
+ }
+ metric_family(metric_instances&& instances, metric_family_info&& info) : _instances(std::move(instances)), _info(std::move(info)) {
+ }
+ metric_family(metric_instances&& instances) : _instances(std::move(instances)) {
+ }
+
+ register_ref& operator[](const labels_type& l) {
+ return _instances[l];
+ }
+
+ const register_ref& at(const labels_type& l) const {
+ return _instances.at(l);
+ }
+
+ metric_family_info& info() {
+ return _info;
+ }
+
+ const metric_family_info& info() const {
+ return _info;
+ }
+
+ iterator find(const labels_type& l) {
+ return _instances.find(l);
+ }
+
+ const_iterator find(const labels_type& l) const {
+ return _instances.find(l);
+ }
+
+ iterator begin() {
+ return _instances.begin();
+ }
+
+ const_iterator begin() const {
+ return _instances.cbegin();
+ }
+
+ iterator end() {
+ return _instances.end();
+ }
+
+ bool empty() const {
+ return _instances.empty();
+ }
+
+ iterator erase(const_iterator position) {
+ return _instances.erase(position);
+ }
+
+ const_iterator end() const {
+ return _instances.cend();
+ }
+
+ uint32_t size() const {
+ return _instances.size();
+ }
+
+};
+
+using value_map = std::map<sstring, metric_family>;
+
+using metric_metadata_vector = std::vector<metric_info>;
+
+/*!
+ * \brief holds a metric family metadata
+ *
+ * The meta data of a metric family compose of the
+ * metadata of the family, and a vector of the metadata for
+ * each of the metric.
+ */
+struct metric_family_metadata {
+ metric_family_info mf;
+ metric_metadata_vector metrics;
+};
+
+using value_vector = std::vector<metric_value>;
+using metric_metadata = std::vector<metric_family_metadata>;
+using metric_values = std::vector<value_vector>;
+
+struct values_copy {
+ shared_ptr<metric_metadata> metadata;
+ metric_values values;
+};
+
+struct config {
+ sstring hostname;
+};
+
+class impl {
+ value_map _value_map;
+ config _config;
+ bool _dirty = true;
+ shared_ptr<metric_metadata> _metadata;
+ std::vector<std::vector<metric_function>> _current_metrics;
+public:
+ value_map& get_value_map() {
+ return _value_map;
+ }
+
+ const value_map& get_value_map() const {
+ return _value_map;
+ }
+
+ void add_registration(const metric_id& id, const metric_type& type, metric_function f, const description& d, bool enabled);
+ void remove_registration(const metric_id& id);
+ future<> stop() {
+ return make_ready_future<>();
+ }
+ const config& get_config() const {
+ return _config;
+ }
+ void set_config(const config& c) {
+ _config = c;
+ }
+
+ shared_ptr<metric_metadata> metadata();
+
+ std::vector<std::vector<metric_function>>& functions();
+
+ void update_metrics_if_needed();
+
+ void dirty() {
+ _dirty = true;
+ }
+};
+
+const value_map& get_value_map();
+using values_reference = shared_ptr<values_copy>;
+
+foreign_ptr<values_reference> get_values();
+
+shared_ptr<impl> get_local_impl();
+
+void unregister_metric(const metric_id & id);
+
+/*!
+ * \brief initialize metric group
+ *
+ * Create a metric_group_def.
+ * No need to use it directly.
+ */
+std::unique_ptr<metric_groups_def> create_metric_groups();
+
+}
+/*!
+ * \brief set the metrics configuration
+ */
+future<> configure(const boost::program_options::variables_map & opts);
+
+/*!
+ * \brief get the metrics configuration desciprtion
+ */
+
+boost::program_options::options_description get_options_description();
+
+}
+}
diff --git a/src/seastar/include/seastar/core/metrics_registration.hh b/src/seastar/include/seastar/core/metrics_registration.hh
new file mode 100644
index 000000000..6f57b708b
--- /dev/null
+++ b/src/seastar/include/seastar/core/metrics_registration.hh
@@ -0,0 +1,173 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <seastar/core/sstring.hh>
+
+/*!
+ * \file metrics_registration.hh
+ * \brief holds the metric_groups definition needed by class that reports metrics
+ *
+ * If class A needs to report metrics,
+ * typically you include metrics_registration.hh, in A header file and add to A:
+ * * metric_groups _metrics as a member
+ * * set_metrics() method that would be called in the constructor.
+ * \code
+ * class A {
+ * metric_groups _metrics
+ *
+ * void setup_metrics();
+ *
+ * };
+ * \endcode
+ * To define the metrics, include in your source file metircs.hh
+ * @see metrics.hh for the definition for adding a metric.
+ */
+
+namespace seastar {
+
+namespace metrics {
+
+namespace impl {
+class metric_groups_def;
+struct metric_definition_impl;
+class metric_groups_impl;
+}
+
+using group_name_type = sstring; /*!< A group of logically related metrics */
+class metric_groups;
+
+class metric_definition {
+ std::unique_ptr<impl::metric_definition_impl> _impl;
+public:
+ metric_definition(const impl::metric_definition_impl& impl) noexcept;
+ metric_definition(metric_definition&& m) noexcept;
+ ~metric_definition();
+ friend metric_groups;
+ friend impl::metric_groups_impl;
+};
+
+class metric_group_definition {
+public:
+ group_name_type name;
+ std::initializer_list<metric_definition> metrics;
+ metric_group_definition(const group_name_type& name, std::initializer_list<metric_definition> l);
+ metric_group_definition(const metric_group_definition&) = delete;
+ ~metric_group_definition();
+};
+
+/*!
+ * metric_groups
+ * \brief holds the metric definition.
+ *
+ * Add multiple metric groups definitions.
+ * Initialization can be done in the constructor or with a call to add_group
+ * @see metrics.hh for example and supported metrics
+ */
+class metric_groups {
+ std::unique_ptr<impl::metric_groups_def> _impl;
+public:
+ metric_groups() noexcept;
+ metric_groups(metric_groups&&) = default;
+ virtual ~metric_groups();
+ metric_groups& operator=(metric_groups&&) = default;
+ /*!
+ * \brief add metrics belong to the same group in the constructor.
+ *
+ * combine the constructor with the add_group functionality.
+ */
+ metric_groups(std::initializer_list<metric_group_definition> mg);
+
+ /*!
+ * \brief Add metrics belonging to the same group.
+ *
+ * Use the metrics creation functions to add metrics.
+ *
+ * For example:
+ * _metrics.add_group("my_group", {
+ * make_counter("my_counter_name1", counter, description("my counter description")),
+ * make_counter("my_counter_name2", counter, description("my second counter description")),
+ * make_gauge("my_gauge_name1", gauge, description("my gauge description")),
+ * });
+ *
+ * Metric name should be unique inside the group.
+ * You can chain add_group calls like:
+ * _metrics.add_group("my group1", {...}).add_group("my group2", {...});
+ *
+ * This overload (with initializer_list) is needed because metric_definition
+ * has no copy constructor, so the other overload (with vector) cannot be
+ * invoked on a braced-init-list.
+ */
+ metric_groups& add_group(const group_name_type& name, const std::initializer_list<metric_definition>& l);
+
+ /*!
+ * \brief Add metrics belonging to the same group.
+ *
+ * Use the metrics creation functions to add metrics.
+ *
+ * For example:
+ * vector<metric_definition> v;
+ * v.push_back(make_counter("my_counter_name1", counter, description("my counter description")));
+ * v.push_back(make_counter("my_counter_name2", counter, description("my second counter description")));
+ * v.push_back(make_gauge("my_gauge_name1", gauge, description("my gauge description")));
+ * _metrics.add_group("my_group", v);
+ *
+ * Metric name should be unique inside the group.
+ * You can chain add_group calls like:
+ * _metrics.add_group("my group1", vec1).add_group("my group2", vec2);
+ */
+ metric_groups& add_group(const group_name_type& name, const std::vector<metric_definition>& l);
+
+ /*!
+ * \brief clear all metrics groups registrations.
+ */
+ void clear();
+};
+
+
+/*!
+ * \brief hold a single metric group
+ * Initialization is done in the constructor or
+ * with a call to add_group
+ */
+class metric_group : public metric_groups {
+public:
+ metric_group() noexcept;
+ metric_group(const metric_group&) = delete;
+ metric_group(metric_group&&) = default;
+ virtual ~metric_group();
+ metric_group& operator=(metric_group&&) = default;
+
+ /*!
+ * \brief add metrics belong to the same group in the constructor.
+ *
+ *
+ */
+ metric_group(const group_name_type& name, std::initializer_list<metric_definition> l);
+};
+
+
+}
+}
diff --git a/src/seastar/include/seastar/core/metrics_types.hh b/src/seastar/include/seastar/core/metrics_types.hh
new file mode 100644
index 000000000..13d79cb21
--- /dev/null
+++ b/src/seastar/include/seastar/core/metrics_types.hh
@@ -0,0 +1,83 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+#include <vector>
+
+namespace seastar {
+namespace metrics {
+
+
+/*!
+ * \brief Histogram bucket type
+ *
+ * A histogram bucket contains an upper bound and the number
+ * of events in the buckets.
+ */
+struct histogram_bucket {
+ uint64_t count = 0; // number of events.
+ double upper_bound = 0; // Inclusive.
+};
+
+
+/*!
+ * \brief Histogram data type
+ *
+ * The histogram struct is a container for histogram values.
+ * It is not a histogram implementation but it will be used by histogram
+ * implementation to return its internal values.
+ */
+struct histogram {
+ uint64_t sample_count = 0;
+ double sample_sum = 0;
+ std::vector<histogram_bucket> buckets; // Ordered in increasing order of upper_bound, +Inf bucket is optional.
+
+ /*!
+ * \brief Addition assigning a historgram
+ *
+ * The histogram must match the buckets upper bounds
+ * or an exception will be thrown
+ */
+ histogram& operator+=(const histogram& h);
+
+ /*!
+ * \brief Addition historgrams
+ *
+ * Add two histograms and return the result as a new histogram
+ * The histogram must match the buckets upper bounds
+ * or an exception will be thrown
+ */
+ histogram operator+(const histogram& h) const;
+
+ /*!
+ * \brief Addition historgrams
+ *
+ * Add two histograms and return the result as a new histogram
+ * The histogram must match the buckets upper bounds
+ * or an exception will be thrown
+ */
+ histogram operator+(histogram&& h) const;
+
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/core/on_internal_error.hh b/src/seastar/include/seastar/core/on_internal_error.hh
new file mode 100644
index 000000000..5c56a9c45
--- /dev/null
+++ b/src/seastar/include/seastar/core/on_internal_error.hh
@@ -0,0 +1,56 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+class logger;
+
+/// Controls whether on_internal_error() aborts or throws. The default
+/// is to throw.
+void set_abort_on_internal_error(bool do_abort);
+
+/// Report an internal error
+///
+/// Depending on the value passed to set_abort_on_internal_error, this
+/// will either log to \p logger and abort or throw a std::runtime_error.
+[[noreturn]] void on_internal_error(logger& logger, std::string_view reason);
+
+/// Report an internal error
+///
+/// Depending on the value passed to set_abort_on_internal_error, this
+/// will either log to \p logger and abort or throw the passed-in
+/// \p ex.
+/// This overload cannot attach a backtrace to the exception, so if the
+/// caller wishes to have one attached they have to do it themselves.
+[[noreturn]] void on_internal_error(logger& logger, std::exception_ptr ex);
+
+/// Report an internal error in a noexcept context
+///
+/// The error will be logged to \logger and if set_abort_on_internal_error,
+/// was set to true, the program will be aborted. This overload can be used
+/// in noexcept contexts like destructors or noexcept functions.
+void on_internal_error_noexcept(logger& logger, std::string_view reason) noexcept;
+
+}
diff --git a/src/seastar/include/seastar/core/pipe.hh b/src/seastar/include/seastar/core/pipe.hh
new file mode 100644
index 000000000..d69484384
--- /dev/null
+++ b/src/seastar/include/seastar/core/pipe.hh
@@ -0,0 +1,267 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/queue.hh>
+
+#include <seastar/util/std-compat.hh>
+
+/// \defgroup fiber-module Fibers
+///
+/// \brief Fibers of execution
+///
+/// Seastar continuations are normally short, but often chained to one
+/// another, so that one continuation does a bit of work and then schedules
+/// another continuation for later. Such chains can be long, and often even
+/// involve loopings - see for example \ref repeat. We call such chains
+/// "fibers" of execution.
+///
+/// These fibers are not threads - each is just a string of continuations -
+/// but they share some common requirements with traditional threads.
+/// For example, we want to avoid one fiber getting starved while a second
+/// fiber continuously runs its continuations one after another.
+/// As another example, fibers may want to communicate - e.g., one fiber
+/// produces data that a second fiber consumes, and we wish to ensure that
+/// both fibers get a chance to run, and that if one stops prematurely,
+/// the other doesn't hang forever.
+///
+/// Consult the following table to see which APIs are useful for fiber tasks:
+///
+/// Task | APIs
+/// -----------------------------------------------|-------------------
+/// Repeat a blocking task indefinitely | \ref keep_doing()
+/// Repeat a blocking task, then exit | \ref repeat(), \ref do_until()
+/// Provide mutual exclusion between two tasks | \ref semaphore, \ref shared_mutex
+/// Pass a stream of data between two fibers | \ref seastar::pipe
+/// Safely shut down a resource | \ref seastar::gate
+/// Hold on to an object while a fiber is running | \ref do_with()
+///
+
+/// Seastar API namespace
+namespace seastar {
+
+/// \addtogroup fiber-module
+/// @{
+
+class broken_pipe_exception : public std::exception {
+public:
+ virtual const char* what() const noexcept {
+ return "Broken pipe";
+ }
+};
+
+class unread_overflow_exception : public std::exception {
+public:
+ virtual const char* what() const noexcept {
+ return "pipe_reader::unread() overflow";
+ }
+};
+
+/// \cond internal
+namespace internal {
+template <typename T>
+class pipe_buffer {
+private:
+ queue<std::optional<T>> _buf;
+ bool _read_open = true;
+ bool _write_open = true;
+public:
+ pipe_buffer(size_t size) : _buf(size) {}
+ future<std::optional<T>> read() {
+ return _buf.pop_eventually();
+ }
+ future<> write(T&& data) {
+ return _buf.push_eventually(std::move(data));
+ }
+ bool readable() const {
+ return _write_open || !_buf.empty();
+ }
+ bool writeable() const {
+ return _read_open;
+ }
+ bool close_read() {
+ // If a writer blocking (on a full queue), need to stop it.
+ if (_buf.full()) {
+ _buf.abort(std::make_exception_ptr(broken_pipe_exception()));
+ }
+ _read_open = false;
+ return !_write_open;
+ }
+ bool close_write() {
+ // If the queue is empty, write the EOF (disengaged optional) to the
+ // queue to wake a blocked reader. If the queue is not empty, there is
+ // no need to write the EOF to the queue - the reader will return an
+ // EOF when it sees that _write_open == false.
+ if (_buf.empty()) {
+ _buf.push({});
+ }
+ _write_open = false;
+ return !_read_open;
+ }
+};
+} // namespace internal
+/// \endcond
+
+template <typename T>
+class pipe;
+
+/// \brief Read side of a \ref seastar::pipe
+///
+/// The read side of a pipe, which allows only reading from the pipe.
+/// A pipe_reader object cannot be created separately, but only as part of a
+/// reader/writer pair through \ref seastar::pipe.
+template <typename T>
+class pipe_reader {
+private:
+ internal::pipe_buffer<T> *_bufp;
+ std::optional<T> _unread;
+ pipe_reader(internal::pipe_buffer<T> *bufp) : _bufp(bufp) { }
+ friend class pipe<T>;
+public:
+ /// \brief Read next item from the pipe
+ ///
+ /// Returns a future value, which is fulfilled when the pipe's buffer
+ /// becomes non-empty, or the write side is closed. The value returned
+ /// is an optional<T>, which is disengaged to mark and end of file
+ /// (i.e., the write side was closed, and we've read everything it sent).
+ future<std::optional<T>> read() {
+ if (_unread) {
+ auto ret = std::move(*_unread);
+ _unread = {};
+ return make_ready_future<std::optional<T>>(std::move(ret));
+ }
+ if (_bufp->readable()) {
+ return _bufp->read();
+ } else {
+ return make_ready_future<std::optional<T>>();
+ }
+ }
+ /// \brief Return an item to the front of the pipe
+ ///
+ /// Pushes the given item to the front of the pipe, so it will be
+ /// returned by the next read() call. The typical use case is to
+ /// unread() the last item returned by read().
+ /// More generally, it is legal to unread() any item, not just one
+ /// previously returned by read(), but note that the unread() is limited
+ /// to just one item - two calls to unread() without an intervening call
+ /// to read() will cause an exception.
+ void unread(T&& item) {
+ if (_unread) {
+ throw unread_overflow_exception();
+ }
+ _unread = std::move(item);
+ }
+ ~pipe_reader() {
+ if (_bufp && _bufp->close_read()) {
+ delete _bufp;
+ }
+ }
+ // Allow move, but not copy, of pipe_reader
+ pipe_reader(pipe_reader&& other) : _bufp(other._bufp) {
+ other._bufp = nullptr;
+ }
+ pipe_reader& operator=(pipe_reader&& other) {
+ std::swap(_bufp, other._bufp);
+ }
+};
+
+/// \brief Write side of a \ref seastar::pipe
+///
+/// The write side of a pipe, which allows only writing to the pipe.
+/// A pipe_writer object cannot be created separately, but only as part of a
+/// reader/writer pair through \ref seastar::pipe.
+template <typename T>
+class pipe_writer {
+private:
+ internal::pipe_buffer<T> *_bufp;
+ pipe_writer(internal::pipe_buffer<T> *bufp) : _bufp(bufp) { }
+ friend class pipe<T>;
+public:
+ /// \brief Write an item to the pipe
+ ///
+ /// Returns a future value, which is fulfilled when the data was written
+ /// to the buffer (when it become non-full). If the data could not be
+ /// written because the read side was closed, an exception
+ /// \ref broken_pipe_exception is returned in the future.
+ future<> write(T&& data) {
+ if (_bufp->writeable()) {
+ return _bufp->write(std::move(data));
+ } else {
+ return make_exception_future<>(broken_pipe_exception());
+ }
+ }
+ ~pipe_writer() {
+ if (_bufp && _bufp->close_write()) {
+ delete _bufp;
+ }
+ }
+ // Allow move, but not copy, of pipe_writer
+ pipe_writer(pipe_writer&& other) : _bufp(other._bufp) {
+ other._bufp = nullptr;
+ }
+ pipe_writer& operator=(pipe_writer&& other) {
+ std::swap(_bufp, other._bufp);
+ }
+};
+
+/// \brief A fixed-size pipe for communicating between two fibers.
+///
+/// A pipe<T> is a mechanism to transfer data between two fibers, one
+/// producing data, and the other consuming it. The fixed-size buffer also
+/// ensures a balanced execution of the two fibers, because the producer
+/// fiber blocks when it writes to a full pipe, until the consumer fiber gets
+/// to run and read from the pipe.
+///
+/// A pipe<T> resembles a Unix pipe, in that it has a read side, a write side,
+/// and a fixed-sized buffer between them, and supports either end to be closed
+/// independently (and EOF or broken pipe when using the other side).
+/// A pipe<T> object holds the reader and write sides of the pipe as two
+/// separate objects. These objects can be moved into two different fibers.
+/// Importantly, if one of the pipe ends is destroyed (i.e., the continuations
+/// capturing it end), the other end of the pipe will stop blocking, so the
+/// other fiber will not hang.
+///
+/// The pipe's read and write interfaces are future-based blocking. I.e., the
+/// write() and read() methods return a future which is fulfilled when the
+/// operation is complete. The pipe is single-reader single-writer, meaning
+/// that until the future returned by read() is fulfilled, read() must not be
+/// called again (and same for write).
+///
+/// Note: The pipe reader and writer are movable, but *not* copyable. It is
+/// often convenient to wrap each end in a shared pointer, so it can be
+/// copied (e.g., used in an std::function which needs to be copyable) or
+/// easily captured into multiple continuations.
+template <typename T>
+class pipe {
+public:
+ pipe_reader<T> reader;
+ pipe_writer<T> writer;
+ explicit pipe(size_t size) : pipe(new internal::pipe_buffer<T>(size)) { }
+private:
+ pipe(internal::pipe_buffer<T> *bufp) : reader(bufp), writer(bufp) { }
+};
+
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh b/src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh
new file mode 100644
index 000000000..437bba47f
--- /dev/null
+++ b/src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh
@@ -0,0 +1,43 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 Elazar Leibovich
+ */
+
+#pragma once
+
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+/// Creates a `temporary_buffer` allocated by a custom allocator
+///
+/// \param allocator allocator to use when allocating the temporary_buffer
+/// \param size size of the temporary buffer
+template <typename CharType>
+temporary_buffer<CharType> make_temporary_buffer(std::pmr::polymorphic_allocator<CharType>* allocator, std::size_t size) {
+ if (allocator == memory::malloc_allocator) {
+ return temporary_buffer<CharType>(size);
+ }
+ CharType *buffer = allocator->allocate(size);
+ return temporary_buffer<CharType>(buffer, size,
+ make_deleter(deleter(), [allocator, buffer, size] () mutable { allocator->deallocate(buffer, size); }));
+}
+
+}
diff --git a/src/seastar/include/seastar/core/posix.hh b/src/seastar/include/seastar/core/posix.hh
new file mode 100644
index 000000000..f8dece37c
--- /dev/null
+++ b/src/seastar/include/seastar/core/posix.hh
@@ -0,0 +1,492 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include "abort_on_ebadf.hh"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <assert.h>
+#include <utility>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <sys/timerfd.h>
+#include <sys/socket.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <system_error>
+#include <pthread.h>
+#include <signal.h>
+#include <memory>
+#include <chrono>
+#include <sys/uio.h>
+
+#include <seastar/net/socket_defs.hh>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+/// \file
+/// \defgroup posix-support POSIX Support
+///
+/// Mostly-internal APIs to provide C++ glue for the underlying POSIX platform;
+/// but can be used by the application when they don't block.
+///
+/// \addtogroup posix-support
+/// @{
+
+inline void throw_system_error_on(bool condition, const char* what_arg = "");
+
+template <typename T>
+inline void throw_kernel_error(T r);
+
+struct mmap_deleter {
+ size_t _size;
+ void operator()(void* ptr) const;
+};
+
+using mmap_area = std::unique_ptr<char[], mmap_deleter>;
+
+mmap_area mmap_anonymous(void* addr, size_t length, int prot, int flags);
+
+class file_desc {
+ int _fd;
+public:
+ file_desc() = delete;
+ file_desc(const file_desc&) = delete;
+ file_desc(file_desc&& x) noexcept : _fd(x._fd) { x._fd = -1; }
+ ~file_desc() { if (_fd != -1) { ::close(_fd); } }
+ void operator=(const file_desc&) = delete;
+ file_desc& operator=(file_desc&& x) {
+ if (this != &x) {
+ std::swap(_fd, x._fd);
+ if (x._fd != -1) {
+ x.close();
+ }
+ }
+ return *this;
+ }
+ void close() {
+ assert(_fd != -1);
+ auto r = ::close(_fd);
+ throw_system_error_on(r == -1, "close");
+ _fd = -1;
+ }
+ int get() const { return _fd; }
+
+ static file_desc from_fd(int fd) {
+ return file_desc(fd);
+ }
+
+ static file_desc open(sstring name, int flags, mode_t mode = 0) {
+ int fd = ::open(name.c_str(), flags, mode);
+ throw_system_error_on(fd == -1, "open");
+ return file_desc(fd);
+ }
+ static file_desc socket(int family, int type, int protocol = 0) {
+ int fd = ::socket(family, type, protocol);
+ throw_system_error_on(fd == -1, "socket");
+ return file_desc(fd);
+ }
+ static file_desc eventfd(unsigned initval, int flags) {
+ int fd = ::eventfd(initval, flags);
+ throw_system_error_on(fd == -1, "eventfd");
+ return file_desc(fd);
+ }
+ static file_desc epoll_create(int flags = 0) {
+ int fd = ::epoll_create1(flags);
+ throw_system_error_on(fd == -1, "epoll_create1");
+ return file_desc(fd);
+ }
+ static file_desc timerfd_create(int clockid, int flags) {
+ int fd = ::timerfd_create(clockid, flags);
+ throw_system_error_on(fd == -1, "timerfd_create");
+ return file_desc(fd);
+ }
+ static file_desc temporary(sstring directory);
+ file_desc dup() const {
+ int fd = ::dup(get());
+ throw_system_error_on(fd == -1, "dup");
+ return file_desc(fd);
+ }
+ file_desc accept(socket_address& sa, int flags = 0) {
+ auto ret = ::accept4(_fd, &sa.as_posix_sockaddr(), &sa.addr_length, flags);
+ throw_system_error_on(ret == -1, "accept4");
+ return file_desc(ret);
+ }
+ static file_desc inotify_init(int flags);
+ // return nullopt if no connection is availbale to be accepted
+ std::optional<file_desc> try_accept(socket_address& sa, int flags = 0) {
+ auto ret = ::accept4(_fd, &sa.as_posix_sockaddr(), &sa.addr_length, flags);
+ if (ret == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(ret == -1, "accept4");
+ return file_desc(ret);
+ }
+ void shutdown(int how) {
+ auto ret = ::shutdown(_fd, how);
+ if (ret == -1 && errno != ENOTCONN) {
+ throw_system_error_on(ret == -1, "shutdown");
+ }
+ }
+ void truncate(size_t size) {
+ auto ret = ::ftruncate(_fd, size);
+ throw_system_error_on(ret, "ftruncate");
+ }
+ int ioctl(int request) {
+ return ioctl(request, 0);
+ }
+ int ioctl(int request, int value) {
+ int r = ::ioctl(_fd, request, value);
+ throw_system_error_on(r == -1, "ioctl");
+ return r;
+ }
+ int ioctl(int request, unsigned int value) {
+ int r = ::ioctl(_fd, request, value);
+ throw_system_error_on(r == -1, "ioctl");
+ return r;
+ }
+ template <class X>
+ int ioctl(int request, X& data) {
+ int r = ::ioctl(_fd, request, &data);
+ throw_system_error_on(r == -1, "ioctl");
+ return r;
+ }
+ template <class X>
+ int ioctl(int request, X&& data) {
+ int r = ::ioctl(_fd, request, &data);
+ throw_system_error_on(r == -1, "ioctl");
+ return r;
+ }
+ template <class X>
+ int setsockopt(int level, int optname, X&& data) {
+ int r = ::setsockopt(_fd, level, optname, &data, sizeof(data));
+ throw_system_error_on(r == -1, "setsockopt");
+ return r;
+ }
+ int setsockopt(int level, int optname, const char* data) {
+ int r = ::setsockopt(_fd, level, optname, data, strlen(data) + 1);
+ throw_system_error_on(r == -1, "setsockopt");
+ return r;
+ }
+ int setsockopt(int level, int optname, const void* data, socklen_t len) {
+ int r = ::setsockopt(_fd, level, optname, data, len);
+ throw_system_error_on(r == -1, "setsockopt");
+ return r;
+ }
+ template <typename Data>
+ Data getsockopt(int level, int optname) {
+ Data data;
+ socklen_t len = sizeof(data);
+ memset(&data, 0, len);
+ int r = ::getsockopt(_fd, level, optname, &data, &len);
+ throw_system_error_on(r == -1, "getsockopt");
+ return data;
+ }
+ int getsockopt(int level, int optname, char* data, socklen_t len) {
+ int r = ::getsockopt(_fd, level, optname, data, &len);
+ throw_system_error_on(r == -1, "getsockopt");
+ return r;
+ }
+ size_t size() {
+ struct stat buf;
+ auto r = ::fstat(_fd, &buf);
+ throw_system_error_on(r == -1, "fstat");
+ return buf.st_size;
+ }
+ std::optional<size_t> read(void* buffer, size_t len) {
+ auto r = ::read(_fd, buffer, len);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "read");
+ return { size_t(r) };
+ }
+ std::optional<ssize_t> recv(void* buffer, size_t len, int flags) {
+ auto r = ::recv(_fd, buffer, len, flags);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "recv");
+ return { ssize_t(r) };
+ }
+ std::optional<size_t> recvmsg(msghdr* mh, int flags) {
+ auto r = ::recvmsg(_fd, mh, flags);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "recvmsg");
+ return { size_t(r) };
+ }
+ std::optional<size_t> send(const void* buffer, size_t len, int flags) {
+ auto r = ::send(_fd, buffer, len, flags);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "send");
+ return { size_t(r) };
+ }
+ std::optional<size_t> sendto(socket_address& addr, const void* buf, size_t len, int flags) {
+ auto r = ::sendto(_fd, buf, len, flags, &addr.u.sa, addr.length());
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "sendto");
+ return { size_t(r) };
+ }
+ std::optional<size_t> sendmsg(const msghdr* msg, int flags) {
+ auto r = ::sendmsg(_fd, msg, flags);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "sendmsg");
+ return { size_t(r) };
+ }
+ void bind(sockaddr& sa, socklen_t sl) {
+ auto r = ::bind(_fd, &sa, sl);
+ throw_system_error_on(r == -1, "bind");
+ }
+ void connect(sockaddr& sa, socklen_t sl) {
+ auto r = ::connect(_fd, &sa, sl);
+ if (r == -1 && errno == EINPROGRESS) {
+ return;
+ }
+ throw_system_error_on(r == -1, "connect");
+ }
+ socket_address get_address() {
+ socket_address addr;
+ auto r = ::getsockname(_fd, &addr.u.sa, &addr.addr_length);
+ throw_system_error_on(r == -1, "getsockname");
+ return addr;
+ }
+ void listen(int backlog) {
+ auto fd = ::listen(_fd, backlog);
+ throw_system_error_on(fd == -1, "listen");
+ }
+ std::optional<size_t> write(const void* buf, size_t len) {
+ auto r = ::write(_fd, buf, len);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "write");
+ return { size_t(r) };
+ }
+ std::optional<size_t> writev(const iovec *iov, int iovcnt) {
+ auto r = ::writev(_fd, iov, iovcnt);
+ if (r == -1 && errno == EAGAIN) {
+ return {};
+ }
+ throw_system_error_on(r == -1, "writev");
+ return { size_t(r) };
+ }
+ size_t pread(void* buf, size_t len, off_t off) {
+ auto r = ::pread(_fd, buf, len, off);
+ throw_system_error_on(r == -1, "pread");
+ return size_t(r);
+ }
+ void timerfd_settime(int flags, const itimerspec& its) {
+ auto r = ::timerfd_settime(_fd, flags, &its, NULL);
+ throw_system_error_on(r == -1, "timerfd_settime");
+ }
+
+ mmap_area map(size_t size, unsigned prot, unsigned flags, size_t offset,
+ void* addr = nullptr) {
+ void *x = mmap(addr, size, prot, flags, _fd, offset);
+ throw_system_error_on(x == MAP_FAILED, "mmap");
+ return mmap_area(static_cast<char*>(x), mmap_deleter{size});
+ }
+
+ mmap_area map_shared_rw(size_t size, size_t offset) {
+ return map(size, PROT_READ | PROT_WRITE, MAP_SHARED, offset);
+ }
+
+ mmap_area map_shared_ro(size_t size, size_t offset) {
+ return map(size, PROT_READ, MAP_SHARED, offset);
+ }
+
+ mmap_area map_private_rw(size_t size, size_t offset) {
+ return map(size, PROT_READ | PROT_WRITE, MAP_PRIVATE, offset);
+ }
+
+ mmap_area map_private_ro(size_t size, size_t offset) {
+ return map(size, PROT_READ, MAP_PRIVATE, offset);
+ }
+
+private:
+ file_desc(int fd) : _fd(fd) {}
+ };
+
+
+namespace posix {
+
+/// Converts a duration value to a `timespec`
+///
+/// \param d a duration value to convert to the POSIX `timespec` format
+/// \return `d` as a `timespec` value
+template <typename Rep, typename Period>
+struct timespec
+to_timespec(std::chrono::duration<Rep, Period> d) {
+ auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(d).count();
+ struct timespec ts {};
+ ts.tv_sec = ns / 1000000000;
+ ts.tv_nsec = ns % 1000000000;
+ return ts;
+}
+
+/// Converts a relative start time and an interval to an `itimerspec`
+///
+/// \param base First expiration of the timer, relative to the current time
+/// \param interval period for re-arming the timer
+/// \return `base` and `interval` converted to an `itimerspec`
+template <typename Rep1, typename Period1, typename Rep2, typename Period2>
+struct itimerspec
+to_relative_itimerspec(std::chrono::duration<Rep1, Period1> base, std::chrono::duration<Rep2, Period2> interval) {
+ struct itimerspec its {};
+ its.it_interval = to_timespec(interval);
+ its.it_value = to_timespec(base);
+ return its;
+}
+
+
+/// Converts a time_point and a duration to an `itimerspec`
+///
+/// \param base base time for the timer; must use the same clock as the timer
+/// \param interval period for re-arming the timer
+/// \return `base` and `interval` converted to an `itimerspec`
+template <typename Clock, class Duration, class Rep, class Period>
+struct itimerspec
+to_absolute_itimerspec(std::chrono::time_point<Clock, Duration> base, std::chrono::duration<Rep, Period> interval) {
+ return to_relative_itimerspec(base.time_since_epoch(), interval);
+}
+
+}
+
+class posix_thread {
+public:
+ class attr;
+private:
+ // must allocate, since this class is moveable
+ std::unique_ptr<std::function<void ()>> _func;
+ pthread_t _pthread;
+ bool _valid = true;
+ mmap_area _stack;
+private:
+ static void* start_routine(void* arg) noexcept;
+public:
+ posix_thread(std::function<void ()> func);
+ posix_thread(attr a, std::function<void ()> func);
+ posix_thread(posix_thread&& x);
+ ~posix_thread();
+ void join();
+public:
+ class attr {
+ public:
+ struct stack_size { size_t size = 0; };
+ attr() = default;
+ template <typename... A>
+ attr(A... a) {
+ set(std::forward<A>(a)...);
+ }
+ void set() {}
+ template <typename A, typename... Rest>
+ void set(A a, Rest... rest) {
+ set(std::forward<A>(a));
+ set(std::forward<Rest>(rest)...);
+ }
+ void set(stack_size ss) { _stack_size = ss; }
+ private:
+ stack_size _stack_size;
+ friend class posix_thread;
+ };
+};
+
+
+inline
+void throw_system_error_on(bool condition, const char* what_arg) {
+ if (condition) {
+ if ((errno == EBADF || errno == ENOTSOCK) && is_abort_on_ebadf_enabled()) {
+ abort();
+ }
+ throw std::system_error(errno, std::system_category(), what_arg);
+ }
+}
+
+template <typename T>
+inline
+void throw_kernel_error(T r) {
+ static_assert(std::is_signed<T>::value, "kernel error variables must be signed");
+ if (r < 0) {
+ auto ec = -r;
+ if ((ec == EBADF || ec == ENOTSOCK) && is_abort_on_ebadf_enabled()) {
+ abort();
+ }
+ throw std::system_error(-r, std::system_category());
+ }
+}
+
+template <typename T>
+inline
+void throw_pthread_error(T r) {
+ if (r != 0) {
+ throw std::system_error(r, std::system_category());
+ }
+}
+
+inline
+sigset_t make_sigset_mask(int signo) {
+ sigset_t set;
+ sigemptyset(&set);
+ sigaddset(&set, signo);
+ return set;
+}
+
+inline
+sigset_t make_full_sigset_mask() {
+ sigset_t set;
+ sigfillset(&set);
+ return set;
+}
+
+inline
+sigset_t make_empty_sigset_mask() {
+ sigset_t set;
+ sigemptyset(&set);
+ return set;
+}
+
+inline
+void pin_this_thread(unsigned cpu_id) {
+ cpu_set_t cs;
+ CPU_ZERO(&cs);
+ CPU_SET(cpu_id, &cs);
+ auto r = pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs);
+ assert(r == 0);
+ (void)r;
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/preempt.hh b/src/seastar/include/seastar/core/preempt.hh
new file mode 100644
index 000000000..722c7831e
--- /dev/null
+++ b/src/seastar/include/seastar/core/preempt.hh
@@ -0,0 +1,58 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+#include <atomic>
+
+namespace seastar {
+
+namespace internal {
+
+struct preemption_monitor {
+ // We preempt when head != tail
+ // This happens to match the Linux aio completion ring, so we can have the
+ // kernel preempt a task by queuing a completion event to an io_context.
+ std::atomic<uint32_t> head;
+ std::atomic<uint32_t> tail;
+};
+
+}
+
+extern __thread const internal::preemption_monitor* g_need_preempt;
+
+inline bool need_preempt() noexcept {
+#ifndef SEASTAR_DEBUG
+ // prevent compiler from eliminating loads in a loop
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ auto np = g_need_preempt;
+ // We aren't reading anything from the ring, so we don't need
+ // any barriers.
+ auto head = np->head.load(std::memory_order_relaxed);
+ auto tail = np->tail.load(std::memory_order_relaxed);
+ // Possible optimization: read head and tail in a single 64-bit load,
+ // and find a funky way to compare the two 32-bit halves.
+ return __builtin_expect(head != tail, false);
+#else
+ return true;
+#endif
+}
+
+}
diff --git a/src/seastar/include/seastar/core/prefetch.hh b/src/seastar/include/seastar/core/prefetch.hh
new file mode 100644
index 000000000..73f9abd3f
--- /dev/null
+++ b/src/seastar/include/seastar/core/prefetch.hh
@@ -0,0 +1,115 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <boost/mpl/range_c.hpp>
+#include <boost/mpl/for_each.hpp>
+#include <seastar/core/align.hh>
+#include <seastar/core/cacheline.hh>
+
+namespace seastar {
+
+template <size_t N, int RW, int LOC>
+struct prefetcher;
+
+template<int RW, int LOC>
+struct prefetcher<0, RW, LOC> {
+ prefetcher(uintptr_t ptr) {}
+};
+
+template <size_t N, int RW, int LOC>
+struct prefetcher {
+ prefetcher(uintptr_t ptr) {
+ __builtin_prefetch(reinterpret_cast<void*>(ptr), RW, LOC);
+ std::atomic_signal_fence(std::memory_order_seq_cst);
+ prefetcher<N-cache_line_size, RW, LOC>(ptr + cache_line_size);
+ }
+};
+
+// LOC is a locality from __buitin_prefetch() gcc documentation:
+// "The value locality must be a compile-time constant integer between zero and three. A value of
+// zero means that the data has no temporal locality, so it need not be left in the cache after
+// the access. A value of three means that the data has a high degree of temporal locality and
+// should be left in all levels of cache possible. Values of one and two mean, respectively, a
+// low or moderate degree of temporal locality. The default is three."
+template<typename T, int LOC = 3>
+void prefetch(T* ptr) {
+ prefetcher<align_up(sizeof(T), cache_line_size), 0, LOC>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+template<typename Iterator, int LOC = 3>
+void prefetch(Iterator begin, Iterator end) {
+ std::for_each(begin, end, [] (auto v) { prefetch<decltype(*v), LOC>(v); });
+}
+
+template<size_t C, typename T, int LOC = 3>
+void prefetch_n(T** pptr) {
+ boost::mpl::for_each< boost::mpl::range_c<size_t,0,C> >( [pptr] (size_t x) { prefetch<T, LOC>(*(pptr + x)); } );
+}
+
+template<size_t L, int LOC = 3>
+void prefetch(void* ptr) {
+ prefetcher<L*cache_line_size, 0, LOC>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+template<size_t L, typename Iterator, int LOC = 3>
+void prefetch_n(Iterator begin, Iterator end) {
+ std::for_each(begin, end, [] (auto v) { prefetch<L, LOC>(v); });
+}
+
+template<size_t L, size_t C, typename T, int LOC = 3>
+void prefetch_n(T** pptr) {
+ boost::mpl::for_each< boost::mpl::range_c<size_t,0,C> >( [pptr] (size_t x) { prefetch<L, LOC>(*(pptr + x)); } );
+}
+
+template<typename T, int LOC = 3>
+void prefetchw(T* ptr) {
+ prefetcher<align_up(sizeof(T), cache_line_size), 1, LOC>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+template<typename Iterator, int LOC = 3>
+void prefetchw_n(Iterator begin, Iterator end) {
+ std::for_each(begin, end, [] (auto v) { prefetchw<decltype(*v), LOC>(v); });
+}
+
+template<size_t C, typename T, int LOC = 3>
+void prefetchw_n(T** pptr) {
+ boost::mpl::for_each< boost::mpl::range_c<size_t,0,C> >( [pptr] (size_t x) { prefetchw<T, LOC>(*(pptr + x)); } );
+}
+
+template<size_t L, int LOC = 3>
+void prefetchw(void* ptr) {
+ prefetcher<L*cache_line_size, 1, LOC>(reinterpret_cast<uintptr_t>(ptr));
+}
+
+template<size_t L, typename Iterator, int LOC = 3>
+void prefetchw_n(Iterator begin, Iterator end) {
+ std::for_each(begin, end, [] (auto v) { prefetchw<L, LOC>(v); });
+}
+
+template<size_t L, size_t C, typename T, int LOC = 3>
+void prefetchw_n(T** pptr) {
+ boost::mpl::for_each< boost::mpl::range_c<size_t,0,C> >( [pptr] (size_t x) { prefetchw<L, LOC>(*(pptr + x)); } );
+}
+
+}
diff --git a/src/seastar/include/seastar/core/print.hh b/src/seastar/include/seastar/core/print.hh
new file mode 100644
index 000000000..72e3934db
--- /dev/null
+++ b/src/seastar/include/seastar/core/print.hh
@@ -0,0 +1,148 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <fmt/ostream.h>
+#include <fmt/printf.h>
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+#include <sstream>
+#include <seastar/core/sstring.hh>
+
+#if 0
+inline
+std::ostream&
+operator<<(std::ostream& os, const void* ptr) {
+ auto flags = os.flags();
+ os << "0x" << std::hex << reinterpret_cast<uintptr_t>(ptr);
+ os.flags(flags);
+ return os;
+}
+#endif
+
+inline
+std::ostream&
+operator<<(std::ostream&& os, const void* ptr) {
+ return os << ptr; // selects non-rvalue version
+}
+
+namespace seastar {
+
+template <typename... A>
+std::ostream&
+fprint(std::ostream& os, const char* fmt, A&&... a) {
+ ::fmt::fprintf(os, fmt, std::forward<A>(a)...);
+ return os;
+}
+
+template <typename... A>
+void
+print(const char* fmt, A&&... a) {
+ ::fmt::printf(fmt, std::forward<A>(a)...);
+}
+
+template <typename... A>
+std::string
+sprint(const char* fmt, A&&... a) {
+ std::ostringstream os;
+ ::fmt::fprintf(os, fmt, std::forward<A>(a)...);
+ return os.str();
+}
+
+template <typename... A>
+std::string
+sprint(const sstring& fmt, A&&... a) {
+ std::ostringstream os;
+ ::fmt::fprintf(os, fmt.c_str(), std::forward<A>(a)...);
+ return os.str();
+}
+
+template <typename Iterator>
+std::string
+format_separated(Iterator b, Iterator e, const char* sep = ", ") {
+ std::string ret;
+ if (b == e) {
+ return ret;
+ }
+ ret += *b++;
+ while (b != e) {
+ ret += sep;
+ ret += *b++;
+ }
+ return ret;
+}
+
+template <typename TimePoint>
+struct usecfmt_wrapper {
+ TimePoint val;
+};
+
+template <typename TimePoint>
+inline
+usecfmt_wrapper<TimePoint>
+usecfmt(TimePoint tp) {
+ return { tp };
+};
+
+template <typename Clock, typename Rep, typename Period>
+std::ostream&
+operator<<(std::ostream& os, usecfmt_wrapper<std::chrono::time_point<Clock, std::chrono::duration<Rep, Period>>> tp) {
+ auto usec = std::chrono::duration_cast<std::chrono::microseconds>(tp.val.time_since_epoch()).count();
+ std::ostream tmp(os.rdbuf());
+ tmp << std::setw(12) << (usec / 1000000) << "." << std::setw(6) << std::setfill('0') << (usec % 1000000);
+ return os;
+}
+
+template <typename... A>
+void
+log(A&&... a) {
+ std::cout << usecfmt(std::chrono::high_resolution_clock::now()) << " ";
+ print(std::forward<A>(a)...);
+}
+
+/**
+ * Evaluate the formatted string in a native fmt library format
+ *
+ * @param fmt format string with the native fmt library syntax
+ * @param a positional parameters
+ *
+ * @return sstring object with the result of applying the given positional
+ * parameters on a given format string.
+ */
+template <typename... A>
+sstring
+format(const char* fmt, A&&... a) {
+ fmt::memory_buffer out;
+ fmt::format_to(out, fmt, std::forward<A>(a)...);
+ return sstring{out.data(), out.size()};
+}
+
+// temporary, use fmt::print() instead
+template <typename... A>
+std::ostream&
+fmt_print(std::ostream& os, const char* format, A&&... a) {
+ fmt::print(os, format, std::forward<A>(a)...);
+ return os;
+}
+
+}
diff --git a/src/seastar/include/seastar/core/prometheus.hh b/src/seastar/include/seastar/core/prometheus.hh
new file mode 100644
index 000000000..70c18894f
--- /dev/null
+++ b/src/seastar/include/seastar/core/prometheus.hh
@@ -0,0 +1,51 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/http/httpd.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+namespace prometheus {
+
+/*!
+ * Holds prometheus related configuration
+ */
+struct config {
+ sstring metric_help; //!< Default help message for the returned metrics
+ sstring hostname; //!< hostname is deprecated, use label instead
+ std::optional<metrics::label_instance> label; //!< A label that will be added to all metrics, we advice not to use it and set it on the prometheus server
+ sstring prefix = "seastar"; //!< a prefix that will be added to metric names
+};
+
+future<> start(httpd::http_server_control& http_server, config ctx);
+
+/// \defgroup add_prometheus_routes adds a /metrics endpoint that returns prometheus metrics
+/// both in txt format and in protobuf according to the prometheus spec
+/// @{
+future<> add_prometheus_routes(distributed<http_server>& server, config ctx);
+future<> add_prometheus_routes(http_server& server, config ctx);
+/// @}
+}
+}
diff --git a/src/seastar/include/seastar/core/queue.hh b/src/seastar/include/seastar/core/queue.hh
new file mode 100644
index 000000000..b5bf8e049
--- /dev/null
+++ b/src/seastar/include/seastar/core/queue.hh
@@ -0,0 +1,279 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/future.hh>
+#include <queue>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+/// Asynchronous single-producer single-consumer queue with limited capacity.
+/// There can be at most one producer-side and at most one consumer-side operation active at any time.
+/// Operations returning a future are considered to be active until the future resolves.
+template <typename T>
+class queue {
+ std::queue<T, circular_buffer<T>> _q;
+ size_t _max;
+ std::optional<promise<>> _not_empty;
+ std::optional<promise<>> _not_full;
+ std::exception_ptr _ex = nullptr;
+private:
+ void notify_not_empty();
+ void notify_not_full();
+public:
+ explicit queue(size_t size);
+
+ /// \brief Push an item.
+ ///
+ /// Returns false if the queue was full and the item was not pushed.
+ bool push(T&& a);
+
+ /// \brief Pop an item.
+ ///
+ /// Popping from an empty queue will result in undefined behavior.
+ T pop();
+
+ /// Consumes items from the queue, passing them to \c func, until \c func
+ /// returns false or the queue it empty
+ ///
+ /// Returns false if func returned false.
+ template <typename Func>
+ bool consume(Func&& func);
+
+ /// Returns true when the queue is empty.
+ bool empty() const;
+
+ /// Returns true when the queue is full.
+ bool full() const;
+
+ /// Returns a future<> that becomes available when pop() or consume()
+ /// can be called.
+ /// A consumer-side operation. Cannot be called concurrently with other consumer-side operations.
+ future<> not_empty();
+
+ /// Returns a future<> that becomes available when push() can be called.
+ /// A producer-side operation. Cannot be called concurrently with other producer-side operations.
+ future<> not_full();
+
+ /// Pops element now or when there is some. Returns a future that becomes
+ /// available when some element is available.
+ /// If the queue is, or already was, abort()ed, the future resolves with
+ /// the exception provided to abort().
+ /// A consumer-side operation. Cannot be called concurrently with other consumer-side operations.
+ future<T> pop_eventually();
+
+ /// Pushes the element now or when there is room. Returns a future<> which
+ /// resolves when data was pushed.
+ /// If the queue is, or already was, abort()ed, the future resolves with
+ /// the exception provided to abort().
+ /// A producer-side operation. Cannot be called concurrently with other producer-side operations.
+ future<> push_eventually(T&& data);
+
+ /// Returns the number of items currently in the queue.
+ size_t size() const { return _q.size(); }
+
+ /// Returns the size limit imposed on the queue during its construction
+ /// or by a call to set_max_size(). If the queue contains max_size()
+ /// items (or more), further items cannot be pushed until some are popped.
+ size_t max_size() const { return _max; }
+
+ /// Set the maximum size to a new value. If the queue's max size is reduced,
+ /// items already in the queue will not be expunged and the queue will be temporarily
+ /// bigger than its max_size.
+ void set_max_size(size_t max) {
+ _max = max;
+ if (!full()) {
+ notify_not_full();
+ }
+ }
+
+ /// Destroy any items in the queue, and pass the provided exception to any
+ /// waiting readers or writers - or to any later read or write attempts.
+ void abort(std::exception_ptr ex) {
+ while (!_q.empty()) {
+ _q.pop();
+ }
+ _ex = ex;
+ if (_not_full) {
+ _not_full->set_exception(ex);
+ _not_full= std::nullopt;
+ }
+ if (_not_empty) {
+ _not_empty->set_exception(std::move(ex));
+ _not_empty = std::nullopt;
+ }
+ }
+
+ /// \brief Check if there is an active consumer
+ ///
+ /// Returns true if another fiber waits for an item to be pushed into the queue
+ bool has_blocked_consumer() const {
+ return bool(_not_empty);
+ }
+};
+
+template <typename T>
+inline
+queue<T>::queue(size_t size)
+ : _max(size) {
+}
+
+template <typename T>
+inline
+void queue<T>::notify_not_empty() {
+ if (_not_empty) {
+ _not_empty->set_value();
+ _not_empty = std::optional<promise<>>();
+ }
+}
+
+template <typename T>
+inline
+void queue<T>::notify_not_full() {
+ if (_not_full) {
+ _not_full->set_value();
+ _not_full = std::optional<promise<>>();
+ }
+}
+
+template <typename T>
+inline
+bool queue<T>::push(T&& data) {
+ if (_q.size() < _max) {
+ _q.push(std::move(data));
+ notify_not_empty();
+ return true;
+ } else {
+ return false;
+ }
+}
+
+template <typename T>
+inline
+T queue<T>::pop() {
+ if (_q.size() == _max) {
+ notify_not_full();
+ }
+ T data = std::move(_q.front());
+ _q.pop();
+ return data;
+}
+
+template <typename T>
+inline
+future<T> queue<T>::pop_eventually() {
+ if (_ex) {
+ return make_exception_future<T>(_ex);
+ }
+ if (empty()) {
+ return not_empty().then([this] {
+ if (_ex) {
+ return make_exception_future<T>(_ex);
+ } else {
+ return make_ready_future<T>(pop());
+ }
+ });
+ } else {
+ return make_ready_future<T>(pop());
+ }
+}
+
+template <typename T>
+inline
+future<> queue<T>::push_eventually(T&& data) {
+ if (_ex) {
+ return make_exception_future<>(_ex);
+ }
+ if (full()) {
+ return not_full().then([this, data = std::move(data)] () mutable {
+ _q.push(std::move(data));
+ notify_not_empty();
+ });
+ } else {
+ _q.push(std::move(data));
+ notify_not_empty();
+ return make_ready_future<>();
+ }
+}
+
+template <typename T>
+template <typename Func>
+inline
+bool queue<T>::consume(Func&& func) {
+ if (_ex) {
+ std::rethrow_exception(_ex);
+ }
+ bool running = true;
+ while (!_q.empty() && running) {
+ running = func(std::move(_q.front()));
+ _q.pop();
+ }
+ if (!full()) {
+ notify_not_full();
+ }
+ return running;
+}
+
+template <typename T>
+inline
+bool queue<T>::empty() const {
+ return _q.empty();
+}
+
+template <typename T>
+inline
+bool queue<T>::full() const {
+ return _q.size() >= _max;
+}
+
+template <typename T>
+inline
+future<> queue<T>::not_empty() {
+ if (_ex) {
+ return make_exception_future<>(_ex);
+ }
+ if (!empty()) {
+ return make_ready_future<>();
+ } else {
+ _not_empty = promise<>();
+ return _not_empty->get_future();
+ }
+}
+
+template <typename T>
+inline
+future<> queue<T>::not_full() {
+ if (_ex) {
+ return make_exception_future<>(_ex);
+ }
+ if (!full()) {
+ return make_ready_future<>();
+ } else {
+ _not_full = promise<>();
+ return _not_full->get_future();
+ }
+}
+
+}
+
diff --git a/src/seastar/include/seastar/core/ragel.hh b/src/seastar/include/seastar/core/ragel.hh
new file mode 100644
index 000000000..de14035e9
--- /dev/null
+++ b/src/seastar/include/seastar/core/ragel.hh
@@ -0,0 +1,140 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/util/eclipse.hh>
+#include <algorithm>
+#include <memory>
+#include <cassert>
+#include <seastar/util/std-compat.hh>
+#include <seastar/core/future.hh>
+
+namespace seastar {
+
+// Support classes for Ragel parsers
+
+// Builds an sstring that can be scattered across multiple packets.
+//
+// Use a sstring_build::guard variable to designate each scattered
+// char array, and call mark_start() and mark_end() at the start
+// and end points, respectively. sstring_builder will collect data
+// from intervening segments, if needed.
+//
+// After mark_end() has been called, use the get() method to obtain
+// the built string.
+//
+// FIXME: switch to string_view.
+//
+class sstring_builder {
+ sstring _value;
+ const char* _start = nullptr;
+public:
+ class guard;
+public:
+ sstring get() && {
+ return std::move(_value);
+ }
+ void reset() {
+ _value = {};
+ _start = nullptr;
+ }
+ friend class guard;
+};
+
+class sstring_builder::guard {
+ sstring_builder& _builder;
+ const char* _block_end;
+public:
+ guard(sstring_builder& builder, const char* block_start, const char* block_end)
+ : _builder(builder), _block_end(block_end) {
+ if (!_builder._value.empty()) {
+ mark_start(block_start);
+ }
+ }
+ ~guard() {
+ if (_builder._start) {
+ mark_end(_block_end);
+ }
+ }
+ void mark_start(const char* p) {
+ _builder._start = p;
+ }
+ void mark_end(const char* p) {
+ if (_builder._value.empty()) {
+ // avoid an allocation in the common case
+ _builder._value = sstring(_builder._start, p);
+ } else {
+ _builder._value += sstring(_builder._start, p);
+ }
+ _builder._start = nullptr;
+ }
+};
+
+
+// CRTP
+template <typename ConcreteParser>
+class ragel_parser_base {
+protected:
+ int _fsm_cs;
+ std::unique_ptr<int[]> _fsm_stack = nullptr;
+ int _fsm_stack_size = 0;
+ int _fsm_top;
+ int _fsm_act;
+ char* _fsm_ts;
+ char* _fsm_te;
+ sstring_builder _builder;
+protected:
+ void init_base() {
+ _builder.reset();
+ }
+ void prepush() {
+ if (_fsm_top == _fsm_stack_size) {
+ auto old = _fsm_stack_size;
+ _fsm_stack_size = std::max(_fsm_stack_size * 2, 16);
+ assert(_fsm_stack_size > old);
+ std::unique_ptr<int[]> new_stack{new int[_fsm_stack_size]};
+ std::copy(_fsm_stack.get(), _fsm_stack.get() + _fsm_top, new_stack.get());
+ std::swap(_fsm_stack, new_stack);
+ }
+ }
+ void postpop() {}
+ sstring get_str() {
+ return std::move(_builder).get();
+ }
+public:
+ using unconsumed_remainder = std::optional<temporary_buffer<char>>;
+ future<unconsumed_remainder> operator()(temporary_buffer<char> buf) {
+ char* p = buf.get_write();
+ char* pe = p + buf.size();
+ char* eof = buf.empty() ? pe : nullptr;
+ char* parsed = static_cast<ConcreteParser*>(this)->parse(p, pe, eof);
+ if (parsed) {
+ buf.trim_front(parsed - p);
+ return make_ready_future<unconsumed_remainder>(std::move(buf));
+ }
+ return make_ready_future<unconsumed_remainder>();
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/reactor.hh b/src/seastar/include/seastar/core/reactor.hh
new file mode 100644
index 000000000..224193247
--- /dev/null
+++ b/src/seastar/include/seastar/core/reactor.hh
@@ -0,0 +1,755 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/core/seastar.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/core/aligned_buffer.hh>
+#include <seastar/core/cacheline.hh>
+#include <seastar/core/circular_buffer_fixed_capacity.hh>
+#include <seastar/core/idle_cpu_handler.hh>
+#include <memory>
+#include <type_traits>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unordered_map>
+#include <netinet/ip.h>
+#include <cstring>
+#include <cassert>
+#include <stdexcept>
+#include <unistd.h>
+#include <vector>
+#include <queue>
+#include <algorithm>
+#include <thread>
+#include <system_error>
+#include <chrono>
+#include <ratio>
+#include <atomic>
+#include <stack>
+#include <seastar/util/std-compat.hh>
+#include <boost/next_prior.hpp>
+#include <boost/lockfree/spsc_queue.hpp>
+#include <boost/program_options.hpp>
+#include <boost/thread/barrier.hpp>
+#include <boost/container/static_vector.hpp>
+#include <set>
+#include <seastar/core/reactor_config.hh>
+#include <seastar/core/linux-aio.hh>
+#include <seastar/util/eclipse.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/net/api.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/file.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/fair_queue.hh>
+#include <seastar/core/scattered_message.hh>
+#include <seastar/core/enum.hh>
+#include <seastar/core/memory.hh>
+#include <seastar/core/thread_cputime_clock.hh>
+#include <boost/range/irange.hpp>
+#include <seastar/core/timer.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/util/log.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/manual_clock.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/core/scheduling.hh>
+#include <seastar/core/scheduling_specific.hh>
+#include <seastar/core/smp.hh>
+#include <seastar/core/internal/io_request.hh>
+#include <seastar/core/make_task.hh>
+#include "internal/pollable_fd.hh"
+#include "internal/poll.hh"
+
+#ifdef HAVE_OSV
+#include <osv/sched.hh>
+#include <osv/mutex.h>
+#include <osv/condvar.h>
+#include <osv/newpoll.hh>
+#endif
+
+struct _Unwind_Exception;
+
+namespace seastar {
+
+using shard_id = unsigned;
+
+namespace alien {
+class message_queue;
+}
+class reactor;
+inline
+size_t iovec_len(const std::vector<iovec>& iov)
+{
+ size_t ret = 0;
+ for (auto&& e : iov) {
+ ret += e.iov_len;
+ }
+ return ret;
+}
+
+}
+
+namespace std {
+
+template <>
+struct hash<::sockaddr_in> {
+ size_t operator()(::sockaddr_in a) const {
+ return a.sin_port ^ a.sin_addr.s_addr;
+ }
+};
+
+}
+
+bool operator==(const ::sockaddr_in a, const ::sockaddr_in b);
+
+namespace seastar {
+
+void register_network_stack(sstring name, boost::program_options::options_description opts,
+ noncopyable_function<future<std::unique_ptr<network_stack>>(boost::program_options::variables_map opts)> create,
+ bool make_default = false);
+
+class thread_pool;
+class smp;
+
+class reactor_backend_selector;
+
+class reactor_backend;
+
+namespace internal {
+
+class reactor_stall_sampler;
+class cpu_stall_detector;
+class buffer_allocator;
+
+template <typename Func> // signature: bool ()
+std::unique_ptr<pollfn> make_pollfn(Func&& func);
+
+class poller {
+ std::unique_ptr<pollfn> _pollfn;
+ class registration_task;
+ class deregistration_task;
+ registration_task* _registration_task = nullptr;
+public:
+ template <typename Func> // signature: bool ()
+ static poller simple(Func&& poll) {
+ return poller(make_pollfn(std::forward<Func>(poll)));
+ }
+ poller(std::unique_ptr<pollfn> fn)
+ : _pollfn(std::move(fn)) {
+ do_register();
+ }
+ ~poller();
+ poller(poller&& x) noexcept;
+ poller& operator=(poller&& x) noexcept;
+ void do_register() noexcept;
+ friend class reactor;
+};
+
+}
+
+class kernel_completion;
+class io_queue;
+class disk_config_params;
+
+class io_completion : public kernel_completion {
+public:
+ virtual void complete_with(ssize_t res) final override;
+
+ virtual void complete(size_t res) noexcept = 0;
+ virtual void set_exception(std::exception_ptr eptr) noexcept = 0;
+};
+
+class reactor {
+ using sched_clock = std::chrono::steady_clock;
+private:
+ struct task_queue;
+ using task_queue_list = circular_buffer_fixed_capacity<task_queue*, max_scheduling_groups()>;
+ using pollfn = seastar::pollfn;
+
+ class signal_pollfn;
+ class batch_flush_pollfn;
+ class smp_pollfn;
+ class drain_cross_cpu_freelist_pollfn;
+ class lowres_timer_pollfn;
+ class manual_timer_pollfn;
+ class epoll_pollfn;
+ class reap_kernel_completions_pollfn;
+ class kernel_submit_work_pollfn;
+ class io_queue_submission_pollfn;
+ class syscall_pollfn;
+ class execution_stage_pollfn;
+ friend class manual_clock;
+ friend class file_data_source_impl; // for fstream statistics
+ friend class internal::reactor_stall_sampler;
+ friend class preempt_io_context;
+ friend struct hrtimer_aio_completion;
+ friend struct task_quota_aio_completion;
+ friend class reactor_backend_epoll;
+ friend class reactor_backend_aio;
+ friend class reactor_backend_selector;
+ friend class aio_storage_context;
+public:
+ using poller = internal::poller;
+ using idle_cpu_handler_result = seastar::idle_cpu_handler_result;
+ using work_waiting_on_reactor = seastar::work_waiting_on_reactor;
+ using idle_cpu_handler = seastar::idle_cpu_handler;
+
+ struct io_stats {
+ uint64_t aio_reads = 0;
+ uint64_t aio_read_bytes = 0;
+ uint64_t aio_writes = 0;
+ uint64_t aio_write_bytes = 0;
+ uint64_t aio_errors = 0;
+ uint64_t fstream_reads = 0;
+ uint64_t fstream_read_bytes = 0;
+ uint64_t fstream_reads_blocked = 0;
+ uint64_t fstream_read_bytes_blocked = 0;
+ uint64_t fstream_read_aheads_discarded = 0;
+ uint64_t fstream_read_ahead_discarded_bytes = 0;
+ };
+ friend void io_completion::complete_with(ssize_t);
+
+private:
+ reactor_config _cfg;
+ file_desc _notify_eventfd;
+ file_desc _task_quota_timer;
+#ifdef HAVE_OSV
+ reactor_backend_osv _backend;
+ sched::thread _timer_thread;
+ sched::thread *_engine_thread;
+ mutable mutex _timer_mutex;
+ condvar _timer_cond;
+ s64 _timer_due = 0;
+#else
+ std::unique_ptr<reactor_backend> _backend;
+#endif
+ sigset_t _active_sigmask; // holds sigmask while sleeping with sig disabled
+ std::vector<pollfn*> _pollers;
+
+ static constexpr unsigned max_aio_per_queue = 128;
+ static constexpr unsigned max_queues = 8;
+ static constexpr unsigned max_aio = max_aio_per_queue * max_queues;
+ friend disk_config_params;
+
+ // Not all reactors have IO queues. If the number of IO queues is less than the number of shards,
+ // some reactors will talk to foreign io_queues. If this reactor holds a valid IO queue, it will
+ // be stored here.
+ std::vector<std::unique_ptr<io_queue>> my_io_queues;
+ std::unordered_map<dev_t, io_queue*> _io_queues;
+
+ std::vector<noncopyable_function<future<> ()>> _exit_funcs;
+ unsigned _id = 0;
+ bool _stopping = false;
+ bool _stopped = false;
+ bool _finished_running_tasks = false;
+ condition_variable _stop_requested;
+ bool _handle_sigint = true;
+ std::optional<future<std::unique_ptr<network_stack>>> _network_stack_ready;
+ int _return = 0;
+ promise<> _start_promise;
+ semaphore _cpu_started;
+ internal::preemption_monitor _preemption_monitor{};
+ uint64_t _global_tasks_processed = 0;
+ uint64_t _polls = 0;
+ std::unique_ptr<internal::cpu_stall_detector> _cpu_stall_detector;
+
+ unsigned _max_task_backlog = 1000;
+ timer_set<timer<>, &timer<>::_link> _timers;
+ timer_set<timer<>, &timer<>::_link>::timer_list_t _expired_timers;
+ timer_set<timer<lowres_clock>, &timer<lowres_clock>::_link> _lowres_timers;
+ timer_set<timer<lowres_clock>, &timer<lowres_clock>::_link>::timer_list_t _expired_lowres_timers;
+ timer_set<timer<manual_clock>, &timer<manual_clock>::_link> _manual_timers;
+ timer_set<timer<manual_clock>, &timer<manual_clock>::_link>::timer_list_t _expired_manual_timers;
+ io_stats _io_stats;
+ uint64_t _fsyncs = 0;
+ uint64_t _cxx_exceptions = 0;
+ uint64_t _abandoned_failed_futures = 0;
+ struct task_queue {
+ explicit task_queue(unsigned id, sstring name, float shares);
+ int64_t _vruntime = 0;
+ float _shares;
+ int64_t _reciprocal_shares_times_2_power_32;
+ bool _current = false;
+ bool _active = false;
+ uint8_t _id;
+ sched_clock::time_point _ts; // to help calculating wait/starve-times
+ sched_clock::duration _runtime = {};
+ sched_clock::duration _waittime = {};
+ sched_clock::duration _starvetime = {};
+ uint64_t _tasks_processed = 0;
+ circular_buffer<task*> _q;
+ sstring _name;
+ int64_t to_vruntime(sched_clock::duration runtime) const;
+ void set_shares(float shares) noexcept;
+ struct indirect_compare;
+ sched_clock::duration _time_spent_on_task_quota_violations = {};
+ seastar::metrics::metric_groups _metrics;
+ void rename(sstring new_name);
+ private:
+ void register_stats();
+ };
+
+ circular_buffer<internal::io_request> _pending_io;
+ boost::container::static_vector<std::unique_ptr<task_queue>, max_scheduling_groups()> _task_queues;
+ internal::scheduling_group_specific_thread_local_data _scheduling_group_specific_data;
+ int64_t _last_vruntime = 0;
+ task_queue_list _active_task_queues;
+ task_queue_list _activating_task_queues;
+ task_queue* _at_destroy_tasks;
+ sched_clock::duration _task_quota;
+ task* _current_task = nullptr;
+ /// Handler that will be called when there is no task to execute on cpu.
+ /// It represents a low priority work.
+ ///
+ /// Handler's return value determines whether handler did any actual work. If no work was done then reactor will go
+ /// into sleep.
+ ///
+ /// Handler's argument is a function that returns true if a task which should be executed on cpu appears or false
+ /// otherwise. This function should be used by a handler to return early if a task appears.
+ idle_cpu_handler _idle_cpu_handler{ [] (work_waiting_on_reactor) {return idle_cpu_handler_result::no_more_work;} };
+ std::unique_ptr<network_stack> _network_stack;
+ // _lowres_clock_impl will only be created on cpu 0
+ std::unique_ptr<lowres_clock_impl> _lowres_clock_impl;
+ lowres_clock::time_point _lowres_next_timeout;
+ std::optional<poller> _epoll_poller;
+ std::optional<pollable_fd> _aio_eventfd;
+ const bool _reuseport;
+ circular_buffer<double> _loads;
+ double _load = 0;
+ sched_clock::duration _total_idle{0};
+ sched_clock::duration _total_sleep;
+ sched_clock::time_point _start_time = sched_clock::now();
+ std::chrono::nanoseconds _max_poll_time = calculate_poll_time();
+ circular_buffer<output_stream<char>* > _flush_batching;
+ std::atomic<bool> _sleeping alignas(seastar::cache_line_size){0};
+ pthread_t _thread_id alignas(seastar::cache_line_size) = pthread_self();
+ bool _strict_o_direct = true;
+ bool _force_io_getevents_syscall = false;
+ bool _bypass_fsync = false;
+ bool _have_aio_fsync = false;
+ std::atomic<bool> _dying{false};
+private:
+ static std::chrono::nanoseconds calculate_poll_time();
+ static void block_notifier(int);
+ void wakeup();
+ size_t handle_aio_error(internal::linux_abi::iocb* iocb, int ec);
+ bool flush_pending_aio();
+ bool reap_kernel_completions();
+ bool flush_tcp_batches();
+ bool do_expire_lowres_timers() noexcept;
+ bool do_check_lowres_timers() const noexcept;
+ void expire_manual_timers() noexcept;
+ void start_aio_eventfd_loop();
+ void stop_aio_eventfd_loop();
+ template <typename T, typename E, typename EnableFunc>
+ void complete_timers(T&, E&, EnableFunc&& enable_fn) noexcept(noexcept(enable_fn()));
+
+ /**
+ * Returns TRUE if all pollers allow blocking.
+ *
+ * @return FALSE if at least one of the blockers requires a non-blocking
+ * execution.
+ */
+ bool poll_once();
+ bool pure_poll_once();
+public:
+ /// Register a user-defined signal handler
+ void handle_signal(int signo, noncopyable_function<void ()>&& handler);
+
+private:
+ class signals {
+ public:
+ signals();
+ ~signals();
+
+ bool poll_signal();
+ bool pure_poll_signal() const;
+ void handle_signal(int signo, noncopyable_function<void ()>&& handler);
+ void handle_signal_once(int signo, noncopyable_function<void ()>&& handler);
+ static void action(int signo, siginfo_t* siginfo, void* ignore);
+ static void failed_to_handle(int signo);
+ private:
+ struct signal_handler {
+ signal_handler(int signo, noncopyable_function<void ()>&& handler);
+ noncopyable_function<void ()> _handler;
+ };
+ std::atomic<uint64_t> _pending_signals;
+ std::unordered_map<int, signal_handler> _signal_handlers;
+
+ friend void reactor::handle_signal(int, noncopyable_function<void ()>&&);
+ };
+
+ signals _signals;
+ std::unique_ptr<thread_pool> _thread_pool;
+ friend class thread_pool;
+ friend class thread_context;
+ friend class internal::cpu_stall_detector;
+
+ uint64_t pending_task_count() const;
+ void run_tasks(task_queue& tq);
+ bool have_more_tasks() const;
+ bool posix_reuseport_detect();
+ void task_quota_timer_thread_fn();
+ void run_some_tasks();
+ void activate(task_queue& tq);
+ void insert_active_task_queue(task_queue* tq);
+ task_queue* pop_active_task_queue(sched_clock::time_point now);
+ void insert_activating_task_queues();
+ void account_runtime(task_queue& tq, sched_clock::duration runtime);
+ void account_idle(sched_clock::duration idletime);
+ void allocate_scheduling_group_specific_data(scheduling_group sg, scheduling_group_key key);
+ future<> init_scheduling_group(scheduling_group sg, sstring name, float shares);
+ future<> init_new_scheduling_group_key(scheduling_group_key key, scheduling_group_key_config cfg);
+ future<> destroy_scheduling_group(scheduling_group sg);
+ uint64_t tasks_processed() const;
+ uint64_t min_vruntime() const;
+ void request_preemption();
+ void start_handling_signal();
+ void reset_preemption_monitor();
+ void service_highres_timer() noexcept;
+
+ future<std::tuple<pollable_fd, socket_address>>
+ do_accept(pollable_fd_state& listen_fd);
+ future<> do_connect(pollable_fd_state& pfd, socket_address& sa);
+
+ future<size_t>
+ do_read_some(pollable_fd_state& fd, void* buffer, size_t size);
+ future<size_t>
+ do_read_some(pollable_fd_state& fd, const std::vector<iovec>& iov);
+ future<temporary_buffer<char>>
+ do_read_some(pollable_fd_state& fd, internal::buffer_allocator* ba);
+
+ future<size_t>
+ do_write_some(pollable_fd_state& fd, const void* buffer, size_t size);
+ future<size_t>
+ do_write_some(pollable_fd_state& fd, net::packet& p);
+public:
+ static boost::program_options::options_description get_options_description(reactor_config cfg);
+ explicit reactor(unsigned id, reactor_backend_selector rbs, reactor_config cfg);
+ reactor(const reactor&) = delete;
+ ~reactor();
+ void operator=(const reactor&) = delete;
+
+ sched_clock::duration uptime() {
+ return sched_clock::now() - _start_time;
+ }
+
+ io_queue& get_io_queue(dev_t devid = 0) {
+ auto queue = _io_queues.find(devid);
+ if (queue == _io_queues.end()) {
+ return *_io_queues[0];
+ } else {
+ return *(queue->second);
+ }
+ }
+
+ io_priority_class register_one_priority_class(sstring name, uint32_t shares);
+
+ /// \brief Updates the current amount of shares for a given priority class
+ ///
+ /// This can involve a cross-shard call if the I/O Queue that is responsible for
+ /// this class lives in a foreign shard.
+ ///
+ /// \param pc the priority class handle
+ /// \param shares the new shares value
+ /// \return a future that is ready when the share update is applied
+ future<> update_shares_for_class(io_priority_class pc, uint32_t shares);
+ static future<> rename_priority_class(io_priority_class pc, sstring new_name) noexcept;
+
+ void configure(boost::program_options::variables_map config);
+
+ server_socket listen(socket_address sa, listen_options opts = {});
+
+ future<connected_socket> connect(socket_address sa);
+ future<connected_socket> connect(socket_address, socket_address, transport proto = transport::TCP);
+
+ pollable_fd posix_listen(socket_address sa, listen_options opts = {});
+
+ bool posix_reuseport_available() const { return _reuseport; }
+
+ pollable_fd make_pollable_fd(socket_address sa, int proto);
+
+ future<> posix_connect(pollable_fd pfd, socket_address sa, socket_address local);
+
+ future<> write_all(pollable_fd_state& fd, const void* buffer, size_t size);
+
+ future<file> open_file_dma(std::string_view name, open_flags flags, file_open_options options = {}) noexcept;
+ future<file> open_directory(std::string_view name) noexcept;
+ future<> make_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept;
+ future<> touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept;
+ future<std::optional<directory_entry_type>> file_type(std::string_view name, follow_symlink = follow_symlink::yes) noexcept;
+ future<stat_data> file_stat(std::string_view pathname, follow_symlink) noexcept;
+ future<uint64_t> file_size(std::string_view pathname) noexcept;
+ future<bool> file_accessible(std::string_view pathname, access_flags flags) noexcept;
+ future<bool> file_exists(std::string_view pathname) noexcept {
+ return file_accessible(pathname, access_flags::exists);
+ }
+ future<fs_type> file_system_at(std::string_view pathname) noexcept;
+ future<struct statvfs> statvfs(std::string_view pathname) noexcept;
+ future<> remove_file(std::string_view pathname) noexcept;
+ future<> rename_file(std::string_view old_pathname, std::string_view new_pathname) noexcept;
+ future<> link_file(std::string_view oldpath, std::string_view newpath) noexcept;
+ future<> chmod(std::string_view name, file_permissions permissions) noexcept;
+
+ future<int> inotify_add_watch(int fd, std::string_view path, uint32_t flags);
+
+ // In the following three methods, prepare_io is not guaranteed to execute in the same processor
+ // in which it was generated. Therefore, care must be taken to avoid the use of objects that could
+ // be destroyed within or at exit of prepare_io.
+ void submit_io(io_completion* desc, internal::io_request req) noexcept;
+ future<size_t> submit_io_read(io_queue* ioq,
+ const io_priority_class& priority_class,
+ size_t len,
+ internal::io_request req) noexcept;
+ future<size_t> submit_io_write(io_queue* ioq,
+ const io_priority_class& priority_class,
+ size_t len,
+ internal::io_request req) noexcept;
+
+ int run();
+ void exit(int ret);
+ future<> when_started() { return _start_promise.get_future(); }
+ // The function waits for timeout period for reactor stop notification
+ // which happens on termination signals or call for exit().
+ template <typename Rep, typename Period>
+ future<> wait_for_stop(std::chrono::duration<Rep, Period> timeout) {
+ return _stop_requested.wait(timeout, [this] { return _stopping; });
+ }
+
+ void at_exit(noncopyable_function<future<> ()> func);
+
+ template <typename Func>
+ void at_destroy(Func&& func) {
+ _at_destroy_tasks->_q.push_back(make_task(default_scheduling_group(), std::forward<Func>(func)));
+ }
+
+#ifdef SEASTAR_SHUFFLE_TASK_QUEUE
+ void shuffle(task*&, task_queue&);
+#endif
+ task* current_task() const { return _current_task; }
+
+ void add_task(task* t) noexcept {
+ auto sg = t->group();
+ auto* q = _task_queues[sg._id].get();
+ bool was_empty = q->_q.empty();
+ q->_q.push_back(std::move(t));
+#ifdef SEASTAR_SHUFFLE_TASK_QUEUE
+ shuffle(q->_q.back(), *q);
+#endif
+ if (was_empty) {
+ activate(*q);
+ }
+ }
+ void add_urgent_task(task* t) noexcept {
+ memory::scoped_critical_alloc_section _;
+ auto sg = t->group();
+ auto* q = _task_queues[sg._id].get();
+ bool was_empty = q->_q.empty();
+ q->_q.push_front(std::move(t));
+#ifdef SEASTAR_SHUFFLE_TASK_QUEUE
+ shuffle(q->_q.front(), *q);
+#endif
+ if (was_empty) {
+ activate(*q);
+ }
+ }
+
+ /// Set a handler that will be called when there is no task to execute on cpu.
+ /// Handler should do a low priority work.
+ ///
+ /// Handler's return value determines whether handler did any actual work. If no work was done then reactor will go
+ /// into sleep.
+ ///
+ /// Handler's argument is a function that returns true if a task which should be executed on cpu appears or false
+ /// otherwise. This function should be used by a handler to return early if a task appears.
+ void set_idle_cpu_handler(idle_cpu_handler&& handler) {
+ _idle_cpu_handler = std::move(handler);
+ }
+ void force_poll();
+
+ void add_high_priority_task(task*) noexcept;
+
+ network_stack& net() { return *_network_stack; }
+
+ [[deprecated("Use this_shard_id")]]
+ shard_id cpu_id() const;
+
+ void sleep();
+
+ steady_clock_type::duration total_idle_time();
+ steady_clock_type::duration total_busy_time();
+ std::chrono::nanoseconds total_steal_time();
+
+ const io_stats& get_io_stats() const { return _io_stats; }
+ uint64_t abandoned_failed_futures() const { return _abandoned_failed_futures; }
+#ifdef HAVE_OSV
+ void timer_thread_func();
+ void set_timer(sched::timer &tmr, s64 t);
+#endif
+private:
+ /**
+ * Add a new "poller" - a non-blocking function returning a boolean, that
+ * will be called every iteration of a main loop.
+ * If it returns FALSE then reactor's main loop is forbidden to block in the
+ * current iteration.
+ *
+ * @param fn a new "poller" function to register
+ */
+ void register_poller(pollfn* p);
+ void unregister_poller(pollfn* p);
+ void replace_poller(pollfn* old, pollfn* neww);
+ void register_metrics();
+ future<> write_all_part(pollable_fd_state& fd, const void* buffer, size_t size, size_t completed);
+
+ future<> fdatasync(int fd) noexcept;
+
+ void add_timer(timer<steady_clock_type>*) noexcept;
+ bool queue_timer(timer<steady_clock_type>*) noexcept;
+ void del_timer(timer<steady_clock_type>*) noexcept;
+ void add_timer(timer<lowres_clock>*) noexcept;
+ bool queue_timer(timer<lowres_clock>*) noexcept;
+ void del_timer(timer<lowres_clock>*) noexcept;
+ void add_timer(timer<manual_clock>*) noexcept;
+ bool queue_timer(timer<manual_clock>*) noexcept;
+ void del_timer(timer<manual_clock>*) noexcept;
+
+ future<> run_exit_tasks();
+ void stop();
+ friend class alien::message_queue;
+ friend class pollable_fd;
+ friend class pollable_fd_state;
+ friend struct pollable_fd_state_deleter;
+ friend class posix_file_impl;
+ friend class blockdev_file_impl;
+ friend class readable_eventfd;
+ friend class timer<>;
+ friend class timer<lowres_clock>;
+ friend class timer<manual_clock>;
+ friend class smp;
+ friend class smp_message_queue;
+ friend class internal::poller;
+ friend class scheduling_group;
+ friend void add_to_flush_poller(output_stream<char>* os);
+ friend void seastar::log_exception_trace() noexcept;
+ friend void report_failed_future(const std::exception_ptr& eptr) noexcept;
+ friend void with_allow_abandoned_failed_futures(unsigned count, noncopyable_function<void ()> func);
+ metrics::metric_groups _metric_groups;
+ friend future<scheduling_group> create_scheduling_group(sstring name, float shares) noexcept;
+ friend future<> seastar::destroy_scheduling_group(scheduling_group) noexcept;
+ friend future<> seastar::rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept;
+ friend future<scheduling_group_key> scheduling_group_key_create(scheduling_group_key_config cfg) noexcept;
+
+ template<typename T>
+ friend T* internal::scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept;
+ template<typename SpecificValType, typename Mapper, typename Reducer, typename Initial>
+ SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Mapper mapper, Reducer reducer, Initial initial) {
+ {reducer(initial, mapper(specific_val))} -> std::convertible_to<Initial>;
+ })
+ friend future<typename function_traits<Reducer>::return_type>
+ map_reduce_scheduling_group_specific(Mapper mapper, Reducer reducer, Initial initial_val, scheduling_group_key key);
+ template<typename SpecificValType, typename Reducer, typename Initial>
+ SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Reducer reducer, Initial initial) {
+ {reducer(initial, specific_val)} -> std::convertible_to<Initial>;
+ })
+ friend future<typename function_traits<Reducer>::return_type>
+ reduce_scheduling_group_specific(Reducer reducer, Initial initial_val, scheduling_group_key key);
+
+ future<struct stat> fstat(int fd) noexcept;
+ future<struct statfs> fstatfs(int fd) noexcept;
+ friend future<shared_ptr<file_impl>> make_file_impl(int fd, file_open_options options, int flags) noexcept;
+public:
+ future<> readable(pollable_fd_state& fd);
+ future<> writeable(pollable_fd_state& fd);
+ future<> readable_or_writeable(pollable_fd_state& fd);
+ void abort_reader(pollable_fd_state& fd);
+ void abort_writer(pollable_fd_state& fd);
+ void enable_timer(steady_clock_type::time_point when) noexcept;
+ /// Sets the "Strict DMA" flag.
+ ///
+ /// When true (default), file I/O operations must use DMA. This is
+ /// the most performant option, but does not work on some file systems
+ /// such as tmpfs or aufs (used in some Docker setups).
+ ///
+ /// When false, file I/O operations can fall back to buffered I/O if
+ /// DMA is not available. This can result in dramatic reducation in
+ /// performance and an increase in memory consumption.
+ void set_strict_dma(bool value);
+ void set_bypass_fsync(bool value);
+ void update_blocked_reactor_notify_ms(std::chrono::milliseconds ms);
+ std::chrono::milliseconds get_blocked_reactor_notify_ms() const;
+ // For testing:
+ void set_stall_detector_report_function(std::function<void ()> report);
+ std::function<void ()> get_stall_detector_report_function() const;
+};
+
+template <typename Func> // signature: bool ()
+inline
+std::unique_ptr<seastar::pollfn>
+internal::make_pollfn(Func&& func) {
+ struct the_pollfn : simple_pollfn<false> {
+ the_pollfn(Func&& func) : func(std::forward<Func>(func)) {}
+ Func func;
+ virtual bool poll() override final {
+ return func();
+ }
+ };
+ return std::make_unique<the_pollfn>(std::forward<Func>(func));
+}
+
+extern __thread reactor* local_engine;
+extern __thread size_t task_quota;
+
+inline reactor& engine() {
+ return *local_engine;
+}
+
+inline bool engine_is_ready() {
+ return local_engine != nullptr;
+}
+
+inline
+size_t iovec_len(const iovec* begin, size_t len)
+{
+ size_t ret = 0;
+ auto end = begin + len;
+ while (begin != end) {
+ ret += begin++->iov_len;
+ }
+ return ret;
+}
+
+inline int hrtimer_signal() {
+ // We don't want to use SIGALRM, because the boost unit test library
+ // also plays with it.
+ return SIGRTMIN;
+}
+
+
+extern logger seastar_logger;
+
+}
diff --git a/src/seastar/include/seastar/core/reactor_config.hh b/src/seastar/include/seastar/core/reactor_config.hh
new file mode 100644
index 000000000..d7cc5f0a9
--- /dev/null
+++ b/src/seastar/include/seastar/core/reactor_config.hh
@@ -0,0 +1,47 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <chrono>
+
+namespace seastar {
+
+/// Configuration structure for reactor
+///
+/// This structure provides configuration items for the reactor. It is typically
+/// provided by \ref app_template, not the user.
+struct reactor_config {
+ std::chrono::duration<double> task_quota{0.5e-3}; ///< default time between polls
+ /// \brief Handle SIGINT/SIGTERM by calling reactor::stop()
+ ///
+ /// When true, Seastar will set up signal handlers for SIGINT/SIGTERM that call
+ /// reactor::stop(). The reactor will then execute callbacks installed by
+ /// reactor::at_exit().
+ ///
+ /// When false, Seastar will not set up signal handlers for SIGINT/SIGTERM
+ /// automatically. The default behavior (terminate the program) will be kept.
+ /// You can adjust the behavior of SIGINT/SIGTERM by installing signal handlers
+ /// via reactor::handle_signal().
+ bool auto_handle_sigint_sigterm = true; ///< automatically terminate on SIGINT/SIGTERM
+};
+
+}
diff --git a/src/seastar/include/seastar/core/report_exception.hh b/src/seastar/include/seastar/core/report_exception.hh
new file mode 100644
index 000000000..7738d9bae
--- /dev/null
+++ b/src/seastar/include/seastar/core/report_exception.hh
@@ -0,0 +1,31 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+void report_exception(std::string_view message, std::exception_ptr) noexcept;
+
+}
+
diff --git a/src/seastar/include/seastar/core/resource.hh b/src/seastar/include/seastar/core/resource.hh
new file mode 100644
index 000000000..5120c2aa4
--- /dev/null
+++ b/src/seastar/include/seastar/core/resource.hh
@@ -0,0 +1,94 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <cstdlib>
+#include <string>
+#include <seastar/util/std-compat.hh>
+#include <vector>
+#include <set>
+#include <sched.h>
+#include <boost/any.hpp>
+#include <unordered_map>
+
+namespace seastar {
+
+cpu_set_t cpuid_to_cpuset(unsigned cpuid);
+
+namespace resource {
+
+using std::optional;
+
+using cpuset = std::set<unsigned>;
+
+struct configuration {
+ optional<size_t> total_memory;
+ optional<size_t> reserve_memory; // if total_memory not specified
+ optional<size_t> cpus;
+ optional<cpuset> cpu_set;
+ std::unordered_map<dev_t, unsigned> num_io_queues;
+ bool assign_orphan_cpus = false;
+};
+
+struct memory {
+ size_t bytes;
+ unsigned nodeid;
+
+};
+
+// Since this is static information, we will keep a copy at each CPU.
+// This will allow us to easily find who is the IO coordinator for a given
+// node without a trip to a remote CPU.
+struct io_queue_topology {
+ std::vector<unsigned> shard_to_coordinator;
+ std::vector<unsigned> coordinator_to_idx;
+ std::vector<bool> coordinator_to_idx_valid; // for validity asserts
+ unsigned nr_coordinators;
+};
+
+struct cpu {
+ unsigned cpu_id;
+ std::vector<memory> mem;
+};
+
+struct resources {
+ std::vector<cpu> cpus;
+ std::unordered_map<dev_t, io_queue_topology> ioq_topology;
+};
+
+resources allocate(configuration c);
+unsigned nr_processing_units();
+}
+
+// We need a wrapper class, because boost::program_options wants validate()
+// (below) to be in the same namespace as the type it is validating.
+struct cpuset_bpo_wrapper {
+ resource::cpuset value;
+};
+
+// Overload for boost program options parsing/validation
+extern
+void validate(boost::any& v,
+ const std::vector<std::string>& values,
+ cpuset_bpo_wrapper* target_type, int);
+
+}
diff --git a/src/seastar/include/seastar/core/rwlock.hh b/src/seastar/include/seastar/core/rwlock.hh
new file mode 100644
index 000000000..58b046a2f
--- /dev/null
+++ b/src/seastar/include/seastar/core/rwlock.hh
@@ -0,0 +1,180 @@
+/*
+* This file is open source software, licensed to you under the terms
+* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+* distributed with this work for additional information regarding copyright
+* ownership. You may not use this file except in compliance with the License.
+*
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied. See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/semaphore.hh>
+
+namespace seastar {
+
+/// \cond internal
+// lock / unlock semantics for rwlock, so it can be used with with_lock()
+template<typename Clock>
+class basic_rwlock;
+
+template<typename Clock = typename timer<>::clock>
+class rwlock_for_read {
+public:
+ future<> lock() {
+ return static_cast<basic_rwlock<Clock>*>(this)->read_lock();
+ }
+ void unlock() {
+ static_cast<basic_rwlock<Clock>*>(this)->read_unlock();
+ }
+ friend class basic_rwlock<Clock>;
+};
+
+template<typename Clock = typename timer<>::clock>
+class rwlock_for_write {
+public:
+ future<> lock() {
+ return static_cast<basic_rwlock<Clock>*>(this)->write_lock();
+ }
+ void unlock() {
+ static_cast<basic_rwlock<Clock>*>(this)->write_unlock();
+ }
+ friend class basic_rwlock<Clock>;
+};
+/// \endcond
+
+
+/// \addtogroup fiber-module
+/// @{
+
+/// Implements a read-write lock mechanism. Beware: this is not a cross-CPU
+/// lock, due to seastar's sharded architecture.
+/// Instead, it can be used to achieve rwlock semantics between two (or more)
+/// fibers running in the same CPU that may use the same resource.
+/// Acquiring the write lock will effectively cause all readers not to be executed
+/// until the write part is done.
+template<typename Clock = typename timer<>::clock>
+class basic_rwlock : private rwlock_for_read<Clock>, rwlock_for_write<Clock> {
+ using semaphore_type = basic_semaphore<semaphore_default_exception_factory, Clock>;
+
+ static constexpr size_t max_ops = semaphore_type::max_counter();
+
+ semaphore_type _sem;
+public:
+ basic_rwlock()
+ : _sem(max_ops) {
+ }
+
+ /// Cast this rwlock into read lock object with lock semantics appropriate to be used
+ /// by "with_lock". The resulting object will have lock / unlock calls that, when called,
+ /// will acquire / release the lock in read mode.
+ rwlock_for_read<Clock>& for_read() {
+ return *this;
+ }
+
+ /// Cast this rwlock into write lock object with lock semantics appropriate to be used
+ /// by "with_lock". The resulting object will have lock / unlock calls that, when called,
+ /// will acquire / release the lock in write mode.
+ rwlock_for_write<Clock>& for_write() {
+ return *this;
+ }
+
+ /// Acquires this lock in read mode. Many readers are allowed, but when
+ /// this future returns, and until \ref read_unlock is called, all fibers
+ /// waiting on \ref write_lock are guaranteed not to execute.
+ future<> read_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) {
+ return _sem.wait(timeout);
+ }
+
+ /// Releases the lock, which must have been taken in read mode. After this
+ /// is called, one of the fibers waiting on \ref write_lock will be allowed
+ /// to proceed.
+ void read_unlock() {
+ assert(_sem.current() < max_ops);
+ _sem.signal();
+ }
+
+ /// Acquires this lock in write mode. Only one writer is allowed. When
+ /// this future returns, and until \ref write_unlock is called, all other
+ /// fibers waiting on either \ref read_lock or \ref write_lock are guaranteed
+ /// not to execute.
+ future<> write_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) {
+ return _sem.wait(timeout, max_ops);
+ }
+
+ /// Releases the lock, which must have been taken in write mode. After this
+ /// is called, one of the other fibers waiting on \ref write_lock or the fibers
+ /// waiting on \ref read_lock will be allowed to proceed.
+ void write_unlock() {
+ assert(_sem.current() == 0);
+ _sem.signal(max_ops);
+ }
+
+ /// Tries to acquire the lock in read mode iff this can be done without waiting.
+ bool try_read_lock() {
+ return _sem.try_wait();
+ }
+
+ /// Tries to acquire the lock in write mode iff this can be done without waiting.
+ bool try_write_lock() {
+ return _sem.try_wait(max_ops);
+ }
+
+ using holder = semaphore_units<semaphore_default_exception_factory, Clock>;
+
+ /// hold_read_lock() waits for a read lock and returns an object which,
+ /// when destroyed, releases the lock. This makes it easy to ensure that
+ /// the lock is eventually undone, at any circumstance (even including
+ /// exceptions). The release() method can be used on the returned object
+ /// to release its ownership of the lock and avoid the automatic unlock.
+ /// Note that both hold_read_lock() and hold_write_lock() return an object
+ /// of the same type, rwlock::holder.
+ ///
+ /// hold_read_lock() may throw an exception (or, in other implementations,
+ /// return an exceptional future) when it failed to obtain the lock -
+ /// e.g., on allocation failure.
+ future<holder> hold_read_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) {
+ return get_units(_sem, 1);
+ }
+
+ /// hold_write_lock() waits for a write lock and returns an object which,
+ /// when destroyed, releases the lock. This makes it easy to ensure that
+ /// the lock is eventually undone, at any circumstance (even including
+ /// exceptions). The release() method can be used on the returned object
+ /// to release its ownership of the lock and avoid the automatic unlock.
+ /// Note that both hold_read_lock() and hold_write_lock() return an object
+ /// of the same type, rwlock::holder.
+ ///
+ /// hold_read_lock() may throw an exception (or, in other implementations,
+ /// return an exceptional future) when it failed to obtain the lock -
+ /// e.g., on allocation failure.
+ future<holder> hold_write_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) {
+ return get_units(_sem, max_ops);
+ }
+
+ /// Checks if any read or write locks are currently held.
+ bool locked() const {
+ return _sem.available_units() != max_ops;
+ }
+
+ friend class rwlock_for_read<Clock>;
+ friend class rwlock_for_write<Clock>;
+};
+
+using rwlock = basic_rwlock<>;
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/scattered_message.hh b/src/seastar/include/seastar/core/scattered_message.hh
new file mode 100644
index 000000000..83655249f
--- /dev/null
+++ b/src/seastar/include/seastar/core/scattered_message.hh
@@ -0,0 +1,112 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/deleter.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/net/packet.hh>
+#include <seastar/core/sstring.hh>
+#include <memory>
+#include <vector>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+template <typename CharType>
+class scattered_message {
+private:
+ using fragment = net::fragment;
+ using packet = net::packet;
+ using char_type = CharType;
+ packet _p;
+public:
+ scattered_message() {}
+ scattered_message(scattered_message&&) = default;
+ scattered_message(const scattered_message&) = delete;
+
+ void append_static(const char_type* buf, size_t size) {
+ if (size) {
+ _p = packet(std::move(_p), fragment{(char_type*)buf, size}, deleter());
+ }
+ }
+
+ template <size_t N>
+ void append_static(const char_type(&s)[N]) {
+ append_static(s, N - 1);
+ }
+
+ void append_static(const char_type* s) {
+ append_static(s, strlen(s));
+ }
+
+ template <typename size_type, size_type max_size>
+ void append_static(const basic_sstring<char_type, size_type, max_size>& s) {
+ append_static(s.begin(), s.size());
+ }
+
+ void append_static(const std::string_view& s) {
+ append_static(s.data(), s.size());
+ }
+
+ void append(std::string_view v) {
+ if (v.size()) {
+ _p = packet(std::move(_p), temporary_buffer<char>::copy_of(v));
+ }
+ }
+
+ template <typename size_type, size_type max_size>
+ void append(basic_sstring<char_type, size_type, max_size> s) {
+ if (s.size()) {
+ _p = packet(std::move(_p), std::move(s).release());
+ }
+ }
+
+ template <typename size_type, size_type max_size, typename Callback>
+ void append(const basic_sstring<char_type, size_type, max_size>& s, Callback callback) {
+ if (s.size()) {
+ _p = packet(std::move(_p), fragment{s.begin(), s.size()}, make_deleter(std::move(callback)));
+ }
+ }
+
+ void reserve(int n_frags) {
+ _p.reserve(n_frags);
+ }
+
+ packet release() && {
+ return std::move(_p);
+ }
+
+ template <typename Callback>
+ void on_delete(Callback callback) {
+ _p = packet(std::move(_p), make_deleter(std::move(callback)));
+ }
+
+ operator bool() const {
+ return _p.len();
+ }
+
+ size_t size() {
+ return _p.len();
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/scheduling.hh b/src/seastar/include/seastar/core/scheduling.hh
new file mode 100644
index 000000000..fb4c59e5a
--- /dev/null
+++ b/src/seastar/include/seastar/core/scheduling.hh
@@ -0,0 +1,366 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 Scylla DB Ltd
+ */
+
+#pragma once
+
+#include <typeindex>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/function_traits.hh>
+#include <seastar/util/concepts.hh>
+
+/// \file
+
+namespace seastar {
+
+constexpr unsigned max_scheduling_groups() { return 16; }
+
+#if SEASTAR_API_LEVEL < 6
+#define SEASTAR_ELLIPSIS ...
+template <typename SEASTAR_ELLIPSIS T>
+#else
+#define SEASTAR_ELLIPSIS
+template <typename T = void>
+#endif
+class future;
+
+class reactor;
+
+class scheduling_group;
+class scheduling_group_key;
+
+namespace internal {
+
+// Returns an index between 0 and max_scheduling_groups()
+unsigned scheduling_group_index(scheduling_group sg) noexcept;
+scheduling_group scheduling_group_from_index(unsigned index) noexcept;
+
+unsigned long scheduling_group_key_id(scheduling_group_key) noexcept;
+
+template<typename T>
+T* scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept;
+
+}
+
+
+/// Creates a scheduling group with a specified number of shares.
+///
+/// The operation is global and affects all shards. The returned scheduling
+/// group can then be used in any shard.
+///
+/// \param name A name that identifiers the group; will be used as a label
+/// in the group's metrics
+/// \param shares number of shares of the CPU time allotted to the group;
+/// Use numbers in the 1-1000 range (but can go above).
+/// \return a scheduling group that can be used on any shard
+future<scheduling_group> create_scheduling_group(sstring name, float shares) noexcept;
+
+/// Destroys a scheduling group.
+///
+/// Destroys a \ref scheduling_group previously created with create_scheduling_group().
+/// The destroyed group must not be currently in use and must not be used later.
+///
+/// The operation is global and affects all shards.
+///
+/// \param sg The scheduling group to be destroyed
+/// \return a future that is ready when the scheduling group has been torn down
+future<> destroy_scheduling_group(scheduling_group sg) noexcept;
+
+/// Rename scheduling group.
+///
+/// Renames a \ref scheduling_group previously created with create_scheduling_group().
+///
+/// The operation is global and affects all shards.
+/// The operation affects the exported statistics labels.
+///
+/// \param sg The scheduling group to be renamed
+/// \param new_name The new name for the scheduling group.
+/// \return a future that is ready when the scheduling group has been renamed
+future<> rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept;
+
+
+/**
+ * Represents a configuration for a specific scheduling group value,
+ * it contains all that is needed to maintain a scheduling group specific
+ * value when it needs to be created, due to, for example, a new
+ * \ref scheduling_group being created.
+ *
+ * @note is is recomended to use @ref make_scheduling_group_key_config in order to
+ * create and configure this syructure. The only reason that one might want to not use
+ * this method is because of a need for specific intervention in the construction or
+ * destruction of the value. Even then, it is recommended to first create the configuration
+ * with @ref make_scheduling_group_key_config and only the change it.
+ *
+ */
+struct scheduling_group_key_config {
+ /**
+ * Constructs a default configuration
+ */
+ scheduling_group_key_config() :
+ scheduling_group_key_config(typeid(void)) {}
+ /**
+ * Creates a configuration that is made for a specific type.
+ * It does not contain the right alignment and allocation sizes
+ * neither the correct construction or destruction logic, but only
+ * the indication for the intended type which is used in debug mode
+ * to make sure that the correct type is reffered to when accessing
+ * the value.
+ * @param type_info - the type information class (create with typeid(T)).
+ */
+ scheduling_group_key_config(const std::type_info& type_info) :
+ type_index(type_info) {}
+ /// The allocation size for the value (usually: sizeof(T))
+ size_t allocation_size;
+ /// The required alignment of the value (usually: alignof(T))
+ size_t alignment;
+ /// Holds the type information for debug mode runtime validation
+ std::type_index type_index;
+ /// A function that will be called for each newly allocated value
+ std::function<void (void*)> constructor;
+ /// A function that will be called for each element that is about
+ /// to be dealocated.
+ std::function<void (void*)> destructor;
+
+};
+
+
+/**
+ * A class that is intended to encapsulate the scheduling group specific
+ * key and "hide" it implementation concerns and details.
+ *
+ * @note this object can be copied accross shards and scheduling groups.
+ */
+class scheduling_group_key {
+public:
+ /// The only user allowed operation on a key is copying.
+ scheduling_group_key(const scheduling_group_key&) noexcept = default;
+ scheduling_group_key(scheduling_group_key&&) noexcept = default;
+private:
+ scheduling_group_key(unsigned long id) noexcept :
+ _id(id) {}
+ unsigned long _id;
+ unsigned long id() const noexcept {
+ return _id;
+ }
+ friend class reactor;
+ friend future<scheduling_group_key> scheduling_group_key_create(scheduling_group_key_config cfg) noexcept;
+ template<typename T>
+ friend T* internal::scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept;
+ template<typename T>
+ friend T& scheduling_group_get_specific(scheduling_group_key key) noexcept;
+
+ friend unsigned long internal::scheduling_group_key_id(scheduling_group_key key) noexcept;
+};
+
+namespace internal {
+
+inline unsigned long scheduling_group_key_id(scheduling_group_key key) noexcept {
+ return key.id();
+}
+
+/**
+ * @brief A function in the spirit of Cpp17 apply, but specifically for constructors.
+ * This function is used in order to preserve support in Cpp14.
+
+ * @tparam ConstructorType - the constructor type or in other words the type to be constructed
+ * @tparam Tuple - T params tuple type (should be deduced)
+ * @tparam size_t...Idx - a sequence of indexes in order to access the typpels members in compile time.
+ * (should be deduced)
+ *
+ * @param pre_alocated_mem - a pointer to the pre allocated memory chunk that will hold the
+ * the initialized object.
+ * @param args - A tupple that holds the prarameters for the constructor
+ * @param idx_seq - An index sequence that will be used to access the members of the tuple in compile
+ * time.
+ *
+ * @note this function was not intended to be called by users and it is only a utility function
+ * for suporting \ref make_scheduling_group_key_config
+ */
+template<typename ConstructorType, typename Tuple, size_t...Idx>
+void apply_constructor(void* pre_alocated_mem, Tuple args, std::index_sequence<Idx...> idx_seq) {
+ new (pre_alocated_mem) ConstructorType(std::get<Idx>(args)...);
+}
+}
+
+/**
+ * A template function that builds a scheduling group specific value configuration.
+ * This configuration is used by the infrastructure to allocate memory for the values
+ * and initialize or deinitialize them when they are created or destroyed.
+ *
+ * @tparam T - the type for the newly created value.
+ * @tparam ...ConstructorArgs - the types for the constructor parameters (should be deduced)
+ * @param args - The parameters for the constructor.
+ * @return a fully initialized \ref scheduling_group_key_config object.
+ */
+template <typename T, typename... ConstructorArgs>
+scheduling_group_key_config
+make_scheduling_group_key_config(ConstructorArgs... args) {
+ scheduling_group_key_config sgkc(typeid(T));
+ sgkc.allocation_size = sizeof(T);
+ sgkc.alignment = alignof(T);
+ sgkc.constructor = [args = std::make_tuple(args...)] (void* p) {
+ internal::apply_constructor<T>(p, args, std::make_index_sequence<sizeof...(ConstructorArgs)>());
+ };
+ sgkc.destructor = [] (void* p) {
+ static_cast<T*>(p)->~T();
+ };
+ return sgkc;
+}
+
+/**
+ * Returns a future that holds a scheduling key and resolves when this key can be used
+ * to access the scheduling group specific value it represents.
+ * @param cfg - A \ref scheduling_group_key_config object (by recomendation: initialized with
+ * \ref make_scheduling_group_key_config )
+ * @return A future containing \ref scheduling_group_key for the newly created specific value.
+ */
+future<scheduling_group_key> scheduling_group_key_create(scheduling_group_key_config cfg) noexcept;
+
+/**
+ * Returnes a reference to the given scheduling group specific value
+ * @tparam T - the type of the scheduling specific type (cannot be deduced)
+ * @param sg - the scheduling group which it's specific value to retrieve
+ * @param key - the key of the value to retrieve.
+ * @return A reference to the scheduling specific value.
+ */
+template<typename T>
+T& scheduling_group_get_specific(scheduling_group sg, scheduling_group_key key);
+
+
+/// \brief Identifies function calls that are accounted as a group
+///
+/// A `scheduling_group` is a tag that can be used to mark a function call.
+/// Executions of such tagged calls are accounted as a group.
+class scheduling_group {
+ unsigned _id;
+private:
+ explicit scheduling_group(unsigned id) noexcept : _id(id) {}
+public:
+ /// Creates a `scheduling_group` object denoting the default group
+ constexpr scheduling_group() noexcept : _id(0) {} // must be constexpr for current_scheduling_group_holder
+ bool active() const noexcept;
+ const sstring& name() const noexcept;
+ bool operator==(scheduling_group x) const noexcept { return _id == x._id; }
+ bool operator!=(scheduling_group x) const noexcept { return _id != x._id; }
+ bool is_main() const noexcept { return _id == 0; }
+ template<typename T>
+ /**
+ * Returnes a reference to this scheduling group specific value
+ * @tparam T - the type of the scheduling specific type (cannot be deduced)
+ * @param key - the key of the value to retrieve.
+ * @return A reference to this scheduling specific value.
+ */
+ T& get_specific(scheduling_group_key key) noexcept {
+ return *internal::scheduling_group_get_specific_ptr<T>(*this, key);
+ }
+ /// Adjusts the number of shares allotted to the group.
+ ///
+ /// Dynamically adjust the number of shares allotted to the group, increasing or
+ /// decreasing the amount of CPU bandwidth it gets. The adjustment is local to
+ /// the shard.
+ ///
+ /// This can be used to reduce a background job's interference with a foreground
+ /// load: the shares can be started at a low value, increased when the background
+ /// job's backlog increases, and reduced again when the backlog decreases.
+ ///
+ /// \param shares number of shares allotted to the group. Use numbers
+ /// in the 1-1000 range.
+ void set_shares(float shares) noexcept;
+ friend future<scheduling_group> create_scheduling_group(sstring name, float shares) noexcept;
+ friend future<> destroy_scheduling_group(scheduling_group sg) noexcept;
+ friend future<> rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept;
+ friend class reactor;
+ friend unsigned internal::scheduling_group_index(scheduling_group sg) noexcept;
+ friend scheduling_group internal::scheduling_group_from_index(unsigned index) noexcept;
+
+ template<typename SpecificValType, typename Mapper, typename Reducer, typename Initial>
+ SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Mapper mapper, Reducer reducer, Initial initial) {
+ {reducer(initial, mapper(specific_val))} -> std::convertible_to<Initial>;
+ })
+ friend future<typename function_traits<Reducer>::return_type>
+ map_reduce_scheduling_group_specific(Mapper mapper, Reducer reducer, Initial initial_val, scheduling_group_key key);
+
+ template<typename SpecificValType, typename Reducer, typename Initial>
+ SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Reducer reducer, Initial initial) {
+ {reducer(initial, specific_val)} -> std::convertible_to<Initial>;
+ })
+ friend future<typename function_traits<Reducer>::return_type>
+ reduce_scheduling_group_specific(Reducer reducer, Initial initial_val, scheduling_group_key key);
+
+
+};
+
+/// \cond internal
+namespace internal {
+
+inline
+unsigned
+scheduling_group_index(scheduling_group sg) noexcept {
+ return sg._id;
+}
+
+inline
+scheduling_group
+scheduling_group_from_index(unsigned index) noexcept {
+ return scheduling_group(index);
+}
+
+inline
+scheduling_group*
+current_scheduling_group_ptr() noexcept {
+ // Slow unless constructor is constexpr
+ static thread_local scheduling_group sg;
+ return &sg;
+}
+
+}
+/// \endcond
+
+/// Returns the current scheduling group
+inline
+scheduling_group
+current_scheduling_group() noexcept {
+ return *internal::current_scheduling_group_ptr();
+}
+
+inline
+scheduling_group
+default_scheduling_group() noexcept {
+ return scheduling_group();
+}
+
+inline
+bool
+scheduling_group::active() const noexcept {
+ return *this == current_scheduling_group();
+}
+
+}
+
+namespace std {
+
+template <>
+struct hash<seastar::scheduling_group> {
+ size_t operator()(seastar::scheduling_group sg) const noexcept {
+ return seastar::internal::scheduling_group_index(sg);
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/scheduling_specific.hh b/src/seastar/include/seastar/core/scheduling_specific.hh
new file mode 100644
index 000000000..d2e3158f5
--- /dev/null
+++ b/src/seastar/include/seastar/core/scheduling_specific.hh
@@ -0,0 +1,189 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 Scylla DB Ltd
+ */
+
+#include <boost/range/adaptor/filtered.hpp>
+#include <seastar/core/scheduling.hh>
+#include <seastar/core/map_reduce.hh>
+#include <array>
+#include <vector>
+
+#pragma once
+
+namespace seastar {
+
+namespace internal {
+
+struct scheduling_group_specific_thread_local_data {
+ struct per_scheduling_group {
+ bool queue_is_initialized = false;
+ /**
+ * This array holds pointers to the scheduling group specific
+ * data. The pointer is not use as is but is cast to a reference
+ * to the appropriate type that is actually pointed to.
+ */
+ std::vector<void*> specific_vals;
+ };
+ std::array<per_scheduling_group, max_scheduling_groups()> per_scheduling_group_data;
+ std::vector<scheduling_group_key_config> scheduling_group_key_configs;
+};
+
+inline
+scheduling_group_specific_thread_local_data** get_scheduling_group_specific_thread_local_data_ptr() noexcept {
+ static thread_local scheduling_group_specific_thread_local_data* data;
+ return &data;
+}
+inline
+scheduling_group_specific_thread_local_data& get_scheduling_group_specific_thread_local_data() noexcept {
+ return **get_scheduling_group_specific_thread_local_data_ptr();
+}
+
+[[noreturn]] void no_such_scheduling_group(scheduling_group sg);
+
+/**
+ * Returns a pointer to the given scheduling group specific data.
+ * @param sg - The scheduling group which it's data needs to be accessed
+ * @param key - The scheduling group key that for the data to access
+ * @return A pointer of type T* to the data, if sg is valid initialized.
+ *
+ * @note The parameter T has to be given since there is no way to deduce it.
+ */
+template<typename T>
+T* scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept {
+ auto& data = internal::get_scheduling_group_specific_thread_local_data();
+#ifdef SEASTAR_DEBUG
+ assert(std::type_index(typeid(T)) == data.scheduling_group_key_configs[key.id()].type_index);
+#endif
+ auto sg_id = internal::scheduling_group_index(sg);
+ if (__builtin_expect(sg_id < data.per_scheduling_group_data.size() &&
+ data.per_scheduling_group_data[sg_id].queue_is_initialized, true)) {
+ return reinterpret_cast<T*>(data.per_scheduling_group_data[sg_id].specific_vals[key.id()]);
+ }
+ return nullptr;
+}
+
+}
+
+/**
+ * Returns a reference to the given scheduling group specific data.
+ * @param sg - The scheduling group which it's data needs to be accessed
+ * @param key - The scheduling group key that for the data to access
+ * @return A reference of type T& to the data.
+ *
+ * @note The parameter T has to be given since there is no way to deduce it.
+ * May throw std::invalid_argument if sg does not exist or is uninitialized.
+ */
+template<typename T>
+T& scheduling_group_get_specific(scheduling_group sg, scheduling_group_key key) {
+ T* p = internal::scheduling_group_get_specific_ptr<T>(sg, std::move(key));
+ if (!p) {
+ internal::no_such_scheduling_group(sg);
+ }
+ return *p;
+}
+
+/**
+ * Returns a reference to the current specific data.
+ * @param key - The scheduling group key that for the data to access
+ * @return A reference of type T& to the data.
+ *
+ * @note The parameter T has to be given since there is no way to deduce it.
+ */
+template<typename T>
+T& scheduling_group_get_specific(scheduling_group_key key) noexcept {
+ // Unlike internal::scheduling_group_get_specific_ptr, this can
+ // return a reference to an element whose queue_is_initialized is
+ // false.
+ auto& data = internal::get_scheduling_group_specific_thread_local_data();
+ assert(std::type_index(typeid(T)) == data.scheduling_group_key_configs[key.id()].type_index);
+ auto sg_id = internal::scheduling_group_index(current_scheduling_group());
+ return *reinterpret_cast<T*>(data.per_scheduling_group_data[sg_id].specific_vals[key.id()]);
+}
+
+/**
+ * A map reduce over all values of a specific scheduling group data.
+ * @param mapper - A functor SomeType(SpecificValType&) or SomeType(SpecificValType) that maps
+ * the specific data to a value of any type.
+ * @param reducer - A functor of of type ConvetibleToInitial(Initial, MapperReurnType) that reduces
+ * a value of type Initial and of the mapper return type to a value of type convertible to Initial.
+ * @param initial_val - the initial value to pass in the first call to the reducer.
+ * @param key - the key to the specific data that the mapper should act upon.
+ * @return A future that resolves when the result of the map reduce is ready.
+ * @note The type of SpecificValType must be given because there is no way to deduce it in a *consistent*
+ * manner.
+ * @note Theoretically the parameter type of Mapper can be deduced to be the type (function_traits<Mapper>::arg<0>)
+ * but then there is a danger when the Mapper accepts a parameter type T where SpecificValType is convertible to
+ * SpecificValType.
+ */
+template<typename SpecificValType, typename Mapper, typename Reducer, typename Initial>
+SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Mapper mapper, Reducer reducer, Initial initial) {
+ {reducer(initial, mapper(specific_val))} -> std::convertible_to<Initial>;
+})
+future<typename function_traits<Reducer>::return_type>
+map_reduce_scheduling_group_specific(Mapper mapper, Reducer reducer,
+ Initial initial_val, scheduling_group_key key) {
+ using per_scheduling_group = internal::scheduling_group_specific_thread_local_data::per_scheduling_group;
+ auto& data = internal::get_scheduling_group_specific_thread_local_data();
+ auto wrapped_mapper = [key, mapper] (per_scheduling_group& psg) {
+ auto id = internal::scheduling_group_key_id(key);
+ return make_ready_future<typename function_traits<Mapper>::return_type>
+ (mapper(*reinterpret_cast<SpecificValType*>(psg.specific_vals[id])));
+ };
+
+ return map_reduce(
+ data.per_scheduling_group_data
+ | boost::adaptors::filtered(std::mem_fn(&per_scheduling_group::queue_is_initialized)),
+ wrapped_mapper, std::move(initial_val), reducer);
+}
+
+/**
+ * A reduce over all values of a specific scheduling group data.
+ * @param reducer - A functor of of type ConvetibleToInitial(Initial, SpecificValType) that reduces
+ * a value of type Initial and of the sg specific data type to a value of type convertible to Initial.
+ * @param initial_val - the initial value to pass in the first call to the reducer.
+ * @param key - the key to the specific data that the mapper should act upon.
+ * @return A future that resolves when the result of the reduce is ready.
+ * * @note The type of SpecificValType must be given because there is no way to deduce it in a *consistent*
+ * manner.
+ * @note Theoretically the parameter type of Reducer can be deduced to be the type (function_traits<Reducer>::arg<0>)
+ * but then there is a danger when the Reducer accepts a parameter type T where SpecificValType is convertible to
+ * SpecificValType.
+ */
+template<typename SpecificValType, typename Reducer, typename Initial>
+SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Reducer reducer, Initial initial) {
+ {reducer(initial, specific_val)} -> std::convertible_to<Initial>;
+})
+future<typename function_traits<Reducer>::return_type>
+reduce_scheduling_group_specific(Reducer reducer, Initial initial_val, scheduling_group_key key) {
+ using per_scheduling_group = internal::scheduling_group_specific_thread_local_data::per_scheduling_group;
+ auto& data = internal::get_scheduling_group_specific_thread_local_data();
+
+ auto mapper = [key] (per_scheduling_group& psg) {
+ auto id = internal::scheduling_group_key_id(key);
+ return make_ready_future<SpecificValType>(*reinterpret_cast<SpecificValType*>(psg.specific_vals[id]));
+ };
+
+ return map_reduce(
+ data.per_scheduling_group_data
+ | boost::adaptors::filtered(std::mem_fn(&per_scheduling_group::queue_is_initialized)),
+ mapper, std::move(initial_val), reducer);
+}
+
+}
diff --git a/src/seastar/include/seastar/core/scollectd.hh b/src/seastar/include/seastar/core/scollectd.hh
new file mode 100644
index 000000000..7909295bb
--- /dev/null
+++ b/src/seastar/include/seastar/core/scollectd.hh
@@ -0,0 +1,848 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+#include <functional>
+#include <array>
+#include <iterator>
+#include <stdint.h>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <chrono>
+#include <boost/program_options.hpp>
+
+#include <seastar/core/future.hh>
+#include <seastar/net/byteorder.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/util/log.hh>
+
+#include <seastar/core/metrics_api.hh>
+
+namespace seastar {
+
+/**
+ * Implementation of rudimentary collectd data gathering.
+ *
+ * Usage is hopefully straight forward. Though, feel free to read
+ * https://collectd.org/wiki/index.php/Naming_schema
+ * for an explanation on the naming model.
+ *
+ * Typically, you'll add values something like:
+ *
+ * scollectd::type_instance_id typ("<pluginname>", "<instance_name>", "<type_name>", "<instance_name>");
+ * scollectd::add_polled_metric(typ, [<metric var> | scollectd::make_typed(<data_type>, <metric_var>) [, ...]);
+ *
+ * Where
+ * `<pluginname>` would be the overall 'module', e.g. "cpu"
+ * `<instance_name>` -> optional distinguisher between plugin instances. For cpu, the built-in
+ * scollectd::per_cpu_plugin_instance constant is a good choice, i.e. 0->N cpu.
+ * If there are no instances (e.g. only one), empty constant is appropriate (none)
+ * `<type_name>` is the 'type' of metric collected, for ex. "usage" (cpu/0/usage)
+ * `<type_instance>` is a distinguisher for metric parts of the type, e.g. "idle", "user", "kernel"
+ * -> cpu/0/usage/idle | cpu/0/usage/user | cpu/0/usage/kernel
+ *
+ * Each type instance can bind an arbitrary number of values, ech representing some aspect in turn of the instance.
+ * The structure and interpretation is up to the producer/consumer
+ *
+ * There is a single "scollectd" instance per cpu, and values should be bound locally
+ * to this cpu. Polling is done at a frequency set in the seastar config (def once per s),
+ * and all registered values will be sent via UDP packages to the destination host(s)
+ *
+ * Note that the tuple { plugin, plugin_instance, type, type_instance } is considered a
+ * unique ID for a value registration, so using the same tuple twice will remove the previously
+ * registered values.
+ *
+ * Values can be unregistered at any time, though they must be so on the same thread/cpu
+ * as they we're registered. The "registration" achor type provides RAII style value unregistration
+ * semantics.
+ *
+ */
+
+namespace scollectd {
+
+extern seastar::logger logger;
+
+using data_type = seastar::metrics::impl::data_type;
+
+enum class known_type {
+ // from types.db. Defined collectd types (type_id) selection.
+ // This enum omits the very application specific types, such
+ // as mysql_* etc, since if you really are re-writing mysql
+ // in seastar, you probably know how to look the type up manually...
+
+ absolute,
+ backends,
+ bitrate,
+ blocked_clients,
+ bytes,
+ cache_eviction,
+ cache_operation,
+ cache_ratio,
+ cache_result,
+ cache_size,
+ capacity,
+ changes_since_last_save,
+ charge,
+ clock_last_meas,
+ clock_last_update,
+ clock_mode,
+ clock_reachability,
+ clock_skew_ppm,
+ clock_state,
+ clock_stratum,
+ compression,
+ compression_ratio,
+ connections,
+ conntrack,
+ contextswitch,
+ count,
+ counter,
+ cpu,
+ cpufreq,
+ current,
+ current_connections,
+ current_sessions,
+ delay,
+ derive,
+ df,
+ df_complex,
+ df_inodes,
+ disk_io_time,
+ disk_latency,
+ disk_merged,
+ disk_octets,
+ disk_ops,
+ disk_ops_complex,
+ disk_time,
+ dns_answer,
+ dns_notify,
+ dns_octets,
+ dns_opcode,
+ dns_qtype,
+ dns_qtype_cached,
+ dns_query,
+ dns_question,
+ dns_rcode,
+ dns_reject,
+ dns_request,
+ dns_resolver,
+ dns_response,
+ dns_transfer,
+ dns_update,
+ dns_zops,
+ drbd_resource,
+ duration,
+ email_check,
+ email_count,
+ email_size,
+ entropy,
+ evicted_keys,
+ expired_keys,
+ fanspeed,
+ file_handles,
+ file_size,
+ files,
+ flow,
+ fork_rate,
+ frequency,
+ frequency_error,
+ frequency_offset,
+ fscache_stat,
+ gauge,
+ hash_collisions,
+ http_request_methods,
+ http_requests,
+ http_response_codes,
+ humidity,
+ if_collisions,
+ if_dropped,
+ if_errors,
+ if_multicast,
+ if_octets,
+ if_packets,
+ if_rx_errors,
+ if_rx_octets,
+ if_tx_errors,
+ if_tx_octets,
+ invocations,
+ io_octets,
+ io_packets,
+ ipt_bytes,
+ ipt_packets,
+ irq,
+ latency,
+ links,
+ load,
+ md_disks,
+ memory,
+ memory_lua,
+ memory_throttle_count,
+ multimeter,
+ mutex_operations,
+ objects,
+ operations,
+ packets,
+ pending_operations,
+ percent,
+ percent_bytes,
+ percent_inodes,
+ ping,
+ ping_droprate,
+ ping_stddev,
+ players,
+ power,
+ pressure,
+ protocol_counter,
+ pubsub,
+ queue_length,
+ records,
+ requests,
+ response_code,
+ response_time,
+ root_delay,
+ root_dispersion,
+ route_etx,
+ route_metric,
+ routes,
+ segments,
+ serial_octets,
+ signal_noise,
+ signal_power,
+ signal_quality,
+ snr,
+ spl,
+ swap,
+ swap_io,
+ tcp_connections,
+ temperature,
+ threads,
+ time_dispersion,
+ time_offset,
+ time_offset_ntp,
+ time_offset_rms,
+ time_ref,
+ timeleft,
+ total_bytes,
+ total_connections,
+ total_objects,
+ total_operations,
+ total_requests,
+ total_sessions,
+ total_threads,
+ total_time_in_ms,
+ total_values,
+ uptime,
+ users,
+ vcl,
+ vcpu,
+ virt_cpu_total,
+ virt_vcpu,
+ vmpage_action,
+ vmpage_faults,
+ vmpage_io,
+ vmpage_number,
+ volatile_changes,
+ voltage,
+ voltage_threshold,
+ vs_memory,
+ vs_processes,
+ vs_threads,
+};
+
+// don't use directly. use make_typed.
+template<typename T>
+struct typed {
+ typed(data_type t, T && v)
+ : type(t), value(std::forward<T>(v)) {
+ }
+ data_type type;
+ T value;
+};
+
+template<typename T>
+static inline typed<T> make_typed(data_type type, T&& t) {
+ return typed<T>(type, std::forward<T>(t));
+}
+
+using plugin_id = seastar::metrics::group_name_type;
+using plugin_instance_id = seastar::metrics::instance_id_type;
+using type_id = seastar::metrics::metric_type_def;
+using type_instance = seastar::metrics::metric_name_type;
+
+type_id type_id_for(known_type);
+
+using description = seastar::metrics::description;
+
+static constexpr unsigned max_collectd_field_text_len = 63;
+
+class type_instance_id {
+ static thread_local unsigned _next_truncated_idx;
+
+ /// truncate a given field to the maximum allowed length
+ void truncate(sstring& field, const char* field_desc);
+public:
+ type_instance_id() = default;
+ type_instance_id(plugin_id p, plugin_instance_id pi, type_id t,
+ scollectd::type_instance ti = std::string())
+ : _plugin(std::move(p)), _plugin_instance(std::move(pi)), _type(
+ std::move(t)), _type_instance(std::move(ti)) {
+ // truncate strings to the maximum allowed length
+ truncate(_plugin, "plugin");
+ truncate(_plugin_instance, "plugin_instance");
+ truncate(_type, "type");
+ truncate(_type_instance, "type_instance");
+ }
+ type_instance_id(const seastar::metrics::impl::metric_id &id, const type_id& inherit_type) : _plugin(id.group_name()),
+ _plugin_instance(id.instance_id()), _type(inherit_type),
+ _type_instance(id.name()) {
+ }
+ type_instance_id(type_instance_id &&) = default;
+ type_instance_id(const type_instance_id &) = default;
+
+ type_instance_id & operator=(type_instance_id &&) = default;
+ type_instance_id & operator=(const type_instance_id &) = default;
+
+ const plugin_id & plugin() const {
+ return _plugin;
+ }
+ const plugin_instance_id & plugin_instance() const {
+ return _plugin_instance;
+ }
+ const type_id & type() const {
+ return _type;
+ }
+ const scollectd::type_instance & type_instance() const {
+ return _type_instance;
+ }
+ bool operator<(const type_instance_id&) const;
+ bool operator==(const type_instance_id&) const;
+private:
+ plugin_id _plugin;
+ plugin_instance_id _plugin_instance;
+ type_id _type;
+ scollectd::type_instance _type_instance;
+};
+
+extern const plugin_instance_id per_cpu_plugin_instance;
+
+void configure(const boost::program_options::variables_map&);
+boost::program_options::options_description get_options_description();
+void remove_polled_metric(const type_instance_id &);
+
+class plugin_instance_metrics;
+
+/**
+ * Anchor for polled registration.
+ * Iff the registered type is in some way none-persistent,
+ * use this as receiver of the reg and ensure it dies before the
+ * added value(s).
+ *
+ * Use:
+ * uint64_t v = 0;
+ * registration r = add_polled_metric(v);
+ * ++r;
+ * <scope end, above dies>
+ */
+struct registration {
+ registration() = default;
+ registration(const type_instance_id& id);
+ registration(type_instance_id&& id);
+ registration(const registration&) = delete;
+ registration(registration&&) = default;
+ ~registration();
+ registration & operator=(const registration&) = delete;
+ registration & operator=(registration&&) = default;
+
+ void unregister() {
+ remove_polled_metric(_id);
+ _id = type_instance_id();
+ }
+private:
+ friend class plugin_instance_metrics;
+ type_instance_id _id;
+ shared_ptr<seastar::metrics::impl::impl> _impl;
+};
+
+/**
+ * Helper type to make generating vectors of registration objects
+ * easier, since it constructs from an initializer list of
+ * type_instance_id:s, avoiding early conversion to registration objs,
+ * which in case of init lists, are copy semantics, not move...
+ */
+class registrations
+ : public std::vector<registration>
+{
+public:
+ typedef std::vector<registration> vector_type;
+
+ registrations()
+ {}
+ registrations(vector_type&& v) : vector_type(std::move(v))
+ {}
+ registrations(const std::initializer_list<type_instance_id>& l)
+ : vector_type(l.begin(),l.end())
+ {}
+ registrations& operator=(vector_type&& v) {
+ vector_type::operator=(std::move(v));
+ return *this;
+ }
+ registrations& operator=(const std::initializer_list<type_instance_id>& l) {
+ return registrations::operator=(registrations(l));
+ }
+};
+
+class value_list;
+
+struct typed_value {
+ /**
+ * Wraps N values of a given type (type_id).
+ * Used to group types into a plugin_instance_metrics
+ */
+ template<typename... Args>
+ typed_value(const type_id& tid, const scollectd::type_instance& ti, description, Args&&... args);
+
+ template<typename... Args>
+ typed_value(const type_id& tid, const scollectd::type_instance& ti, Args&&... args)
+ : typed_value(tid, ti, description(), std::forward<Args>(args)...)
+ {}
+
+ const scollectd::type_instance& type_instance() const {
+ return _type_instance;
+ }
+ const shared_ptr<value_list>& values() const {
+ return _values;
+ }
+ const type_id & type() const {
+ return _type_id;
+ }
+private:
+ type_id _type_id;
+ scollectd::type_instance _type_instance;
+ shared_ptr<value_list> _values;
+};
+
+class plugin_instance_metrics {
+public:
+ template<typename... TypedValues>
+ plugin_instance_metrics(const plugin_id& p, const plugin_instance_id& pi, TypedValues&&... values)
+ : _plugin_id(p)
+ , _plugin_instance(pi)
+ , _registrations({ add_impl(values)... })
+ {}
+ std::vector<type_instance_id> bound_ids() const;
+ void add(const typed_value&);
+private:
+ type_instance_id add_impl(const typed_value&);
+
+ plugin_id _plugin_id;
+ plugin_instance_id _plugin_instance;
+ registrations _registrations;
+};
+
+/**
+ * Simplified wrapper for the common case of per-cpu plugin instances
+ * (i.e. distributed objects)
+ */
+class percpu_plugin_instance_metrics : public plugin_instance_metrics {
+public:
+ template<typename... TypedValues>
+ percpu_plugin_instance_metrics(const plugin_id& p, TypedValues&&... values)
+ : plugin_instance_metrics(p, per_cpu_plugin_instance, std::forward<TypedValues>(values)...)
+ {}
+};
+
+/**
+ * Template wrapper for type_id values, deriving type_id string
+ * from the known_types enum, for auto-completetion joy.
+ */
+template<known_type Type>
+struct typed_value_impl: public typed_value {
+ template<typename ... Args>
+ typed_value_impl(const scollectd::type_instance& ti, Args&& ... args)
+ : typed_value(type_id_for(Type), ti, std::forward<Args>(args)...)
+ {}
+
+ template<typename ... Args>
+ typed_value_impl(scollectd::type_instance ti, description d, Args&& ... args)
+ : typed_value(type_id_for(Type), std::move(ti), std::move(d), std::forward<Args>(args)...)
+ {}
+ template<typename ... Args>
+ typed_value_impl(description d, Args&& ... args)
+ : typed_value(type_id_for(Type), scollectd::type_instance(), std::move(d), std::forward<Args>(args)...)
+ {}
+};
+
+/*!
+ * \deprecated metrics registration should be done using the metrics layer
+ *
+ * Some typedefs for common used types. Feel free to add.
+ */
+typedef typed_value_impl<known_type::total_bytes> total_bytes;
+typedef typed_value_impl<known_type::total_connections> total_connections;
+typedef typed_value_impl<known_type::total_objects> total_objects;
+typedef typed_value_impl<known_type::total_operations> total_operations;
+typedef typed_value_impl<known_type::total_requests> total_requests;
+typedef typed_value_impl<known_type::total_sessions> total_sessions;
+typedef typed_value_impl<known_type::total_threads> total_threads;
+typedef typed_value_impl<known_type::total_time_in_ms> total_time_in_ms;
+typedef typed_value_impl<known_type::total_values> total_values;
+typedef typed_value_impl<known_type::queue_length> queue_length;
+typedef typed_value_impl<known_type::counter> counter;
+typedef typed_value_impl<known_type::count> count;
+typedef typed_value_impl<known_type::gauge> gauge;
+
+// lots of template junk to build typed value list tuples
+// for registered values.
+template<typename T, typename En = void>
+struct data_type_for;
+
+template<typename T, typename En = void>
+struct is_callable;
+
+template<typename T>
+struct is_callable<T,
+typename std::enable_if<
+!std::is_void<typename std::result_of<T()>::type>::value,
+void>::type> : public std::true_type {
+};
+
+template<typename T>
+struct is_callable<T,
+typename std::enable_if<std::is_fundamental<T>::value, void>::type> : public std::false_type {
+};
+
+template<typename T>
+struct data_type_for<T,
+typename std::enable_if<
+std::is_integral<T>::value && std::is_unsigned<T>::value,
+void>::type> : public std::integral_constant<data_type,
+data_type::COUNTER> {
+};
+template<typename T>
+struct data_type_for<T,
+typename std::enable_if<
+std::is_integral<T>::value && std::is_signed<T>::value, void>::type> : public std::integral_constant<
+data_type, data_type::DERIVE> {
+};
+template<typename T>
+struct data_type_for<T,
+typename std::enable_if<std::is_floating_point<T>::value, void>::type> : public std::integral_constant<
+data_type, data_type::GAUGE> {
+};
+template<typename T>
+struct data_type_for<T,
+typename std::enable_if<is_callable<T>::value, void>::type> : public data_type_for<
+typename std::result_of<T()>::type> {
+};
+template<typename T>
+struct data_type_for<typed<T>> : public data_type_for<T> {
+};
+
+template<typename T>
+class value {
+public:
+ template<typename W>
+ struct wrap {
+ wrap(const W & v)
+ : _v(v) {
+ }
+ const W & operator()() const {
+ return _v;
+ }
+ const W & _v;
+ };
+
+ typedef typename std::remove_reference<T>::type value_type;
+ typedef typename std::conditional<
+ is_callable<typename std::remove_reference<T>::type>::value,
+ value_type, wrap<value_type> >::type stored_type;
+
+ value(const value_type & t)
+ : value<T>(data_type_for<value_type>::value, t) {
+ }
+ value(data_type type, const value_type & t)
+ : _type(type), _t(t) {
+ }
+ uint64_t operator()() const {
+ auto v = _t();
+ if (_type == data_type::GAUGE) {
+ return convert(double(v));
+ } else {
+ uint64_t u = v;
+ return convert(u);
+ }
+ }
+ operator uint64_t() const {
+ return (*this)();
+ }
+ operator data_type() const {
+ return _type;
+ }
+ data_type type() const {
+ return _type;
+ }
+private:
+ // not super quick value -> protocol endian 64-bit values.
+ template<typename _Iter>
+ void bpack(_Iter s, _Iter e, uint64_t v) const {
+ while (s != e) {
+ *s++ = (v & 0xff);
+ v >>= 8;
+ }
+ }
+ template<typename V>
+ typename std::enable_if<std::is_integral<V>::value, uint64_t>::type convert(
+ V v) const {
+ uint64_t i = v;
+ // network byte order
+ return ntohq(i);
+ }
+ template<typename V>
+ typename std::enable_if<std::is_floating_point<V>::value, uint64_t>::type convert(
+ V t) const {
+ union {
+ uint64_t i;
+ double v;
+ } v;
+ union {
+ uint64_t i;
+ uint8_t b[8];
+ } u;
+ v.v = t;
+ // intel byte order. could also obviously be faster.
+ // could be ignored if we just assume we're le (for now),
+ // but this is ok me thinks.
+ bpack(std::begin(u.b), std::end(u.b), v.i);
+ return u.i;
+ }
+ ;
+
+ const data_type _type;
+ const stored_type _t;
+};
+
+template<typename T>
+class value<typed<T>> : public value<T> {
+public:
+ value(const typed<T> & args)
+: value<T>(args.type, args.value) {
+ }
+};
+
+class value_list {
+ bool _enabled = true;
+public:
+ value_list(description d) : _description(std::move(d))
+ {}
+ value_list(value_list&&) = default;
+ virtual ~value_list() {}
+
+ virtual size_t size() const = 0;
+
+ virtual void types(data_type *) const = 0;
+ virtual void values(net::packed<uint64_t> *) const = 0;
+
+ const description& desc() const {
+ return _description;
+ }
+
+ bool empty() const {
+ return size() == 0;
+ }
+
+ bool is_enabled() const {
+ return _enabled;
+ }
+
+ void set_enabled(bool b) {
+ _enabled = b;
+ }
+private:
+ description _description;
+};
+
+template<typename ... Args>
+class values_impl: public value_list {
+public:
+ static const size_t num_values = sizeof...(Args);
+
+ values_impl(description d, Args&& ...args)
+ : value_list(std::move(d))
+ , _values(std::forward<Args>(args)...)
+ {}
+
+ values_impl(values_impl<Args...>&& a) = default;
+ values_impl(const values_impl<Args...>& a) = default;
+
+ size_t size() const override {
+ return num_values;
+ }
+ void types(data_type * p) const override {
+ unpack(_values, [p](Args... args) {
+ std::initializer_list<data_type> tmp = { args... };
+ std::copy(tmp.begin(), tmp.end(), p);
+ });
+ }
+ void values(net::packed<uint64_t> * p) const override {
+ unpack(_values, [p](Args... args) {
+ std::initializer_list<uint64_t> tmp = { args... };
+ std::copy(tmp.begin(), tmp.end(), p);
+ });
+ }
+private:
+ template<typename _Op>
+ void unpack(const std::tuple<Args...>& t, _Op&& op) const {
+ do_unpack(t, std::index_sequence_for<Args...> {}, std::forward<_Op>(op));
+ }
+
+ template<size_t ...S, typename _Op>
+ void do_unpack(const std::tuple<Args...>& t, const std::index_sequence<S...> &, _Op&& op) const {
+ op(std::get<S>(t)...);
+ }
+
+ std::tuple < Args... > _values;
+};
+
+void add_polled(const type_instance_id &, const shared_ptr<value_list> &, bool enabled = true);
+
+typedef std::function<void()> notify_function;
+template<typename... _Args>
+static auto make_type_instance(description d, _Args && ... args) -> values_impl < decltype(value<_Args>(std::forward<_Args>(args)))... >
+{
+ return values_impl<decltype(value<_Args>(std::forward<_Args>(args)))...>(
+ std::move(d), value<_Args>(std::forward<_Args>(args))...);
+}
+/*!
+ * \deprecated metrics registration should be done using the metrics layer
+ *
+ */
+template<typename ... _Args>
+[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const plugin_id & plugin,
+ const plugin_instance_id & plugin_instance, const type_id & type,
+ const scollectd::type_instance & type_instance, _Args&& ... args) {
+ return add_polled_metric(plugin, plugin_instance, type, type_instance, description(),
+ std::forward<_Args>(args)...);
+}
+/*!
+ * \deprecated metrics registration should be done using the metrics layer
+ *
+ */
+template<typename ... _Args>
+[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const plugin_id & plugin,
+ const plugin_instance_id & plugin_instance, const type_id & type,
+ const scollectd::type_instance & type_instance, description d, _Args&& ... args) {
+ return add_polled_metric(
+ type_instance_id(plugin, plugin_instance, type, type_instance), std::move(d),
+ std::forward<_Args>(args)...);
+}
+template<typename ... _Args>
+static future<> send_explicit_metric(const plugin_id & plugin,
+ const plugin_instance_id & plugin_instance, const type_id & type,
+ const scollectd::type_instance & type_instance, _Args&& ... args) {
+ return send_explicit_metric(
+ type_instance_id(plugin, plugin_instance, type, type_instance),
+ std::forward<_Args>(args)...);
+}
+template<typename ... _Args>
+static notify_function create_explicit_metric(const plugin_id & plugin,
+ const plugin_instance_id & plugin_instance, const type_id & type,
+ const scollectd::type_instance & type_instance, _Args&& ... args) {
+ return create_explicit_metric(
+ type_instance_id(plugin, plugin_instance, type, type_instance),
+ std::forward<_Args>(args)...);
+}
+
+seastar::metrics::impl::metric_id to_metrics_id(const type_instance_id & id);
+/*!
+ * \deprecated metrics registration should be done using the metrics layer
+ *
+ */
+template<typename Arg>
+[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const type_instance_id & id, description d,
+ Arg&& arg, bool enabled = true) {
+ seastar::metrics::impl::get_local_impl()->add_registration(to_metrics_id(id), arg.type, seastar::metrics::impl::make_function(arg.value, arg.type), d, enabled);
+ return id;
+}
+/*!
+ * \deprecated metrics registration should be done using the metrics layer
+ *
+ */
+template<typename Arg>
+[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const type_instance_id & id,
+ Arg&& arg) {
+ return std::move(add_polled_metric(id, description(), std::forward<Arg>(arg)));
+}
+
+/*!
+ * \deprecated metrics registration should be done using the metrics layer
+ *
+ */
+template<typename Args>
+[[deprecated("Use the metrics layer")]] static type_instance_id add_disabled_polled_metric(const type_instance_id & id, description d,
+ Args&& arg) {
+ return add_polled_metric(id, d, std::forward<Args>(arg), false);
+}
+
+template<typename Args>
+static type_instance_id add_disabled_polled_metric(const type_instance_id & id,
+ Args&& args) {
+ return add_disabled_polled_metric(id, description(), std::forward<Args>(args));
+}
+
+template<typename ... Args>
+static type_instance_id add_disabled_polled_metric(const type_instance_id & id,
+ Args&& ... args) {
+ return add_disabled_polled_metric(id, description(), std::forward<Args>(args)...);
+}
+
+// "Explicit" metric sends. Sends a single value list as a message.
+// Obviously not super efficient either. But maybe someone needs it sometime.
+template<typename ... _Args>
+static future<> send_explicit_metric(const type_instance_id & id,
+ _Args&& ... args) {
+ return send_metric(id, make_type_instance(std::forward<_Args>(args)...));
+}
+template<typename ... _Args>
+static notify_function create_explicit_metric(const type_instance_id & id,
+ _Args&& ... args) {
+ auto list = make_type_instance(std::forward<_Args>(args)...);
+ return [id, list=std::move(list)]() {
+ send_metric(id, list);
+ };
+}
+
+template<typename... Args>
+typed_value::typed_value(const type_id& tid, const scollectd::type_instance& ti, description d, Args&&... args)
+ : _type_id(tid)
+ , _type_instance(ti)
+ , _values(::seastar::make_shared<decltype(make_type_instance(std::move(d), std::forward<Args>(args)...))>(make_type_instance(std::move(d), std::forward<Args>(args)...)))
+{}
+
+// Send a message packet (string)
+future<> send_notification(const type_instance_id & id, const sstring & msg);
+};
+
+}
diff --git a/src/seastar/include/seastar/core/scollectd_api.hh b/src/seastar/include/seastar/core/scollectd_api.hh
new file mode 100644
index 000000000..b4ca7eae9
--- /dev/null
+++ b/src/seastar/include/seastar/core/scollectd_api.hh
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/core/scollectd.hh>
+#include <seastar/core/metrics_api.hh>
+
+namespace seastar {
+
+namespace scollectd {
+
+using collectd_value = seastar::metrics::impl::metric_value;
+
+std::vector<collectd_value> get_collectd_value(
+ const scollectd::type_instance_id& id);
+
+std::vector<scollectd::type_instance_id> get_collectd_ids();
+
+sstring get_collectd_description_str(const scollectd::type_instance_id&);
+
+bool is_enabled(const scollectd::type_instance_id& id);
+/**
+ * Enable or disable collectd metrics on local instance
+ * @param id - the metric to enable or disable
+ * @param enable - should the collectd metrics be enable or disable
+ */
+void enable(const scollectd::type_instance_id& id, bool enable);
+
+
+metrics::impl::value_map get_value_map();
+}
+
+}
diff --git a/src/seastar/include/seastar/core/seastar.hh b/src/seastar/include/seastar/core/seastar.hh
new file mode 100644
index 000000000..18130ae92
--- /dev/null
+++ b/src/seastar/include/seastar/core/seastar.hh
@@ -0,0 +1,386 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+/// \mainpage
+///
+/// Seastar is a high performance C++ application framework for high
+/// concurrency server applications.
+///
+/// A good place to start is the [Tutorial](tutorial.html) or [Multi-page version](split/).
+///
+/// Please see:
+/// - \ref future-module Documentation on futures and promises, which are
+/// the seastar building blocks.
+/// - \ref future-util Utililty functions for working with futures
+/// - \ref memory-module Memory management
+/// - \ref networking-module TCP/IP networking
+/// - \ref fileio-module File Input/Output
+/// - \ref smp-module Multicore support
+/// - \ref fiber-module Utilities for managing loosely coupled chains of
+/// continuations, also known as fibers
+/// - \ref thread-module Support for traditional threaded execution
+/// - \ref rpc Build high-level communication protocols
+///
+/// View the [Seastar compatibility statement](./md_compatibility.html) for
+/// information about library evolution.
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/file-types.hh>
+#include <seastar/util/bool_class.hh>
+#include <seastar/util/std-compat.hh>
+#include "./internal/api-level.hh"
+
+namespace seastar {
+
+// iostream.hh
+template <class CharType> class input_stream;
+template <class CharType> class output_stream;
+
+class server_socket;
+class socket;
+class connected_socket;
+class socket_address;
+struct listen_options;
+enum class transport;
+
+// file.hh
+class file;
+struct file_open_options;
+struct stat_data;
+
+namespace net {
+
+class udp_channel;
+
+}
+
+// Networking API
+
+/// \defgroup networking-module Networking
+///
+/// Seastar provides a simple networking API, backed by two
+/// TCP/IP stacks: the POSIX stack, utilizing the kernel's
+/// BSD socket APIs, and the native stack, implement fully
+/// within seastar and able to drive network cards directly.
+/// The native stack supports zero-copy on both transmit
+/// and receive, and is implemented using seastar's high
+/// performance, lockless sharded design. The network stack
+/// can be selected with the \c \--network-stack command-line
+/// parameter.
+
+/// \addtogroup networking-module
+/// @{
+
+/// Listen for connections on a given port
+///
+/// Starts listening on a given address for incoming connections.
+///
+/// \param sa socket address to listen on
+///
+/// \return \ref server_socket object ready to accept connections.
+///
+/// \see listen(socket_address sa, listen_options opts)
+server_socket listen(socket_address sa);
+
+/// Listen for connections on a given port
+///
+/// Starts listening on a given address for incoming connections.
+///
+/// \param sa socket address to listen on
+/// \param opts options controlling the listen operation
+///
+/// \return \ref server_socket object ready to accept connections.
+///
+/// \see listen(socket_address sa)
+server_socket listen(socket_address sa, listen_options opts);
+
+/// Establishes a connection to a given address
+///
+/// Attempts to connect to the given address.
+///
+/// \param sa socket address to connect to
+///
+/// \return a \ref connected_socket object, or an exception
+future<connected_socket> connect(socket_address sa);
+
+/// Establishes a connection to a given address
+///
+/// Attempts to connect to the given address with a defined local endpoint
+///
+/// \param sa socket address to connect to
+/// \param local socket address for local endpoint
+/// \param proto transport protocol (TCP or SCTP)
+///
+/// \return a \ref connected_socket object, or an exception
+future<connected_socket> connect(socket_address sa, socket_address local, transport proto);
+
+
+/// Creates a socket object suitable for establishing stream-oriented connections
+///
+/// \return a \ref socket object that can be used for establishing connections
+socket make_socket();
+
+/// Creates a udp_channel object suitable for sending UDP packets
+///
+/// The channel is not bound to a local address, and thus can only be used
+/// for sending.
+///
+/// \return a \ref net::udp_channel object that can be used for UDP transfers.
+net::udp_channel make_udp_channel();
+
+
+/// Creates a udp_channel object suitable for sending and receiving UDP packets
+///
+/// \param local local address to bind to
+///
+/// \return a \ref net::udp_channel object that can be used for UDP transfers.
+net::udp_channel make_udp_channel(const socket_address& local);
+
+/// @}
+
+/// \defgroup fileio-module File Input/Output
+///
+/// Seastar provides a file API to deal with persistent storage.
+/// Unlike most file APIs, seastar offers unbuffered file I/O
+/// (similar to, and based on, \c O_DIRECT). Unbuffered I/O means
+/// that the application is required to do its own caching, but
+/// delivers better performance if this caching is done correctly.
+///
+/// For random I/O or sequential unbuffered I/O, the \ref file
+/// class provides a set of methods for reading, writing, discarding,
+/// or otherwise manipulating a file. For buffered sequential I/O,
+/// see \ref make_file_input_stream() and \ref make_file_output_stream().
+
+/// \addtogroup fileio-module
+/// @{
+
+/// Opens or creates a file. The "dma" in the name refers to the fact
+/// that data transfers are unbuffered and uncached.
+///
+/// \param name the name of the file to open or create
+/// \param flags various flags controlling the open process
+/// \return a \ref file object, as a future
+///
+/// \note
+/// The file name is not guaranteed to be stable on disk, unless the
+/// containing directory is sync'ed.
+///
+/// \relates file
+future<file> open_file_dma(std::string_view name, open_flags flags) noexcept;
+
+/// Opens or creates a file. The "dma" in the name refers to the fact
+/// that data transfers are unbuffered and uncached.
+///
+/// \param name the name of the file to open or create
+/// \param flags various flags controlling the open process
+/// \param options options for opening the file
+/// \return a \ref file object, as a future
+///
+/// \note
+/// The file name is not guaranteed to be stable on disk, unless the
+/// containing directory is sync'ed.
+///
+/// \relates file
+future<file> open_file_dma(std::string_view name, open_flags flags, file_open_options options) noexcept;
+
+/// Checks if a given directory supports direct io
+///
+/// Seastar bypasses the Operating System caches and issues direct io to the
+/// underlying block devices. Projects using seastar should check if the directory
+/// lies in a filesystem that support such operations. This function can be used
+/// to do that.
+///
+/// It will return if direct io can be used, or throw an std::system_error
+/// exception, with the EINVAL error code.
+///
+/// A std::system_error with the respective error code is also thrown if \c path is
+/// not a directory.
+///
+/// \param path the directory we need to verify.
+future<> check_direct_io_support(std::string_view path) noexcept;
+
+/// Opens a directory.
+///
+/// \param name name of the directory to open
+///
+/// \return a \ref file object representing a directory. The only
+/// legal operations are \ref file::list_directory(),
+/// \ref file::flush(), and \ref file::close().
+///
+/// \relates file
+future<file> open_directory(std::string_view name) noexcept;
+
+/// Creates a new directory.
+///
+/// \param name name of the directory to create
+/// \param permissions optional file permissions of the directory to create.
+///
+/// \note
+/// The directory is not guaranteed to be stable on disk, unless the
+/// containing directory is sync'ed.
+future<> make_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept;
+
+/// Ensures a directory exists
+///
+/// Checks whether a directory exists, and if not, creates it. Only
+/// the last component of the directory name is created.
+///
+/// \param name name of the directory to potentially create
+/// \param permissions optional file permissions of the directory to create.
+///
+/// \note
+/// The directory is not guaranteed to be stable on disk, unless the
+/// containing directory is sync'ed.
+/// If the directory exists, the provided permissions are not applied.
+future<> touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept;
+
+/// Recursively ensures a directory exists
+///
+/// Checks whether each component of a directory exists, and if not, creates it.
+///
+/// \param name name of the directory to potentially create
+/// \param permissions optional file permissions of the directory to create.
+///
+/// \note
+/// This function fsyncs each component created, and is therefore guaranteed to be stable on disk.
+/// The provided permissions are applied only on the last component in the path, if it needs to be created,
+/// if intermediate directories do not exist, they are created with the default_dir_permissions.
+/// If any directory exists, the provided permissions are not applied.
+future<> recursive_touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept;
+
+/// Synchronizes a directory to disk
+///
+/// Makes sure the modifications in a directory are synchronized in disk.
+/// This is useful, for instance, after creating or removing a file inside the
+/// directory.
+///
+/// \param name name of the directory to potentially create
+future<> sync_directory(std::string_view name) noexcept;
+
+
+/// Removes (unlinks) a file or an empty directory
+///
+/// \param name name of the file or the directory to remove
+///
+/// \note
+/// The removal is not guaranteed to be stable on disk, unless the
+/// containing directory is sync'ed.
+future<> remove_file(std::string_view name) noexcept;
+
+/// Renames (moves) a file.
+///
+/// \param old_name existing file name
+/// \param new_name new file name
+///
+/// \note
+/// The rename is not guaranteed to be stable on disk, unless the
+/// both containing directories are sync'ed.
+future<> rename_file(std::string_view old_name, std::string_view new_name) noexcept;
+
+struct follow_symlink_tag { };
+using follow_symlink = bool_class<follow_symlink_tag>;
+
+/// Return stat information about a file.
+///
+/// \param name name of the file to return its stat information
+/// \param fs a follow_symlink flag to follow symbolic links.
+///
+/// \return stat_data of the file identified by name.
+/// If name identifies a symbolic link then stat_data is returned either for the target of the link,
+/// with follow_symlink::yes, or for the link itself, with follow_symlink::no.
+future<stat_data> file_stat(std::string_view name, follow_symlink fs = follow_symlink::yes) noexcept;
+
+/// Return the size of a file.
+///
+/// \param name name of the file to return the size
+///
+/// Note that file_size of a symlink is NOT the size of the symlink -
+/// which is the length of the pathname it contains -
+/// but rather the size of the file to which it points.
+future<uint64_t> file_size(std::string_view name) noexcept;
+
+/// Check file access.
+///
+/// \param name name of the file to check
+/// \param flags bit pattern containing type of access to check (read/write/execute or exists).
+///
+/// If only access_flags::exists is queried, returns true if the file exists, or false otherwise.
+/// Throws a std::filesystem::filesystem_error exception if any error other than ENOENT is encountered.
+///
+/// If any of the access_flags (read/write/execute) is set, returns true if the file exists and is
+/// accessible with the requested flags, or false if the file exists and is not accessible
+/// as queried.
+/// Throws a std::filesystem::filesystem_error exception if any error other than EACCES is encountered.
+/// Note that if any path component leading to the file is not searchable, the file is considered inaccessible
+/// with the requested mode and false will be returned.
+future<bool> file_accessible(std::string_view name, access_flags flags) noexcept;
+
+/// check if a file exists.
+///
+/// \param name name of the file to check
+future<bool> file_exists(std::string_view name) noexcept;
+
+/// Determine the type of a file (regular file, directory, etc.)
+///
+/// \param name name of the file for which type information is requested
+/// \param follow a follow_symlink flag that determines whether a trailing symbolic link should be followed or not
+///
+/// \return a engaged optional with the file type if lookup was successful; a disengaged optional
+/// if the file (or one of its parent directories) does not exist; an exceptional future on
+/// other errors.
+future<std::optional<directory_entry_type>> file_type(std::string_view name, follow_symlink follow = follow_symlink::yes) noexcept;
+
+
+/// Creates a hard link for a file
+///
+/// \param oldpath existing file name
+/// \param newpath name of link
+///
+future<> link_file(std::string_view oldpath, std::string_view newpath) noexcept;
+
+/// Changes the permissions mode of a file or directory
+///
+/// \param name name of the file ot directory to change
+/// \param permissions permissions to set
+///
+future<> chmod(std::string_view name, file_permissions permissions) noexcept;
+
+/// Return information about the filesystem where a file is located.
+///
+/// \param name name of the file to inspect
+future<fs_type> file_system_at(std::string_view name) noexcept;
+
+/// Return space available to unprivileged users in filesystem where a file is located, in bytes.
+///
+/// \param name name of the file to inspect
+future<uint64_t> fs_avail(std::string_view name) noexcept;
+
+/// Return free space in filesystem where a file is located, in bytes.
+///
+/// \param name name of the file to inspect
+future<uint64_t> fs_free(std::string_view name) noexcept;
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/semaphore.hh b/src/seastar/include/seastar/core/semaphore.hh
new file mode 100644
index 000000000..a4837da0d
--- /dev/null
+++ b/src/seastar/include/seastar/core/semaphore.hh
@@ -0,0 +1,572 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/chunked_fifo.hh>
+#include <stdexcept>
+#include <exception>
+#include <seastar/core/timer.hh>
+#include <seastar/core/expiring_fifo.hh>
+
+namespace seastar {
+
+/// \addtogroup fiber-module
+/// @{
+
+/// Exception thrown when a semaphore is broken by
+/// \ref semaphore::broken().
+class broken_semaphore : public std::exception {
+public:
+ /// Reports the exception reason.
+ virtual const char* what() const noexcept;
+};
+
+/// Exception thrown when a semaphore wait operation
+/// times out.
+///
+/// \see semaphore::wait(typename timer<>::duration timeout, size_t nr)
+class semaphore_timed_out : public std::exception {
+public:
+ /// Reports the exception reason.
+ virtual const char* what() const noexcept;
+};
+
+/// Exception Factory for standard semaphore
+///
+/// constructs standard semaphore exceptions
+/// \see semaphore_timed_out and broken_semaphore
+struct semaphore_default_exception_factory {
+ static semaphore_timed_out timeout() noexcept;
+ static broken_semaphore broken() noexcept;
+};
+
+class named_semaphore_timed_out : public semaphore_timed_out {
+ sstring _msg;
+public:
+ named_semaphore_timed_out(std::string_view msg) noexcept;
+ virtual const char* what() const noexcept;
+};
+
+class broken_named_semaphore : public broken_semaphore {
+ sstring _msg;
+public:
+ broken_named_semaphore(std::string_view msg) noexcept;
+ virtual const char* what() const noexcept;
+};
+
+// A factory of semaphore exceptions that contain additional context: the semaphore name
+// auto sem = named_semaphore(0, named_semaphore_exception_factory{"file_opening_limit_semaphore"});
+struct named_semaphore_exception_factory {
+ sstring name;
+ named_semaphore_timed_out timeout() const noexcept;
+ broken_named_semaphore broken() const noexcept;
+};
+
+/// \brief Counted resource guard.
+///
+/// This is a standard computer science semaphore, adapted
+/// for futures. You can deposit units into a counter,
+/// or take them away. Taking units from the counter may wait
+/// if not enough units are available.
+///
+/// To support exceptional conditions, a \ref broken() method
+/// is provided, which causes all current waiters to stop waiting,
+/// with an exceptional future returned. This allows causing all
+/// fibers that are blocked on a semaphore to continue. This is
+/// similar to POSIX's `pthread_cancel()`, with \ref wait() acting
+/// as a cancellation point.
+///
+/// \tparam ExceptionFactory template parameter allows modifying a semaphore to throw
+/// customized exceptions on timeout/broken(). It has to provide two functions
+/// ExceptionFactory::timeout() and ExceptionFactory::broken() which return corresponding
+/// exception object.
+template<typename ExceptionFactory, typename Clock = typename timer<>::clock>
+class basic_semaphore : private ExceptionFactory {
+public:
+ using duration = typename timer<Clock>::duration;
+ using clock = typename timer<Clock>::clock;
+ using time_point = typename timer<Clock>::time_point;
+ using exception_factory = ExceptionFactory;
+private:
+ ssize_t _count;
+ std::exception_ptr _ex;
+ struct entry {
+ promise<> pr;
+ size_t nr;
+ entry(promise<>&& pr_, size_t nr_) noexcept : pr(std::move(pr_)), nr(nr_) {}
+ };
+ using expiry_handler = std::function<void (entry&)>;
+ expiring_fifo<entry, expiry_handler, clock> _wait_list;
+ expiry_handler make_expiry_handler() noexcept {
+ return [this] (entry& e) noexcept {
+ try {
+ e.pr.set_exception(this->timeout());
+ } catch (...) {
+ e.pr.set_exception(semaphore_timed_out());
+ }
+ };
+ }
+ bool has_available_units(size_t nr) const noexcept {
+ return _count >= 0 && (static_cast<size_t>(_count) >= nr);
+ }
+ bool may_proceed(size_t nr) const noexcept {
+ return has_available_units(nr) && _wait_list.empty();
+ }
+public:
+ /// Returns the maximum number of units the semaphore counter can hold
+ static constexpr size_t max_counter() noexcept {
+ return std::numeric_limits<decltype(_count)>::max();
+ }
+
+ /// Constructs a semaphore object with a specific number of units
+ /// in its internal counter. E.g., starting it at 1 is suitable for use as
+ /// an unlocked mutex.
+ ///
+ /// \param count number of initial units present in the counter.
+ basic_semaphore(size_t count) noexcept(std::is_nothrow_default_constructible_v<exception_factory>)
+ : exception_factory()
+ , _count(count),
+ _wait_list(make_expiry_handler())
+ {}
+ basic_semaphore(size_t count, exception_factory&& factory) noexcept(std::is_nothrow_move_constructible_v<exception_factory>)
+ : exception_factory(std::move(factory))
+ , _count(count)
+ , _wait_list(make_expiry_handler())
+ {
+ static_assert(std::is_nothrow_move_constructible_v<expiry_handler>);
+ }
+ /// Waits until at least a specific number of units are available in the
+ /// counter, and reduces the counter by that amount of units.
+ ///
+ /// \note Waits are serviced in FIFO order, though if several are awakened
+ /// at once, they may be reordered by the scheduler.
+ ///
+ /// \param nr Amount of units to wait for (default 1).
+ /// \return a future that becomes ready when sufficient units are available
+ /// to satisfy the request. If the semaphore was \ref broken(), may
+ /// contain an exception.
+ future<> wait(size_t nr = 1) noexcept {
+ return wait(time_point::max(), nr);
+ }
+ /// Waits until at least a specific number of units are available in the
+ /// counter, and reduces the counter by that amount of units. If the request
+ /// cannot be satisfied in time, the request is aborted.
+ ///
+ /// \note Waits are serviced in FIFO order, though if several are awakened
+ /// at once, they may be reordered by the scheduler.
+ ///
+ /// \param timeout expiration time.
+ /// \param nr Amount of units to wait for (default 1).
+ /// \return a future that becomes ready when sufficient units are available
+ /// to satisfy the request. On timeout, the future contains a
+ /// \ref semaphore_timed_out exception. If the semaphore was
+ /// \ref broken(), may contain an exception.
+ future<> wait(time_point timeout, size_t nr = 1) noexcept {
+ if (may_proceed(nr)) {
+ _count -= nr;
+ return make_ready_future<>();
+ }
+ if (_ex) {
+ return make_exception_future(_ex);
+ }
+ entry e(promise<>(), nr);
+ auto fut = e.pr.get_future();
+ try {
+ _wait_list.push_back(std::move(e), timeout);
+ } catch (...) {
+ e.pr.set_exception(std::current_exception());
+ }
+ return fut;
+ }
+
+ /// Waits until at least a specific number of units are available in the
+ /// counter, and reduces the counter by that amount of units. If the request
+ /// cannot be satisfied in time, the request is aborted.
+ ///
+ /// \note Waits are serviced in FIFO order, though if several are awakened
+ /// at once, they may be reordered by the scheduler.
+ ///
+ /// \param timeout how long to wait.
+ /// \param nr Amount of units to wait for (default 1).
+ /// \return a future that becomes ready when sufficient units are available
+ /// to satisfy the request. On timeout, the future contains a
+ /// \ref semaphore_timed_out exception. If the semaphore was
+ /// \ref broken(), may contain an exception.
+ future<> wait(duration timeout, size_t nr = 1) noexcept {
+ return wait(clock::now() + timeout, nr);
+ }
+ /// Deposits a specified number of units into the counter.
+ ///
+ /// The counter is incremented by the specified number of units.
+ /// If the new counter value is sufficient to satisfy the request
+ /// of one or more waiters, their futures (in FIFO order) become
+ /// ready, and the value of the counter is reduced according to
+ /// the amount requested.
+ ///
+ /// \param nr Number of units to deposit (default 1).
+ void signal(size_t nr = 1) noexcept {
+ if (_ex) {
+ return;
+ }
+ _count += nr;
+ while (!_wait_list.empty() && has_available_units(_wait_list.front().nr)) {
+ auto& x = _wait_list.front();
+ _count -= x.nr;
+ x.pr.set_value();
+ _wait_list.pop_front();
+ }
+ }
+
+ /// Consume the specific number of units without blocking
+ //
+ /// Consume the specific number of units now, regardless of how many units are available
+ /// in the counter, and reduces the counter by that amount of units. This operation may
+ /// cause the counter to go negative.
+ ///
+ /// \param nr Amount of units to consume (default 1).
+ void consume(size_t nr = 1) noexcept {
+ if (_ex) {
+ return;
+ }
+ _count -= nr;
+ }
+
+ /// Attempts to reduce the counter value by a specified number of units.
+ ///
+ /// If sufficient units are available in the counter, and if no
+ /// other fiber is waiting, then the counter is reduced. Otherwise,
+ /// nothing happens. This is useful for "opportunistic" waits where
+ /// useful work can happen if the counter happens to be ready, but
+ /// when it is not worthwhile to wait.
+ ///
+ /// \param nr number of units to reduce the counter by (default 1).
+ /// \return `true` if the counter had sufficient units, and was decremented.
+ bool try_wait(size_t nr = 1) noexcept {
+ if (may_proceed(nr)) {
+ _count -= nr;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ /// Returns the number of units available in the counter.
+ ///
+ /// Does not take into account any waiters.
+ size_t current() const noexcept { return std::max(_count, ssize_t(0)); }
+
+ /// Returns the number of available units.
+ ///
+ /// Takes into account units consumed using \ref consume() and therefore
+ /// may return a negative value.
+ ssize_t available_units() const noexcept { return _count; }
+
+ /// Returns the current number of waiters
+ size_t waiters() const noexcept { return _wait_list.size(); }
+
+ /// Signal to waiters that an error occurred. \ref wait() will see
+ /// an exceptional future<> containing a \ref broken_semaphore exception.
+ /// The future is made available immediately.
+ void broken() noexcept {
+ std::exception_ptr ep;
+ try {
+ ep = std::make_exception_ptr(exception_factory::broken());
+ } catch (...) {
+ ep = std::make_exception_ptr(broken_semaphore());
+ }
+ broken(std::move(ep));
+ }
+
+ /// Signal to waiters that an error occurred. \ref wait() will see
+ /// an exceptional future<> containing the provided exception parameter.
+ /// The future is made available immediately.
+ template <typename Exception>
+ void broken(const Exception& ex) noexcept {
+ broken(std::make_exception_ptr(ex));
+ }
+
+ /// Signal to waiters that an error occurred. \ref wait() will see
+ /// an exceptional future<> containing the provided exception parameter.
+ /// The future is made available immediately.
+ void broken(std::exception_ptr ex) noexcept;
+
+ /// Reserve memory for waiters so that wait() will not throw.
+ void ensure_space_for_waiters(size_t n) {
+ _wait_list.reserve(n);
+ }
+};
+
+template<typename ExceptionFactory, typename Clock>
+inline
+void
+basic_semaphore<ExceptionFactory, Clock>::broken(std::exception_ptr xp) noexcept {
+ static_assert(std::is_nothrow_copy_constructible_v<std::exception_ptr>);
+ _ex = xp;
+ _count = 0;
+ while (!_wait_list.empty()) {
+ auto& x = _wait_list.front();
+ x.pr.set_exception(xp);
+ _wait_list.pop_front();
+ }
+}
+
+template<typename ExceptionFactory = semaphore_default_exception_factory, typename Clock = typename timer<>::clock>
+class semaphore_units {
+ basic_semaphore<ExceptionFactory, Clock>* _sem;
+ size_t _n;
+
+ semaphore_units(basic_semaphore<ExceptionFactory, Clock>* sem, size_t n) noexcept : _sem(sem), _n(n) {}
+public:
+ semaphore_units() noexcept : semaphore_units(nullptr, 0) {}
+ semaphore_units(basic_semaphore<ExceptionFactory, Clock>& sem, size_t n) noexcept : semaphore_units(&sem, n) {}
+ semaphore_units(semaphore_units&& o) noexcept : _sem(o._sem), _n(std::exchange(o._n, 0)) {
+ }
+ semaphore_units& operator=(semaphore_units&& o) noexcept {
+ _sem = o._sem;
+ _n = std::exchange(o._n, 0);
+ return *this;
+ }
+ semaphore_units(const semaphore_units&) = delete;
+ ~semaphore_units() noexcept {
+ return_all();
+ }
+ /// Return ownership of some units to the semaphore. The semaphore will be signaled by the number of units returned.
+ ///
+ /// \param units number of units to subtract.
+ ///
+ /// \note throws exception if \c units is more than those protected by the semaphore
+ ///
+ /// \return the number of remaining units
+ size_t return_units(size_t units) {
+ if (units > _n) {
+ throw std::invalid_argument("Cannot take more units than those protected by the semaphore");
+ }
+ _n -= units;
+ _sem->signal(units);
+ return _n;
+ }
+ /// Return ownership of all units. The semaphore will be signaled by the number of units returned.
+ void return_all() noexcept {
+ if (_n) {
+ _sem->signal(_n);
+ _n = 0;
+ }
+ }
+ /// Releases ownership of the units. The semaphore will not be signalled.
+ ///
+ /// \return the number of units held
+ size_t release() noexcept {
+ return std::exchange(_n, 0);
+ }
+ /// Splits this instance into a \ref semaphore_units object holding the specified amount of units.
+ /// This object will continue holding the remaining units.
+ ///
+ /// \param units number of units to subtract.
+ ///
+ /// \note throws exception if \c units is more than those protected by the semaphore
+ ///
+ /// \return semaphore_units holding the specified number of units
+ semaphore_units split(size_t units) {
+ if (units > _n) {
+ throw std::invalid_argument("Cannot take more units than those protected by the semaphore");
+ }
+ _n -= units;
+ return semaphore_units(_sem, units);
+ }
+ /// The inverse of split(), in which the units held by the specified \ref semaphore_units
+ /// object are merged into the current one. The function assumes (and asserts) that both
+ /// are associated with the same \ref semaphore.
+ ///
+ /// \return the updated semaphore_units object
+ void adopt(semaphore_units&& other) noexcept {
+ assert(other._sem == _sem);
+ _n += other.release();
+ }
+
+ /// Returns the number of units held
+ size_t count() const noexcept {
+ return _n;
+ }
+};
+
+/// \brief Take units from semaphore temporarily
+///
+/// Takes units from the semaphore and returns them when the \ref semaphore_units object goes out of scope.
+/// This provides a safe way to temporarily take units from a semaphore and ensure
+/// that they are eventually returned under all circumstances (exceptions, premature scope exits, etc).
+///
+/// Unlike with_semaphore(), the scope of unit holding is not limited to the scope of a single async lambda.
+///
+/// \param sem The semaphore to take units from
+/// \param units Number of units to take
+/// \return a \ref future<> holding \ref semaphore_units object. When the object goes out of scope
+/// the units are returned to the semaphore.
+///
+/// \note The caller must guarantee that \c sem is valid as long as
+/// \ref seaphore_units object is alive.
+///
+/// \related semaphore
+template<typename ExceptionFactory, typename Clock = typename timer<>::clock>
+future<semaphore_units<ExceptionFactory, Clock>>
+get_units(basic_semaphore<ExceptionFactory, Clock>& sem, size_t units) noexcept {
+ return sem.wait(units).then([&sem, units] {
+ return semaphore_units<ExceptionFactory, Clock>{ sem, units };
+ });
+}
+
+/// \brief Take units from semaphore temporarily with time bound on wait
+///
+/// Like \ref get_units(basic_semaphore<ExceptionFactory>&, size_t) but when
+/// timeout is reached before units are granted returns an exceptional future holding semaphore_timed_out.
+///
+/// \param sem The semaphore to take units from
+/// \param units Number of units to take
+/// \return a \ref future<> holding \ref semaphore_units object. When the object goes out of scope
+/// the units are returned to the semaphore.
+///
+/// \note The caller must guarantee that \c sem is valid as long as
+/// \ref seaphore_units object is alive.
+///
+/// \related semaphore
+template<typename ExceptionFactory, typename Clock = typename timer<>::clock>
+future<semaphore_units<ExceptionFactory, Clock>>
+get_units(basic_semaphore<ExceptionFactory, Clock>& sem, size_t units, typename basic_semaphore<ExceptionFactory, Clock>::time_point timeout) noexcept {
+ return sem.wait(timeout, units).then([&sem, units] {
+ return semaphore_units<ExceptionFactory, Clock>{ sem, units };
+ });
+}
+
+/// \brief Take units from semaphore temporarily with time bound on wait
+///
+/// Like \ref get_units(basic_semaphore<ExceptionFactory>&, size_t, basic_semaphore<ExceptionFactory>::time_point) but
+/// allow the timeout to be specified as a duration.
+///
+/// \param sem The semaphore to take units from
+/// \param units Number of units to take
+/// \param timeout a duration specifying when to timeout the current request
+/// \return a \ref future<> holding \ref semaphore_units object. When the object goes out of scope
+/// the units are returned to the semaphore.
+///
+/// \note The caller must guarantee that \c sem is valid as long as
+/// \ref seaphore_units object is alive.
+///
+/// \related semaphore
+template<typename ExceptionFactory, typename Clock>
+future<semaphore_units<ExceptionFactory, Clock>>
+get_units(basic_semaphore<ExceptionFactory, Clock>& sem, size_t units, typename basic_semaphore<ExceptionFactory, Clock>::duration timeout) noexcept {
+ return sem.wait(timeout, units).then([&sem, units] {
+ return semaphore_units<ExceptionFactory, Clock>{ sem, units };
+ });
+}
+
+
+/// \brief Consume units from semaphore temporarily
+///
+/// Consume units from the semaphore and returns them when the \ref semaphore_units object goes out of scope.
+/// This provides a safe way to temporarily take units from a semaphore and ensure
+/// that they are eventually returned under all circumstances (exceptions, premature scope exits, etc).
+///
+/// Unlike get_units(), this calls the non-blocking consume() API.
+///
+/// Unlike with_semaphore(), the scope of unit holding is not limited to the scope of a single async lambda.
+///
+/// \param sem The semaphore to take units from
+/// \param units Number of units to consume
+template<typename ExceptionFactory, typename Clock = typename timer<>::clock>
+semaphore_units<ExceptionFactory, Clock>
+consume_units(basic_semaphore<ExceptionFactory, Clock>& sem, size_t units) noexcept {
+ sem.consume(units);
+ return semaphore_units<ExceptionFactory, Clock>{ sem, units };
+}
+
+/// \brief Runs a function protected by a semaphore
+///
+/// Acquires a \ref semaphore, runs a function, and releases
+/// the semaphore, returning the the return value of the function,
+/// as a \ref future.
+///
+/// \param sem The semaphore to be held while the \c func is
+/// running.
+/// \param units Number of units to acquire from \c sem (as
+/// with semaphore::wait())
+/// \param func The function to run; signature \c void() or
+/// \c future<>().
+/// \return a \ref future<> holding the function's return value
+/// or exception thrown; or a \ref future<> containing
+/// an exception from one of the semaphore::broken()
+/// variants.
+///
+/// \note The caller must guarantee that \c sem is valid until
+/// the future returned by with_semaphore() resolves.
+///
+/// \related semaphore
+template <typename ExceptionFactory, typename Func, typename Clock = typename timer<>::clock>
+inline
+futurize_t<std::result_of_t<Func()>>
+with_semaphore(basic_semaphore<ExceptionFactory, Clock>& sem, size_t units, Func&& func) noexcept {
+ return get_units(sem, units).then([func = std::forward<Func>(func)] (auto units) mutable {
+ return futurize_invoke(std::forward<Func>(func)).finally([units = std::move(units)] {});
+ });
+}
+
+/// \brief Runs a function protected by a semaphore with time bound on wait
+///
+/// If possible, acquires a \ref semaphore, runs a function, and releases
+/// the semaphore, returning the the return value of the function,
+/// as a \ref future.
+///
+/// If the semaphore can't be acquired within the specified timeout, returns
+/// a semaphore_timed_out exception
+///
+/// \param sem The semaphore to be held while the \c func is
+/// running.
+/// \param units Number of units to acquire from \c sem (as
+/// with semaphore::wait())
+/// \param timeout a duration specifying when to timeout the current request
+/// \param func The function to run; signature \c void() or
+/// \c future<>().
+/// \return a \ref future<> holding the function's return value
+/// or exception thrown; or a \ref future<> containing
+/// an exception from one of the semaphore::broken()
+/// variants.
+///
+/// \note The caller must guarantee that \c sem is valid until
+/// the future returned by with_semaphore() resolves.
+///
+/// \related semaphore
+template <typename ExceptionFactory, typename Clock, typename Func>
+inline
+futurize_t<std::result_of_t<Func()>>
+with_semaphore(basic_semaphore<ExceptionFactory, Clock>& sem, size_t units, typename basic_semaphore<ExceptionFactory, Clock>::duration timeout, Func&& func) noexcept {
+ return get_units(sem, units, timeout).then([func = std::forward<Func>(func)] (auto units) mutable {
+ return futurize_invoke(std::forward<Func>(func)).finally([units = std::move(units)] {});
+ });
+}
+
+/// default basic_semaphore specialization that throws semaphore specific exceptions
+/// on error conditions.
+using semaphore = basic_semaphore<semaphore_default_exception_factory>;
+using named_semaphore = basic_semaphore<named_semaphore_exception_factory>;
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/sharded.hh b/src/seastar/include/seastar/core/sharded.hh
new file mode 100644
index 000000000..c002476e1
--- /dev/null
+++ b/src/seastar/include/seastar/core/sharded.hh
@@ -0,0 +1,909 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/smp.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/core/map_reduce.hh>
+#include <seastar/util/is_smart_ptr.hh>
+#include <seastar/util/tuple_utils.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/util/concepts.hh>
+#include <boost/iterator/counting_iterator.hpp>
+#include <functional>
+#if __has_include(<concepts>)
+#include <concepts>
+#endif
+
+/// \defgroup smp-module Multicore
+///
+/// \brief Support for exploiting multiple cores on a server.
+///
+/// Seastar supports multicore servers by using *sharding*. Each logical
+/// core (lcore) runs a separate event loop, with its own memory allocator,
+/// TCP/IP stack, and other services. Shards communicate by explicit message
+/// passing, rather than using locks and condition variables as with traditional
+/// threaded programming.
+
+namespace seastar {
+
+template <typename Func, typename... Param>
+class sharded_parameter;
+
+namespace internal {
+
+template <typename Func, typename... Param>
+auto unwrap_sharded_arg(sharded_parameter<Func, Param...> sp);
+
+using on_each_shard_func = std::function<future<> (unsigned shard)>;
+
+future<> sharded_parallel_for_each(unsigned nr_shards, on_each_shard_func on_each_shard) noexcept(std::is_nothrow_move_constructible_v<on_each_shard_func>);
+
+}
+
+/// \addtogroup smp-module
+/// @{
+
+template <typename T>
+class sharded;
+
+/// If sharded service inherits from this class sharded::stop() will wait
+/// until all references to a service on each shard will disappear before
+/// returning. It is still service's own responsibility to track its references
+/// in asynchronous code by calling shared_from_this() and keeping returned smart
+/// pointer as long as object is in use.
+template<typename T>
+class async_sharded_service : public enable_shared_from_this<T> {
+protected:
+ std::function<void()> _delete_cb;
+ async_sharded_service() noexcept = default;
+ virtual ~async_sharded_service() {
+ if (_delete_cb) {
+ _delete_cb();
+ }
+ }
+ template <typename Service> friend class sharded;
+};
+
+
+/// \brief Provide a sharded service with access to its peers
+///
+/// If a service class inherits from this, it will gain a \code container()
+/// \endcode method that provides access to the \ref sharded object, with which
+/// it can call its peers.
+template <typename Service>
+class peering_sharded_service {
+ sharded<Service>* _container = nullptr;
+private:
+ template <typename T> friend class sharded;
+ void set_container(sharded<Service>* container) noexcept { _container = container; }
+public:
+ peering_sharded_service() noexcept = default;
+ peering_sharded_service(peering_sharded_service<Service>&&) noexcept = default;
+ peering_sharded_service(const peering_sharded_service<Service>&) = delete;
+ peering_sharded_service& operator=(const peering_sharded_service<Service>&) = delete;
+ sharded<Service>& container() noexcept { return *_container; }
+ const sharded<Service>& container() const noexcept { return *_container; }
+};
+
+
+/// Exception thrown when a \ref sharded object does not exist
+class no_sharded_instance_exception : public std::exception {
+public:
+ virtual const char* what() const noexcept override {
+ return "sharded instance does not exist";
+ }
+};
+
+/// Template helper to distribute a service across all logical cores.
+///
+/// The \c sharded template manages a sharded service, by creating
+/// a copy of the service on each logical core, providing mechanisms to communicate
+/// with each shard's copy, and a way to stop the service.
+///
+/// \tparam Service a class to be instantiated on each core. Must expose
+/// a \c stop() method that returns a \c future<>, to be called when
+/// the service is stopped.
+template <typename Service>
+class sharded {
+ struct entry {
+ shared_ptr<Service> service;
+ promise<> freed;
+ };
+ std::vector<entry> _instances;
+private:
+ using invoke_on_all_func_type = std::function<future<> (Service&)>;
+private:
+ void service_deleted() noexcept {
+ _instances[this_shard_id()].freed.set_value();
+ }
+ template <typename U, bool async>
+ friend struct shared_ptr_make_helper;
+
+ template <typename T>
+ std::enable_if_t<std::is_base_of<peering_sharded_service<T>, T>::value>
+ set_container(T& service) noexcept {
+ service.set_container(this);
+ }
+
+ template <typename T>
+ std::enable_if_t<!std::is_base_of<peering_sharded_service<T>, T>::value>
+ set_container(T& service) noexcept {
+ }
+
+ future<>
+ sharded_parallel_for_each(internal::on_each_shard_func func) noexcept(std::is_nothrow_move_constructible_v<internal::on_each_shard_func>) {
+ return internal::sharded_parallel_for_each(_instances.size(), std::move(func));
+ }
+public:
+ /// Constructs an empty \c sharded object. No instances of the service are
+ /// created.
+ sharded() noexcept {}
+ sharded(const sharded& other) = delete;
+ sharded& operator=(const sharded& other) = delete;
+ /// Sharded object with T that inherits from peering_sharded_service
+ /// cannot be moved safely, so disable move operations.
+ sharded(sharded&& other) = delete;
+ sharded& operator=(sharded&& other) = delete;
+ /// Destroyes a \c sharded object. Must not be in a started state.
+ ~sharded();
+
+ /// Starts \c Service by constructing an instance on every logical core
+ /// with a copy of \c args passed to the constructor.
+ ///
+ /// \param args Arguments to be forwarded to \c Service constructor
+ /// \return a \ref seastar::future<> that becomes ready when all instances have been
+ /// constructed.
+ template <typename... Args>
+ future<> start(Args&&... args) noexcept;
+
+ /// Starts \c Service by constructing an instance on a single logical core
+ /// with a copy of \c args passed to the constructor.
+ ///
+ /// \param args Arguments to be forwarded to \c Service constructor
+ /// \return a \ref seastar::future<> that becomes ready when the instance has been
+ /// constructed.
+ template <typename... Args>
+ future<> start_single(Args&&... args) noexcept;
+
+ /// Stops all started instances and destroys them.
+ ///
+ /// For every started instance, its \c stop() method is called, and then
+ /// it is destroyed.
+ future<> stop() noexcept;
+
+ /// Invoke a type-erased function on all instances of `Service`.
+ /// The return value becomes ready when all instances have processed
+ /// the message.
+ ///
+ /// \param options the options to forward to the \ref smp::submit_to()
+ /// called behind the scenes.
+ /// \param func Function to be invoked on all shards
+ /// \return Future that becomes ready once all calls have completed
+ future<> invoke_on_all(smp_submit_to_options options, std::function<future<> (Service&)> func) noexcept;
+
+ /// Invoke a type-erased function on all instances of `Service`.
+ /// The return value becomes ready when all instances have processed
+ /// the message.
+ /// Passes the default \ref smp_submit_to_options to the
+ /// \ref smp::submit_to() called behind the scenes.
+ future<> invoke_on_all(std::function<future<> (Service&)> func) noexcept {
+ try {
+ return invoke_on_all(smp_submit_to_options{}, std::move(func));
+ } catch (...) {
+ return current_exception_as_future();
+ }
+ }
+
+ /// Invoke a function on all instances of `Service`.
+ /// The return value becomes ready when all instances have processed
+ /// the message. The function can be a member pointer to function,
+ /// a free function, or a functor. The first argument of the function
+ /// will be a reference to the local service on the shard.
+ ///
+ /// For a non-static pointer-to-member-function, the first argument
+ /// becomes `this`, not the first declared parameter.
+ ///
+ /// \param options the options to forward to the \ref smp::submit_to()
+ /// called behind the scenes.
+ /// \param func invocable accepting a `Service&` as the first parameter
+ /// to be invoked on all shards
+ /// \return Future that becomes ready once all calls have completed
+ template <typename Func, typename... Args>
+ SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args...>)
+ future<> invoke_on_all(smp_submit_to_options options, Func func, Args... args) noexcept;
+
+ /// Invoke a function on all instances of `Service`.
+ /// The return value becomes ready when all instances have processed
+ /// the message.
+ /// Passes the default \ref smp_submit_to_options to the
+ /// \ref smp::submit_to() called behind the scenes.
+ template <typename Func, typename... Args>
+ SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args...>)
+ future<> invoke_on_all(Func func, Args... args) noexcept {
+ try {
+ return invoke_on_all(smp_submit_to_options{}, std::move(func), std::move(args)...);
+ } catch (...) {
+ return current_exception_as_future();
+ }
+ }
+
+ /// Invoke a callable on all instances of \c Service except the instance
+ /// which is allocated on current shard.
+ ///
+ /// \param options the options to forward to the \ref smp::submit_to()
+ /// called behind the scenes.
+ /// \param func a callable with the signature `void (Service&)`
+ /// or `future<> (Service&)`, to be called on each core
+ /// with the local instance as an argument.
+ /// \return a `future<>` that becomes ready when all cores but the current one have
+ /// processed the message.
+ template <typename Func, typename... Args>
+ SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args...>)
+ future<> invoke_on_others(smp_submit_to_options options, Func func, Args... args) noexcept;
+
+ /// Invoke a callable on all instances of \c Service except the instance
+ /// which is allocated on current shard.
+ ///
+ /// \param func a callable with the signature `void (Service&)`
+ /// or `future<> (Service&)`, to be called on each core
+ /// with the local instance as an argument.
+ /// \return a `future<>` that becomes ready when all cores but the current one have
+ /// processed the message.
+ ///
+ /// Passes the default \ref smp_submit_to_options to the
+ /// \ref smp::submit_to() called behind the scenes.
+ template <typename Func, typename... Args>
+ SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args...>)
+ future<> invoke_on_others(Func func, Args... args) noexcept {
+ try {
+ return invoke_on_others(smp_submit_to_options{}, std::move(func), std::move(args)...);
+ } catch (...) {
+ return current_exception_as_future();
+ }
+ }
+
+ /// Invoke a method on all instances of `Service` and reduce the results using
+ /// `Reducer`.
+ ///
+ /// \see map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Reducer&& r)
+ template <typename Reducer, typename Ret, typename... FuncArgs, typename... Args>
+ inline
+ auto
+ map_reduce(Reducer&& r, Ret (Service::*func)(FuncArgs...), Args&&... args)
+ -> typename reducer_traits<Reducer>::future_type
+ {
+ return ::seastar::map_reduce(boost::make_counting_iterator<unsigned>(0),
+ boost::make_counting_iterator<unsigned>(_instances.size()),
+ [this, func, args = std::make_tuple(std::forward<Args>(args)...)] (unsigned c) mutable {
+ return smp::submit_to(c, [this, func, args] () mutable {
+ return std::apply([this, func] (Args&&... args) mutable {
+ auto inst = _instances[this_shard_id()].service;
+ if (inst) {
+ return ((*inst).*func)(std::forward<Args>(args)...);
+ } else {
+ throw no_sharded_instance_exception();
+ }
+ }, std::move(args));
+ });
+ }, std::forward<Reducer>(r));
+ }
+
+ /// Invoke a callable on all instances of `Service` and reduce the results using
+ /// `Reducer`.
+ ///
+ /// \see map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Reducer&& r)
+ template <typename Reducer, typename Func>
+ inline
+ auto map_reduce(Reducer&& r, Func&& func) -> typename reducer_traits<Reducer>::future_type
+ {
+ return ::seastar::map_reduce(boost::make_counting_iterator<unsigned>(0),
+ boost::make_counting_iterator<unsigned>(_instances.size()),
+ [this, &func] (unsigned c) mutable {
+ return smp::submit_to(c, [this, func] () mutable {
+ auto inst = get_local_service();
+ return func(*inst);
+ });
+ }, std::forward<Reducer>(r));
+ }
+
+ /// Applies a map function to all shards, then reduces the output by calling a reducer function.
+ ///
+ /// \param map callable with the signature `Value (Service&)` or
+ /// `future<Value> (Service&)` (for some `Value` type).
+ /// used as the second input to \c reduce
+ /// \param initial initial value used as the first input to \c reduce.
+ /// \param reduce binary function used to left-fold the return values of \c map
+ /// into \c initial .
+ ///
+ /// Each \c map invocation runs on the shard associated with the service.
+ ///
+ /// \tparam Mapper unary function taking `Service&` and producing some result.
+ /// \tparam Initial any value type
+ /// \tparam Reduce a binary function taking two Initial values and returning an Initial
+ /// \return Result of invoking `map` with each instance in parallel, reduced by calling
+ /// `reduce()` on each adjacent pair of results.
+ template <typename Mapper, typename Initial, typename Reduce>
+ inline
+ future<Initial>
+ map_reduce0(Mapper map, Initial initial, Reduce reduce) {
+ auto wrapped_map = [this, map] (unsigned c) {
+ return smp::submit_to(c, [this, map] {
+ auto inst = get_local_service();
+ return map(*inst);
+ });
+ };
+ return ::seastar::map_reduce(smp::all_cpus().begin(), smp::all_cpus().end(),
+ std::move(wrapped_map),
+ std::move(initial),
+ std::move(reduce));
+ }
+
+ /// Applies a map function to all shards, and return a vector of the result.
+ ///
+ /// \param mapper callable with the signature `Value (Service&)` or
+ /// `future<Value> (Service&)` (for some `Value` type).
+ ///
+ /// Each \c map invocation runs on the shard associated with the service.
+ ///
+ /// \tparam Mapper unary function taking `Service&` and producing some result.
+ /// \return Result vector of invoking `map` with each instance in parallel
+ template <typename Mapper, typename Future = futurize_t<std::result_of_t<Mapper(Service&)>>, typename return_type = decltype(internal::untuple(std::declval<typename Future::tuple_type>()))>
+ inline future<std::vector<return_type>> map(Mapper mapper) {
+ return do_with(std::vector<return_type>(),
+ [&mapper, this] (std::vector<return_type>& vec) mutable {
+ vec.resize(smp::count);
+ return parallel_for_each(boost::irange<unsigned>(0, _instances.size()), [this, &vec, mapper] (unsigned c) {
+ return smp::submit_to(c, [this, mapper] {
+ auto inst = get_local_service();
+ return mapper(*inst);
+ }).then([&vec, c] (auto res) {
+ vec[c] = res;
+ });
+ }).then([&vec] {
+ return make_ready_future<std::vector<return_type>>(std::move(vec));
+ });
+ });
+ }
+
+ /// Invoke a callable on a specific instance of `Service`.
+ ///
+ /// \param id shard id to call
+ /// \param options the options to forward to the \ref smp::submit_to()
+ /// called behind the scenes.
+ /// \param func a callable with signature `Value (Service&, Args...)` or
+ /// `future<Value> (Service&, Args...)` (for some `Value` type), or a pointer
+ /// to a member function of Service
+ /// \param args parameters to the callable; will be copied or moved. To pass by reference,
+ /// use std::ref().
+ ///
+ /// \return result of calling `func(instance)` on the designated instance
+ template <typename Func, typename... Args, typename Ret = futurize_t<std::invoke_result_t<Func, Service&, Args...>>>
+ SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args&&...>)
+ Ret
+ invoke_on(unsigned id, smp_submit_to_options options, Func&& func, Args&&... args) {
+ return smp::submit_to(id, options, [this, func = std::forward<Func>(func), args = std::tuple(std::move(args)...)] () mutable {
+ auto inst = get_local_service();
+ return std::apply(std::forward<Func>(func), std::tuple_cat(std::forward_as_tuple(*inst), std::move(args)));
+ });
+ }
+
+ /// Invoke a callable on a specific instance of `Service`.
+ ///
+ /// \param id shard id to call
+ /// \param func a callable with signature `Value (Service&)` or
+ /// `future<Value> (Service&)` (for some `Value` type), or a pointer
+ /// to a member function of Service
+ /// \param args parameters to the callable
+ /// \return result of calling `func(instance)` on the designated instance
+ template <typename Func, typename... Args, typename Ret = futurize_t<std::invoke_result_t<Func, Service&, Args&&...>>>
+ SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args&&...>)
+ Ret
+ invoke_on(unsigned id, Func&& func, Args&&... args) {
+ return invoke_on(id, smp_submit_to_options(), std::forward<Func>(func), std::forward<Args>(args)...);
+ }
+
+ /// Gets a reference to the local instance.
+ const Service& local() const noexcept;
+
+ /// Gets a reference to the local instance.
+ Service& local() noexcept;
+
+ /// Gets a shared pointer to the local instance.
+ shared_ptr<Service> local_shared() noexcept;
+
+ /// Checks whether the local instance has been initialized.
+ bool local_is_initialized() const noexcept;
+
+private:
+ void track_deletion(shared_ptr<Service>& s, std::false_type) noexcept {
+ // do not wait for instance to be deleted since it is not going to notify us
+ service_deleted();
+ }
+
+ void track_deletion(shared_ptr<Service>& s, std::true_type) {
+ s->_delete_cb = std::bind(std::mem_fn(&sharded<Service>::service_deleted), this);
+ }
+
+ template <typename... Args>
+ shared_ptr<Service> create_local_service(Args&&... args) {
+ auto s = ::seastar::make_shared<Service>(std::forward<Args>(args)...);
+ set_container(*s);
+ track_deletion(s, std::is_base_of<async_sharded_service<Service>, Service>());
+ return s;
+ }
+
+ shared_ptr<Service> get_local_service() {
+ auto inst = _instances[this_shard_id()].service;
+ if (!inst) {
+ throw no_sharded_instance_exception();
+ }
+ return inst;
+ }
+};
+
+namespace internal {
+
+template <typename T>
+struct sharded_unwrap {
+ using type = T;
+};
+
+template <typename T>
+struct sharded_unwrap<std::reference_wrapper<sharded<T>>> {
+ using type = T&;
+};
+
+template <typename T>
+using sharded_unwrap_t = typename sharded_unwrap<T>::type;
+
+} // internal
+
+
+/// \brief Helper to pass a parameter to a `sharded<>` object that depends
+/// on the shard. It is evaluated on the shard, just before being
+/// passed to the local instance. It is useful when passing
+/// parameters to sharded::start().
+template <typename Func, typename... Params>
+class sharded_parameter {
+ Func _func;
+ std::tuple<Params...> _params;
+public:
+ /// Creates a sharded parameter, which evaluates differently based on
+ /// the shard it is executed on.
+ ///
+ /// \param func Function to be executed
+ /// \param params optional parameters to be passed to the function. Can
+ /// be std::ref(sharded<whatever>), in which case the local
+ /// instance will be passed. Anything else
+ /// will be passed by value unchanged.
+ explicit sharded_parameter(Func func, Params... params)
+ SEASTAR_CONCEPT(requires std::invocable<Func, internal::sharded_unwrap_t<Params>...>)
+ : _func(std::move(func)), _params(std::make_tuple(std::move(params)...)) {
+ }
+private:
+ auto evaluate() const;
+
+ template <typename Func_, typename... Param_>
+ friend auto internal::unwrap_sharded_arg(sharded_parameter<Func_, Param_...> sp);
+};
+
+/// \example sharded_parameter_demo.cc
+///
+/// Example use of \ref sharded_parameter.
+
+/// @}
+
+template <typename Service>
+sharded<Service>::~sharded() {
+ assert(_instances.empty());
+}
+
+namespace internal {
+
+template <typename Service>
+class either_sharded_or_local {
+ sharded<Service>& _sharded;
+public:
+ either_sharded_or_local(sharded<Service>& s) : _sharded(s) {}
+ operator sharded<Service>& () { return _sharded; }
+ operator Service& () { return _sharded.local(); }
+};
+
+template <typename T>
+inline
+T&&
+unwrap_sharded_arg(T&& arg) {
+ return std::forward<T>(arg);
+}
+
+template <typename Service>
+either_sharded_or_local<Service>
+unwrap_sharded_arg(std::reference_wrapper<sharded<Service>> arg) {
+ return either_sharded_or_local<Service>(arg);
+}
+
+template <typename Func, typename... Param>
+auto
+unwrap_sharded_arg(sharded_parameter<Func, Param...> sp) {
+ return sp.evaluate();
+}
+
+}
+
+template <typename Func, typename... Param>
+auto
+sharded_parameter<Func, Param...>::evaluate() const {
+ auto unwrap_params_and_invoke = [this] (const auto&... params) {
+ return std::invoke(_func, internal::unwrap_sharded_arg(params)...);
+ };
+ return std::apply(unwrap_params_and_invoke, _params);
+}
+
+template <typename Service>
+template <typename... Args>
+future<>
+sharded<Service>::start(Args&&... args) noexcept {
+ try {
+ _instances.resize(smp::count);
+ return sharded_parallel_for_each(
+ [this, args = std::make_tuple(std::forward<Args>(args)...)] (unsigned c) mutable {
+ return smp::submit_to(c, [this, args] () mutable {
+ _instances[this_shard_id()].service = std::apply([this] (Args... args) {
+ return create_local_service(internal::unwrap_sharded_arg(std::forward<Args>(args))...);
+ }, args);
+ });
+ }).then_wrapped([this] (future<> f) {
+ try {
+ f.get();
+ return make_ready_future<>();
+ } catch (...) {
+ return this->stop().then([e = std::current_exception()] () mutable {
+ std::rethrow_exception(e);
+ });
+ }
+ });
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template <typename Service>
+template <typename... Args>
+future<>
+sharded<Service>::start_single(Args&&... args) noexcept {
+ try {
+ assert(_instances.empty());
+ _instances.resize(1);
+ return smp::submit_to(0, [this, args = std::make_tuple(std::forward<Args>(args)...)] () mutable {
+ _instances[0].service = std::apply([this] (Args... args) {
+ return create_local_service(internal::unwrap_sharded_arg(std::forward<Args>(args))...);
+ }, args);
+ }).then_wrapped([this] (future<> f) {
+ try {
+ f.get();
+ return make_ready_future<>();
+ } catch (...) {
+ return this->stop().then([e = std::current_exception()] () mutable {
+ std::rethrow_exception(e);
+ });
+ }
+ });
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+namespace internal {
+
+// Helper check if Service::stop exists
+
+struct sharded_has_stop {
+ // If a member names "stop" exists, try to call it, even if it doesn't
+ // have the correct signature. This is so that we don't ignore a function
+ // named stop() just because the signature is incorrect, and instead
+ // force the user to resolve the ambiguity.
+ template <typename Service>
+ constexpr static auto check(int) -> std::enable_if_t<(sizeof(&Service::stop) >= 0), bool> {
+ return true;
+ }
+
+ // Fallback in case Service::stop doesn't exist.
+ template<typename>
+ static constexpr auto check(...) -> bool {
+ return false;
+ }
+};
+
+template <bool stop_exists>
+struct sharded_call_stop {
+ template <typename Service>
+ static future<> call(Service& instance);
+};
+
+template <>
+template <typename Service>
+inline
+future<> sharded_call_stop<true>::call(Service& instance) {
+ return instance.stop();
+}
+
+template <>
+template <typename Service>
+inline
+future<> sharded_call_stop<false>::call(Service& instance) {
+ return make_ready_future<>();
+}
+
+template <typename Service>
+inline
+future<>
+stop_sharded_instance(Service& instance) {
+ constexpr bool has_stop = internal::sharded_has_stop::check<Service>(0);
+ return internal::sharded_call_stop<has_stop>::call(instance);
+}
+
+}
+
+template <typename Service>
+future<>
+sharded<Service>::stop() noexcept {
+ try {
+ return sharded_parallel_for_each([this] (unsigned c) mutable {
+ return smp::submit_to(c, [this] () mutable {
+ auto inst = _instances[this_shard_id()].service;
+ if (!inst) {
+ return make_ready_future<>();
+ }
+ return internal::stop_sharded_instance(*inst);
+ });
+ }).then_wrapped([this] (future<> fut) {
+ return sharded_parallel_for_each([this] (unsigned c) {
+ return smp::submit_to(c, [this] {
+ if (_instances[this_shard_id()].service == nullptr) {
+ return make_ready_future<>();
+ }
+ _instances[this_shard_id()].service = nullptr;
+ return _instances[this_shard_id()].freed.get_future();
+ });
+ }).finally([this, fut = std::move(fut)] () mutable {
+ _instances.clear();
+ _instances = std::vector<sharded<Service>::entry>();
+ return std::move(fut);
+ });
+ });
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template <typename Service>
+future<>
+sharded<Service>::invoke_on_all(smp_submit_to_options options, std::function<future<> (Service&)> func) noexcept {
+ try {
+ return sharded_parallel_for_each([this, options, func = std::move(func)] (unsigned c) {
+ return smp::submit_to(c, options, [this, func] {
+ return func(*get_local_service());
+ });
+ });
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template <typename Service>
+template <typename Func, typename... Args>
+SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args...>)
+inline
+future<>
+sharded<Service>::invoke_on_all(smp_submit_to_options options, Func func, Args... args) noexcept {
+ static_assert(std::is_same_v<futurize_t<std::invoke_result_t<Func, Service&, Args...>>, future<>>,
+ "invoke_on_all()'s func must return void or future<>");
+ try {
+ return invoke_on_all(options, invoke_on_all_func_type([func, args = std::tuple(std::move(args)...)] (Service& service) mutable {
+ return futurize_apply(func, std::tuple_cat(std::forward_as_tuple(service), args));
+ }));
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template <typename Service>
+template <typename Func, typename... Args>
+SEASTAR_CONCEPT(requires std::invocable<Func, Service&, Args...>)
+inline
+future<>
+sharded<Service>::invoke_on_others(smp_submit_to_options options, Func func, Args... args) noexcept {
+ static_assert(std::is_same_v<futurize_t<std::invoke_result_t<Func, Service&, Args...>>, future<>>,
+ "invoke_on_others()'s func must return void or future<>");
+ try {
+ return invoke_on_all(options, [orig = this_shard_id(), func = std::move(func), args = std::tuple(std::move(args)...)] (Service& s) -> future<> {
+ return this_shard_id() == orig ? make_ready_future<>() : futurize_apply(func, std::tuple_cat(std::forward_as_tuple(s), args));;
+ });
+ } catch (...) {
+ return current_exception_as_future();
+ }
+}
+
+template <typename Service>
+const Service& sharded<Service>::local() const noexcept {
+ assert(local_is_initialized());
+ return *_instances[this_shard_id()].service;
+}
+
+template <typename Service>
+Service& sharded<Service>::local() noexcept {
+ assert(local_is_initialized());
+ return *_instances[this_shard_id()].service;
+}
+
+template <typename Service>
+shared_ptr<Service> sharded<Service>::local_shared() noexcept {
+ assert(local_is_initialized());
+ return _instances[this_shard_id()].service;
+}
+
+template <typename Service>
+inline bool sharded<Service>::local_is_initialized() const noexcept {
+ return _instances.size() > this_shard_id() &&
+ _instances[this_shard_id()].service;
+}
+
+/// \addtogroup smp-module
+/// @{
+
+/// Smart pointer wrapper which makes it safe to move across CPUs.
+///
+/// \c foreign_ptr<> is a smart pointer wrapper which, unlike
+/// \ref shared_ptr and \ref lw_shared_ptr, is safe to move to a
+/// different core.
+///
+/// As seastar avoids locking, any but the most trivial objects must
+/// be destroyed on the same core they were created on, so that,
+/// for example, their destructors can unlink references to the
+/// object from various containers. In addition, for performance
+/// reasons, the shared pointer types do not use atomic operations
+/// to manage their reference counts. As a result they cannot be
+/// used on multiple cores in parallel.
+///
+/// \c foreign_ptr<> provides a solution to that problem.
+/// \c foreign_ptr<> wraps any pointer type -- raw pointer,
+/// \ref seastar::shared_ptr<>, or similar, and remembers on what core this
+/// happened. When the \c foreign_ptr<> object is destroyed, it
+/// sends a message to the original core so that the wrapped object
+/// can be safely destroyed.
+///
+/// \c foreign_ptr<> is a move-only object; it cannot be copied.
+///
+template <typename PtrType>
+class foreign_ptr {
+private:
+ PtrType _value;
+ unsigned _cpu;
+private:
+ void destroy(PtrType p, unsigned cpu) noexcept {
+ if (p && this_shard_id() != cpu) {
+ // `destroy()` is called from the destructor and other
+ // synchronous methods (like `reset()`), that have no way to
+ // wait for this future.
+ (void)smp::submit_to(cpu, [v = std::move(p)] () mutable {
+ // Destroy the contained pointer. We do this explicitly
+ // in the current shard, because the lambda is destroyed
+ // in the shard that submitted the task.
+ v = {};
+ });
+ }
+ }
+public:
+ using element_type = typename std::pointer_traits<PtrType>::element_type;
+ using pointer = element_type*;
+
+ /// Constructs a null \c foreign_ptr<>.
+ foreign_ptr() noexcept(std::is_nothrow_default_constructible_v<PtrType>)
+ : _value(PtrType())
+ , _cpu(this_shard_id()) {
+ }
+ /// Constructs a null \c foreign_ptr<>.
+ foreign_ptr(std::nullptr_t) noexcept(std::is_nothrow_default_constructible_v<foreign_ptr>) : foreign_ptr() {}
+ /// Wraps a pointer object and remembers the current core.
+ foreign_ptr(PtrType value) noexcept(std::is_nothrow_move_constructible_v<PtrType>)
+ : _value(std::move(value))
+ , _cpu(this_shard_id()) {
+ }
+ // The type is intentionally non-copyable because copies
+ // are expensive because each copy requires across-CPU call.
+ foreign_ptr(const foreign_ptr&) = delete;
+ /// Moves a \c foreign_ptr<> to another object.
+ foreign_ptr(foreign_ptr&& other) noexcept(std::is_nothrow_move_constructible_v<PtrType>) = default;
+ /// Destroys the wrapped object on its original cpu.
+ ~foreign_ptr() {
+ destroy(std::move(_value), _cpu);
+ }
+ /// Creates a copy of this foreign ptr. Only works if the stored ptr is copyable.
+ future<foreign_ptr> copy() const noexcept {
+ return smp::submit_to(_cpu, [this] () mutable {
+ auto v = _value;
+ return make_foreign(std::move(v));
+ });
+ }
+ /// Accesses the wrapped object.
+ element_type& operator*() const noexcept(noexcept(*_value)) { return *_value; }
+ /// Accesses the wrapped object.
+ element_type* operator->() const noexcept(noexcept(&*_value)) { return &*_value; }
+ /// Access the raw pointer to the wrapped object.
+ pointer get() const noexcept(noexcept(&*_value)) { return &*_value; }
+ /// Return the owner-shard of this pointer.
+ ///
+ /// The owner shard of the pointer can change as a result of
+ /// move-assigment or a call to reset().
+ unsigned get_owner_shard() const noexcept { return _cpu; }
+ /// Checks whether the wrapped pointer is non-null.
+ operator bool() const noexcept(noexcept(static_cast<bool>(_value))) { return static_cast<bool>(_value); }
+ /// Move-assigns a \c foreign_ptr<>.
+ foreign_ptr& operator=(foreign_ptr&& other) noexcept(std::is_nothrow_move_constructible<PtrType>::value) {
+ destroy(std::move(_value), _cpu);
+ _value = std::move(other._value);
+ _cpu = other._cpu;
+ return *this;
+ }
+ /// Releases the owned pointer
+ ///
+ /// Warning: the caller is now responsible for destroying the
+ /// pointer on its owner shard. This method is best called on the
+ /// owner shard to avoid accidents.
+ PtrType release() noexcept(std::is_nothrow_default_constructible_v<PtrType>) {
+ return std::exchange(_value, {});
+ }
+ /// Replace the managed pointer with new_ptr.
+ ///
+ /// The previous managed pointer is destroyed on its owner shard.
+ void reset(PtrType new_ptr) noexcept(std::is_nothrow_move_constructible_v<PtrType>) {
+ auto old_ptr = std::move(_value);
+ auto old_cpu = _cpu;
+
+ _value = std::move(new_ptr);
+ _cpu = this_shard_id();
+
+ destroy(std::move(old_ptr), old_cpu);
+ }
+ /// Replace the managed pointer with a null value.
+ ///
+ /// The previous managed pointer is destroyed on its owner shard.
+ void reset(std::nullptr_t = nullptr) noexcept(std::is_nothrow_default_constructible_v<PtrType>) {
+ reset(PtrType());
+ }
+};
+
+/// Wraps a raw or smart pointer object in a \ref foreign_ptr<>.
+///
+/// \relates foreign_ptr
+template <typename T>
+foreign_ptr<T> make_foreign(T ptr) {
+ return foreign_ptr<T>(std::move(ptr));
+}
+
+/// @}
+
+template<typename T>
+struct is_smart_ptr<foreign_ptr<T>> : std::true_type {};
+
+}
diff --git a/src/seastar/include/seastar/core/shared_future.hh b/src/seastar/include/seastar/core/shared_future.hh
new file mode 100644
index 000000000..6d8a8045e
--- /dev/null
+++ b/src/seastar/include/seastar/core/shared_future.hh
@@ -0,0 +1,299 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2015 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/expiring_fifo.hh>
+
+namespace seastar {
+
+/// \addtogroup future-module
+/// @{
+
+/// Changes the clock used by shared_future<> and shared_promise<> when passed as the first template parameter.
+template<typename Clock>
+struct with_clock {};
+
+/// \cond internal
+
+template <typename... T>
+struct future_option_traits;
+
+template <typename Clock, typename... T>
+struct future_option_traits<with_clock<Clock>, T...> {
+ using clock_type = Clock;
+
+ template<template <typename...> class Class>
+ struct parametrize {
+ using type = Class<T...>;
+ };
+};
+
+template <typename... T>
+struct future_option_traits {
+ using clock_type = lowres_clock;
+
+ template<template <typename...> class Class>
+ struct parametrize {
+ using type = Class<T...>;
+ };
+};
+
+/// \endcond
+
+/// \brief Like \ref future except the result can be waited for by many fibers.
+///
+/// Represents a value which may not yet be ready. A fiber can wait for the value using
+/// the \ref future obtained by calling \ref get_future() or casting to \ref future type.
+/// Multiple fibers are allowed to obtain a \ref future for the result using the same
+/// instance of \ref shared_future.
+///
+/// All futures obtained from shared_future should end up in the same state. However,
+/// if the value's copy constructor throws, some of the futures may end up in a failed state
+/// with an exception thrown from the copy constructor and end up with a state
+/// different than other futures.
+///
+/// The scope of shared_future instance doesn't have to include scopes of the futures
+/// obtained from that instance. In that sense the returned futures are independent.
+///
+/// shared_future can be copied at any time and all copies will resolve with the same value.
+///
+/// shared_future can be in a disengaged state when it's default-constructed or moved-from.
+/// When it's in such a state we say it's invalid and obtaining futures must not be attempted.
+///
+/// The types in the parameter pack T must all be copy-constructible.
+///
+/// When the first type in the parameter pack is \ref with_clock then it has the effect
+/// of changing the clock used for timeouts by this instance. This type is omitted from
+/// the parameter of the future<> objects.
+///
+/// Example:
+///
+/// future<int> f;
+/// shared_future<with_clock<manual_clock>, int> sf(std::move(f));
+/// future<int> f2 = sf;
+///
+template<typename... T>
+class shared_future {
+ template <typename... U> friend class shared_promise;
+ using options = future_option_traits<T...>;
+public:
+ using clock = typename options::clock_type;
+ using time_point = typename clock::time_point;
+ using future_type = typename future_option_traits<T...>::template parametrize<future>::type;
+ using promise_type = typename future_option_traits<T...>::template parametrize<promise>::type;
+ using value_tuple_type = typename future_option_traits<T...>::template parametrize<std::tuple>::type;
+private:
+ using promise_expiry = typename future_option_traits<T...>::template parametrize<promise_expiry>::type;
+
+ /// \cond internal
+ class shared_state : public enable_lw_shared_from_this<shared_state> {
+ future_type _original_future;
+ expiring_fifo<promise_type, promise_expiry, clock> _peers;
+
+ public:
+ ~shared_state() {
+ // Don't warn if the shared future is exceptional. Any
+ // warnings will be reported by the futures returned by
+ // get_future.
+ if (_original_future.failed()) {
+ _original_future.ignore_ready_future();
+ }
+ }
+ explicit shared_state(future_type f) noexcept : _original_future(std::move(f)) { }
+ void resolve(future_type&& f) noexcept {
+ _original_future = std::move(f);
+ auto& state = _original_future._state;
+ if (_original_future.failed()) {
+ while (_peers) {
+ _peers.front().set_exception(state.get_exception());
+ _peers.pop_front();
+ }
+ } else {
+ while (_peers) {
+ auto& p = _peers.front();
+ try {
+ p.set_value(state.get_value());
+ } catch (...) {
+ p.set_exception(std::current_exception());
+ }
+ _peers.pop_front();
+ }
+ }
+ }
+
+ future_type get_future(time_point timeout = time_point::max()) noexcept {
+ // Note that some functions called below may throw,
+ // like pushing to _peers or copying _original_future's ready value.
+ // We'd rather terminate than propagate these errors similar to
+ // .then()'s failure to allocate a continuation as the caller cannot
+ // distinguish between an error returned by the original future to
+ // failing to perform `get_future` itself.
+ memory::scoped_critical_alloc_section _;
+ if (!_original_future.available()) {
+ promise_type p;
+ auto f = p.get_future();
+ if (_original_future._state.valid()) {
+ // _original_future's result is forwarded to each peer.
+ (void)_original_future.then_wrapped([s = this->shared_from_this()] (future_type&& f) mutable {
+ s->resolve(std::move(f));
+ });
+ }
+ _peers.push_back(std::move(p), timeout);
+ return f;
+ } else if (_original_future.failed()) {
+ return future_type(exception_future_marker(), std::exception_ptr(_original_future._state.get_exception()));
+ } else {
+ return future_type(ready_future_marker(), _original_future._state.get_value());
+ }
+ }
+
+ bool available() const noexcept {
+ return _original_future.available();
+ }
+
+ bool failed() const noexcept {
+ return _original_future.failed();
+ }
+ };
+ /// \endcond
+ lw_shared_ptr<shared_state> _state;
+public:
+ /// \brief Forwards the result of future \c f into this shared_future.
+ shared_future(future_type f)
+ : _state(make_lw_shared<shared_state>(std::move(f))) { }
+
+ shared_future() = default; // noexcept, based on the respective lw_shared_ptr constructor
+ shared_future(const shared_future&) = default; // noexcept, based on the respective lw_shared_ptr constructor
+ shared_future& operator=(const shared_future&) = default; // noexcept, based on respective constructor
+ shared_future(shared_future&&) = default; // noexcept, based on the respective lw_shared_ptr constructor
+ shared_future& operator=(shared_future&&) = default; // noexcept, based on the respective constructor
+
+ /// \brief Creates a new \c future which will resolve with the result of this shared_future
+ ///
+ /// \param timeout When engaged, the returned future will resolve with \ref timed_out_error
+ /// if this shared_future doesn't resolve before timeout is reached.
+ ///
+ /// This object must be in a valid state.
+ future_type get_future(time_point timeout = time_point::max()) const noexcept {
+ return _state->get_future(timeout);
+ }
+
+ /// \brief Returns true if the future is available (ready or failed)
+ ///
+ /// \note This object must be in a valid state.
+ bool available() const noexcept {
+ return _state->available();
+ }
+
+ /// \brief Returns true if the future is failed
+ ///
+ /// \note This object must be in a valid state.
+ bool failed() const noexcept {
+ return _state->failed();
+ }
+
+ /// \brief Equivalent to \ref get_future()
+ operator future_type() const noexcept {
+ return get_future();
+ }
+
+ /// \brief Returns true if the instance is in valid state
+ bool valid() const noexcept {
+ return bool(_state);
+ }
+};
+
+/// \brief Like \ref promise except that its counterpart is \ref shared_future instead of \ref future
+///
+/// When the shared_promise is made ready, every waiter is also made ready.
+///
+/// Like the shared_future, the types in the parameter pack T must all be copy-constructible.
+template <typename... T>
+class shared_promise {
+public:
+ using shared_future_type = shared_future<T...>;
+ using future_type = typename shared_future_type::future_type;
+ using promise_type = typename shared_future_type::promise_type;
+ using clock = typename shared_future_type::clock;
+ using time_point = typename shared_future_type::time_point;
+ using value_tuple_type = typename shared_future_type::value_tuple_type;
+private:
+ promise_type _promise;
+ shared_future_type _shared_future;
+ static constexpr bool copy_noexcept = future_type::copy_noexcept;
+public:
+ shared_promise(const shared_promise&) = delete;
+ shared_promise(shared_promise&&) = default; // noexcept, based on the respective promise and shared_future constructors
+ shared_promise& operator=(shared_promise&&) = default; // noexcept, based on the respective promise and shared_future constructors
+ shared_promise() : _promise(), _shared_future(_promise.get_future()) {
+ }
+
+ /// \brief Gets new future associated with this promise.
+ /// If the promise is not resolved before timeout the returned future will resolve with \ref timed_out_error.
+ /// This instance doesn't have to be kept alive until the returned future resolves.
+ future_type get_shared_future(time_point timeout = time_point::max()) const noexcept {
+ return _shared_future.get_future(timeout);
+ }
+
+ /// \brief Sets the shared_promise's value (as tuple; by copying), same as normal promise
+ void set_value(const value_tuple_type& result) noexcept(copy_noexcept) {
+ _promise.set_value(result);
+ }
+
+ /// \brief Sets the shared_promise's value (as tuple; by moving), same as normal promise
+ void set_value(value_tuple_type&& result) noexcept {
+ _promise.set_value(std::move(result));
+ }
+
+ /// \brief Sets the shared_promise's value (variadic), same as normal promise
+ template <typename... A>
+ void set_value(A&&... a) noexcept {
+ _promise.set_value(std::forward<A>(a)...);
+ }
+
+ /// \brief Marks the shared_promise as failed, same as normal promise
+ void set_exception(std::exception_ptr ex) noexcept {
+ _promise.set_exception(std::move(ex));
+ }
+
+ /// \brief Marks the shared_promise as failed, same as normal promise
+ template<typename Exception>
+ void set_exception(Exception&& e) noexcept {
+ set_exception(make_exception_ptr(std::forward<Exception>(e)));
+ }
+
+ /// \brief Returns true if the underlying future is available (ready or failed)
+ bool available() const noexcept {
+ return _shared_future.available();
+ }
+
+ /// \brief Returns true if the underlying future is failed
+ bool failed() const noexcept {
+ return _shared_future.failed();
+ }
+};
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/shared_mutex.hh b/src/seastar/include/seastar/core/shared_mutex.hh
new file mode 100644
index 000000000..553f47e74
--- /dev/null
+++ b/src/seastar/include/seastar/core/shared_mutex.hh
@@ -0,0 +1,183 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/circular_buffer.hh>
+
+namespace seastar {
+
+/// \addtogroup fiber-module
+/// @{
+
+/// \brief Shared/exclusive mutual exclusion.
+///
+/// Similar to \c std::shared_mutex, this class provides protection
+/// for a shared resource, with two levels of access protection: shared
+/// and exclusive. Shared access allows multiple tasks to access the
+/// shared resource concurrently, while exclusive access allows just
+/// one task to access the resource at a time.
+///
+/// Note that many seastar tasks do not require protection at all,
+/// since the seastar scheduler is not preemptive; however tasks that do
+/// (by waiting on a future) may require explicit locking.
+///
+/// The \ref with_shared(shared_mutex&, Func&&) and
+/// \ref with_lock(shared_mutex&, Func&&) provide exception-safe
+/// wrappers for use with \c shared_mutex.
+///
+/// \see semaphore simpler mutual exclusion
+class shared_mutex {
+ unsigned _readers = 0;
+ bool _writer = false;
+ struct waiter {
+ waiter(promise<>&& pr, bool for_write) : pr(std::move(pr)), for_write(for_write) {}
+ promise<> pr;
+ bool for_write;
+ };
+ circular_buffer<waiter> _waiters;
+public:
+ shared_mutex() = default;
+ shared_mutex(shared_mutex&&) = default;
+ shared_mutex& operator=(shared_mutex&&) = default;
+ shared_mutex(const shared_mutex&) = delete;
+ void operator=(const shared_mutex&) = delete;
+ /// Lock the \c shared_mutex for shared access
+ ///
+ /// \return a future that becomes ready when no exclusive access
+ /// is granted to anyone.
+ future<> lock_shared() {
+ if (try_lock_shared()) {
+ return make_ready_future<>();
+ }
+ _waiters.emplace_back(promise<>(), false);
+ return _waiters.back().pr.get_future();
+ }
+ /// Try to lock the \c shared_mutex for shared access
+ ///
+ /// \return true iff could acquire the lock for shared access.
+ bool try_lock_shared() noexcept {
+ if (!_writer && _waiters.empty()) {
+ ++_readers;
+ return true;
+ }
+ return false;
+ }
+ /// Unlocks a \c shared_mutex after a previous call to \ref lock_shared().
+ void unlock_shared() {
+ assert(_readers > 0);
+ --_readers;
+ wake();
+ }
+ /// Lock the \c shared_mutex for exclusive access
+ ///
+ /// \return a future that becomes ready when no access, shared or exclusive
+ /// is granted to anyone.
+ future<> lock() {
+ if (try_lock()) {
+ return make_ready_future<>();
+ }
+ _waiters.emplace_back(promise<>(), true);
+ return _waiters.back().pr.get_future();
+ }
+ /// Try to lock the \c shared_mutex for exclusive access
+ ///
+ /// \return true iff could acquire the lock for exclusive access.
+ bool try_lock() noexcept {
+ if (!_readers && !_writer) {
+ _writer = true;
+ return true;
+ }
+ return false;
+ }
+ /// Unlocks a \c shared_mutex after a previous call to \ref lock().
+ void unlock() {
+ assert(_writer);
+ _writer = false;
+ wake();
+ }
+private:
+ void wake() {
+ while (!_waiters.empty()) {
+ auto& w = _waiters.front();
+ // note: _writer == false in wake()
+ if (w.for_write) {
+ if (!_readers) {
+ _writer = true;
+ w.pr.set_value();
+ _waiters.pop_front();
+ }
+ break;
+ } else { // for read
+ ++_readers;
+ w.pr.set_value();
+ _waiters.pop_front();
+ }
+ }
+ }
+};
+
+/// Executes a function while holding shared access to a resource.
+///
+/// Executes a function while holding shared access to a resource. When
+/// the function returns, the mutex is automatically unlocked.
+///
+/// \param sm a \ref shared_mutex guarding access to the shared resource
+/// \param func callable object to invoke while the mutex is held for shared access
+/// \return whatever \c func returns, as a future
+///
+/// \relates shared_mutex
+template <typename Func>
+inline
+futurize_t<std::result_of_t<Func ()>>
+with_shared(shared_mutex& sm, Func&& func) {
+ return sm.lock_shared().then([&sm, func = std::forward<Func>(func)] () mutable {
+ return futurize_invoke(func).finally([&sm] {
+ sm.unlock_shared();
+ });
+ });
+}
+
+/// Executes a function while holding exclusive access to a resource.
+///
+/// Executes a function while holding exclusive access to a resource. When
+/// the function returns, the mutex is automatically unlocked.
+///
+/// \param sm a \ref shared_mutex guarding access to the shared resource
+/// \param func callable object to invoke while the mutex is held for shared access
+/// \return whatever \c func returns, as a future
+///
+/// \relates shared_mutex
+template <typename Func>
+inline
+futurize_t<std::result_of_t<Func ()>>
+with_lock(shared_mutex& sm, Func&& func) {
+ return sm.lock().then([&sm, func = std::forward<Func>(func)] () mutable {
+ return futurize_invoke(func).finally([&sm] {
+ sm.unlock();
+ });
+ });
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/shared_ptr.hh b/src/seastar/include/seastar/core/shared_ptr.hh
new file mode 100644
index 000000000..f7a8843d4
--- /dev/null
+++ b/src/seastar/include/seastar/core/shared_ptr.hh
@@ -0,0 +1,868 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/shared_ptr_debug_helper.hh>
+#include <utility>
+#include <type_traits>
+#include <functional>
+#include <ostream>
+#include <seastar/util/is_smart_ptr.hh>
+#include <seastar/util/indirect.hh>
+
+#include <boost/intrusive/parent_from_member.hpp>
+
+namespace seastar {
+
+// This header defines two shared pointer facilities, lw_shared_ptr<> and
+// shared_ptr<>, both modeled after std::shared_ptr<>.
+//
+// Unlike std::shared_ptr<>, neither of these implementations are thread
+// safe, and two pointers sharing the same object must not be used in
+// different threads.
+//
+// lw_shared_ptr<> is the more lightweight variant, with a lw_shared_ptr<>
+// occupying just one machine word, and adding just one word to the shared
+// object. However, it does not support polymorphism.
+//
+// shared_ptr<> is more expensive, with a pointer occupying two machine
+// words, and with two words of overhead in the shared object. In return,
+// it does support polymorphism.
+//
+// Both variants support shared_from_this() via enable_shared_from_this<>
+// and lw_enable_shared_from_this<>().
+//
+
+#ifndef SEASTAR_DEBUG_SHARED_PTR
+using shared_ptr_counter_type = long;
+#else
+using shared_ptr_counter_type = debug_shared_ptr_counter_type;
+#endif
+
+template <typename T>
+class lw_shared_ptr;
+
+template <typename T>
+class shared_ptr;
+
+template <typename T>
+class enable_lw_shared_from_this;
+
+template <typename T>
+class enable_shared_from_this;
+
+template <typename T, typename... A>
+lw_shared_ptr<T> make_lw_shared(A&&... a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T&& a);
+
+template <typename T>
+lw_shared_ptr<T> make_lw_shared(T& a);
+
+template <typename T, typename... A>
+shared_ptr<T> make_shared(A&&... a);
+
+template <typename T>
+shared_ptr<T> make_shared(T&& a);
+
+template <typename T, typename U>
+shared_ptr<T> static_pointer_cast(const shared_ptr<U>& p);
+
+template <typename T, typename U>
+shared_ptr<T> dynamic_pointer_cast(const shared_ptr<U>& p);
+
+template <typename T, typename U>
+shared_ptr<T> const_pointer_cast(const shared_ptr<U>& p);
+
+struct lw_shared_ptr_counter_base {
+ shared_ptr_counter_type _count = 0;
+};
+
+
+namespace internal {
+
+template <class T, class U>
+struct lw_shared_ptr_accessors;
+
+template <class T>
+struct lw_shared_ptr_accessors_esft;
+
+template <class T>
+struct lw_shared_ptr_accessors_no_esft;
+
+}
+
+
+// We want to support two use cases for shared_ptr<T>:
+//
+// 1. T is any type (primitive or class type)
+//
+// 2. T is a class type that inherits from enable_shared_from_this<T>.
+//
+// In the first case, we must wrap T in an object containing the counter,
+// since T may be a primitive type and cannot be a base class.
+//
+// In the second case, we want T to reach the counter through its
+// enable_shared_from_this<> base class, so that we can implement
+// shared_from_this().
+//
+// To implement those two conflicting requirements (T alongside its counter;
+// T inherits from an object containing the counter) we use std::conditional<>
+// and some accessor functions to select between two implementations.
+
+
+// CRTP from this to enable shared_from_this:
+template <typename T>
+class enable_lw_shared_from_this : private lw_shared_ptr_counter_base {
+ using ctor = T;
+protected:
+ enable_lw_shared_from_this() noexcept {}
+ enable_lw_shared_from_this(enable_lw_shared_from_this&&) noexcept {}
+ enable_lw_shared_from_this(const enable_lw_shared_from_this&) noexcept {}
+ enable_lw_shared_from_this& operator=(const enable_lw_shared_from_this&) noexcept { return *this; }
+ enable_lw_shared_from_this& operator=(enable_lw_shared_from_this&&) noexcept { return *this; }
+public:
+ lw_shared_ptr<T> shared_from_this();
+ lw_shared_ptr<const T> shared_from_this() const;
+ long use_count() const noexcept { return _count; }
+
+ template <typename X>
+ friend class lw_shared_ptr;
+ template <typename X>
+ friend struct internal::lw_shared_ptr_accessors_esft;
+ template <typename X, class Y>
+ friend struct internal::lw_shared_ptr_accessors;
+};
+
+template <typename T>
+struct shared_ptr_no_esft : private lw_shared_ptr_counter_base {
+ T _value;
+
+ shared_ptr_no_esft() = default;
+ shared_ptr_no_esft(const T& x) : _value(x) {}
+ shared_ptr_no_esft(T&& x) : _value(std::move(x)) {}
+ template <typename... A>
+ shared_ptr_no_esft(A&&... a) : _value(std::forward<A>(a)...) {}
+
+ template <typename X>
+ friend class lw_shared_ptr;
+ template <typename X>
+ friend struct internal::lw_shared_ptr_accessors_no_esft;
+ template <typename X, class Y>
+ friend struct internal::lw_shared_ptr_accessors;
+};
+
+
+/// Extension point: the user may override this to change how \ref lw_shared_ptr objects are destroyed,
+/// primarily so that incomplete classes can be used.
+///
+/// Customizing the deleter requires that \c T be derived from \c enable_lw_shared_from_this<T>.
+/// The specialization must be visible for all uses of \c lw_shared_ptr<T>.
+///
+/// To customize, the template must have a `static void dispose(T*)` operator that disposes of
+/// the object.
+template <typename T>
+struct lw_shared_ptr_deleter; // No generic implementation
+
+namespace internal {
+
+template <typename T>
+struct lw_shared_ptr_accessors_esft {
+ using concrete_type = std::remove_const_t<T>;
+ static T* to_value(lw_shared_ptr_counter_base* counter) {
+ return static_cast<T*>(counter);
+ }
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ dispose(static_cast<T*>(counter));
+ }
+ static void dispose(T* value_ptr) {
+ delete value_ptr;
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // since to_value() is defined above, we don't need to do anything special
+ // to force-instantiate it
+ }
+};
+
+template <typename T>
+struct lw_shared_ptr_accessors_no_esft {
+ using concrete_type = shared_ptr_no_esft<T>;
+ static T* to_value(lw_shared_ptr_counter_base* counter) {
+ return &static_cast<concrete_type*>(counter)->_value;
+ }
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ delete static_cast<concrete_type*>(counter);
+ }
+ static void dispose(T* value_ptr) {
+ delete boost::intrusive::get_parent_from_member(value_ptr, &concrete_type::_value);
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // since to_value() is defined above, we don't need to do anything special
+ // to force-instantiate it
+ }
+};
+
+// Generic case: lw_shared_ptr_deleter<T> is not specialized, select
+// implementation based on whether T inherits from enable_lw_shared_from_this<T>.
+template <typename T, typename U = void>
+struct lw_shared_ptr_accessors : std::conditional_t<
+ std::is_base_of<enable_lw_shared_from_this<T>, T>::value,
+ lw_shared_ptr_accessors_esft<T>,
+ lw_shared_ptr_accessors_no_esft<T>> {
+};
+
+// void_t is C++17, use this temporarily
+template <typename... T>
+using void_t = void;
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+struct lw_shared_ptr_accessors<T, void_t<decltype(lw_shared_ptr_deleter<T>{})>> {
+ using concrete_type = T;
+ static T* to_value(lw_shared_ptr_counter_base* counter);
+ static void dispose(lw_shared_ptr_counter_base* counter) {
+ lw_shared_ptr_deleter<T>::dispose(to_value(counter));
+ }
+ static void instantiate_to_value(lw_shared_ptr_counter_base* p) {
+ // instantiate to_value(); must be defined by shared_ptr_incomplete.hh
+ to_value(p);
+ }
+};
+
+}
+
+template <typename T>
+class lw_shared_ptr {
+ using accessors = internal::lw_shared_ptr_accessors<std::remove_const_t<T>>;
+ using concrete_type = typename accessors::concrete_type;
+ mutable lw_shared_ptr_counter_base* _p = nullptr;
+private:
+ lw_shared_ptr(lw_shared_ptr_counter_base* p) noexcept : _p(p) {
+ if (_p) {
+ ++_p->_count;
+ }
+ }
+ template <typename... A>
+ static lw_shared_ptr make(A&&... a) {
+ auto p = new concrete_type(std::forward<A>(a)...);
+ accessors::instantiate_to_value(p);
+ return lw_shared_ptr(p);
+ }
+public:
+ using element_type = T;
+
+ // Destroys the object pointed to by p and disposes of its storage.
+ // The pointer to the object must have been obtained through release().
+ static void dispose(T* p) noexcept {
+ accessors::dispose(const_cast<std::remove_const_t<T>*>(p));
+ }
+
+ // A functor which calls dispose().
+ class disposer {
+ public:
+ void operator()(T* p) const noexcept {
+ dispose(p);
+ }
+ };
+
+ lw_shared_ptr() noexcept = default;
+ lw_shared_ptr(std::nullptr_t) noexcept : lw_shared_ptr() {}
+ lw_shared_ptr(const lw_shared_ptr& x) noexcept : _p(x._p) {
+ if (_p) {
+ ++_p->_count;
+ }
+ }
+ lw_shared_ptr(lw_shared_ptr&& x) noexcept : _p(x._p) {
+ x._p = nullptr;
+ }
+ [[gnu::always_inline]]
+ ~lw_shared_ptr() {
+ if (_p && !--_p->_count) {
+ accessors::dispose(_p);
+ }
+ }
+ lw_shared_ptr& operator=(const lw_shared_ptr& x) noexcept {
+ if (_p != x._p) {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(x);
+ }
+ return *this;
+ }
+ lw_shared_ptr& operator=(lw_shared_ptr&& x) noexcept {
+ if (_p != x._p) {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(std::move(x));
+ }
+ return *this;
+ }
+ lw_shared_ptr& operator=(std::nullptr_t) noexcept {
+ return *this = lw_shared_ptr();
+ }
+ lw_shared_ptr& operator=(T&& x) noexcept {
+ this->~lw_shared_ptr();
+ new (this) lw_shared_ptr(make_lw_shared<T>(std::move(x)));
+ return *this;
+ }
+
+ T& operator*() const noexcept { return *accessors::to_value(_p); }
+ T* operator->() const noexcept { return accessors::to_value(_p); }
+ T* get() const noexcept {
+ if (_p) {
+ return accessors::to_value(_p);
+ } else {
+ return nullptr;
+ }
+ }
+
+ // Releases ownership of the object without destroying it.
+ // If this was the last owner then returns an engaged unique_ptr
+ // which is now the sole owner of the object.
+ // Returns a disengaged pointer if there are still some owners.
+ //
+ // Note that in case the raw pointer is extracted from the unique_ptr
+ // using unique_ptr::release(), it must be still destroyed using
+ // lw_shared_ptr::disposer or lw_shared_ptr::dispose().
+ std::unique_ptr<T, disposer> release() noexcept {
+ auto p = std::exchange(_p, nullptr);
+ if (--p->_count) {
+ return nullptr;
+ } else {
+ return std::unique_ptr<T, disposer>(accessors::to_value(p));
+ }
+ }
+
+ long int use_count() const noexcept {
+ if (_p) {
+ return _p->_count;
+ } else {
+ return 0;
+ }
+ }
+
+ operator lw_shared_ptr<const T>() const noexcept {
+ return lw_shared_ptr<const T>(_p);
+ }
+
+ explicit operator bool() const noexcept {
+ return _p;
+ }
+
+ bool owned() const noexcept {
+ return _p->_count == 1;
+ }
+
+ bool operator==(const lw_shared_ptr<const T>& x) const {
+ return _p == x._p;
+ }
+
+ bool operator!=(const lw_shared_ptr<const T>& x) const {
+ return !operator==(x);
+ }
+
+ bool operator==(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return _p == x._p;
+ }
+
+ bool operator!=(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return !operator==(x);
+ }
+
+ bool operator<(const lw_shared_ptr<const T>& x) const {
+ return _p < x._p;
+ }
+
+ bool operator<(const lw_shared_ptr<std::remove_const_t<T>>& x) const {
+ return _p < x._p;
+ }
+
+ template <typename U>
+ friend class lw_shared_ptr;
+
+ template <typename X, typename... A>
+ friend lw_shared_ptr<X> make_lw_shared(A&&...);
+
+ template <typename U>
+ friend lw_shared_ptr<U> make_lw_shared(U&&);
+
+ template <typename U>
+ friend lw_shared_ptr<U> make_lw_shared(U&);
+
+ template <typename U>
+ friend class enable_lw_shared_from_this;
+};
+
+template <typename T, typename... A>
+inline
+lw_shared_ptr<T> make_lw_shared(A&&... a) {
+ return lw_shared_ptr<T>::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T&& a) {
+ return lw_shared_ptr<T>::make(std::move(a));
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T> make_lw_shared(T& a) {
+ return lw_shared_ptr<T>::make(a);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<T>
+enable_lw_shared_from_this<T>::shared_from_this() {
+ return lw_shared_ptr<T>(this);
+}
+
+template <typename T>
+inline
+lw_shared_ptr<const T>
+enable_lw_shared_from_this<T>::shared_from_this() const {
+ return lw_shared_ptr<const T>(const_cast<enable_lw_shared_from_this*>(this));
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const lw_shared_ptr<T>& p) {
+ if (!p) {
+ return out << "null";
+ }
+ return out << *p;
+}
+
+// Polymorphic shared pointer class
+
+struct shared_ptr_count_base {
+ // destructor is responsible for fully-typed deletion
+ virtual ~shared_ptr_count_base() {}
+ shared_ptr_counter_type count = 0;
+};
+
+template <typename T>
+struct shared_ptr_count_for : shared_ptr_count_base {
+ T data;
+ template <typename... A>
+ shared_ptr_count_for(A&&... a) : data(std::forward<A>(a)...) {}
+};
+
+template <typename T>
+class enable_shared_from_this : private shared_ptr_count_base {
+public:
+ shared_ptr<T> shared_from_this();
+ shared_ptr<const T> shared_from_this() const;
+ long use_count() const noexcept { return count; }
+
+ template <typename U>
+ friend class shared_ptr;
+
+ template <typename U, bool esft>
+ friend struct shared_ptr_make_helper;
+};
+
+template <typename T>
+class shared_ptr {
+ mutable shared_ptr_count_base* _b = nullptr;
+ mutable T* _p = nullptr;
+private:
+ explicit shared_ptr(shared_ptr_count_for<T>* b) noexcept : _b(b), _p(&b->data) {
+ ++_b->count;
+ }
+ shared_ptr(shared_ptr_count_base* b, T* p) noexcept : _b(b), _p(p) {
+ if (_b) {
+ ++_b->count;
+ }
+ }
+ explicit shared_ptr(enable_shared_from_this<std::remove_const_t<T>>* p) noexcept : _b(p), _p(static_cast<T*>(p)) {
+ if (_b) {
+ ++_b->count;
+ }
+ }
+public:
+ using element_type = T;
+
+ shared_ptr() noexcept = default;
+ shared_ptr(std::nullptr_t) noexcept : shared_ptr() {}
+ shared_ptr(const shared_ptr& x) noexcept
+ : _b(x._b)
+ , _p(x._p) {
+ if (_b) {
+ ++_b->count;
+ }
+ }
+ shared_ptr(shared_ptr&& x) noexcept
+ : _b(x._b)
+ , _p(x._p) {
+ x._b = nullptr;
+ x._p = nullptr;
+ }
+ template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+ shared_ptr(const shared_ptr<U>& x) noexcept
+ : _b(x._b)
+ , _p(x._p) {
+ if (_b) {
+ ++_b->count;
+ }
+ }
+ template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+ shared_ptr(shared_ptr<U>&& x) noexcept
+ : _b(x._b)
+ , _p(x._p) {
+ x._b = nullptr;
+ x._p = nullptr;
+ }
+ ~shared_ptr() {
+ if (_b && !--_b->count) {
+ delete _b;
+ }
+ }
+ shared_ptr& operator=(const shared_ptr& x) noexcept {
+ if (this != &x) {
+ this->~shared_ptr();
+ new (this) shared_ptr(x);
+ }
+ return *this;
+ }
+ shared_ptr& operator=(shared_ptr&& x) noexcept {
+ if (this != &x) {
+ this->~shared_ptr();
+ new (this) shared_ptr(std::move(x));
+ }
+ return *this;
+ }
+ shared_ptr& operator=(std::nullptr_t) noexcept {
+ return *this = shared_ptr();
+ }
+ template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+ shared_ptr& operator=(const shared_ptr<U>& x) noexcept {
+ if (*this != x) {
+ this->~shared_ptr();
+ new (this) shared_ptr(x);
+ }
+ return *this;
+ }
+ template <typename U, typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+ shared_ptr& operator=(shared_ptr<U>&& x) noexcept {
+ if (*this != x) {
+ this->~shared_ptr();
+ new (this) shared_ptr(std::move(x));
+ }
+ return *this;
+ }
+ explicit operator bool() const noexcept {
+ return _p;
+ }
+ T& operator*() const noexcept {
+ return *_p;
+ }
+ T* operator->() const noexcept {
+ return _p;
+ }
+ T* get() const noexcept {
+ return _p;
+ }
+ long use_count() const noexcept {
+ if (_b) {
+ return _b->count;
+ } else {
+ return 0;
+ }
+ }
+
+ template <bool esft>
+ struct make_helper;
+
+ template <typename U, typename... A>
+ friend shared_ptr<U> make_shared(A&&... a);
+
+ template <typename U>
+ friend shared_ptr<U> make_shared(U&& a);
+
+ template <typename V, typename U>
+ friend shared_ptr<V> static_pointer_cast(const shared_ptr<U>& p);
+
+ template <typename V, typename U>
+ friend shared_ptr<V> dynamic_pointer_cast(const shared_ptr<U>& p);
+
+ template <typename V, typename U>
+ friend shared_ptr<V> const_pointer_cast(const shared_ptr<U>& p);
+
+ template <bool esft, typename... A>
+ static shared_ptr make(A&&... a);
+
+ template <typename U>
+ friend class enable_shared_from_this;
+
+ template <typename U, bool esft>
+ friend struct shared_ptr_make_helper;
+
+ template <typename U>
+ friend class shared_ptr;
+};
+
+template <typename U, bool esft>
+struct shared_ptr_make_helper;
+
+template <typename T>
+struct shared_ptr_make_helper<T, false> {
+ template <typename... A>
+ static shared_ptr<T> make(A&&... a) {
+ return shared_ptr<T>(new shared_ptr_count_for<T>(std::forward<A>(a)...));
+ }
+};
+
+template <typename T>
+struct shared_ptr_make_helper<T, true> {
+ template <typename... A>
+ static shared_ptr<T> make(A&&... a) {
+ auto p = new T(std::forward<A>(a)...);
+ return shared_ptr<T>(p, p);
+ }
+};
+
+template <typename T, typename... A>
+inline
+shared_ptr<T>
+make_shared(A&&... a) {
+ using helper = shared_ptr_make_helper<T, std::is_base_of<shared_ptr_count_base, T>::value>;
+ return helper::make(std::forward<A>(a)...);
+}
+
+template <typename T>
+inline
+shared_ptr<T>
+make_shared(T&& a) {
+ using helper = shared_ptr_make_helper<T, std::is_base_of<shared_ptr_count_base, T>::value>;
+ return helper::make(std::forward<T>(a));
+}
+
+template <typename T, typename U>
+inline
+shared_ptr<T>
+static_pointer_cast(const shared_ptr<U>& p) {
+ return shared_ptr<T>(p._b, static_cast<T*>(p._p));
+}
+
+template <typename T, typename U>
+inline
+shared_ptr<T>
+dynamic_pointer_cast(const shared_ptr<U>& p) {
+ auto q = dynamic_cast<T*>(p._p);
+ return shared_ptr<T>(q ? p._b : nullptr, q);
+}
+
+template <typename T, typename U>
+inline
+shared_ptr<T>
+const_pointer_cast(const shared_ptr<U>& p) {
+ return shared_ptr<T>(p._b, const_cast<T*>(p._p));
+}
+
+template <typename T>
+inline
+shared_ptr<T>
+enable_shared_from_this<T>::shared_from_this() {
+ auto unconst = reinterpret_cast<enable_shared_from_this<std::remove_const_t<T>>*>(this);
+ return shared_ptr<T>(unconst);
+}
+
+template <typename T>
+inline
+shared_ptr<const T>
+enable_shared_from_this<T>::shared_from_this() const {
+ auto esft = const_cast<enable_shared_from_this*>(this);
+ auto unconst = reinterpret_cast<enable_shared_from_this<std::remove_const_t<T>>*>(esft);
+ return shared_ptr<const T>(unconst);
+}
+
+template <typename T, typename U>
+inline
+bool
+operator==(const shared_ptr<T>& x, const shared_ptr<U>& y) {
+ return x.get() == y.get();
+}
+
+template <typename T>
+inline
+bool
+operator==(const shared_ptr<T>& x, std::nullptr_t) {
+ return x.get() == nullptr;
+}
+
+template <typename T>
+inline
+bool
+operator==(std::nullptr_t, const shared_ptr<T>& y) {
+ return nullptr == y.get();
+}
+
+template <typename T, typename U>
+inline
+bool
+operator!=(const shared_ptr<T>& x, const shared_ptr<U>& y) {
+ return x.get() != y.get();
+}
+
+template <typename T>
+inline
+bool
+operator!=(const shared_ptr<T>& x, std::nullptr_t) {
+ return x.get() != nullptr;
+}
+
+template <typename T>
+inline
+bool
+operator!=(std::nullptr_t, const shared_ptr<T>& y) {
+ return nullptr != y.get();
+}
+
+template <typename T, typename U>
+inline
+bool
+operator<(const shared_ptr<T>& x, const shared_ptr<U>& y) {
+ return x.get() < y.get();
+}
+
+template <typename T>
+inline
+bool
+operator<(const shared_ptr<T>& x, std::nullptr_t) {
+ return x.get() < nullptr;
+}
+
+template <typename T>
+inline
+bool
+operator<(std::nullptr_t, const shared_ptr<T>& y) {
+ return nullptr < y.get();
+}
+
+template <typename T, typename U>
+inline
+bool
+operator<=(const shared_ptr<T>& x, const shared_ptr<U>& y) {
+ return x.get() <= y.get();
+}
+
+template <typename T>
+inline
+bool
+operator<=(const shared_ptr<T>& x, std::nullptr_t) {
+ return x.get() <= nullptr;
+}
+
+template <typename T>
+inline
+bool
+operator<=(std::nullptr_t, const shared_ptr<T>& y) {
+ return nullptr <= y.get();
+}
+
+template <typename T, typename U>
+inline
+bool
+operator>(const shared_ptr<T>& x, const shared_ptr<U>& y) {
+ return x.get() > y.get();
+}
+
+template <typename T>
+inline
+bool
+operator>(const shared_ptr<T>& x, std::nullptr_t) {
+ return x.get() > nullptr;
+}
+
+template <typename T>
+inline
+bool
+operator>(std::nullptr_t, const shared_ptr<T>& y) {
+ return nullptr > y.get();
+}
+
+template <typename T, typename U>
+inline
+bool
+operator>=(const shared_ptr<T>& x, const shared_ptr<U>& y) {
+ return x.get() >= y.get();
+}
+
+template <typename T>
+inline
+bool
+operator>=(const shared_ptr<T>& x, std::nullptr_t) {
+ return x.get() >= nullptr;
+}
+
+template <typename T>
+inline
+bool
+operator>=(std::nullptr_t, const shared_ptr<T>& y) {
+ return nullptr >= y.get();
+}
+
+template <typename T>
+static inline
+std::ostream& operator<<(std::ostream& out, const shared_ptr<T>& p) {
+ if (!p) {
+ return out << "null";
+ }
+ return out << *p;
+}
+
+template<typename T>
+using shared_ptr_equal_by_value = indirect_equal_to<shared_ptr<T>>;
+
+template<typename T>
+using shared_ptr_value_hash = indirect_hash<shared_ptr<T>>;
+
+}
+
+namespace std {
+
+template <typename T>
+struct hash<seastar::lw_shared_ptr<T>> : private hash<T*> {
+ size_t operator()(const seastar::lw_shared_ptr<T>& p) const {
+ return hash<T*>::operator()(p.get());
+ }
+};
+
+template <typename T>
+struct hash<seastar::shared_ptr<T>> : private hash<T*> {
+ size_t operator()(const seastar::shared_ptr<T>& p) const {
+ return hash<T*>::operator()(p.get());
+ }
+};
+
+}
+
+namespace seastar {
+
+template<typename T>
+struct is_smart_ptr<shared_ptr<T>> : std::true_type {};
+
+template<typename T>
+struct is_smart_ptr<lw_shared_ptr<T>> : std::true_type {};
+
+}
diff --git a/src/seastar/include/seastar/core/shared_ptr_debug_helper.hh b/src/seastar/include/seastar/core/shared_ptr_debug_helper.hh
new file mode 100644
index 000000000..0cf0d7ac3
--- /dev/null
+++ b/src/seastar/include/seastar/core/shared_ptr_debug_helper.hh
@@ -0,0 +1,70 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#ifdef SEASTAR_DEBUG_SHARED_PTR
+
+#include <thread>
+#include <cassert>
+
+namespace seastar {
+
+// A counter that is only comfortable being incremented on the cpu
+// it was created on. Useful for verifying that a shared_ptr
+// or lw_shared_ptr isn't misued across cores.
+class debug_shared_ptr_counter_type {
+ long _counter = 0;
+ std::thread::id _cpu = std::this_thread::get_id();
+public:
+ debug_shared_ptr_counter_type(long x) : _counter(x) {}
+ operator long() const {
+ check();
+ return _counter;
+ }
+ debug_shared_ptr_counter_type& operator++() {
+ check();
+ ++_counter;
+ return *this;
+ }
+ long operator++(int) {
+ check();
+ return _counter++;
+ }
+ debug_shared_ptr_counter_type& operator--() {
+ check();
+ --_counter;
+ return *this;
+ }
+ long operator--(int) {
+ check();
+ return _counter--;
+ }
+private:
+ void check() const {
+ assert(_cpu == std::this_thread::get_id());
+ }
+};
+
+}
+
+#endif
+
diff --git a/src/seastar/include/seastar/core/shared_ptr_incomplete.hh b/src/seastar/include/seastar/core/shared_ptr_incomplete.hh
new file mode 100644
index 000000000..84f2b44a7
--- /dev/null
+++ b/src/seastar/include/seastar/core/shared_ptr_incomplete.hh
@@ -0,0 +1,44 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/shared_ptr.hh>
+
+
+/// \file
+/// \brief Include this header files when using \c lw_shared_ptr<some_incomplete_type>, at the point
+/// where \c some_incomplete_type is defined.
+
+namespace seastar {
+
+namespace internal {
+
+// Overload when lw_shared_ptr_deleter<T> specialized
+template <typename T>
+T*
+lw_shared_ptr_accessors<T, void_t<decltype(lw_shared_ptr_deleter<T>{})>>::to_value(lw_shared_ptr_counter_base* counter) {
+ return static_cast<T*>(counter);
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/core/simple-stream.hh b/src/seastar/include/seastar/core/simple-stream.hh
new file mode 100644
index 000000000..3d02ec813
--- /dev/null
+++ b/src/seastar/include/seastar/core/simple-stream.hh
@@ -0,0 +1,639 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 Scylladb, Ltd.
+ */
+
+#pragma once
+#include <seastar/core/sstring.hh>
+#include <seastar/util/variant_utils.hh>
+
+namespace seastar {
+
+class measuring_output_stream {
+ size_t _size = 0;
+public:
+ void write(const char* data, size_t size) {
+ _size += size;
+ }
+
+ size_t size() const {
+ return _size;
+ }
+};
+
+template<typename>
+class memory_output_stream;
+
+class simple_memory_input_stream;
+
+template<typename Iterator>
+class fragmented_memory_input_stream;
+
+template<typename Iterator>
+class memory_input_stream;
+
+class simple_memory_output_stream {
+ char* _p = nullptr;
+ size_t _size = 0;
+public:
+ using has_with_stream = std::false_type;
+ simple_memory_output_stream() {}
+ simple_memory_output_stream(char* p, size_t size, size_t start = 0) : _p(p + start), _size(size) {}
+ char* begin() { return _p; }
+
+ [[gnu::always_inline]]
+ void skip(size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("serialization buffer overflow");
+ }
+ _p += size;
+ _size -= size;
+ }
+
+ [[gnu::always_inline]]
+ simple_memory_output_stream write_substream(size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("serialization buffer overflow");
+ }
+ simple_memory_output_stream substream(_p, size);
+ skip(size);
+ return substream;
+ }
+
+ [[gnu::always_inline]]
+ void write(const char* p, size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("serialization buffer overflow");
+ }
+ std::copy_n(p, size, _p);
+ skip(size);
+ }
+
+ [[gnu::always_inline]]
+ void fill(char c, size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("serialization buffer overflow");
+ }
+ std::fill_n(_p, size, c);
+ skip(size);
+ }
+
+ [[gnu::always_inline]]
+ const size_t size() const {
+ return _size;
+ }
+
+ // simple_memory_output_stream is a write cursor that keeps a mutable view of some
+ // underlying buffer and provides write interface. to_input_stream() converts it
+ // to a read cursor that points to the same part of the buffer but provides
+ // read interface.
+ simple_memory_input_stream to_input_stream() const;
+};
+
+template<typename Iterator>
+class fragmented_memory_output_stream {
+ using simple = simple_memory_output_stream ;
+
+ Iterator _it;
+ simple _current;
+ size_t _size = 0;
+
+ friend class memory_input_stream<Iterator>;
+private:
+ template<typename Func>
+ //requires requires(Func f, view bv) { { f(bv) } -> void; }
+ void for_each_fragment(size_t size, Func&& func) {
+ if (size > _size) {
+ throw std::out_of_range("serialization buffer overflow");
+ }
+ _size -= size;
+ while (size) {
+ if (!_current.size()) {
+ _current = simple(reinterpret_cast<char*>((*_it).get_write()), (*_it).size());
+ _it++;
+ }
+ auto this_size = std::min(_current.size(), size);
+ func(_current.write_substream(this_size));
+ size -= this_size;
+ }
+ }
+ fragmented_memory_output_stream(Iterator it, simple_memory_output_stream bv, size_t size)
+ : _it(it), _current(bv), _size(size) { }
+public:
+ using has_with_stream = std::false_type;
+ using iterator_type = Iterator;
+
+ fragmented_memory_output_stream() = default;
+
+ fragmented_memory_output_stream(Iterator it, size_t size)
+ : _it(it), _size(size) {
+ }
+
+ void skip(size_t size) {
+ for_each_fragment(size, [] (auto) { });
+ }
+ memory_output_stream<Iterator> write_substream(size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("serialization buffer overflow");
+ }
+ if (_current.size() >= size) {
+ _size -= size;
+ return _current.write_substream(size);
+ }
+ fragmented_memory_output_stream substream(_it, _current, size);
+ skip(size);
+ return substream;
+ }
+ void write(const char* p, size_t size) {
+ for_each_fragment(size, [&p] (auto bv) {
+ std::copy_n(p, bv.size(), bv.begin());
+ p += bv.size();
+ });
+ }
+ void fill(char c, size_t size) {
+ for_each_fragment(size, [c] (simple fragment) {
+ std::fill_n(fragment.begin(), fragment.size(), c);
+ });
+ }
+ const size_t size() const {
+ return _size;
+ }
+
+ // fragmented_memory_input_stream is a write cursor that keeps a mutable view of some
+ // underlying fragmented buffer and provides write interface. to_input_stream() converts
+ // it to a read cursor that points to the same part of the buffer but provides read interface.
+ fragmented_memory_input_stream<Iterator> to_input_stream() const;
+};
+
+template<typename Iterator>
+class memory_output_stream {
+public:
+ using simple = simple_memory_output_stream;
+ using fragmented = fragmented_memory_output_stream<Iterator>;
+
+private:
+ const bool _is_simple;
+ using fragmented_type = fragmented;
+ union {
+ simple _simple;
+ fragmented_type _fragmented;
+ };
+public:
+ template<typename StreamVisitor>
+ [[gnu::always_inline]]
+ decltype(auto) with_stream(StreamVisitor&& visitor) {
+ if (__builtin_expect(_is_simple, true)) {
+ return visitor(_simple);
+ }
+ return visitor(_fragmented);
+ }
+
+ template<typename StreamVisitor>
+ [[gnu::always_inline]]
+ decltype(auto) with_stream(StreamVisitor&& visitor) const {
+ if (__builtin_expect(_is_simple, true)) {
+ return visitor(_simple);
+ }
+ return visitor(_fragmented);
+ }
+public:
+ using has_with_stream = std::true_type;
+ using iterator_type = Iterator;
+ memory_output_stream()
+ : _is_simple(true), _simple() {}
+ memory_output_stream(simple stream)
+ : _is_simple(true), _simple(std::move(stream)) {}
+ memory_output_stream(fragmented stream)
+ : _is_simple(false), _fragmented(std::move(stream)) {}
+
+ [[gnu::always_inline]]
+ memory_output_stream(const memory_output_stream& other) noexcept : _is_simple(other._is_simple) {
+ // Making this copy constructor noexcept makes copy assignment simpler.
+ // Besides, performance of memory_output_stream relies on the fact that both
+ // fragmented and simple input stream are PODs and the branch below
+ // is optimized away, so throwable copy constructors aren't something
+ // we want.
+ static_assert(std::is_nothrow_copy_constructible<fragmented>::value,
+ "seastar::memory_output_stream::fragmented should be copy constructible");
+ static_assert(std::is_nothrow_copy_constructible<simple>::value,
+ "seastar::memory_output_stream::simple should be copy constructible");
+ if (_is_simple) {
+ new (&_simple) simple(other._simple);
+ } else {
+ new (&_fragmented) fragmented_type(other._fragmented);
+ }
+ }
+
+ [[gnu::always_inline]]
+ memory_output_stream(memory_output_stream&& other) noexcept : _is_simple(other._is_simple) {
+ if (_is_simple) {
+ new (&_simple) simple(std::move(other._simple));
+ } else {
+ new (&_fragmented) fragmented_type(std::move(other._fragmented));
+ }
+ }
+
+ [[gnu::always_inline]]
+ memory_output_stream& operator=(const memory_output_stream& other) noexcept {
+ // Copy constructor being noexcept makes copy assignment simpler.
+ static_assert(std::is_nothrow_copy_constructible<memory_output_stream>::value,
+ "memory_output_stream copy constructor shouldn't throw");
+ if (this != &other) {
+ this->~memory_output_stream();
+ new (this) memory_output_stream(other);
+ }
+ return *this;
+ }
+
+ [[gnu::always_inline]]
+ memory_output_stream& operator=(memory_output_stream&& other) noexcept {
+ if (this != &other) {
+ this->~memory_output_stream();
+ new (this) memory_output_stream(std::move(other));
+ }
+ return *this;
+ }
+
+ [[gnu::always_inline]]
+ ~memory_output_stream() {
+ if (_is_simple) {
+ _simple.~simple();
+ } else {
+ _fragmented.~fragmented_type();
+ }
+ }
+
+ [[gnu::always_inline]]
+ void skip(size_t size) {
+ with_stream([size] (auto& stream) {
+ stream.skip(size);
+ });
+ }
+
+ [[gnu::always_inline]]
+ memory_output_stream write_substream(size_t size) {
+ return with_stream([size] (auto& stream) -> memory_output_stream {
+ return stream.write_substream(size);
+ });
+ }
+
+ [[gnu::always_inline]]
+ void write(const char* p, size_t size) {
+ with_stream([p, size] (auto& stream) {
+ stream.write(p, size);
+ });
+ }
+
+ [[gnu::always_inline]]
+ void fill(char c, size_t size) {
+ with_stream([c, size] (auto& stream) {
+ stream.fill(c, size);
+ });
+ }
+
+ [[gnu::always_inline]]
+ size_t size() const {
+ return with_stream([] (auto& stream) {
+ return stream.size();
+ });
+ }
+
+ memory_input_stream<Iterator> to_input_stream() const;
+};
+
+class simple_memory_input_stream {
+ using simple = simple_memory_input_stream;
+
+ const char* _p = nullptr;
+ size_t _size = 0;
+public:
+ using has_with_stream = std::false_type;
+ simple_memory_input_stream() = default;
+ simple_memory_input_stream(const char* p, size_t size) : _p(p), _size(size) {}
+
+ const char* begin() const { return _p; }
+
+ [[gnu::always_inline]]
+ void skip(size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("deserialization buffer underflow");
+ }
+ _p += size;
+ _size -= size;
+ }
+
+ [[gnu::always_inline]]
+ simple read_substream(size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("deserialization buffer underflow");
+ }
+ simple substream(_p, size);
+ skip(size);
+ return substream;
+ }
+
+ [[gnu::always_inline]]
+ void read(char* p, size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("deserialization buffer underflow");
+ }
+ std::copy_n(_p, size, p);
+ skip(size);
+ }
+
+ template<typename Output>
+ [[gnu::always_inline]]
+ void copy_to(Output& out) const {
+ out.write(_p, _size);
+ }
+
+ [[gnu::always_inline]]
+ const size_t size() const {
+ return _size;
+ }
+};
+
+template<typename Iterator>
+class fragmented_memory_input_stream {
+ using simple = simple_memory_input_stream;
+ using fragmented = fragmented_memory_input_stream;
+
+ Iterator _it;
+ simple _current;
+ size_t _size;
+private:
+ template<typename Func>
+ //requires requires(Func f, view bv) { { f(bv) } -> void; }
+ void for_each_fragment(size_t size, Func&& func) {
+ if (size > _size) {
+ throw std::out_of_range("deserialization buffer underflow");
+ }
+ _size -= size;
+ while (size) {
+ if (!_current.size()) {
+ _current = simple(reinterpret_cast<const char*>((*_it).begin()), (*_it).size());
+ _it++;
+ }
+ auto this_size = std::min(_current.size(), size);
+ func(_current.read_substream(this_size));
+ size -= this_size;
+ }
+ }
+ fragmented_memory_input_stream(Iterator it, simple bv, size_t size)
+ : _it(it), _current(bv), _size(size) { }
+ friend class fragmented_memory_output_stream<Iterator>;
+public:
+ using has_with_stream = std::false_type;
+ using iterator_type = Iterator;
+ fragmented_memory_input_stream(Iterator it, size_t size)
+ : _it(it), _size(size) {
+ }
+
+ void skip(size_t size) {
+ for_each_fragment(size, [] (auto) { });
+ }
+ fragmented read_substream(size_t size) {
+ if (size > _size) {
+ throw std::out_of_range("deserialization buffer underflow");
+ }
+ fragmented substream(_it, _current, size);
+ skip(size);
+ return substream;
+ }
+ void read(char* p, size_t size) {
+ for_each_fragment(size, [&p] (auto bv) {
+ p = std::copy_n(bv.begin(), bv.size(), p);
+ });
+ }
+ template<typename Output>
+ void copy_to(Output& out) {
+ for_each_fragment(_size, [&out] (auto bv) {
+ bv.copy_to(out);
+ });
+ }
+ const size_t size() const {
+ return _size;
+ }
+
+ const char* first_fragment_data() const { return _current.begin(); }
+ size_t first_fragment_size() const { return _current.size(); }
+ Iterator fragment_iterator() const { return _it; }
+};
+
+/*
+template<typename Visitor>
+concept bool StreamVisitor() {
+ return requires(Visitor visitor, simple& simple, fragmented& fragmented) {
+ visitor(simple);
+ visitor(fragmented);
+ };
+}
+*/
+// memory_input_stream performs type erasure optimized for cases where
+// simple is used.
+// By using a lot of [[gnu::always_inline]] attributes this class attempts to
+// make the compiler generate code with simple functions inlined
+// directly in the user of the intput_stream.
+template<typename Iterator>
+class memory_input_stream {
+public:
+ using simple = simple_memory_input_stream;
+ using fragmented = fragmented_memory_input_stream<Iterator>;
+private:
+ const bool _is_simple;
+ using fragmented_type = fragmented;
+ union {
+ simple _simple;
+ fragmented_type _fragmented;
+ };
+public:
+ template<typename StreamVisitor>
+ [[gnu::always_inline]]
+ decltype(auto) with_stream(StreamVisitor&& visitor) {
+ if (__builtin_expect(_is_simple, true)) {
+ return visitor(_simple);
+ }
+ return visitor(_fragmented);
+ }
+
+ template<typename StreamVisitor>
+ [[gnu::always_inline]]
+ decltype(auto) with_stream(StreamVisitor&& visitor) const {
+ if (__builtin_expect(_is_simple, true)) {
+ return visitor(_simple);
+ }
+ return visitor(_fragmented);
+ }
+public:
+ using has_with_stream = std::true_type;
+ using iterator_type = Iterator;
+ memory_input_stream(simple stream)
+ : _is_simple(true), _simple(std::move(stream)) {}
+ memory_input_stream(fragmented stream)
+ : _is_simple(false), _fragmented(std::move(stream)) {}
+
+ [[gnu::always_inline]]
+ memory_input_stream(const memory_input_stream& other) noexcept : _is_simple(other._is_simple) {
+ // Making this copy constructor noexcept makes copy assignment simpler.
+ // Besides, performance of memory_input_stream relies on the fact that both
+ // fragmented and simple input stream are PODs and the branch below
+ // is optimized away, so throwable copy constructors aren't something
+ // we want.
+ static_assert(std::is_nothrow_copy_constructible<fragmented>::value,
+ "seastar::memory_input_stream::fragmented should be copy constructible");
+ static_assert(std::is_nothrow_copy_constructible<simple>::value,
+ "seastar::memory_input_stream::simple should be copy constructible");
+ if (_is_simple) {
+ new (&_simple) simple(other._simple);
+ } else {
+ new (&_fragmented) fragmented_type(other._fragmented);
+ }
+ }
+
+ [[gnu::always_inline]]
+ memory_input_stream(memory_input_stream&& other) noexcept : _is_simple(other._is_simple) {
+ if (_is_simple) {
+ new (&_simple) simple(std::move(other._simple));
+ } else {
+ new (&_fragmented) fragmented_type(std::move(other._fragmented));
+ }
+ }
+
+ [[gnu::always_inline]]
+ memory_input_stream& operator=(const memory_input_stream& other) noexcept {
+ // Copy constructor being noexcept makes copy assignment simpler.
+ static_assert(std::is_nothrow_copy_constructible<memory_input_stream>::value,
+ "memory_input_stream copy constructor shouldn't throw");
+ if (this != &other) {
+ this->~memory_input_stream();
+ new (this) memory_input_stream(other);
+ }
+ return *this;
+ }
+
+ [[gnu::always_inline]]
+ memory_input_stream& operator=(memory_input_stream&& other) noexcept {
+ if (this != &other) {
+ this->~memory_input_stream();
+ new (this) memory_input_stream(std::move(other));
+ }
+ return *this;
+ }
+
+ [[gnu::always_inline]]
+ ~memory_input_stream() {
+ if (_is_simple) {
+ _simple.~simple_memory_input_stream();
+ } else {
+ _fragmented.~fragmented_type();
+ }
+ }
+
+ [[gnu::always_inline]]
+ void skip(size_t size) {
+ with_stream([size] (auto& stream) {
+ stream.skip(size);
+ });
+ }
+
+ [[gnu::always_inline]]
+ memory_input_stream read_substream(size_t size) {
+ return with_stream([size] (auto& stream) -> memory_input_stream {
+ return stream.read_substream(size);
+ });
+ }
+
+ [[gnu::always_inline]]
+ void read(char* p, size_t size) {
+ with_stream([p, size] (auto& stream) {
+ stream.read(p, size);
+ });
+ }
+
+ template<typename Output>
+ [[gnu::always_inline]]
+ void copy_to(Output& out) {
+ with_stream([&out] (auto& stream) {
+ stream.copy_to(out);
+ });
+ }
+
+ [[gnu::always_inline]]
+ size_t size() const {
+ return with_stream([] (auto& stream) {
+ return stream.size();
+ });
+ }
+
+ template<typename Stream, typename StreamVisitor>
+ friend decltype(auto) with_serialized_stream(Stream& stream, StreamVisitor&& visitor);
+};
+
+inline simple_memory_input_stream simple_memory_output_stream::to_input_stream() const {
+ return simple_memory_input_stream(_p, _size);
+}
+
+template<typename Iterator>
+inline fragmented_memory_input_stream<Iterator> fragmented_memory_output_stream<Iterator>::to_input_stream() const {
+ return fragmented_memory_input_stream<Iterator>(_it, _current.to_input_stream(), _size);
+}
+
+template<typename Iterator>
+inline memory_input_stream<Iterator> memory_output_stream<Iterator>::to_input_stream() const {
+ return with_stream(make_visitor(
+ [] (const simple_memory_output_stream& ostream) -> memory_input_stream<Iterator> {
+ return ostream.to_input_stream();
+ },
+ [] (const fragmented_memory_output_stream<Iterator>& ostream) -> memory_input_stream<Iterator> {
+ return ostream.to_input_stream();
+ }
+ ));
+}
+
+// The purpose of the with_serialized_stream() is to minimize number of dynamic
+// dispatches. For example, a lot of IDL-generated code looks like this:
+// auto some_value() const {
+// return seastar::with_serialized_stream(v, [] (auto& v) {
+// auto in = v;
+// ser::skip(in, boost::type<type1>());
+// ser::skip(in, boost::type<type2>());
+// return deserialize(in, boost::type<type3>());
+// });
+// }
+// Using with_stream() there is at most one dynamic dispatch per such
+// function, instead of one per each skip() and deserialize() call.
+
+template<typename Stream, typename StreamVisitor, typename = std::enable_if_t<Stream::has_with_stream::value>>
+[[gnu::always_inline]]
+ static inline decltype(auto)
+ with_serialized_stream(Stream& stream, StreamVisitor&& visitor) {
+ return stream.with_stream(std::forward<StreamVisitor>(visitor));
+}
+
+template<typename Stream, typename StreamVisitor, typename = std::enable_if_t<!Stream::has_with_stream::value>, typename = void>
+[[gnu::always_inline]]
+ static inline decltype(auto)
+ with_serialized_stream(Stream& stream, StreamVisitor&& visitor) {
+ return visitor(stream);
+}
+
+using simple_input_stream = simple_memory_input_stream;
+using simple_output_stream = simple_memory_output_stream;
+
+}
diff --git a/src/seastar/include/seastar/core/slab.hh b/src/seastar/include/seastar/core/slab.hh
new file mode 100644
index 000000000..7bafa74f4
--- /dev/null
+++ b/src/seastar/include/seastar/core/slab.hh
@@ -0,0 +1,568 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+#pragma once
+
+#include <boost/intrusive/unordered_set.hpp>
+#include <boost/intrusive/list.hpp>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/align.hh>
+#include <seastar/core/memory.hh>
+
+namespace seastar {
+
+static constexpr uint16_t SLAB_MAGIC_NUMBER = 0x51AB; // meant to be 'SLAB' :-)
+
+/*
+ * Item requirements
+ * - Extend it to slab_item_base.
+ * - First parameter of constructor must be uint32_t _slab_page_index.
+ * - Implement get_slab_page_index() to return _slab_page_index.
+ * - Implement is_unlocked() to check if Item can be evicted.
+ */
+
+/*
+ * slab_page_desc is 1:1 mapped to slab page.
+ * footprint: 80b for each slab page.
+ */
+struct slab_page_desc {
+private:
+ boost::intrusive::list_member_hook<> _lru_link;
+ boost::intrusive::list_member_hook<> _free_pages_link;
+ void *_slab_page;
+ std::vector<uintptr_t> _free_objects;
+ uint32_t _refcnt;
+ uint32_t _index; // index into slab page vector
+ uint16_t _magic;
+ uint8_t _slab_class_id;
+public:
+ slab_page_desc(void *slab_page, size_t objects, size_t object_size, uint8_t slab_class_id, uint32_t index)
+ : _slab_page(slab_page)
+ , _refcnt(0U)
+ , _index(index)
+ , _magic(SLAB_MAGIC_NUMBER)
+ , _slab_class_id(slab_class_id)
+ {
+ auto object = reinterpret_cast<uintptr_t>(slab_page);
+ _free_objects.reserve(objects - 1);
+ for (auto i = 1u; i < objects; i++) {
+ object += object_size;
+ _free_objects.push_back(object);
+ }
+ }
+
+ bool empty() const {
+ return _free_objects.empty();
+ }
+
+ size_t size() const {
+ return _free_objects.size();
+ }
+
+ uint32_t& refcnt() {
+ return _refcnt;
+ }
+
+ uint32_t index() const {
+ return _index;
+ }
+
+ uint16_t magic() const {
+ return _magic;
+ }
+
+ uint8_t slab_class_id() const {
+ return _slab_class_id;
+ }
+
+ void* slab_page() const {
+ return _slab_page;
+ }
+
+ std::vector<uintptr_t>& free_objects() {
+ return _free_objects;
+ }
+
+ void* allocate_object() {
+ assert(!_free_objects.empty());
+ auto object = reinterpret_cast<void*>(_free_objects.back());
+ _free_objects.pop_back();
+ return object;
+ }
+
+ void free_object(void *object) {
+ _free_objects.push_back(reinterpret_cast<uintptr_t>(object));
+ }
+
+ template<typename Item>
+ friend class slab_class;
+ template<typename Item>
+ friend class slab_allocator;
+};
+
+class slab_item_base {
+ boost::intrusive::list_member_hook<> _lru_link;
+
+ template<typename Item>
+ friend class slab_class;
+};
+
+template<typename Item>
+class slab_class {
+private:
+ boost::intrusive::list<slab_page_desc,
+ boost::intrusive::member_hook<slab_page_desc, boost::intrusive::list_member_hook<>,
+ &slab_page_desc::_free_pages_link>> _free_slab_pages;
+ boost::intrusive::list<slab_item_base,
+ boost::intrusive::member_hook<slab_item_base, boost::intrusive::list_member_hook<>,
+ &slab_item_base::_lru_link>> _lru;
+ size_t _size; // size of objects
+ uint8_t _slab_class_id;
+private:
+ template<typename... Args>
+ inline
+ Item* create_item(void *object, uint32_t slab_page_index, Args&&... args) {
+ Item *new_item = new(object) Item(slab_page_index, std::forward<Args>(args)...);
+ _lru.push_front(reinterpret_cast<slab_item_base&>(*new_item));
+ return new_item;
+ }
+
+ inline
+ std::pair<void *, uint32_t> evict_lru_item(std::function<void (Item& item_ref)>& erase_func) {
+ if (_lru.empty()) {
+ return { nullptr, 0U };
+ }
+
+ Item& victim = reinterpret_cast<Item&>(_lru.back());
+ uint32_t index = victim.get_slab_page_index();
+ assert(victim.is_unlocked());
+ _lru.erase(_lru.iterator_to(reinterpret_cast<slab_item_base&>(victim)));
+ // WARNING: You need to make sure that erase_func will not release victim back to slab.
+ erase_func(victim);
+
+ return { reinterpret_cast<void*>(&victim), index };
+ }
+public:
+ slab_class(size_t size, uint8_t slab_class_id)
+ : _size(size)
+ , _slab_class_id(slab_class_id)
+ {
+ }
+ slab_class(slab_class&&) = default;
+ ~slab_class() {
+ _free_slab_pages.clear();
+ _lru.clear();
+ }
+
+ size_t size() const {
+ return _size;
+ }
+
+ bool empty() const {
+ return _free_slab_pages.empty();
+ }
+
+ bool has_no_slab_pages() const {
+ return _lru.empty();
+ }
+
+ template<typename... Args>
+ Item *create(Args&&... args) {
+ assert(!_free_slab_pages.empty());
+ auto& desc = _free_slab_pages.back();
+ auto object = desc.allocate_object();
+ if (desc.empty()) {
+ // if empty, remove desc from the list of slab pages with free objects.
+ _free_slab_pages.erase(_free_slab_pages.iterator_to(desc));
+ }
+
+ return create_item(object, desc.index(), std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Item *create_from_new_page(uint64_t max_object_size, uint32_t slab_page_index,
+ std::function<void (slab_page_desc& desc)> insert_slab_page_desc,
+ Args&&... args) {
+ // allocate slab page.
+ constexpr size_t alignment = std::alignment_of<Item>::value;
+ void *slab_page = aligned_alloc(alignment, max_object_size);
+ if (!slab_page) {
+ throw std::bad_alloc{};
+ }
+ // allocate descriptor to slab page.
+ slab_page_desc *desc = nullptr;
+ assert(_size % alignment == 0);
+ try {
+ auto objects = max_object_size / _size;
+ desc = new slab_page_desc(slab_page, objects, _size, _slab_class_id, slab_page_index);
+ } catch (const std::bad_alloc& e) {
+ ::free(slab_page);
+ throw std::bad_alloc{};
+ }
+
+ _free_slab_pages.push_front(*desc);
+ insert_slab_page_desc(*desc);
+
+ // first object from the allocated slab page is returned.
+ return create_item(slab_page, slab_page_index, std::forward<Args>(args)...);
+ }
+
+ template<typename... Args>
+ Item *create_from_lru(std::function<void (Item& item_ref)>& erase_func, Args&&... args) {
+ auto ret = evict_lru_item(erase_func);
+ if (!ret.first) {
+ throw std::bad_alloc{};
+ }
+ return create_item(ret.first, ret.second, std::forward<Args>(args)...);
+ }
+
+ void free_item(Item *item, slab_page_desc& desc) {
+ void *object = item;
+ _lru.erase(_lru.iterator_to(reinterpret_cast<slab_item_base&>(*item)));
+ desc.free_object(object);
+ if (desc.size() == 1) {
+ // push back desc into the list of slab pages with free objects.
+ _free_slab_pages.push_back(desc);
+ }
+ }
+
+ void touch_item(Item *item) {
+ auto& item_ref = reinterpret_cast<slab_item_base&>(*item);
+ _lru.erase(_lru.iterator_to(item_ref));
+ _lru.push_front(item_ref);
+ }
+
+ void remove_item_from_lru(Item *item) {
+ auto& item_ref = reinterpret_cast<slab_item_base&>(*item);
+ _lru.erase(_lru.iterator_to(item_ref));
+ }
+
+ void insert_item_into_lru(Item *item) {
+ auto& item_ref = reinterpret_cast<slab_item_base&>(*item);
+ _lru.push_front(item_ref);
+ }
+
+ void remove_desc_from_free_list(slab_page_desc& desc) {
+ assert(desc.slab_class_id() == _slab_class_id);
+ _free_slab_pages.erase(_free_slab_pages.iterator_to(desc));
+ }
+};
+
+template<typename Item>
+class slab_allocator {
+private:
+ std::vector<size_t> _slab_class_sizes;
+ std::vector<slab_class<Item>> _slab_classes;
+ seastar::metrics::metric_groups _metrics;
+ // erase_func() is used to remove the item from the cache using slab.
+ std::function<void (Item& item_ref)> _erase_func;
+ std::vector<slab_page_desc*> _slab_pages_vector;
+ boost::intrusive::list<slab_page_desc,
+ boost::intrusive::member_hook<slab_page_desc, boost::intrusive::list_member_hook<>,
+ &slab_page_desc::_lru_link>> _slab_page_desc_lru;
+ uint64_t _max_object_size;
+ uint64_t _available_slab_pages;
+ struct collectd_stats {
+ uint64_t allocs;
+ uint64_t frees;
+ } _stats;
+ memory::reclaimer *_reclaimer = nullptr;
+ bool _reclaimed = false;
+private:
+ memory::reclaiming_result evict_lru_slab_page() {
+ if (_slab_page_desc_lru.empty()) {
+ // NOTE: Nothing to evict. If this happens, it implies that all
+ // slab pages in the slab are being used at the same time.
+ // That being said, this event is very unlikely to happen.
+ return memory::reclaiming_result::reclaimed_nothing;
+ }
+ // get descriptor of the least-recently-used slab page and related info.
+ auto& desc = _slab_page_desc_lru.back();
+ assert(desc.refcnt() == 0);
+ uint8_t slab_class_id = desc.slab_class_id();
+ auto slab_class = get_slab_class(slab_class_id);
+ void *slab_page = desc.slab_page();
+
+ auto& free_objects = desc.free_objects();
+ if (!desc.empty()) {
+ // if not empty, remove desc from the list of slab pages with free objects.
+ slab_class->remove_desc_from_free_list(desc);
+ // and sort the array of free objects for binary search later on.
+ std::sort(free_objects.begin(), free_objects.end());
+ }
+ // remove desc from the list of slab page descriptors.
+ _slab_page_desc_lru.erase(_slab_page_desc_lru.iterator_to(desc));
+ // remove desc from the slab page vector.
+ _slab_pages_vector[desc.index()] = nullptr;
+
+ // Iterate through objects in the slab page and if the object is an allocated
+ // item, the item should be removed from LRU and then erased.
+ uintptr_t object = reinterpret_cast<uintptr_t>(slab_page);
+ auto object_size = slab_class->size();
+ auto objects = _max_object_size / object_size;
+ for (auto i = 0u; i < objects; i++, object += object_size) {
+ if (!desc.empty()) {
+ // if binary_search returns true, it means that object at the current
+ // offset isn't an item.
+ if (std::binary_search(free_objects.begin(), free_objects.end(), object)) {
+ continue;
+ }
+ }
+ Item* item = reinterpret_cast<Item*>(object);
+ assert(item->is_unlocked());
+ slab_class->remove_item_from_lru(item);
+ _erase_func(*item);
+ _stats.frees++;
+ }
+#ifdef SEASTAR_DEBUG
+ printf("lru slab page eviction succeeded! desc_empty?=%d\n", desc.empty());
+#endif
+ ::free(slab_page); // free slab page object
+ delete &desc; // free its descriptor
+ return memory::reclaiming_result::reclaimed_something;
+ }
+
+ /*
+ * Reclaim the least recently used slab page that is unused.
+ */
+ memory::reclaiming_result reclaim() {
+ // once reclaimer was called, slab pages should no longer be allocated, as the
+ // memory used by slab is supposed to be calibrated.
+ _reclaimed = true;
+ // FIXME: Should reclaim() only evict a single slab page at a time?
+ return evict_lru_slab_page();
+ }
+
+ void initialize_slab_allocator(double growth_factor, uint64_t limit) {
+ constexpr size_t alignment = std::alignment_of<Item>::value;
+ constexpr size_t initial_size = 96;
+ size_t size = initial_size; // initial object size
+ uint8_t slab_class_id = 0U;
+
+ while (_max_object_size / size > 1) {
+ size = align_up(size, alignment);
+ _slab_class_sizes.push_back(size);
+ _slab_classes.emplace_back(size, slab_class_id);
+ size *= growth_factor;
+ assert(slab_class_id < std::numeric_limits<uint8_t>::max());
+ slab_class_id++;
+ }
+ _slab_class_sizes.push_back(_max_object_size);
+ _slab_classes.emplace_back(_max_object_size, slab_class_id);
+
+ // If slab limit is zero, enable reclaimer.
+ if (!limit) {
+ _reclaimer = new memory::reclaimer([this] { return reclaim(); });
+ } else {
+ _slab_pages_vector.reserve(_available_slab_pages);
+ }
+ }
+
+ slab_class<Item>* get_slab_class(const size_t size) {
+ // given a size, find slab class with binary search.
+ auto i = std::lower_bound(_slab_class_sizes.begin(), _slab_class_sizes.end(), size);
+ if (i == _slab_class_sizes.end()) {
+ return nullptr;
+ }
+ auto dist = std::distance(_slab_class_sizes.begin(), i);
+ return &_slab_classes[dist];
+ }
+
+ slab_class<Item>* get_slab_class(const uint8_t slab_class_id) {
+ assert(slab_class_id >= 0 && slab_class_id < _slab_classes.size());
+ return &_slab_classes[slab_class_id];
+ }
+
+ void register_metrics() {
+ namespace sm = seastar::metrics;
+ _metrics.add_group("slab", {
+ sm::make_derive("malloc_total_operations", sm::description("Total number of slab malloc operations"), _stats.allocs),
+ sm::make_derive("free_total_operations", sm::description("Total number of slab free operations"), _stats.frees),
+ sm::make_gauge("malloc_objects", sm::description("Number of slab created objects currently in memory"), [this] {
+ return _stats.allocs - _stats.frees;
+ })
+ });
+ }
+
+ inline slab_page_desc& get_slab_page_desc(Item *item)
+ {
+ auto desc = _slab_pages_vector[item->get_slab_page_index()];
+ assert(desc != nullptr);
+ assert(desc->magic() == SLAB_MAGIC_NUMBER);
+ return *desc;
+ }
+
+ inline bool can_allocate_page(slab_class<Item>& sc) {
+ return (_reclaimer && !_reclaimed) ||
+ (_available_slab_pages > 0 || sc.has_no_slab_pages());
+ }
+public:
+ slab_allocator(double growth_factor, uint64_t limit, uint64_t max_object_size)
+ : _max_object_size(max_object_size)
+ , _available_slab_pages(limit / max_object_size)
+ {
+ initialize_slab_allocator(growth_factor, limit);
+ register_metrics();
+ }
+
+ slab_allocator(double growth_factor, uint64_t limit, uint64_t max_object_size,
+ std::function<void (Item& item_ref)> erase_func)
+ : _erase_func(std::move(erase_func))
+ , _max_object_size(max_object_size)
+ , _available_slab_pages(limit / max_object_size)
+ {
+ initialize_slab_allocator(growth_factor, limit);
+ register_metrics();
+ }
+
+ ~slab_allocator()
+ {
+ _slab_classes.clear();
+ _slab_page_desc_lru.clear();
+ for (auto desc : _slab_pages_vector) {
+ if (!desc) {
+ continue;
+ }
+ ::free(desc->slab_page());
+ delete desc;
+ }
+ delete _reclaimer;
+ }
+
+ /**
+ * Create an item from a given slab class based on requested size.
+ */
+ template<typename... Args>
+ Item* create(const size_t size, Args&&... args) {
+ auto slab_class = get_slab_class(size);
+ if (!slab_class) {
+ throw std::bad_alloc{};
+ }
+
+ Item *item = nullptr;
+ if (!slab_class->empty()) {
+ item = slab_class->create(std::forward<Args>(args)...);
+ _stats.allocs++;
+ } else {
+ if (can_allocate_page(*slab_class)) {
+ auto index_to_insert = _slab_pages_vector.size();
+ item = slab_class->create_from_new_page(_max_object_size, index_to_insert,
+ [this](slab_page_desc& desc) {
+ if (_reclaimer) {
+ // insert desc into the LRU list of slab page descriptors.
+ _slab_page_desc_lru.push_front(desc);
+ }
+ // insert desc into the slab page vector.
+ _slab_pages_vector.push_back(&desc);
+ },
+ std::forward<Args>(args)...);
+ if (_available_slab_pages > 0) {
+ _available_slab_pages--;
+ }
+ _stats.allocs++;
+ } else if (_erase_func) {
+ item = slab_class->create_from_lru(_erase_func, std::forward<Args>(args)...);
+ }
+ }
+ return item;
+ }
+
+ void lock_item(Item *item) {
+ auto& desc = get_slab_page_desc(item);
+ if (_reclaimer) {
+ auto& refcnt = desc.refcnt();
+
+ if (++refcnt == 1) {
+ // remove slab page descriptor from list of slab page descriptors.
+ _slab_page_desc_lru.erase(_slab_page_desc_lru.iterator_to(desc));
+ }
+ }
+ // remove item from the lru of its slab class.
+ auto slab_class = get_slab_class(desc.slab_class_id());
+ slab_class->remove_item_from_lru(item);
+ }
+
+ void unlock_item(Item *item) {
+ auto& desc = get_slab_page_desc(item);
+ if (_reclaimer) {
+ auto& refcnt = desc.refcnt();
+
+ if (--refcnt == 0) {
+ // insert slab page descriptor back into list of slab page descriptors.
+ _slab_page_desc_lru.push_front(desc);
+ }
+ }
+ // insert item into the lru of its slab class.
+ auto slab_class = get_slab_class(desc.slab_class_id());
+ slab_class->insert_item_into_lru(item);
+ }
+
+ /**
+ * Free an item back to its original slab class.
+ */
+ void free(Item *item) {
+ if (item) {
+ auto& desc = get_slab_page_desc(item);
+ auto slab_class = get_slab_class(desc.slab_class_id());
+ slab_class->free_item(item, desc);
+ _stats.frees++;
+ }
+ }
+
+ /**
+ * Update item position in the LRU of its slab class.
+ */
+ void touch(Item *item) {
+ if (item) {
+ auto& desc = get_slab_page_desc(item);
+ auto slab_class = get_slab_class(desc.slab_class_id());
+ slab_class->touch_item(item);
+ }
+ }
+
+ /**
+ * Helper function: Print all available slab classes and their respective properties.
+ */
+ void print_slab_classes() {
+ auto class_id = 0;
+ for (auto& slab_class : _slab_classes) {
+ size_t size = slab_class.size();
+ printf("slab[%3d]\tsize: %10lu\tper-slab-page: %5lu\n", class_id, size, _max_object_size / size);
+ class_id++;
+ }
+ }
+
+ /**
+ * Helper function: Useful for getting a slab class' chunk size from a size parameter.
+ */
+ size_t class_size(const size_t size) {
+ auto slab_class = get_slab_class(size);
+ return (slab_class) ? slab_class->size() : 0;
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/sleep.hh b/src/seastar/include/seastar/core/sleep.hh
new file mode 100644
index 000000000..4d4db0af6
--- /dev/null
+++ b/src/seastar/include/seastar/core/sleep.hh
@@ -0,0 +1,93 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <chrono>
+#include <functional>
+
+#include <seastar/core/abort_source.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <seastar/core/timer.hh>
+
+namespace seastar {
+
+/// \file
+
+/// Returns a future which completes after a specified time has elapsed.
+///
+/// \param dur minimum amount of time before the returned future becomes
+/// ready.
+/// \return A \ref future which becomes ready when the sleep duration elapses.
+template <typename Clock = steady_clock_type, typename Rep, typename Period>
+future<> sleep(std::chrono::duration<Rep, Period> dur) {
+ struct sleeper {
+ promise<> done;
+ timer<Clock> tmr;
+ sleeper(std::chrono::duration<Rep, Period> dur)
+ : tmr([this] { done.set_value(); })
+ {
+ tmr.arm(dur);
+ }
+ };
+ sleeper *s = new sleeper(dur);
+ future<> fut = s->done.get_future();
+ return fut.then([s] { delete s; });
+}
+
+/// exception that is thrown when application is in process of been stopped
+class sleep_aborted : public std::exception {
+public:
+ /// Reports the exception reason.
+ virtual const char* what() const noexcept {
+ return "Sleep is aborted";
+ }
+};
+
+/// Returns a future which completes after a specified time has elapsed
+/// or throws \ref sleep_aborted exception if application is aborted
+///
+/// \param dur minimum amount of time before the returned future becomes
+/// ready.
+/// \return A \ref future which becomes ready when the sleep duration elapses.
+template <typename Clock = steady_clock_type>
+future<> sleep_abortable(typename Clock::duration dur);
+
+extern template future<> sleep_abortable<steady_clock_type>(typename steady_clock_type::duration);
+extern template future<> sleep_abortable<lowres_clock>(typename lowres_clock::duration);
+
+/// Returns a future which completes after a specified time has elapsed
+/// or throws \ref sleep_aborted exception if the sleep is aborted.
+///
+/// \param dur minimum amount of time before the returned future becomes
+/// ready.
+/// \param as the \ref abort_source that eventually notifies that the sleep
+/// should be aborted.
+/// \return A \ref future which becomes ready when the sleep duration elapses.
+template <typename Clock = steady_clock_type>
+future<> sleep_abortable(typename Clock::duration dur, abort_source& as);
+
+extern template future<> sleep_abortable<steady_clock_type>(typename steady_clock_type::duration, abort_source&);
+extern template future<> sleep_abortable<lowres_clock>(typename lowres_clock::duration, abort_source&);
+
+}
diff --git a/src/seastar/include/seastar/core/smp.hh b/src/seastar/include/seastar/core/smp.hh
new file mode 100644
index 000000000..2da028770
--- /dev/null
+++ b/src/seastar/include/seastar/core/smp.hh
@@ -0,0 +1,444 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2019 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/core/reactor_config.hh>
+#include <boost/lockfree/spsc_queue.hpp>
+#include <boost/thread/barrier.hpp>
+#include <boost/range/irange.hpp>
+#include <boost/program_options.hpp>
+#include <deque>
+#include <thread>
+
+/// \file
+
+namespace seastar {
+
+using shard_id = unsigned;
+
+class smp_service_group;
+class reactor_backend_selector;
+
+namespace internal {
+
+unsigned smp_service_group_id(smp_service_group ssg) noexcept;
+
+inline shard_id* this_shard_id_ptr() noexcept {
+ static thread_local shard_id g_this_shard_id;
+ return &g_this_shard_id;
+}
+
+}
+
+/// Returns shard_id of the of the current shard.
+inline shard_id this_shard_id() noexcept {
+ return *internal::this_shard_id_ptr();
+}
+
+/// Configuration for smp_service_group objects.
+///
+/// \see create_smp_service_group()
+struct smp_service_group_config {
+ /// The maximum number of non-local requests that execute on a shard concurrently
+ ///
+ /// Will be adjusted upwards to allow at least one request per non-local shard.
+ unsigned max_nonlocal_requests = 0;
+ /// An optional name for this smp group
+ ///
+ /// If this optional is engaged, timeout exception messages of the group's
+ /// semaphores will indicate the group's name.
+ std::optional<sstring> group_name;
+};
+
+/// A resource controller for cross-shard calls.
+///
+/// An smp_service_group allows you to limit the concurrency of
+/// smp::submit_to() and similar calls. While it's easy to limit
+/// the caller's concurrency (for example, by using a semaphore),
+/// the concurrency at the remote end can be multiplied by a factor
+/// of smp::count-1, which can be large.
+///
+/// The class is called a service _group_ because it can be used
+/// to group similar calls that share resource usage characteristics,
+/// need not be isolated from each other, but do need to be isolated
+/// from other groups. Calls in a group should not nest; doing so
+/// can result in ABA deadlocks.
+///
+/// Nested submit_to() calls must form a directed acyclic graph
+/// when considering their smp_service_groups as nodes. For example,
+/// if a call using ssg1 then invokes another call using ssg2, the
+/// internal call may not call again via either ssg1 or ssg2, or it
+/// may form a cycle (and risking an ABBA deadlock). Create a
+/// new smp_service_group_instead.
+class smp_service_group {
+ unsigned _id;
+private:
+ explicit smp_service_group(unsigned id) noexcept : _id(id) {}
+
+ friend unsigned internal::smp_service_group_id(smp_service_group ssg) noexcept;
+ friend smp_service_group default_smp_service_group() noexcept;
+ friend future<smp_service_group> create_smp_service_group(smp_service_group_config ssgc) noexcept;
+};
+
+inline
+unsigned
+internal::smp_service_group_id(smp_service_group ssg) noexcept {
+ return ssg._id;
+}
+
+/// Returns the default smp_service_group. This smp_service_group
+/// does not impose any limits on concurrency in the target shard.
+/// This makes is deadlock-safe, but can consume unbounded resources,
+/// and should therefore only be used when initiator concurrency is
+/// very low (e.g. administrative tasks).
+smp_service_group default_smp_service_group() noexcept;
+
+/// Creates an smp_service_group with the specified configuration.
+///
+/// The smp_service_group is global, and after this call completes,
+/// the returned value can be used on any shard.
+future<smp_service_group> create_smp_service_group(smp_service_group_config ssgc) noexcept;
+
+/// Destroy an smp_service_group.
+///
+/// Frees all resources used by an smp_service_group. It must not
+/// be used again once this function is called.
+future<> destroy_smp_service_group(smp_service_group ssg) noexcept;
+
+inline
+smp_service_group default_smp_service_group() noexcept {
+ return smp_service_group(0);
+}
+
+using smp_timeout_clock = lowres_clock;
+using smp_service_group_semaphore = basic_semaphore<named_semaphore_exception_factory, smp_timeout_clock>;
+using smp_service_group_semaphore_units = semaphore_units<named_semaphore_exception_factory, smp_timeout_clock>;
+
+static constexpr smp_timeout_clock::time_point smp_no_timeout = smp_timeout_clock::time_point::max();
+
+/// Options controlling the behaviour of \ref smp::submit_to().
+struct smp_submit_to_options {
+ /// Controls resource allocation.
+ smp_service_group service_group = default_smp_service_group();
+ /// The timeout is relevant only to the time the call spends waiting to be
+ /// processed by the remote shard, and *not* to the time it takes to be
+ /// executed there.
+ smp_timeout_clock::time_point timeout = smp_no_timeout;
+
+ smp_submit_to_options(smp_service_group service_group = default_smp_service_group(), smp_timeout_clock::time_point timeout = smp_no_timeout) noexcept
+ : service_group(service_group)
+ , timeout(timeout) {
+ }
+};
+
+void init_default_smp_service_group(shard_id cpu);
+
+smp_service_group_semaphore& get_smp_service_groups_semaphore(unsigned ssg_id, shard_id t) noexcept;
+
+class smp_message_queue {
+ static constexpr size_t queue_length = 128;
+ static constexpr size_t batch_size = 16;
+ static constexpr size_t prefetch_cnt = 2;
+ struct work_item;
+ struct lf_queue_remote {
+ reactor* remote;
+ };
+ using lf_queue_base = boost::lockfree::spsc_queue<work_item*,
+ boost::lockfree::capacity<queue_length>>;
+ // use inheritence to control placement order
+ struct lf_queue : lf_queue_remote, lf_queue_base {
+ lf_queue(reactor* remote) : lf_queue_remote{remote} {}
+ void maybe_wakeup();
+ ~lf_queue();
+ };
+ lf_queue _pending;
+ lf_queue _completed;
+ struct alignas(seastar::cache_line_size) {
+ size_t _sent = 0;
+ size_t _compl = 0;
+ size_t _last_snt_batch = 0;
+ size_t _last_cmpl_batch = 0;
+ size_t _current_queue_length = 0;
+ };
+ // keep this between two structures with statistics
+ // this makes sure that they have at least one cache line
+ // between them, so hw prefetcher will not accidentally prefetch
+ // cache line used by another cpu.
+ metrics::metric_groups _metrics;
+ struct alignas(seastar::cache_line_size) {
+ size_t _received = 0;
+ size_t _last_rcv_batch = 0;
+ };
+ struct work_item : public task {
+ explicit work_item(smp_service_group ssg) : task(current_scheduling_group()), ssg(ssg) {}
+ smp_service_group ssg;
+ virtual ~work_item() {}
+ virtual void fail_with(std::exception_ptr) = 0;
+ void process();
+ virtual void complete() = 0;
+ };
+ template <typename Func>
+ struct async_work_item : work_item {
+ smp_message_queue& _queue;
+ Func _func;
+ using futurator = futurize<std::result_of_t<Func()>>;
+ using future_type = typename futurator::type;
+ using value_type = typename future_type::value_type;
+ std::optional<value_type> _result;
+ std::exception_ptr _ex; // if !_result
+ typename futurator::promise_type _promise; // used on local side
+ async_work_item(smp_message_queue& queue, smp_service_group ssg, Func&& func) : work_item(ssg), _queue(queue), _func(std::move(func)) {}
+ virtual void fail_with(std::exception_ptr ex) override {
+ _promise.set_exception(std::move(ex));
+ }
+ virtual task* waiting_task() noexcept override {
+ // FIXME: waiting_tasking across shards is not implemented. Unsynchronized task access is unsafe.
+ return nullptr;
+ }
+ virtual void run_and_dispose() noexcept override {
+ // _queue.respond() below forwards the continuation chain back to the
+ // calling shard.
+ (void)futurator::invoke(this->_func).then_wrapped([this] (auto f) {
+ if (f.failed()) {
+ _ex = f.get_exception();
+ } else {
+ _result = f.get();
+ }
+ _queue.respond(this);
+ });
+ // We don't delete the task here as the creator of the work item will
+ // delete it on the origin shard.
+ }
+ virtual void complete() override {
+ if (_result) {
+ _promise.set_value(std::move(*_result));
+ } else {
+ // FIXME: _ex was allocated on another cpu
+ _promise.set_exception(std::move(_ex));
+ }
+ }
+ future_type get_future() { return _promise.get_future(); }
+ };
+ union tx_side {
+ tx_side() {}
+ ~tx_side() {}
+ void init() { new (&a) aa; }
+ struct aa {
+ std::deque<work_item*> pending_fifo;
+ } a;
+ } _tx;
+ std::vector<work_item*> _completed_fifo;
+public:
+ smp_message_queue(reactor* from, reactor* to);
+ ~smp_message_queue();
+ template <typename Func>
+ futurize_t<std::result_of_t<Func()>> submit(shard_id t, smp_submit_to_options options, Func&& func) noexcept {
+ memory::scoped_critical_alloc_section _;
+ auto wi = std::make_unique<async_work_item<Func>>(*this, options.service_group, std::forward<Func>(func));
+ auto fut = wi->get_future();
+ submit_item(t, options.timeout, std::move(wi));
+ return fut;
+ }
+ void start(unsigned cpuid);
+ template<size_t PrefetchCnt, typename Func>
+ size_t process_queue(lf_queue& q, Func process);
+ size_t process_incoming();
+ size_t process_completions(shard_id t);
+ void stop();
+private:
+ void work();
+ void submit_item(shard_id t, smp_timeout_clock::time_point timeout, std::unique_ptr<work_item> wi);
+ void respond(work_item* wi);
+ void move_pending();
+ void flush_request_batch();
+ void flush_response_batch();
+ bool has_unflushed_responses() const;
+ bool pure_poll_rx() const;
+ bool pure_poll_tx() const;
+
+ friend class smp;
+};
+
+class smp {
+ static std::vector<posix_thread> _threads;
+ static std::vector<std::function<void ()>> _thread_loops; // for dpdk
+ static std::optional<boost::barrier> _all_event_loops_done;
+ static std::vector<reactor*> _reactors;
+ struct qs_deleter {
+ void operator()(smp_message_queue** qs) const;
+ };
+ static std::unique_ptr<smp_message_queue*[], qs_deleter> _qs;
+ static std::thread::id _tmain;
+ static bool _using_dpdk;
+
+ template <typename Func>
+ using returns_future = is_future<std::result_of_t<Func()>>;
+ template <typename Func>
+ using returns_void = std::is_same<std::result_of_t<Func()>, void>;
+public:
+ static boost::program_options::options_description get_options_description();
+ static void register_network_stacks();
+ static void configure(boost::program_options::variables_map vm, reactor_config cfg = {});
+ static void cleanup();
+ static void cleanup_cpu();
+ static void arrive_at_event_loop_end();
+ static void join_all();
+ static bool main_thread() { return std::this_thread::get_id() == _tmain; }
+
+ /// Runs a function on a remote core.
+ ///
+ /// \param t designates the core to run the function on (may be a remote
+ /// core or the local core).
+ /// \param options an \ref smp_submit_to_options that contains options for this call.
+ /// \param func a callable to run on core \c t.
+ /// If \c func is a temporary object, its lifetime will be
+ /// extended by moving. This movement and the eventual
+ /// destruction of func are both done in the _calling_ core.
+ /// If \c func is a reference, the caller must guarantee that
+ /// it will survive the call.
+ /// \return whatever \c func returns, as a future<> (if \c func does not return a future,
+ /// submit_to() will wrap it in a future<>).
+ template <typename Func>
+ static futurize_t<std::result_of_t<Func()>> submit_to(unsigned t, smp_submit_to_options options, Func&& func) noexcept {
+ using ret_type = std::result_of_t<Func()>;
+ if (t == this_shard_id()) {
+ try {
+ if (!is_future<ret_type>::value) {
+ // Non-deferring function, so don't worry about func lifetime
+ return futurize<ret_type>::invoke(std::forward<Func>(func));
+ } else if (std::is_lvalue_reference<Func>::value) {
+ // func is an lvalue, so caller worries about its lifetime
+ return futurize<ret_type>::invoke(func);
+ } else {
+ // Deferring call on rvalue function, make sure to preserve it across call
+ auto w = std::make_unique<std::decay_t<Func>>(std::move(func));
+ auto ret = futurize<ret_type>::invoke(*w);
+ return ret.finally([w = std::move(w)] {});
+ }
+ } catch (...) {
+ // Consistently return a failed future rather than throwing, to simplify callers
+ return futurize<std::result_of_t<Func()>>::make_exception_future(std::current_exception());
+ }
+ } else {
+ return _qs[t][this_shard_id()].submit(t, options, std::forward<Func>(func));
+ }
+ }
+ /// Runs a function on a remote core.
+ ///
+ /// Uses default_smp_service_group() to control resource allocation.
+ ///
+ /// \param t designates the core to run the function on (may be a remote
+ /// core or the local core).
+ /// \param func a callable to run on core \c t.
+ /// If \c func is a temporary object, its lifetime will be
+ /// extended by moving. This movement and the eventual
+ /// destruction of func are both done in the _calling_ core.
+ /// If \c func is a reference, the caller must guarantee that
+ /// it will survive the call.
+ /// \return whatever \c func returns, as a future<> (if \c func does not return a future,
+ /// submit_to() will wrap it in a future<>).
+ template <typename Func>
+ static futurize_t<std::result_of_t<Func()>> submit_to(unsigned t, Func&& func) noexcept {
+ return submit_to(t, default_smp_service_group(), std::forward<Func>(func));
+ }
+ static bool poll_queues();
+ static bool pure_poll_queues();
+ static boost::integer_range<unsigned> all_cpus() noexcept {
+ return boost::irange(0u, count);
+ }
+ /// Invokes func on all shards.
+ ///
+ /// \param options the options to forward to the \ref smp::submit_to()
+ /// called behind the scenes.
+ /// \param func the function to be invoked on each shard. May return void or
+ /// future<>. Each async invocation will work with a separate copy
+ /// of \c func.
+ /// \returns a future that resolves when all async invocations finish.
+ template<typename Func>
+ SEASTAR_CONCEPT( requires std::is_nothrow_move_constructible_v<Func> )
+ static future<> invoke_on_all(smp_submit_to_options options, Func&& func) noexcept {
+ static_assert(std::is_same<future<>, typename futurize<std::result_of_t<Func()>>::type>::value, "bad Func signature");
+ static_assert(std::is_nothrow_move_constructible_v<Func>);
+ return parallel_for_each(all_cpus(), [options, &func] (unsigned id) {
+ return smp::submit_to(id, options, Func(func));
+ });
+ }
+ /// Invokes func on all shards.
+ ///
+ /// \param func the function to be invoked on each shard. May return void or
+ /// future<>. Each async invocation will work with a separate copy
+ /// of \c func.
+ /// \returns a future that resolves when all async invocations finish.
+ ///
+ /// Passes the default \ref smp_submit_to_options to the
+ /// \ref smp::submit_to() called behind the scenes.
+ template<typename Func>
+ static future<> invoke_on_all(Func&& func) noexcept {
+ return invoke_on_all(smp_submit_to_options{}, std::forward<Func>(func));
+ }
+ /// Invokes func on all other shards.
+ ///
+ /// \param cpu_id the cpu on which **not** to run the function.
+ /// \param options the options to forward to the \ref smp::submit_to()
+ /// called behind the scenes.
+ /// \param func the function to be invoked on each shard. May return void or
+ /// future<>. Each async invocation will work with a separate copy
+ /// of \c func.
+ /// \returns a future that resolves when all async invocations finish.
+ template<typename Func>
+ SEASTAR_CONCEPT( requires std::is_nothrow_move_constructible_v<Func> )
+ static future<> invoke_on_others(unsigned cpu_id, smp_submit_to_options options, Func func) noexcept {
+ static_assert(std::is_same<future<>, typename futurize<std::result_of_t<Func()>>::type>::value, "bad Func signature");
+ static_assert(std::is_nothrow_move_constructible_v<Func>);
+ return parallel_for_each(all_cpus(), [cpu_id, options, func = std::move(func)] (unsigned id) {
+ return id != cpu_id ? smp::submit_to(id, options, func) : make_ready_future<>();
+ });
+ }
+ /// Invokes func on all other shards.
+ ///
+ /// \param cpu_id the cpu on which **not** to run the function.
+ /// \param func the function to be invoked on each shard. May return void or
+ /// future<>. Each async invocation will work with a separate copy
+ /// of \c func.
+ /// \returns a future that resolves when all async invocations finish.
+ ///
+ /// Passes the default \ref smp_submit_to_options to the
+ /// \ref smp::submit_to() called behind the scenes.
+ template<typename Func>
+ static future<> invoke_on_others(unsigned cpu_id, Func func) noexcept {
+ return invoke_on_others(cpu_id, smp_submit_to_options{}, std::move(func));
+ }
+private:
+ static void start_all_queues();
+ static void pin(unsigned cpu_id);
+ static void allocate_reactor(unsigned id, reactor_backend_selector rbs, reactor_config cfg);
+ static void create_thread(std::function<void ()> thread_loop);
+public:
+ static unsigned count;
+};
+
+}
diff --git a/src/seastar/include/seastar/core/sstring.hh b/src/seastar/include/seastar/core/sstring.hh
new file mode 100644
index 000000000..781f30f57
--- /dev/null
+++ b/src/seastar/include/seastar/core/sstring.hh
@@ -0,0 +1,779 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <cstring>
+#include <stdexcept>
+#include <initializer_list>
+#include <istream>
+#include <ostream>
+#include <functional>
+#include <cstdio>
+#include <type_traits>
+#include <seastar/util/std-compat.hh>
+#include <seastar/core/temporary_buffer.hh>
+
+namespace seastar {
+
+template <typename char_type, typename Size, Size max_size, bool NulTerminate = true>
+class basic_sstring;
+
+#ifdef SEASTAR_SSTRING
+using sstring = basic_sstring<char, uint32_t, 15>;
+#else
+using sstring = std::string;
+#endif
+
+namespace internal {
+[[noreturn]] void throw_bad_alloc();
+[[noreturn]] void throw_sstring_overflow();
+[[noreturn]] void throw_sstring_out_of_range();
+}
+
+template <typename char_type, typename Size, Size max_size, bool NulTerminate>
+class basic_sstring {
+ static_assert(
+ (std::is_same<char_type, char>::value
+ || std::is_same<char_type, signed char>::value
+ || std::is_same<char_type, unsigned char>::value),
+ "basic_sstring only supports single byte char types");
+ union contents {
+ struct external_type {
+ char_type* str;
+ Size size;
+ int8_t pad;
+ } external;
+ struct internal_type {
+ char_type str[max_size];
+ int8_t size;
+ } internal;
+ static_assert(sizeof(external_type) <= sizeof(internal_type), "max_size too small");
+ static_assert(max_size <= 127, "max_size too large");
+ } u;
+ bool is_internal() const noexcept {
+ return u.internal.size >= 0;
+ }
+ bool is_external() const noexcept {
+ return !is_internal();
+ }
+ const char_type* str() const noexcept {
+ return is_internal() ? u.internal.str : u.external.str;
+ }
+ char_type* str() noexcept {
+ return is_internal() ? u.internal.str : u.external.str;
+ }
+
+public:
+ using value_type = char_type;
+ using traits_type = std::char_traits<char_type>;
+ using allocator_type = std::allocator<char_type>;
+ using reference = char_type&;
+ using const_reference = const char_type&;
+ using pointer = char_type*;
+ using const_pointer = const char_type*;
+ using iterator = char_type*;
+ using const_iterator = const char_type*;
+ // FIXME: add reverse_iterator and friend
+ using difference_type = ssize_t; // std::make_signed_t<Size> can be too small
+ using size_type = Size;
+ static constexpr size_type npos = static_cast<size_type>(-1);
+ static constexpr unsigned padding() { return unsigned(NulTerminate); }
+public:
+ struct initialized_later {};
+
+ basic_sstring() noexcept {
+ u.internal.size = 0;
+ if (NulTerminate) {
+ u.internal.str[0] = '\0';
+ }
+ }
+ basic_sstring(const basic_sstring& x) {
+ if (x.is_internal()) {
+ u.internal = x.u.internal;
+ } else {
+ u.internal.size = -1;
+ u.external.str = reinterpret_cast<char_type*>(std::malloc(x.u.external.size + padding()));
+ if (!u.external.str) {
+ internal::throw_bad_alloc();
+ }
+ std::copy(x.u.external.str, x.u.external.str + x.u.external.size + padding(), u.external.str);
+ u.external.size = x.u.external.size;
+ }
+ }
+ basic_sstring(basic_sstring&& x) noexcept {
+#pragma GCC diagnostic push
+ // Is a small-string construction is followed by this move constructor, then the trailing bytes
+ // of x.u are not initialized, but copied. gcc complains, but it is both legitimate to copy
+ // these bytes, and more efficient than a variable-size copy
+#pragma GCC diagnostic ignored "-Wuninitialized"
+ u = x.u;
+#pragma GCC diagnostic pop
+ x.u.internal.size = 0;
+ x.u.internal.str[0] = '\0';
+ }
+ basic_sstring(initialized_later, size_t size) {
+ if (size_type(size) != size) {
+ internal::throw_sstring_overflow();
+ }
+ if (size + padding() <= sizeof(u.internal.str)) {
+ if (NulTerminate) {
+ u.internal.str[size] = '\0';
+ }
+ u.internal.size = size;
+ } else {
+ u.internal.size = -1;
+ u.external.str = reinterpret_cast<char_type*>(std::malloc(size + padding()));
+ if (!u.external.str) {
+ internal::throw_bad_alloc();
+ }
+ u.external.size = size;
+ if (NulTerminate) {
+ u.external.str[size] = '\0';
+ }
+ }
+ }
+ basic_sstring(const char_type* x, size_t size) {
+ if (size_type(size) != size) {
+ internal::throw_sstring_overflow();
+ }
+ if (size + padding() <= sizeof(u.internal.str)) {
+ std::copy(x, x + size, u.internal.str);
+ if (NulTerminate) {
+ u.internal.str[size] = '\0';
+ }
+ u.internal.size = size;
+ } else {
+ u.internal.size = -1;
+ u.external.str = reinterpret_cast<char_type*>(std::malloc(size + padding()));
+ if (!u.external.str) {
+ internal::throw_bad_alloc();
+ }
+ u.external.size = size;
+ std::copy(x, x + size, u.external.str);
+ if (NulTerminate) {
+ u.external.str[size] = '\0';
+ }
+ }
+ }
+
+ basic_sstring(size_t size, char_type x) : basic_sstring(initialized_later(), size) {
+ memset(begin(), x, size);
+ }
+
+ basic_sstring(const char* x) : basic_sstring(reinterpret_cast<const char_type*>(x), std::strlen(x)) {}
+ basic_sstring(std::basic_string<char_type>& x) : basic_sstring(x.c_str(), x.size()) {}
+ basic_sstring(std::initializer_list<char_type> x) : basic_sstring(x.begin(), x.end() - x.begin()) {}
+ basic_sstring(const char_type* b, const char_type* e) : basic_sstring(b, e - b) {}
+ basic_sstring(const std::basic_string<char_type>& s)
+ : basic_sstring(s.data(), s.size()) {}
+ template <typename InputIterator>
+ basic_sstring(InputIterator first, InputIterator last)
+ : basic_sstring(initialized_later(), std::distance(first, last)) {
+ std::copy(first, last, begin());
+ }
+ explicit basic_sstring(std::basic_string_view<char_type, traits_type> v)
+ : basic_sstring(v.data(), v.size()) {
+ }
+ ~basic_sstring() noexcept {
+ if (is_external()) {
+ std::free(u.external.str);
+ }
+ }
+ basic_sstring& operator=(const basic_sstring& x) {
+ basic_sstring tmp(x);
+ swap(tmp);
+ return *this;
+ }
+ basic_sstring& operator=(basic_sstring&& x) noexcept {
+ if (this != &x) {
+ this->~basic_sstring();
+ new (this) basic_sstring(std::move(x));
+ }
+ return *this;
+ }
+ operator std::basic_string<char_type>() const {
+ return { str(), size() };
+ }
+
+ size_t size() const noexcept {
+ return is_internal() ? u.internal.size : u.external.size;
+ }
+
+ size_t length() const noexcept {
+ return size();
+ }
+
+ size_t find(char_type t, size_t pos = 0) const noexcept {
+ const char_type* it = str() + pos;
+ const char_type* end = str() + size();
+ while (it < end) {
+ if (*it == t) {
+ return it - str();
+ }
+ it++;
+ }
+ return npos;
+ }
+
+ size_t find(const basic_sstring& s, size_t pos = 0) const noexcept {
+ const char_type* it = str() + pos;
+ const char_type* end = str() + size();
+ const char_type* c_str = s.str();
+ const char_type* c_str_end = s.str() + s.size();
+
+ while (it < end) {
+ auto i = it;
+ auto j = c_str;
+ while ( i < end && j < c_str_end && *i == *j) {
+ i++;
+ j++;
+ }
+ if (j == c_str_end) {
+ return it - str();
+ }
+ it++;
+ }
+ return npos;
+ }
+
+ /**
+ * find_last_of find the last occurrence of c in the string.
+ * When pos is specified, the search only includes characters
+ * at or before position pos.
+ *
+ */
+ size_t find_last_of (char_type c, size_t pos = npos) const noexcept {
+ const char_type* str_start = str();
+ if (size()) {
+ if (pos >= size()) {
+ pos = size() - 1;
+ }
+ const char_type* p = str_start + pos + 1;
+ do {
+ p--;
+ if (*p == c) {
+ return (p - str_start);
+ }
+ } while (p != str_start);
+ }
+ return npos;
+ }
+
+ /**
+ * Append a C substring.
+ * @param s The C string to append.
+ * @param n The number of characters to append.
+ * @return Reference to this string.
+ */
+ basic_sstring& append (const char_type* s, size_t n) {
+ basic_sstring ret(initialized_later(), size() + n);
+ std::copy(begin(), end(), ret.begin());
+ std::copy(s, s + n, ret.begin() + size());
+ *this = std::move(ret);
+ return *this;
+ }
+
+ /**
+ * Resize string.
+ * @param n new size.
+ * @param c if n greater than current size character to fill newly allocated space with.
+ */
+ void resize(size_t n, const char_type c = '\0') {
+ if (n > size()) {
+ *this += basic_sstring(n - size(), c);
+ } else if (n < size()) {
+ if (is_internal()) {
+ u.internal.size = n;
+ if (NulTerminate) {
+ u.internal.str[n] = '\0';
+ }
+ } else if (n + padding() <= sizeof(u.internal.str)) {
+ *this = basic_sstring(u.external.str, n);
+ } else {
+ u.external.size = n;
+ if (NulTerminate) {
+ u.external.str[n] = '\0';
+ }
+ }
+ }
+ }
+
+ /**
+ * Replace characters with a value of a C style substring.
+ *
+ */
+ basic_sstring& replace(size_type pos, size_type n1, const char_type* s,
+ size_type n2) {
+ if (pos > size()) {
+ internal::throw_sstring_out_of_range();
+ }
+
+ if (n1 > size() - pos) {
+ n1 = size() - pos;
+ }
+
+ if (n1 == n2) {
+ if (n2) {
+ std::copy(s, s + n2, begin() + pos);
+ }
+ return *this;
+ }
+ basic_sstring ret(initialized_later(), size() + n2 - n1);
+ char_type* p= ret.begin();
+ std::copy(begin(), begin() + pos, p);
+ p += pos;
+ if (n2) {
+ std::copy(s, s + n2, p);
+ }
+ p += n2;
+ std::copy(begin() + pos + n1, end(), p);
+ *this = std::move(ret);
+ return *this;
+ }
+
+ template <class InputIterator>
+ basic_sstring& replace (const_iterator i1, const_iterator i2,
+ InputIterator first, InputIterator last) {
+ if (i1 < begin() || i1 > end() || i2 < begin()) {
+ internal::throw_sstring_out_of_range();
+ }
+ if (i2 > end()) {
+ i2 = end();
+ }
+
+ if (i2 - i1 == last - first) {
+ //in place replacement
+ std::copy(first, last, const_cast<char_type*>(i1));
+ return *this;
+ }
+ basic_sstring ret(initialized_later(), size() + (last - first) - (i2 - i1));
+ char_type* p = ret.begin();
+ p = std::copy(cbegin(), i1, p);
+ p = std::copy(first, last, p);
+ std::copy(i2, cend(), p);
+ *this = std::move(ret);
+ return *this;
+ }
+
+ iterator erase(iterator first, iterator last) {
+ size_t pos = first - begin();
+ replace(pos, last - first, nullptr, 0);
+ return begin() + pos;
+ }
+
+ /**
+ * Inserts additional characters into the string right before
+ * the character indicated by p.
+ */
+ template <class InputIterator>
+ void insert(const_iterator p, InputIterator beg, InputIterator end) {
+ replace(p, p, beg, end);
+ }
+
+ /**
+ * Returns a read/write reference to the data at the last
+ * element of the string.
+ * This function shall not be called on empty strings.
+ */
+ reference
+ back() noexcept {
+ return operator[](size() - 1);
+ }
+
+ /**
+ * Returns a read-only (constant) reference to the data at the last
+ * element of the string.
+ * This function shall not be called on empty strings.
+ */
+ const_reference
+ back() const noexcept {
+ return operator[](size() - 1);
+ }
+
+ basic_sstring substr(size_t from, size_t len = npos) const {
+ if (from > size()) {
+ internal::throw_sstring_out_of_range();
+ }
+ if (len > size() - from) {
+ len = size() - from;
+ }
+ if (len == 0) {
+ return "";
+ }
+ return { str() + from , len };
+ }
+
+ const char_type& at(size_t pos) const {
+ if (pos >= size()) {
+ internal::throw_sstring_out_of_range();
+ }
+ return *(str() + pos);
+ }
+
+ char_type& at(size_t pos) {
+ if (pos >= size()) {
+ internal::throw_sstring_out_of_range();
+ }
+ return *(str() + pos);
+ }
+
+ bool empty() const noexcept {
+ return u.internal.size == 0;
+ }
+
+ // Deprecated March 2020.
+ [[deprecated("Use = {}")]]
+ void reset() noexcept {
+ if (is_external()) {
+ std::free(u.external.str);
+ }
+ u.internal.size = 0;
+ if (NulTerminate) {
+ u.internal.str[0] = '\0';
+ }
+ }
+ temporary_buffer<char_type> release() && {
+ if (is_external()) {
+ auto ptr = u.external.str;
+ auto size = u.external.size;
+ u.external.str = nullptr;
+ u.external.size = 0;
+ return temporary_buffer<char_type>(ptr, size, make_free_deleter(ptr));
+ } else {
+ auto buf = temporary_buffer<char_type>(u.internal.size);
+ std::copy(u.internal.str, u.internal.str + u.internal.size, buf.get_write());
+ u.internal.size = 0;
+ if (NulTerminate) {
+ u.internal.str[0] = '\0';
+ }
+ return buf;
+ }
+ }
+ int compare(std::basic_string_view<char_type, traits_type> x) const noexcept {
+ auto n = traits_type::compare(begin(), x.begin(), std::min(size(), x.size()));
+ if (n != 0) {
+ return n;
+ }
+ if (size() < x.size()) {
+ return -1;
+ } else if (size() > x.size()) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ int compare(size_t pos, size_t sz, std::basic_string_view<char_type, traits_type> x) const {
+ if (pos > size()) {
+ internal::throw_sstring_out_of_range();
+ }
+
+ sz = std::min(size() - pos, sz);
+ auto n = traits_type::compare(begin() + pos, x.begin(), std::min(sz, x.size()));
+ if (n != 0) {
+ return n;
+ }
+ if (sz < x.size()) {
+ return -1;
+ } else if (sz > x.size()) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ void swap(basic_sstring& x) noexcept {
+ contents tmp;
+ tmp = x.u;
+ x.u = u;
+ u = tmp;
+ }
+ char_type* data() noexcept {
+ return str();
+ }
+ const char_type* data() const noexcept {
+ return str();
+ }
+ const char_type* c_str() const noexcept {
+ return str();
+ }
+ const char_type* begin() const noexcept { return str(); }
+ const char_type* end() const noexcept { return str() + size(); }
+ const char_type* cbegin() const noexcept { return str(); }
+ const char_type* cend() const noexcept { return str() + size(); }
+ char_type* begin() noexcept { return str(); }
+ char_type* end() noexcept { return str() + size(); }
+ bool operator==(const basic_sstring& x) const noexcept {
+ return size() == x.size() && std::equal(begin(), end(), x.begin());
+ }
+ bool operator!=(const basic_sstring& x) const noexcept {
+ return !operator==(x);
+ }
+ bool operator<(const basic_sstring& x) const noexcept {
+ return compare(x) < 0;
+ }
+ basic_sstring operator+(const basic_sstring& x) const {
+ basic_sstring ret(initialized_later(), size() + x.size());
+ std::copy(begin(), end(), ret.begin());
+ std::copy(x.begin(), x.end(), ret.begin() + size());
+ return ret;
+ }
+ basic_sstring& operator+=(const basic_sstring& x) {
+ return *this = *this + x;
+ }
+ char_type& operator[](size_type pos) noexcept {
+ return str()[pos];
+ }
+ const char_type& operator[](size_type pos) const noexcept {
+ return str()[pos];
+ }
+
+ operator std::basic_string_view<char_type>() const noexcept {
+ // we assume that std::basic_string_view<char_type>(str(), size())
+ // won't throw, although it is not specified as noexcept in
+ // https://en.cppreference.com/w/cpp/string/basic_string_view/basic_string_view
+ // at this time (C++20).
+ //
+ // This is similar to std::string operator std::basic_string_view:
+ // https://en.cppreference.com/w/cpp/string/basic_string/operator_basic_string_view
+ // that is specified as noexcept too.
+ static_assert(noexcept(std::basic_string_view<char_type>(str(), size())));
+ return std::basic_string_view<char_type>(str(), size());
+ }
+};
+template <typename char_type, typename Size, Size max_size, bool NulTerminate>
+constexpr Size basic_sstring<char_type, Size, max_size, NulTerminate>::npos;
+
+template <typename string_type = sstring>
+string_type uninitialized_string(size_t size) {
+ string_type ret;
+ // FIXME: use __resize_default_init if available
+ ret.resize(size);
+ return ret;
+}
+
+template <typename char_type, typename Size, Size max_size, bool NulTerminate>
+basic_sstring<char_type, Size, max_size, NulTerminate> uninitialized_string(size_t size) {
+ using sstring_type = basic_sstring<char_type, Size, max_size, NulTerminate>;
+ return sstring_type(sstring_type::initialized_later(), size);
+}
+
+template <typename char_type, typename size_type, size_type Max, size_type N, bool NulTerminate>
+inline
+basic_sstring<char_type, size_type, Max, NulTerminate>
+operator+(const char(&s)[N], const basic_sstring<char_type, size_type, Max, NulTerminate>& t) {
+ using sstring = basic_sstring<char_type, size_type, Max, NulTerminate>;
+ // don't copy the terminating NUL character
+ sstring ret(typename sstring::initialized_later(), N-1 + t.size());
+ auto p = std::copy(std::begin(s), std::end(s)-1, ret.begin());
+ std::copy(t.begin(), t.end(), p);
+ return ret;
+}
+
+static inline
+size_t str_len() {
+ return 0;
+}
+
+template <typename First, typename... Tail>
+static inline
+size_t str_len(const First& first, const Tail&... tail) {
+ return std::string_view(first).size() + str_len(tail...);
+}
+
+template <typename char_type, typename size_type, size_type max_size>
+inline
+void swap(basic_sstring<char_type, size_type, max_size>& x,
+ basic_sstring<char_type, size_type, max_size>& y) noexcept
+{
+ return x.swap(y);
+}
+
+template <typename char_type, typename size_type, size_type max_size, bool NulTerminate, typename char_traits>
+inline
+std::basic_ostream<char_type, char_traits>&
+operator<<(std::basic_ostream<char_type, char_traits>& os,
+ const basic_sstring<char_type, size_type, max_size, NulTerminate>& s) {
+ return os.write(s.begin(), s.size());
+}
+
+template <typename char_type, typename size_type, size_type max_size, bool NulTerminate, typename char_traits>
+inline
+std::basic_istream<char_type, char_traits>&
+operator>>(std::basic_istream<char_type, char_traits>& is,
+ basic_sstring<char_type, size_type, max_size, NulTerminate>& s) {
+ std::string tmp;
+ is >> tmp;
+ s = tmp;
+ return is;
+}
+
+}
+
+namespace std {
+
+template <typename char_type, typename size_type, size_type max_size, bool NulTerminate>
+struct hash<seastar::basic_sstring<char_type, size_type, max_size, NulTerminate>> {
+ size_t operator()(const seastar::basic_sstring<char_type, size_type, max_size, NulTerminate>& s) const {
+ return std::hash<std::basic_string_view<char_type>>()(s);
+ }
+};
+
+}
+
+namespace seastar {
+
+static inline
+char* copy_str_to(char* dst) {
+ return dst;
+}
+
+template <typename Head, typename... Tail>
+static inline
+char* copy_str_to(char* dst, const Head& head, const Tail&... tail) {
+ std::string_view v(head);
+ return copy_str_to(std::copy(v.begin(), v.end(), dst), tail...);
+}
+
+template <typename String = sstring, typename... Args>
+static String make_sstring(Args&&... args)
+{
+ String ret = uninitialized_string<String>(str_len(args...));
+ copy_str_to(ret.data(), args...);
+ return ret;
+}
+
+namespace internal {
+template <typename string_type, typename T>
+string_type to_sstring_sprintf(T value, const char* fmt) {
+ char tmp[sizeof(value) * 3 + 2];
+ auto len = std::sprintf(tmp, fmt, value);
+ using ch_type = typename string_type::value_type;
+ return string_type(reinterpret_cast<ch_type*>(tmp), len);
+}
+
+template <typename string_type>
+string_type to_sstring(int value) {
+ return to_sstring_sprintf<string_type>(value, "%d");
+}
+
+template <typename string_type>
+string_type to_sstring(unsigned value) {
+ return to_sstring_sprintf<string_type>(value, "%u");
+}
+
+template <typename string_type>
+string_type to_sstring(long value) {
+ return to_sstring_sprintf<string_type>(value, "%ld");
+}
+
+template <typename string_type>
+string_type to_sstring(unsigned long value) {
+ return to_sstring_sprintf<string_type>(value, "%lu");
+}
+
+template <typename string_type>
+string_type to_sstring(long long value) {
+ return to_sstring_sprintf<string_type>(value, "%lld");
+}
+
+template <typename string_type>
+string_type to_sstring(unsigned long long value) {
+ return to_sstring_sprintf<string_type>(value, "%llu");
+}
+
+template <typename string_type>
+string_type to_sstring(float value) {
+ return to_sstring_sprintf<string_type>(value, "%g");
+}
+
+template <typename string_type>
+string_type to_sstring(double value) {
+ return to_sstring_sprintf<string_type>(value, "%g");
+}
+
+template <typename string_type>
+string_type to_sstring(long double value) {
+ return to_sstring_sprintf<string_type>(value, "%Lg");
+}
+
+template <typename string_type>
+string_type to_sstring(const char* value) {
+ return string_type(value);
+}
+
+template <typename string_type>
+string_type to_sstring(sstring value) {
+ return value;
+}
+
+template <typename string_type>
+string_type to_sstring(const temporary_buffer<char>& buf) {
+ return string_type(buf.get(), buf.size());
+}
+}
+
+template <typename string_type = sstring, typename T>
+string_type to_sstring(T value) {
+ return internal::to_sstring<string_type>(value);
+}
+}
+
+namespace std {
+template <typename T>
+inline
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
+ bool first = true;
+ os << "{";
+ for (auto&& elem : v) {
+ if (!first) {
+ os << ", ";
+ } else {
+ first = false;
+ }
+ os << elem;
+ }
+ os << "}";
+ return os;
+}
+
+template <typename Key, typename T, typename Hash, typename KeyEqual, typename Allocator>
+std::ostream& operator<<(std::ostream& os, const std::unordered_map<Key, T, Hash, KeyEqual, Allocator>& v) {
+ bool first = true;
+ os << "{";
+ for (auto&& elem : v) {
+ if (!first) {
+ os << ", ";
+ } else {
+ first = false;
+ }
+ os << "{ " << elem.first << " -> " << elem.second << "}";
+ }
+ os << "}";
+ return os;
+}
+}
diff --git a/src/seastar/include/seastar/core/stall_sampler.hh b/src/seastar/include/seastar/core/stall_sampler.hh
new file mode 100644
index 000000000..424c170fc
--- /dev/null
+++ b/src/seastar/include/seastar/core/stall_sampler.hh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/util/noncopyable_function.hh>
+
+#include <chrono>
+#include <iosfwd>
+
+// Instrumentation to detect context switches during reactor execution
+// and associated stall time, intended for use in tests
+
+namespace seastar {
+
+namespace internal {
+
+struct stall_report {
+ uint64_t kernel_stalls;
+ std::chrono::steady_clock::duration run_wall_time; // excludes sleeps
+ std::chrono::steady_clock::duration stall_time;
+};
+
+/// Run the unit-under-test (uut) function until completion, and report on any
+/// reactor stalls it generated.
+future<stall_report> report_reactor_stalls(noncopyable_function<future<> ()> uut);
+
+std::ostream& operator<<(std::ostream& os, const stall_report& sr);
+
+}
+
+}
+
diff --git a/src/seastar/include/seastar/core/std-coroutine.hh b/src/seastar/include/seastar/core/std-coroutine.hh
new file mode 100644
index 000000000..4d9299da4
--- /dev/null
+++ b/src/seastar/include/seastar/core/std-coroutine.hh
@@ -0,0 +1,94 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+// Clang currently only supports the TS
+#if __has_include(<coroutine>) && !defined(__clang__)
+#include <coroutine>
+#define SEASTAR_INTERNAL_COROUTINE_NAMESPACE std
+#elif __has_include(<experimental/coroutine>)
+#include <experimental/coroutine>
+#define SEASTAR_INTERNAL_COROUTINE_NAMESPACE std::experimental
+#else
+#define SEASTAR_INTERNAL_COROUTINE_NAMESPACE std::experimental
+
+// We are not exactly allowed to defined anything in the std namespace, but this
+// makes coroutines work with libstdc++. All of this is experimental anyway.
+
+namespace std::experimental {
+
+template<typename Promise>
+class coroutine_handle {
+ void* _pointer = nullptr;
+public:
+ coroutine_handle() = default;
+
+ coroutine_handle &operator=(nullptr_t) noexcept {
+ _pointer = nullptr;
+ return *this;
+ }
+
+ explicit operator bool() const noexcept { return _pointer; }
+
+ static coroutine_handle from_address(void* ptr) noexcept {
+ coroutine_handle hndl;
+ hndl._pointer =ptr;
+ return hndl;
+ }
+ void* address() const noexcept { return _pointer; }
+
+ static coroutine_handle from_promise(Promise& promise) noexcept {
+ coroutine_handle hndl;
+ hndl._pointer = __builtin_coro_promise(&promise, alignof(Promise), true);
+ return hndl;
+ }
+ Promise& promise() const noexcept {
+ return *reinterpret_cast<Promise*>(__builtin_coro_promise(_pointer, alignof(Promise), false));
+ }
+
+ void operator()() noexcept { resume(); }
+
+ void resume() const noexcept { __builtin_coro_resume(_pointer); }
+ void destroy() const noexcept { __builtin_coro_destroy(_pointer); }
+ bool done() const noexcept { return __builtin_coro_done(_pointer); }
+};
+
+struct suspend_never {
+ constexpr bool await_ready() const noexcept { return true; }
+ template<typename T>
+ constexpr void await_suspend(coroutine_handle<T>) noexcept { }
+ constexpr void await_resume() noexcept { }
+};
+
+struct suspend_always {
+ constexpr bool await_ready() const noexcept { return false; }
+ template<typename T>
+ constexpr void await_suspend(coroutine_handle<T>) noexcept { }
+ constexpr void await_resume() noexcept { }
+};
+
+template<typename T, typename... Args>
+class coroutine_traits { };
+
+}
+
+#endif
diff --git a/src/seastar/include/seastar/core/stream.hh b/src/seastar/include/seastar/core/stream.hh
new file mode 100644
index 000000000..b23106108
--- /dev/null
+++ b/src/seastar/include/seastar/core/stream.hh
@@ -0,0 +1,174 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <exception>
+#include <functional>
+#include <cassert>
+
+namespace seastar {
+
+// A stream/subscription pair is similar to a promise/future pair,
+// but apply to a sequence of values instead of a single value.
+//
+// A stream<> is the producer side. It may call produce() as long
+// as the future<> returned from the previous invocation is ready.
+// To signify no more data is available, call close().
+//
+// A subscription<> is the consumer side. It is created by a call
+// to stream::listen(). Calling subscription::start(),
+// which registers the data processing callback, starts processing
+// events. It may register for end-of-stream notifications by
+// chaining the when_done() future, which also delivers error
+// events (as exceptions).
+//
+// The consumer can pause generation of new data by returning
+// a non-ready future; when the future becomes ready, the producer
+// will resume processing.
+
+template <typename... T>
+class stream;
+
+template <typename... T>
+class subscription;
+
+template <typename... T>
+class stream {
+public:
+ using next_fn = noncopyable_function<future<> (T...)>;
+
+private:
+ promise<> _done;
+ promise<> _ready;
+ next_fn _next;
+
+ /// \brief Start receiving events from the stream.
+ ///
+ /// \param next Callback to call for each event
+ void start(next_fn next) {
+ _next = std::move(next);
+ _ready.set_value();
+ }
+
+public:
+ stream() = default;
+ stream(const stream&) = delete;
+ stream(stream&&) = delete;
+ void operator=(const stream&) = delete;
+ void operator=(stream&&) = delete;
+
+ // Returns a subscription that reads value from this
+ // stream.
+ subscription<T...> listen() {
+ return subscription<T...>(this);
+ }
+
+ // Returns a subscription that reads value from this
+ // stream, and also sets up the listen function.
+ subscription<T...> listen(next_fn next) {
+ start(std::move(next));
+ return subscription<T...>(this);
+ }
+
+ // Becomes ready when the listener is ready to accept
+ // values. Call only once, when beginning to produce
+ // values.
+ future<> started() {
+ return _ready.get_future();
+ }
+
+ // Produce a value. Call only after started(), and after
+ // a previous produce() is ready.
+ future<> produce(T... data);
+
+ // End the stream. Call only after started(), and after
+ // a previous produce() is ready. No functions may be called
+ // after this.
+ void close() {
+ _done.set_value();
+ }
+
+ // Signal an error. Call only after started(), and after
+ // a previous produce() is ready. No functions may be called
+ // after this.
+ template <typename E>
+ void set_exception(E ex) {
+ _done.set_exception(ex);
+ }
+
+ friend class subscription<T...>;
+};
+
+template <typename... T>
+class subscription {
+ stream<T...>* _stream;
+ future<> _done;
+ explicit subscription(stream<T...>* s) : _stream(s), _done(s->_done.get_future()) {
+ }
+
+public:
+ using next_fn = typename stream<T...>::next_fn;
+ subscription(subscription&& x) : _stream(x._stream), _done(std::move(x._done)) {
+ x._stream = nullptr;
+ }
+
+ /// \brief Start receiving events from the stream.
+ ///
+ /// \param next Callback to call for each event
+ void start(next_fn next) {
+ return _stream->start(std::move(next));
+ }
+
+ // Becomes ready when the stream is empty, or when an error
+ // happens (in that case, an exception is held).
+ future<> done() {
+ return std::move(_done);
+ }
+
+ friend class stream<T...>;
+};
+
+template <typename... T>
+inline
+future<>
+stream<T...>::produce(T... data) {
+ auto ret = futurize_invoke(_next, std::move(data)...);
+ if (ret.available() && !ret.failed()) {
+ // Native network stack depends on stream::produce() returning
+ // a ready future to push packets along without dropping. As
+ // a temporary workaround, special case a ready, unfailed future
+ // and return it immediately, so that then_wrapped(), below,
+ // doesn't convert a ready future to an unready one.
+ return ret;
+ }
+ return ret.then_wrapped([this] (auto&& f) {
+ try {
+ f.get();
+ } catch (...) {
+ _done.set_exception(std::current_exception());
+ // FIXME: tell the producer to stop producing
+ throw;
+ }
+ });
+}
+}
diff --git a/src/seastar/include/seastar/core/systemwide_memory_barrier.hh b/src/seastar/include/seastar/core/systemwide_memory_barrier.hh
new file mode 100644
index 000000000..879d1bfc6
--- /dev/null
+++ b/src/seastar/include/seastar/core/systemwide_memory_barrier.hh
@@ -0,0 +1,37 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Scylla DB
+ */
+
+#pragma once
+
+namespace seastar {
+
+/// \cond internal
+
+// cause all threads to invoke a full memory barrier
+void systemwide_memory_barrier();
+// attempt to invoke a systemwide memory barrier; return false
+// if doing so would cause lock contention in the kernel
+bool try_systemwide_memory_barrier();
+
+/// \endcond
+
+}
+
diff --git a/src/seastar/include/seastar/core/task.hh b/src/seastar/include/seastar/core/task.hh
new file mode 100644
index 000000000..37ca56b96
--- /dev/null
+++ b/src/seastar/include/seastar/core/task.hh
@@ -0,0 +1,67 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <seastar/core/scheduling.hh>
+#include <seastar/util/backtrace.hh>
+
+namespace seastar {
+
+class task {
+ scheduling_group _sg;
+#ifdef SEASTAR_TASK_BACKTRACE
+ shared_backtrace _bt;
+#endif
+protected:
+ // Task destruction is performed by run_and_dispose() via a concrete type,
+ // so no need for a virtual destructor here. Derived classes that implement
+ // run_and_dispose() should be declared final to avoid losing concrete type
+ // information via inheritance.
+ ~task() = default;
+public:
+ explicit task(scheduling_group sg = current_scheduling_group()) noexcept : _sg(sg) {}
+ virtual void run_and_dispose() noexcept = 0;
+ /// Returns the next task which is waiting for this task to complete execution, or nullptr.
+ virtual task* waiting_task() noexcept = 0;
+ scheduling_group group() const { return _sg; }
+ shared_backtrace get_backtrace() const;
+#ifdef SEASTAR_TASK_BACKTRACE
+ void make_backtrace() noexcept;
+#else
+ void make_backtrace() noexcept {}
+#endif
+};
+
+inline
+shared_backtrace task::get_backtrace() const {
+#ifdef SEASTAR_TASK_BACKTRACE
+ return _bt;
+#else
+ return {};
+#endif
+}
+
+void schedule(task* t) noexcept;
+void schedule_urgent(task* t) noexcept;
+
+}
diff --git a/src/seastar/include/seastar/core/temporary_buffer.hh b/src/seastar/include/seastar/core/temporary_buffer.hh
new file mode 100644
index 000000000..def29ecf3
--- /dev/null
+++ b/src/seastar/include/seastar/core/temporary_buffer.hh
@@ -0,0 +1,246 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/deleter.hh>
+#include <seastar/util/eclipse.hh>
+#include <seastar/util/std-compat.hh>
+#include <malloc.h>
+#include <algorithm>
+#include <cstddef>
+
+namespace seastar {
+
+/// \addtogroup memory-module
+/// @{
+
+/// Temporary, self-managed byte buffer.
+///
+/// A \c temporary_buffer is similar to an \c std::string or a \c std::unique_ptr<char[]>,
+/// but provides more flexible memory management. A \c temporary_buffer can own the memory
+/// it points to, or it can be shared with another \c temporary_buffer, or point at a substring
+/// of a buffer. It uses a \ref deleter to manage its memory.
+///
+/// A \c temporary_buffer should not be held indefinitely. It can be held while a request
+/// is processed, or for a similar duration, but not longer, as it can tie up more memory
+/// that its size indicates.
+///
+/// A buffer can be shared: two \c temporary_buffer objects will point to the same data,
+/// or a subset of it. See the \ref temporary_buffer::share() method.
+///
+/// Unless you created a \c temporary_buffer yourself, do not modify its contents, as they
+/// may be shared with another user that does not expect the data to change.
+///
+/// Use cases for a \c temporary_buffer include:
+/// - passing a substring of a tcp packet for the user to consume (zero-copy
+/// tcp input)
+/// - passing a refcounted blob held in memory to tcp, ensuring that when the TCP ACK
+/// is received, the blob is released (by decrementing its reference count) (zero-copy
+/// tcp output)
+///
+/// \tparam CharType underlying character type (must be a variant of \c char).
+template <typename CharType>
+class temporary_buffer {
+ static_assert(sizeof(CharType) == 1, "must buffer stream of bytes");
+ CharType* _buffer;
+ size_t _size;
+ deleter _deleter;
+public:
+ /// Creates a \c temporary_buffer of a specified size. The buffer is not shared
+ /// with anyone, and is not initialized.
+ ///
+ /// \param size buffer size, in bytes
+ explicit temporary_buffer(size_t size)
+ : _buffer(static_cast<CharType*>(malloc(size * sizeof(CharType)))), _size(size)
+ , _deleter(make_free_deleter(_buffer)) {
+ if (size && !_buffer) {
+ throw std::bad_alloc();
+ }
+ }
+ //explicit temporary_buffer(CharType* borrow, size_t size) : _buffer(borrow), _size(size) {}
+ /// Creates an empty \c temporary_buffer that does not point at anything.
+ temporary_buffer() noexcept
+ : _buffer(nullptr)
+ , _size(0) {}
+ temporary_buffer(const temporary_buffer&) = delete;
+
+ /// Moves a \c temporary_buffer.
+ temporary_buffer(temporary_buffer&& x) noexcept : _buffer(x._buffer), _size(x._size), _deleter(std::move(x._deleter)) {
+ x._buffer = nullptr;
+ x._size = 0;
+ }
+
+ /// Creates a \c temporary_buffer with a specific deleter.
+ ///
+ /// \param buf beginning of the buffer held by this \c temporary_buffer
+ /// \param size size of the buffer
+ /// \param d deleter controlling destruction of the buffer. The deleter
+ /// will be destroyed when there are no longer any users for the buffer.
+ temporary_buffer(CharType* buf, size_t size, deleter d)
+ : _buffer(buf), _size(size), _deleter(std::move(d)) {}
+ /// Creates a `temporary_buffer` containing a copy of the provided data
+ ///
+ /// \param src data buffer to be copied
+ /// \param size size of data buffer in `src`
+ temporary_buffer(const CharType* src, size_t size) : temporary_buffer(size) {
+ std::copy_n(src, size, _buffer);
+ }
+ void operator=(const temporary_buffer&) = delete;
+ /// Moves a \c temporary_buffer.
+ temporary_buffer& operator=(temporary_buffer&& x) noexcept {
+ if (this != &x) {
+ _buffer = x._buffer;
+ _size = x._size;
+ _deleter = std::move(x._deleter);
+ x._buffer = nullptr;
+ x._size = 0;
+ }
+ return *this;
+ }
+ /// Gets a pointer to the beginning of the buffer.
+ const CharType* get() const { return _buffer; }
+ /// Gets a writable pointer to the beginning of the buffer. Use only
+ /// when you are certain no user expects the buffer data not to change.
+ CharType* get_write() { return _buffer; }
+ /// Gets the buffer size.
+ size_t size() const { return _size; }
+ /// Gets a pointer to the beginning of the buffer.
+ const CharType* begin() const { return _buffer; }
+ /// Gets a pointer to the end of the buffer.
+ const CharType* end() const { return _buffer + _size; }
+ /// Returns the buffer, but with a reduced size. The original
+ /// buffer is consumed by this call and can no longer be used.
+ ///
+ /// \param size New size; must be smaller than current size.
+ /// \return the same buffer, with a prefix removed.
+ temporary_buffer prefix(size_t size) && {
+ auto ret = std::move(*this);
+ ret._size = size;
+ return ret;
+ }
+ /// Reads a character from a specific position in the buffer.
+ ///
+ /// \param pos position to read character from; must be less than size.
+ CharType operator[](size_t pos) const {
+ return _buffer[pos];
+ }
+ /// Checks whether the buffer is empty.
+ bool empty() const { return !size(); }
+ /// Checks whether the buffer is not empty.
+ explicit operator bool() const { return size(); }
+ /// Create a new \c temporary_buffer object referring to the same
+ /// underlying data. The underlying \ref deleter will not be destroyed
+ /// until both the original and the clone have been destroyed.
+ ///
+ /// \return a clone of the buffer object.
+ temporary_buffer share() {
+ return temporary_buffer(_buffer, _size, _deleter.share());
+ }
+ /// Create a new \c temporary_buffer object referring to a substring of the
+ /// same underlying data. The underlying \ref deleter will not be destroyed
+ /// until both the original and the clone have been destroyed.
+ ///
+ /// \param pos Position of the first character to share.
+ /// \param len Length of substring to share.
+ /// \return a clone of the buffer object, referring to a substring.
+ temporary_buffer share(size_t pos, size_t len) {
+ auto ret = share();
+ ret._buffer += pos;
+ ret._size = len;
+ return ret;
+ }
+ /// Clone the current \c temporary_buffer object into a new one.
+ /// This creates a temporary buffer with the same length and data but not
+ /// pointing to the memory of the original object.
+ temporary_buffer clone() const {
+ return {_buffer, _size};
+ }
+ /// Remove a prefix from the buffer. The underlying data
+ /// is not modified.
+ ///
+ /// \param pos Position of first character to retain.
+ void trim_front(size_t pos) {
+ _buffer += pos;
+ _size -= pos;
+ }
+ /// Remove a suffix from the buffer. The underlying data
+ /// is not modified.
+ ///
+ /// \param pos Position of first character to drop.
+ void trim(size_t pos) {
+ _size = pos;
+ }
+ /// Stops automatic memory management. When the \c temporary_buffer
+ /// object is destroyed, the underlying \ref deleter will not be called.
+ /// Instead, it is the caller's responsibility to destroy the deleter object
+ /// when the data is no longer needed.
+ ///
+ /// \return \ref deleter object managing the data's lifetime.
+ deleter release() {
+ return std::move(_deleter);
+ }
+ /// Creates a \c temporary_buffer object with a specified size, with
+ /// memory aligned to a specific boundary.
+ ///
+ /// \param alignment Required alignment; must be a power of two and a multiple of sizeof(void *).
+ /// \param size Required size; must be a multiple of alignment.
+ /// \return a new \c temporary_buffer object.
+ static temporary_buffer aligned(size_t alignment, size_t size) {
+ void *ptr = nullptr;
+ auto ret = ::posix_memalign(&ptr, alignment, size * sizeof(CharType));
+ auto buf = static_cast<CharType*>(ptr);
+ if (ret) {
+ throw std::bad_alloc();
+ }
+ return temporary_buffer(buf, size, make_free_deleter(buf));
+ }
+
+ static temporary_buffer copy_of(std::string_view view) {
+ void* ptr = ::malloc(view.size());
+ if (!ptr) {
+ throw std::bad_alloc();
+ }
+ auto buf = static_cast<CharType*>(ptr);
+ memcpy(buf, view.data(), view.size());
+ return temporary_buffer(buf, view.size(), make_free_deleter(buf));
+ }
+
+ /// Compare contents of this buffer with another buffer for equality
+ ///
+ /// \param o buffer to compare with
+ /// \return true if and only if contents are the same
+ bool operator==(const temporary_buffer& o) const {
+ return size() == o.size() && std::equal(begin(), end(), o.begin());
+ }
+
+ /// Compare contents of this buffer with another buffer for inequality
+ ///
+ /// \param o buffer to compare with
+ /// \return true if and only if contents are not the same
+ bool operator!=(const temporary_buffer& o) const {
+ return !(*this == o);
+ }
+};
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/thread.hh b/src/seastar/include/seastar/core/thread.hh
new file mode 100644
index 000000000..e5617b8dc
--- /dev/null
+++ b/src/seastar/include/seastar/core/thread.hh
@@ -0,0 +1,285 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/thread_impl.hh>
+#include <seastar/core/future.hh>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/scheduling.hh>
+#include <memory>
+#include <setjmp.h>
+#include <type_traits>
+#include <chrono>
+#include <seastar/util/std-compat.hh>
+#include <ucontext.h>
+#include <boost/intrusive/list.hpp>
+
+/// \defgroup thread-module Seastar threads
+///
+/// Seastar threads provide an execution environment where blocking
+/// is tolerated; you can issue I/O, and wait for it in the same function,
+/// rather then establishing a callback to be called with \ref future<>::then().
+///
+/// Seastar threads are not the same as operating system threads:
+/// - seastar threads are cooperative; they are never preempted except
+/// at blocking points (see below)
+/// - seastar threads always run on the same core they were launched on
+///
+/// Like other seastar code, seastar threads may not issue blocking system calls.
+///
+/// A seastar thread blocking point is any function that returns a \ref future.
+/// you block by calling \ref future<>::get(); this waits for the future to become
+/// available, and in the meanwhile, other seastar threads and seastar non-threaded
+/// code may execute.
+///
+/// Example:
+/// \code
+/// seastar::thread th([] {
+/// sleep(5s).get(); // blocking point
+/// });
+/// \endcode
+///
+/// An easy way to launch a thread and carry out some computation, and return a
+/// result from this execution is by using the \ref seastar::async() function.
+/// The result is returned as a future, so that non-threaded code can wait for
+/// the thread to terminate and yield a result.
+
+/// Seastar API namespace
+namespace seastar {
+
+/// \addtogroup thread-module
+/// @{
+
+class thread;
+class thread_attributes;
+
+/// Class that holds attributes controling the behavior of a thread.
+class thread_attributes {
+public:
+ std::optional<seastar::scheduling_group> sched_group;
+ // For stack_size 0, a default value will be used (128KiB when writing this comment)
+ size_t stack_size = 0;
+};
+
+
+/// \cond internal
+extern thread_local jmp_buf_link g_unthreaded_context;
+
+// Internal class holding thread state. We can't hold this in
+// \c thread itself because \c thread is movable, and we want pointers
+// to this state to be captured.
+class thread_context final : private task {
+ struct stack_deleter {
+ void operator()(char *ptr) const noexcept;
+ int valgrind_id;
+ stack_deleter(int valgrind_id);
+ };
+ using stack_holder = std::unique_ptr<char[], stack_deleter>;
+
+ stack_holder _stack;
+ noncopyable_function<void ()> _func;
+ jmp_buf_link _context;
+ promise<> _done;
+ bool _joined = false;
+
+ boost::intrusive::list_member_hook<> _all_link;
+ using all_thread_list = boost::intrusive::list<thread_context,
+ boost::intrusive::member_hook<thread_context, boost::intrusive::list_member_hook<>,
+ &thread_context::_all_link>,
+ boost::intrusive::constant_time_size<false>>;
+
+ static thread_local all_thread_list _all_threads;
+private:
+ static void s_main(int lo, int hi); // all parameters MUST be 'int' for makecontext
+ void setup(size_t stack_size);
+ void main();
+ stack_holder make_stack(size_t stack_size);
+ virtual void run_and_dispose() noexcept override; // from task class
+public:
+ thread_context(thread_attributes attr, noncopyable_function<void ()> func);
+ ~thread_context();
+ void switch_in();
+ void switch_out();
+ bool should_yield() const;
+ void reschedule();
+ void yield();
+ task* waiting_task() noexcept override { return _done.waiting_task(); }
+ friend class thread;
+ friend void thread_impl::switch_in(thread_context*);
+ friend void thread_impl::switch_out(thread_context*);
+ friend scheduling_group thread_impl::sched_group(const thread_context*);
+};
+
+/// \endcond
+
+
+/// \brief thread - stateful thread of execution
+///
+/// Threads allow using seastar APIs in a blocking manner,
+/// by calling future::get() on a non-ready future. When
+/// this happens, the thread is put to sleep until the future
+/// becomes ready.
+class thread {
+ std::unique_ptr<thread_context> _context;
+ static thread_local thread* _current;
+public:
+ /// \brief Constructs a \c thread object that does not represent a thread
+ /// of execution.
+ thread() = default;
+ /// \brief Constructs a \c thread object that represents a thread of execution
+ ///
+ /// \param func Callable object to execute in thread. The callable is
+ /// called immediately.
+ template <typename Func>
+ thread(Func func);
+ /// \brief Constructs a \c thread object that represents a thread of execution
+ ///
+ /// \param attr Attributes describing the new thread.
+ /// \param func Callable object to execute in thread. The callable is
+ /// called immediately.
+ template <typename Func>
+ thread(thread_attributes attr, Func func);
+ /// \brief Moves a thread object.
+ thread(thread&& x) noexcept = default;
+ /// \brief Move-assigns a thread object.
+ thread& operator=(thread&& x) noexcept = default;
+ /// \brief Destroys a \c thread object.
+ ///
+ /// The thread must not represent a running thread of execution (see join()).
+ ~thread() { assert(!_context || _context->_joined); }
+ /// \brief Waits for thread execution to terminate.
+ ///
+ /// Waits for thread execution to terminate, and marks the thread object as not
+ /// representing a running thread of execution.
+ future<> join();
+ /// \brief Voluntarily defer execution of current thread.
+ ///
+ /// Gives other threads/fibers a chance to run on current CPU.
+ /// The current thread will resume execution promptly.
+ static void yield();
+ /// \brief Checks whether this thread ought to call yield() now.
+ ///
+ /// Useful where we cannot call yield() immediately because we
+ /// Need to take some cleanup action first.
+ static bool should_yield();
+
+ /// \brief Yield if this thread ought to call yield() now.
+ ///
+ /// Useful where a code does long running computation and does
+ /// not want to hog cpu for more then its share
+ static void maybe_yield();
+
+ static bool running_in_thread() {
+ return thread_impl::get() != nullptr;
+ }
+};
+
+template <typename Func>
+inline
+thread::thread(thread_attributes attr, Func func)
+ : _context(std::make_unique<thread_context>(std::move(attr), std::move(func))) {
+}
+
+template <typename Func>
+inline
+thread::thread(Func func)
+ : thread(thread_attributes(), std::move(func)) {
+}
+
+inline
+future<>
+thread::join() {
+ _context->_joined = true;
+ return _context->_done.get_future();
+}
+
+/// Executes a callable in a seastar thread.
+///
+/// Runs a block of code in a threaded context,
+/// which allows it to block (using \ref future::get()). The
+/// result of the callable is returned as a future.
+///
+/// \param attr a \ref thread_attributes instance
+/// \param func a callable to be executed in a thread
+/// \param args a parameter pack to be forwarded to \c func.
+/// \return whatever \c func returns, as a future.
+///
+/// Example:
+/// \code
+/// future<int> compute_sum(int a, int b) {
+/// thread_attributes attr = {};
+/// attr.sched_group = some_scheduling_group_ptr;
+/// return seastar::async(attr, [a, b] {
+/// // some blocking code:
+/// sleep(1s).get();
+/// return a + b;
+/// });
+/// }
+/// \endcode
+template <typename Func, typename... Args>
+inline
+futurize_t<std::result_of_t<std::decay_t<Func>(std::decay_t<Args>...)>>
+async(thread_attributes attr, Func&& func, Args&&... args) noexcept {
+ using return_type = std::result_of_t<std::decay_t<Func>(std::decay_t<Args>...)>;
+ struct work {
+ thread_attributes attr;
+ Func func;
+ std::tuple<Args...> args;
+ promise<return_type> pr;
+ thread th;
+ };
+
+ try {
+ auto wp = std::make_unique<work>(work{std::move(attr), std::forward<Func>(func), std::forward_as_tuple(std::forward<Args>(args)...)});
+ auto& w = *wp;
+ auto ret = w.pr.get_future();
+ w.th = thread(std::move(w.attr), [&w] {
+ futurize<return_type>::apply(std::move(w.func), std::move(w.args)).forward_to(std::move(w.pr));
+ });
+ return w.th.join().then([ret = std::move(ret)] () mutable {
+ return std::move(ret);
+ }).finally([wp = std::move(wp)] {});
+ } catch (...) {
+ return futurize<return_type>::make_exception_future(std::current_exception());
+ }
+}
+
+/// Executes a callable in a seastar thread.
+///
+/// Runs a block of code in a threaded context,
+/// which allows it to block (using \ref future::get()). The
+/// result of the callable is returned as a future.
+///
+/// \param func a callable to be executed in a thread
+/// \param args a parameter pack to be forwarded to \c func.
+/// \return whatever \c func returns, as a future.
+template <typename Func, typename... Args>
+inline
+futurize_t<std::result_of_t<std::decay_t<Func>(std::decay_t<Args>...)>>
+async(Func&& func, Args&&... args) noexcept {
+ return async(thread_attributes{}, std::forward<Func>(func), std::forward<Args>(args)...);
+}
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/core/thread_cputime_clock.hh b/src/seastar/include/seastar/core/thread_cputime_clock.hh
new file mode 100644
index 000000000..9e905397b
--- /dev/null
+++ b/src/seastar/include/seastar/core/thread_cputime_clock.hh
@@ -0,0 +1,48 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+#pragma once
+
+#include <chrono>
+#include <time.h>
+#include <cassert>
+
+namespace seastar {
+
+class thread_cputime_clock {
+public:
+ using rep = int64_t;
+ using period = std::chrono::nanoseconds::period;
+ using duration = std::chrono::duration<rep, period>;
+ using time_point = std::chrono::time_point<thread_cputime_clock, duration>;
+public:
+ static time_point now() {
+ using namespace std::chrono_literals;
+
+ struct timespec tp;
+ [[gnu::unused]] auto ret = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp);
+ assert(ret == 0);
+ return time_point(tp.tv_nsec * 1ns + tp.tv_sec * 1s);
+ }
+};
+
+}
+
diff --git a/src/seastar/include/seastar/core/thread_impl.hh b/src/seastar/include/seastar/core/thread_impl.hh
new file mode 100644
index 000000000..214872406
--- /dev/null
+++ b/src/seastar/include/seastar/core/thread_impl.hh
@@ -0,0 +1,84 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+#include <seastar/core/preempt.hh>
+#include <setjmp.h>
+#include <ucontext.h>
+#include <chrono>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+/// Clock used for scheduling threads
+using thread_clock = std::chrono::steady_clock;
+
+/// \cond internal
+class thread_context;
+class scheduling_group;
+
+struct jmp_buf_link {
+#ifdef SEASTAR_ASAN_ENABLED
+ ucontext_t context;
+ void* fake_stack = nullptr;
+ const void* stack_bottom;
+ size_t stack_size;
+#else
+ jmp_buf jmpbuf;
+#endif
+ jmp_buf_link* link;
+ thread_context* thread;
+public:
+ void initial_switch_in(ucontext_t* initial_context, const void* stack_bottom, size_t stack_size);
+ void switch_in();
+ void switch_out();
+ void initial_switch_in_completed();
+ void final_switch_out();
+};
+
+extern thread_local jmp_buf_link* g_current_context;
+
+namespace thread_impl {
+
+inline thread_context* get() {
+ return g_current_context->thread;
+}
+
+inline bool should_yield() {
+ if (need_preempt()) {
+ return true;
+ } else {
+ return false;
+ }
+}
+
+scheduling_group sched_group(const thread_context*);
+
+void yield();
+void switch_in(thread_context* to);
+void switch_out(thread_context* from);
+void init();
+
+}
+}
+/// \endcond
+
+
diff --git a/src/seastar/include/seastar/core/timed_out_error.hh b/src/seastar/include/seastar/core/timed_out_error.hh
new file mode 100644
index 000000000..48d878b92
--- /dev/null
+++ b/src/seastar/include/seastar/core/timed_out_error.hh
@@ -0,0 +1,42 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <exception>
+
+namespace seastar {
+
+class timed_out_error : public std::exception {
+public:
+ virtual const char* what() const noexcept {
+ return "timedout";
+ }
+};
+
+struct default_timeout_exception_factory {
+ static auto timeout() {
+ return timed_out_error();
+ }
+};
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/timer-set.hh b/src/seastar/include/seastar/core/timer-set.hh
new file mode 100644
index 000000000..5032eb686
--- /dev/null
+++ b/src/seastar/include/seastar/core/timer-set.hh
@@ -0,0 +1,253 @@
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+/*
+ * Imported from OSv:
+ *
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ * This work is open source software, licensed under the terms of the
+ * BSD license as described in the LICENSE file in the top-level directory.
+ */
+
+#pragma once
+
+#include <chrono>
+#include <limits>
+#include <bitset>
+#include <array>
+#include <boost/intrusive/list.hpp>
+#include <seastar/core/bitset-iter.hh>
+
+namespace seastar {
+
+/**
+ * A data structure designed for holding and expiring timers. It's
+ * optimized for timer non-delivery by deferring sorting cost until
+ * expiry time. The optimization is based on the observation that in
+ * many workloads timers are cancelled or rescheduled before they
+ * expire. That's especially the case for TCP timers.
+ *
+ * The template type "Timer" should have a method named
+ * get_timeout() which returns Timer::time_point which denotes
+ * timer's expiration.
+ */
+template<typename Timer, boost::intrusive::list_member_hook<> Timer::*link>
+class timer_set {
+public:
+ using time_point = typename Timer::time_point;
+ using timer_list_t = boost::intrusive::list<Timer, boost::intrusive::member_hook<Timer, boost::intrusive::list_member_hook<>, link>>;
+private:
+ using duration = typename Timer::duration;
+ using timestamp_t = typename Timer::duration::rep;
+
+ static constexpr timestamp_t max_timestamp = std::numeric_limits<timestamp_t>::max();
+ static constexpr int timestamp_bits = std::numeric_limits<timestamp_t>::digits;
+
+ // The last bucket is reserved for active timers with timeout <= _last.
+ static constexpr int n_buckets = timestamp_bits + 1;
+
+ std::array<timer_list_t, n_buckets> _buckets;
+ timestamp_t _last;
+ timestamp_t _next;
+
+ std::bitset<n_buckets> _non_empty_buckets;
+private:
+ static timestamp_t get_timestamp(time_point _time_point) noexcept
+ {
+ return _time_point.time_since_epoch().count();
+ }
+
+ static timestamp_t get_timestamp(Timer& timer) noexcept
+ {
+ return get_timestamp(timer.get_timeout());
+ }
+
+ int get_index(timestamp_t timestamp) const noexcept
+ {
+ if (timestamp <= _last) {
+ return n_buckets - 1;
+ }
+
+ auto index = bitsets::count_leading_zeros(timestamp ^ _last);
+ assert(index < n_buckets - 1);
+ return index;
+ }
+
+ int get_index(Timer& timer) const noexcept
+ {
+ return get_index(get_timestamp(timer));
+ }
+
+ int get_last_non_empty_bucket() const noexcept
+ {
+ return bitsets::get_last_set(_non_empty_buckets);
+ }
+public:
+ timer_set() noexcept
+ : _last(0)
+ , _next(max_timestamp)
+ , _non_empty_buckets(0)
+ {
+ }
+
+ ~timer_set() {
+ for (auto&& list : _buckets) {
+ while (!list.empty()) {
+ auto& timer = *list.begin();
+ timer.cancel();
+ }
+ }
+ }
+
+ /**
+ * Adds timer to the active set.
+ *
+ * The value returned by timer.get_timeout() is used as timer's expiry. The result
+ * of timer.get_timeout() must not change while the timer is in the active set.
+ *
+ * Preconditions:
+ * - this timer must not be currently in the active set or in the expired set.
+ *
+ * Postconditions:
+ * - this timer will be added to the active set until it is expired
+ * by a call to expire() or removed by a call to remove().
+ *
+ * Returns true if and only if this timer's timeout is less than get_next_timeout().
+ * When this function returns true the caller should reschedule expire() to be
+ * called at timer.get_timeout() to ensure timers are expired in a timely manner.
+ */
+ bool insert(Timer& timer) noexcept
+ {
+ auto timestamp = get_timestamp(timer);
+ auto index = get_index(timestamp);
+
+ _buckets[index].push_back(timer);
+ _non_empty_buckets[index] = true;
+
+ if (timestamp < _next) {
+ _next = timestamp;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Removes timer from the active set.
+ *
+ * Preconditions:
+ * - timer must be currently in the active set. Note: it must not be in
+ * the expired set.
+ *
+ * Postconditions:
+ * - timer is no longer in the active set.
+ * - this object will no longer hold any references to this timer.
+ */
+ void remove(Timer& timer) noexcept
+ {
+ auto index = get_index(timer);
+ auto& list = _buckets[index];
+ list.erase(list.iterator_to(timer));
+ if (list.empty()) {
+ _non_empty_buckets[index] = false;
+ }
+ }
+
+ /**
+ * Expires active timers.
+ *
+ * The time points passed to this function must be monotonically increasing.
+ * Use get_next_timeout() to query for the next time point.
+ *
+ * Preconditions:
+ * - the time_point passed to this function must not be lesser than
+ * the previous one passed to this function.
+ *
+ * Postconditons:
+ * - all timers from the active set with Timer::get_timeout() <= now are moved
+ * to the expired set.
+ */
+ timer_list_t expire(time_point now) noexcept
+ {
+ timer_list_t exp;
+ auto timestamp = get_timestamp(now);
+
+ if (timestamp < _last) {
+ abort();
+ }
+
+ auto index = get_index(timestamp);
+
+ for (int i : bitsets::for_each_set(_non_empty_buckets, index + 1)) {
+ exp.splice(exp.end(), _buckets[i]);
+ _non_empty_buckets[i] = false;
+ }
+
+ _last = timestamp;
+ _next = max_timestamp;
+
+ auto& list = _buckets[index];
+ while (!list.empty()) {
+ auto& timer = *list.begin();
+ list.pop_front();
+ if (timer.get_timeout() <= now) {
+ exp.push_back(timer);
+ } else {
+ insert(timer);
+ }
+ }
+
+ _non_empty_buckets[index] = !list.empty();
+
+ if (_next == max_timestamp && _non_empty_buckets.any()) {
+ for (auto& timer : _buckets[get_last_non_empty_bucket()]) {
+ _next = std::min(_next, get_timestamp(timer));
+ }
+ }
+ return exp;
+ }
+
+ /**
+ * Returns a time point at which expire() should be called
+ * in order to ensure timers are expired in a timely manner.
+ *
+ * Returned values are monotonically increasing.
+ */
+ time_point get_next_timeout() const noexcept
+ {
+ return time_point(duration(std::max(_last, _next)));
+ }
+
+ /**
+ * Clears both active and expired timer sets.
+ */
+ void clear() noexcept
+ {
+ for (int i : bitsets::for_each_set(_non_empty_buckets)) {
+ _buckets[i].clear();
+ }
+ }
+
+ size_t size() const noexcept
+ {
+ size_t res = 0;
+ for (int i : bitsets::for_each_set(_non_empty_buckets)) {
+ res += _buckets[i].size();
+ }
+ return res;
+ }
+
+ /**
+ * Returns true if and only if there are no timers in the active set.
+ */
+ bool empty() const noexcept
+ {
+ return _non_empty_buckets.none();
+ }
+
+ time_point now() noexcept {
+ return Timer::clock::now();
+ }
+};
+};
diff --git a/src/seastar/include/seastar/core/timer.hh b/src/seastar/include/seastar/core/timer.hh
new file mode 100644
index 000000000..39bf0c272
--- /dev/null
+++ b/src/seastar/include/seastar/core/timer.hh
@@ -0,0 +1,225 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <chrono>
+#include <seastar/util/std-compat.hh>
+#include <atomic>
+#include <functional>
+#include <seastar/core/future.hh>
+#include <seastar/core/timer-set.hh>
+#include <seastar/core/scheduling.hh>
+
+/// \file
+
+/// \defgroup timers Timers
+///
+/// Seastar provides timers that can be defined to run a callback at a certain
+/// time point in the future; timers are provided for \ref lowres_clock (10ms
+/// resolution, efficient), for std::chrono::steady_clock (accurate but less
+/// efficient) and for \ref manual_clock (for testing purposes).
+///
+/// Timers are optimized for cancellation; that is, adding a timer and cancelling
+/// it is very efficient. This means that attaching a timer per object for
+/// a timeout that rarely happens is reasonable; one does not have to maintain
+/// a single timer and a sorted list for this use case.
+///
+/// Timer callbacks should be short and execute quickly. If involved processing
+/// is required, a timer can launch a continuation.
+
+namespace seastar {
+
+using steady_clock_type = std::chrono::steady_clock;
+
+
+/// \addtogroup timers
+/// @{
+
+/// Timer - run a callback at a certain time point in the future.
+///
+/// Timer callbacks should execute quickly. If more involved computation
+/// is required, the timer should launch it as a fiber (or signal an
+/// existing fiber to continue execution). Fibers launched from a timer
+/// callback are executed under the scheduling group that was current
+/// when the timer was created (see current_scheduling_group()), or the
+/// scheduling that was given explicitly by the caller when the callback
+/// was specified.
+///
+/// Expiration of a `timer<std::chrono::steady_clock>` is independent of
+/// task_quota, so it has relatively high accuracy, but as a result this
+/// is a relatively expensive timer. It is recommended to use `timer<lowres_clock>`
+/// instead, which has very coarse resolution (~10ms) but is quite efficient.
+/// It is suitable for most user timeouts.
+///
+/// \tparam Clock type of clock used to denote time points; can be
+/// std::chrono::steady_clock_type (default), lowres_clock (more efficient
+/// but with less resolution) and manual_clock_type (fine-grained control
+/// for testing.
+template <typename Clock = steady_clock_type>
+class timer {
+public:
+ typedef typename Clock::time_point time_point;
+ typedef typename Clock::duration duration;
+ typedef Clock clock;
+private:
+ using callback_t = noncopyable_function<void()>;
+ boost::intrusive::list_member_hook<> _link;
+ scheduling_group _sg;
+ callback_t _callback;
+ time_point _expiry;
+ std::optional<duration> _period;
+ bool _armed = false;
+ bool _queued = false;
+ bool _expired = false;
+ void readd_periodic() noexcept;
+ void arm_state(time_point until, std::optional<duration> period) noexcept {
+ assert(!_armed);
+ _period = period;
+ _armed = true;
+ _expired = false;
+ _expiry = until;
+ _queued = true;
+ }
+public:
+ /// Constructs a timer with no callback set and no expiration time.
+ timer() noexcept {}; // implementation is required (instead of = default) for noexcept due to a bug in gcc 9.3.1,
+ // since boost::intrusive::list_member_hook default constructor is not specified as noexcept.
+ /// Constructs a timer from another timer that is moved from.
+ ///
+ /// \note care should be taken when moving a timer whose callback captures `this`,
+ /// since the object pointed to by `this` may have been moved as well.
+ timer(timer&& t) noexcept : _sg(t._sg), _callback(std::move(t._callback)), _expiry(std::move(t._expiry)), _period(std::move(t._period)),
+ _armed(t._armed), _queued(t._queued), _expired(t._expired) {
+ _link.swap_nodes(t._link);
+ t._queued = false;
+ t._armed = false;
+ }
+ /// Constructs a timer with a callback. The timer is not armed.
+ ///
+ /// \param sg Scheduling group to run the callback under.
+ /// \param callback function (with signature `void ()`) to execute after the timer is armed and expired.
+ timer(scheduling_group sg, noncopyable_function<void ()>&& callback) noexcept : _sg(sg), _callback{std::move(callback)} {
+ }
+ /// Constructs a timer with a callback. The timer is not armed.
+ ///
+ /// \param callback function (with signature `void ()`) to execute after the timer is armed and expired.
+ explicit timer(noncopyable_function<void ()>&& callback) noexcept : timer(current_scheduling_group(), std::move(callback)) {
+ }
+ /// Destroys the timer. The timer is cancelled if armed.
+ ~timer();
+ /// Sets the callback function to be called when the timer expires.
+ ///
+ /// \param sg the scheduling group under which the callback will be executed.
+ /// \param callback the callback to be executed when the timer expires.
+ void set_callback(scheduling_group sg, noncopyable_function<void ()>&& callback) noexcept {
+ _sg = sg;
+ _callback = std::move(callback);
+ }
+ /// Sets the callback function to be called when the timer expires.
+ ///
+ /// \param callback the callback to be executed when the timer expires.
+ void set_callback(noncopyable_function<void ()>&& callback) noexcept {
+ set_callback(current_scheduling_group(), std::move(callback));
+ }
+ /// Sets the timer expiration time.
+ ///
+ /// It is illegal to arm a timer that has already been armed (and
+ /// not disarmed by expiration or cancel()). In the current
+ /// implementation, this will result in an assertion failure. See
+ /// rearm().
+ ///
+ /// \param until the time when the timer expires
+ /// \param period optional automatic rearm duration; if given the timer
+ /// will automatically rearm itself when it expires, using the period
+ /// to calculate the next expiration time.
+ void arm(time_point until, std::optional<duration> period = {}) noexcept;
+ /// Sets the timer expiration time. If the timer was already armed, it is
+ /// canceled first.
+ ///
+ /// \param until the time when the timer expires
+ /// \param period optional automatic rearm duration; if given the timer
+ /// will automatically rearm itself when it expires, using the period
+ /// to calculate the next expiration time.
+ void rearm(time_point until, std::optional<duration> period = {}) noexcept {
+ if (_armed) {
+ cancel();
+ }
+ arm(until, period);
+ }
+ /// Sets the timer expiration time.
+ ///
+ /// It is illegal to arm a timer that has already been armed (and
+ /// not disarmed by expiration or cancel()). In the current
+ /// implementation, this will result in an assertion failure. See
+ /// rearm().
+ ///
+ /// \param delta the time when the timer expires, relative to now
+ void arm(duration delta) noexcept {
+ return arm(Clock::now() + delta);
+ }
+ /// Sets the timer expiration time, with automatic rearming
+ ///
+ /// \param delta the time when the timer expires, relative to now. The timer
+ /// will also rearm automatically using the same delta time.
+ void arm_periodic(duration delta) noexcept {
+ arm(Clock::now() + delta, {delta});
+ }
+ /// Sets the timer expiration time, with automatic rearming.
+ /// If the timer was already armed, it is canceled first.
+ ///
+ /// \param delta the time when the timer expires, relative to now. The timer
+ /// will also rearm automatically using the same delta time.
+ void rearm_periodic(duration delta) noexcept {
+ if (_armed) {
+ cancel();
+ }
+ arm_periodic(delta);
+ }
+ /// Returns whether the timer is armed
+ ///
+ /// \return `true` if the timer is armed and has not expired yet.
+ bool armed() const noexcept { return _armed; }
+ /// Cancels an armed timer.
+ ///
+ /// If the timer was armed, it is disarmed. If the timer was not
+ /// armed, does nothing.
+ ///
+ /// \return `true` if the timer was armed before the call.
+ bool cancel() noexcept;
+ /// Gets the expiration time of an armed timer.
+ ///
+ /// \return the time at which the timer is scheduled to expire (undefined if the
+ /// timer is not armed).
+ time_point get_timeout() const noexcept {
+ return _expiry;
+ }
+ friend class reactor;
+ friend class timer_set<timer, &timer::_link>;
+};
+
+extern template class timer<steady_clock_type>;
+
+
+/// @}
+
+}
+
diff --git a/src/seastar/include/seastar/core/transfer.hh b/src/seastar/include/seastar/core/transfer.hh
new file mode 100644
index 000000000..14cd432e7
--- /dev/null
+++ b/src/seastar/include/seastar/core/transfer.hh
@@ -0,0 +1,75 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+// Helper functions for copying or moving multiple objects in an exception
+// safe manner, then destroying the sources.
+//
+// To transfer, call transfer_pass1(allocator, &from, &to) on all object pairs,
+// (this copies the object from @from to @to). If no exceptions are encountered,
+// call transfer_pass2(allocator, &from, &to). This destroys the object at the
+// origin. If exceptions were encountered, simply destroy all copied objects.
+//
+// As an optimization, if the objects are moveable without throwing (noexcept)
+// transfer_pass1() simply moves the objects and destroys the source, and
+// transfer_pass2() does nothing.
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace seastar {
+
+template <typename T, typename Alloc>
+inline
+void
+transfer_pass1(Alloc& a, T* from, T* to,
+ typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ std::allocator_traits<Alloc>::construct(a, to, std::move(*from));
+ std::allocator_traits<Alloc>::destroy(a, from);
+}
+
+template <typename T, typename Alloc>
+inline
+void
+transfer_pass2(Alloc& a, T* from, T* to,
+ typename std::enable_if<std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+}
+
+template <typename T, typename Alloc>
+inline
+void
+transfer_pass1(Alloc& a, T* from, T* to,
+ typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ std::allocator_traits<Alloc>::construct(a, to, *from);
+}
+
+template <typename T, typename Alloc>
+inline
+void
+transfer_pass2(Alloc& a, T* from, T* to,
+ typename std::enable_if<!std::is_nothrow_move_constructible<T>::value>::type* = nullptr) {
+ std::allocator_traits<Alloc>::destroy(a, from);
+}
+
+}
+
diff --git a/src/seastar/include/seastar/core/unaligned.hh b/src/seastar/include/seastar/core/unaligned.hh
new file mode 100644
index 000000000..78ce39bac
--- /dev/null
+++ b/src/seastar/include/seastar/core/unaligned.hh
@@ -0,0 +1,78 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+// The following unaligned_cast<T*>(p) is a portable replacement for
+// reinterpret_cast<T*>(p) which should be used every time address p
+// is not guaranteed to be properly aligned to alignof(T).
+//
+// On architectures like x86 and ARM, where unaligned access is allowed,
+// unaligned_cast will behave the same as reinterpret_cast and will generate
+// the same code.
+//
+// Certain architectures (e.g., MIPS) make it extremely slow or outright
+// forbidden to use ordinary machine instructions on a primitive type at an
+// unaligned addresses - e.g., access a uint32_t at an address which is not
+// a multiple of 4. Gcc's "undefined behavior sanitizer" (enabled in our debug
+// build) also catches such unaligned accesses and reports them as errors,
+// even when running on x86.
+//
+// Therefore, reinterpret_cast<int32_t*> on an address which is not guaranteed
+// to be a multiple of 4 may generate extremely slow code or runtime errors,
+// and must be avoided. The compiler needs to be told about the unaligned
+// access, so it can generate reasonably-efficient code for the access
+// (in MIPS, this means generating two instructions "lwl" and "lwr", instead
+// of the one instruction "lw" which faults on unaligned/ access). The way to
+// tell the compiler this is with __attribute__((packed)). This will also
+// cause the sanitizer not to generate runtime alignment checks for this
+// access.
+
+#include <type_traits>
+
+namespace seastar {
+
+template <typename T>
+struct unaligned {
+ // This is made to support only simple types, so it is fine to
+ // require them to be trivially copy constructible.
+ static_assert(std::is_trivially_copy_constructible_v<T>);
+ T raw;
+ unaligned() noexcept = default;
+ unaligned(T x) noexcept : raw(x) {}
+ unaligned& operator=(const T& x) noexcept { raw = x; return *this; }
+ operator T() const noexcept { return raw; }
+} __attribute__((packed));
+
+
+// deprecated: violates strict aliasing rules
+template <typename T, typename F>
+inline auto unaligned_cast(F* p) noexcept {
+ return reinterpret_cast<unaligned<std::remove_pointer_t<T>>*>(p);
+}
+
+// deprecated: violates strict aliasing rules
+template <typename T, typename F>
+inline auto unaligned_cast(const F* p) noexcept {
+ return reinterpret_cast<const unaligned<std::remove_pointer_t<T>>*>(p);
+}
+
+}
diff --git a/src/seastar/include/seastar/core/units.hh b/src/seastar/include/seastar/core/units.hh
new file mode 100644
index 000000000..43dbffeb2
--- /dev/null
+++ b/src/seastar/include/seastar/core/units.hh
@@ -0,0 +1,30 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+namespace seastar {
+
+static constexpr size_t KB = 1 << 10;
+static constexpr size_t MB = 1 << 20;
+static constexpr size_t GB = 1 << 30;
+
+}
diff --git a/src/seastar/include/seastar/core/vector-data-sink.hh b/src/seastar/include/seastar/core/vector-data-sink.hh
new file mode 100644
index 000000000..4a39103ff
--- /dev/null
+++ b/src/seastar/include/seastar/core/vector-data-sink.hh
@@ -0,0 +1,47 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/core/iostream.hh>
+
+namespace seastar {
+
+class vector_data_sink final : public data_sink_impl {
+public:
+ using vector_type = std::vector<net::packet>;
+private:
+ vector_type& _v;
+public:
+ vector_data_sink(vector_type& v) : _v(v) {}
+
+ virtual future<> put(net::packet p) override {
+ _v.push_back(std::move(p));
+ return make_ready_future<>();
+ }
+
+ virtual future<> close() override {
+ // TODO: close on local side
+ return make_ready_future<>();
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/core/weak_ptr.hh b/src/seastar/include/seastar/core/weak_ptr.hh
new file mode 100644
index 000000000..884ddb13e
--- /dev/null
+++ b/src/seastar/include/seastar/core/weak_ptr.hh
@@ -0,0 +1,120 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <boost/intrusive/list.hpp>
+
+namespace seastar {
+
+/// A non-owning reference to an object.
+///
+/// weak_ptr allows one to keep a non-owning reference to an object. When the
+/// object is destroyed, it notifies all weak_ptr instances pointing to it.
+/// A weak_ptr instance pointing to a destroyed object is equivalent to a
+/// `nullptr`.
+///
+/// The referenced object must inherit from weakly_referencable.
+/// weak_ptr instances can only be obtained by calling weak_from_this() on
+/// the to-be-referenced object.
+///
+/// \see weakly_referencable
+template<typename T>
+class weak_ptr {
+ template<typename U>
+ friend class weakly_referencable;
+private:
+ using hook_type = boost::intrusive::list_member_hook<boost::intrusive::link_mode<boost::intrusive::auto_unlink>>;
+ hook_type _hook;
+ T* _ptr = nullptr;
+ weak_ptr(T* p) noexcept : _ptr(p) {}
+public:
+ // Note: The default constructor's body is implemented as no-op
+ // rather than `noexcept = default` due to a bug with gcc 9.3.1
+ // that deletes the constructor since boost::intrusive::list_member_hook
+ // is not default_nothrow_constructible.
+ weak_ptr() noexcept {}
+ weak_ptr(std::nullptr_t) noexcept : weak_ptr() {}
+ weak_ptr(weak_ptr&& o) noexcept
+ : _ptr(o._ptr)
+ {
+ _hook.swap_nodes(o._hook);
+ o._ptr = nullptr;
+ }
+ weak_ptr& operator=(weak_ptr&& o) noexcept {
+ if (this != &o) {
+ this->~weak_ptr();
+ new (this) weak_ptr(std::move(o));
+ }
+ return *this;
+ }
+ explicit operator bool() const noexcept { return _ptr != nullptr; }
+ T* operator->() const noexcept { return _ptr; }
+ T& operator*() const noexcept { return *_ptr; }
+ T* get() const noexcept { return _ptr; }
+ bool operator==(const weak_ptr& o) const noexcept { return _ptr == o._ptr; }
+ bool operator!=(const weak_ptr& o) const noexcept { return _ptr != o._ptr; }
+};
+
+/// Allows obtaining a non-owning reference (weak_ptr) to the object.
+///
+/// A live weak_ptr object doesn't prevent the referenced object form being destroyed.
+///
+/// The underlying pointer held by weak_ptr is valid as long as the referenced object is alive.
+/// When the object dies, all weak_ptr objects associated with it are emptied.
+///
+/// A weak reference is obtained like this:
+///
+/// class X : public weakly_referencable<X> {};
+/// auto x = std::make_unique<X>();
+/// weak_ptr<X> ptr = x->weak_from_this();
+///
+/// The user of weak_ptr can check if it still holds a valid pointer like this:
+///
+/// if (ptr) ptr->do_something();
+///
+template<typename T>
+class weakly_referencable {
+ boost::intrusive::list<weak_ptr<T>,
+ boost::intrusive::member_hook<weak_ptr<T>, typename weak_ptr<T>::hook_type, &weak_ptr<T>::_hook>,
+ boost::intrusive::constant_time_size<false>> _ptr_list;
+public:
+ // Note: The default constructor's body is implemented as no-op
+ // rather than `noexcept = default` due to a bug with gcc 9.3.1
+ // that deletes the constructor since boost::intrusive::member_hook
+ // is not default_nothrow_constructible.
+ weakly_referencable() noexcept {}
+ weakly_referencable(weakly_referencable&&) = delete; // pointer to this is captured and passed to weak_ptr
+ weakly_referencable(const weakly_referencable&) = delete;
+ ~weakly_referencable() noexcept {
+ _ptr_list.clear_and_dispose([] (weak_ptr<T>* wp) noexcept {
+ wp->_ptr = nullptr;
+ });
+ }
+ weak_ptr<T> weak_from_this() noexcept {
+ weak_ptr<T> ptr(static_cast<T*>(this));
+ _ptr_list.push_back(ptr);
+ return ptr;
+ }
+};
+
+}
+
diff --git a/src/seastar/include/seastar/core/when_all.hh b/src/seastar/include/seastar/core/when_all.hh
new file mode 100644
index 000000000..03315d3eb
--- /dev/null
+++ b/src/seastar/include/seastar/core/when_all.hh
@@ -0,0 +1,562 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 ScyllaDB.
+ */
+
+#pragma once
+
+#include <tuple>
+#include <utility>
+#include <type_traits>
+
+#include <seastar/core/future.hh>
+#include <seastar/util/tuple_utils.hh>
+#include <seastar/util/critical_alloc_section.hh>
+
+namespace seastar {
+
+/// \addtogroup future-util
+/// @{
+
+namespace internal {
+
+template<typename... Futures>
+struct identity_futures_tuple {
+ using future_type = future<std::tuple<Futures...>>;
+ using promise_type = typename future_type::promise_type;
+
+ static void set_promise(promise_type& p, std::tuple<Futures...> futures) {
+ p.set_value(std::move(futures));
+ }
+
+ static future_type make_ready_future(std::tuple<Futures...> futures) noexcept {
+ return seastar::make_ready_future<std::tuple<Futures...>>(std::move(futures));
+ }
+
+ static future_type current_exception_as_future() noexcept {
+ return seastar::current_exception_as_future<std::tuple<Futures...>>();
+ }
+};
+
+class when_all_state_base;
+
+// If the future is ready, return true
+// if the future is not ready, chain a continuation to it, and return false
+using when_all_process_element_func = bool (*)(void* future, void* continuation, when_all_state_base* wasb) noexcept;
+
+struct when_all_process_element {
+ when_all_process_element_func func;
+ void* future;
+};
+
+class when_all_state_base {
+ size_t _nr_remain;
+ const when_all_process_element* _processors;
+ void* _continuation;
+public:
+ virtual ~when_all_state_base() {}
+ when_all_state_base(size_t nr_remain, const when_all_process_element* processors, void* continuation)
+ : _nr_remain(nr_remain), _processors(processors), _continuation(continuation) {
+ }
+ virtual task* waiting_task() = 0;
+ void complete_one() noexcept {
+ // We complete in reverse order; if the futures happen to complete
+ // in order, then waiting for the last one will find the rest ready
+ --_nr_remain;
+ while (_nr_remain) {
+ bool ready = process_one(_nr_remain - 1);
+ if (!ready) {
+ return;
+ }
+ --_nr_remain;
+ }
+ if (!_nr_remain) {
+ delete this;
+ }
+ }
+ void do_wait_all() noexcept {
+ ++_nr_remain; // fake pending completion for complete_one()
+ complete_one();
+ }
+ bool process_one(size_t idx) noexcept {
+ auto p = _processors[idx];
+ return p.func(p.future, _continuation, this);
+ }
+};
+
+template <typename Future>
+class when_all_state_component final : public continuation_base_from_future_t<Future> {
+ when_all_state_base* _base;
+ Future* _final_resting_place;
+public:
+ static bool process_element_func(void* future, void* continuation, when_all_state_base* wasb) noexcept {
+ auto f = reinterpret_cast<Future*>(future);
+ if (f->available()) {
+ return true;
+ } else {
+ auto c = new (continuation) when_all_state_component(wasb, f);
+ set_callback(*f, c);
+ return false;
+ }
+ }
+ when_all_state_component(when_all_state_base *base, Future* future) noexcept : _base(base), _final_resting_place(future) {}
+ task* waiting_task() noexcept override { return _base->waiting_task(); }
+ virtual void run_and_dispose() noexcept override {
+ using futurator = futurize<Future>;
+ if (__builtin_expect(this->_state.failed(), false)) {
+ *_final_resting_place = futurator::make_exception_future(std::move(this->_state).get_exception());
+ } else {
+ *_final_resting_place = futurator::from_tuple(std::move(this->_state).get_value());
+ }
+ auto base = _base;
+ this->~when_all_state_component();
+ base->complete_one();
+ }
+};
+
+template<typename ResolvedTupleTransform, typename... Futures>
+class when_all_state : public when_all_state_base {
+ static constexpr size_t nr = sizeof...(Futures);
+ using type = std::tuple<Futures...>;
+ type tuple;
+ // We only schedule one continuation at a time, and store it in _cont.
+ // This way, if while the future we wait for completes, some other futures
+ // also complete, we won't need to schedule continuations for them.
+ std::aligned_union_t<1, when_all_state_component<Futures>...> _cont;
+ when_all_process_element _processors[nr];
+public:
+ typename ResolvedTupleTransform::promise_type p;
+ when_all_state(Futures&&... t) : when_all_state_base(nr, _processors, &_cont), tuple(std::make_tuple(std::move(t)...)) {
+ init_element_processors(std::make_index_sequence<nr>());
+ }
+ virtual ~when_all_state() {
+ ResolvedTupleTransform::set_promise(p, std::move(tuple));
+ }
+ task* waiting_task() noexcept override {
+ return p.waiting_task();
+ }
+private:
+ template <size_t... Idx>
+ void init_element_processors(std::index_sequence<Idx...>) {
+ auto ignore = {
+ 0,
+ (_processors[Idx] = when_all_process_element{
+ when_all_state_component<std::tuple_element_t<Idx, type>>::process_element_func,
+ &std::get<Idx>(tuple)
+ }, 0)...
+ };
+ (void)ignore;
+ }
+public:
+ static typename ResolvedTupleTransform::future_type wait_all(Futures&&... futures) noexcept {
+ if ((futures.available() && ...)) {
+ return ResolvedTupleTransform::make_ready_future(std::make_tuple(std::move(futures)...));
+ }
+ auto state = [&] () noexcept {
+ memory::scoped_critical_alloc_section _;
+ return new when_all_state(std::move(futures)...);
+ }();
+ auto ret = state->p.get_future();
+ state->do_wait_all();
+ return ret;
+ }
+};
+
+} // namespace internal
+
+/// \cond internal
+SEASTAR_CONCEPT(
+
+namespace impl {
+
+// Want: folds
+
+template <typename T>
+struct is_tuple_of_futures : std::false_type {
+};
+
+template <>
+struct is_tuple_of_futures<std::tuple<>> : std::true_type {
+};
+
+template <typename... T, typename... Rest>
+struct is_tuple_of_futures<std::tuple<future<T...>, Rest...>> : is_tuple_of_futures<std::tuple<Rest...>> {
+};
+
+}
+
+template <typename... Futs>
+concept AllAreFutures = impl::is_tuple_of_futures<std::tuple<Futs...>>::value;
+
+)
+
+template<typename Fut, std::enable_if_t<is_future<Fut>::value, int> = 0>
+auto futurize_invoke_if_func(Fut&& fut) noexcept {
+ return std::forward<Fut>(fut);
+}
+
+template<typename Func, std::enable_if_t<!is_future<Func>::value, int> = 0>
+auto futurize_invoke_if_func(Func&& func) noexcept {
+ return futurize_invoke(std::forward<Func>(func));
+}
+/// \endcond
+
+namespace internal {
+
+template <typename... Futs>
+SEASTAR_CONCEPT( requires seastar::AllAreFutures<Futs...> )
+inline
+future<std::tuple<Futs...>>
+when_all_impl(Futs&&... futs) noexcept {
+ using state = when_all_state<identity_futures_tuple<Futs...>, Futs...>;
+ return state::wait_all(std::forward<Futs>(futs)...);
+}
+
+} // namespace internal
+
+/// Wait for many futures to complete, capturing possible errors (variadic version).
+///
+/// Each future can be passed directly, or a function that returns a
+/// future can be given instead.
+///
+/// If any function throws, an exceptional future is created for it.
+///
+/// Returns a tuple of futures so individual values or exceptions can be
+/// examined.
+///
+/// \param fut_or_funcs futures or functions that return futures
+/// \return an \c std::tuple<> of all futures returned; when ready,
+/// all contained futures will be ready as well.
+template <typename... FutOrFuncs>
+inline auto when_all(FutOrFuncs&&... fut_or_funcs) noexcept {
+ return internal::when_all_impl(futurize_invoke_if_func(std::forward<FutOrFuncs>(fut_or_funcs))...);
+}
+
+namespace internal {
+
+template<typename Future>
+struct identity_futures_vector {
+ using future_type = future<std::vector<Future>>;
+ static future_type run(std::vector<Future> futures) noexcept {
+ return make_ready_future<std::vector<Future>>(std::move(futures));
+ }
+ static future_type current_exception_as_future() noexcept {
+ return seastar::current_exception_as_future<std::vector<Future>>();
+ }
+};
+
+// Internal function for when_all().
+template <typename ResolvedVectorTransform, typename Future>
+inline
+typename ResolvedVectorTransform::future_type
+complete_when_all(std::vector<Future>&& futures, typename std::vector<Future>::iterator pos) noexcept {
+ // If any futures are already ready, skip them.
+ while (pos != futures.end() && pos->available()) {
+ ++pos;
+ }
+ // Done?
+ if (pos == futures.end()) {
+ return ResolvedVectorTransform::run(std::move(futures));
+ }
+ // Wait for unready future, store, and continue.
+ return pos->then_wrapped([futures = std::move(futures), pos] (auto fut) mutable {
+ *pos++ = std::move(fut);
+ return complete_when_all<ResolvedVectorTransform>(std::move(futures), pos);
+ });
+}
+
+template<typename ResolvedVectorTransform, typename FutureIterator>
+inline auto
+do_when_all(FutureIterator begin, FutureIterator end) noexcept {
+ using itraits = std::iterator_traits<FutureIterator>;
+ auto make_values_vector = [] (size_t size) noexcept {
+ memory::scoped_critical_alloc_section _;
+ std::vector<typename itraits::value_type> ret;
+ ret.reserve(size);
+ return ret;
+ };
+ std::vector<typename itraits::value_type> ret =
+ make_values_vector(iterator_range_estimate_vector_capacity(begin, end, typename itraits::iterator_category()));
+ // Important to invoke the *begin here, in case it's a function iterator,
+ // so we launch all computation in parallel.
+ std::move(begin, end, std::back_inserter(ret));
+ return complete_when_all<ResolvedVectorTransform>(std::move(ret), ret.begin());
+}
+
+} // namespace internal
+
+/// Wait for many futures to complete, capturing possible errors (iterator version).
+///
+/// Given a range of futures as input, wait for all of them
+/// to resolve (either successfully or with an exception), and return
+/// them as a \c std::vector so individual values or exceptions can be examined.
+///
+/// \param begin an \c InputIterator designating the beginning of the range of futures
+/// \param end an \c InputIterator designating the end of the range of futures
+/// \return an \c std::vector<> of all the futures in the input; when
+/// ready, all contained futures will be ready as well.
+template <typename FutureIterator>
+SEASTAR_CONCEPT( requires requires (FutureIterator i) { { *i++ }; requires is_future<std::remove_reference_t<decltype(*i)>>::value; } )
+inline
+future<std::vector<typename std::iterator_traits<FutureIterator>::value_type>>
+when_all(FutureIterator begin, FutureIterator end) noexcept {
+ namespace si = internal;
+ using itraits = std::iterator_traits<FutureIterator>;
+ using result_transform = si::identity_futures_vector<typename itraits::value_type>;
+ try {
+ return si::do_when_all<result_transform>(std::move(begin), std::move(end));
+ } catch (...) {
+ return result_transform::current_exception_as_future();
+ }
+}
+
+namespace internal {
+
+template<typename Future>
+struct future_has_value {
+ enum {
+ value = !std::is_same<std::decay_t<Future>, future<>>::value
+ };
+};
+
+template<typename Tuple>
+struct tuple_to_future;
+
+template<typename... Elements>
+struct tuple_to_future<std::tuple<Elements...>> {
+#if SEASTAR_API_LEVEL < 4
+ using value_type = when_all_succeed_tuple<Elements...>;
+#else
+ using value_type = std::tuple<Elements...>;
+#endif
+ using type = future<value_type>;
+ using promise_type = promise<value_type>;
+
+ // Elements... all come from futures, so we know they are nothrow move
+ // constructible. `future` also has a static assertion to that effect.
+
+ static auto make_ready(std::tuple<Elements...> t) noexcept {
+ return make_ready_future<value_type>(value_type(std::move(t)));
+ }
+
+ static auto make_failed(std::exception_ptr excp) noexcept {
+ return seastar::make_exception_future<value_type>(std::move(excp));
+ }
+};
+
+#if SEASTAR_API_LEVEL < 4
+
+template<typename Element>
+struct tuple_to_future<std::tuple<Element>> {
+ using type = future<Element>;
+ using promise_type = promise<Element>;
+
+ // Element comes from a future, so we know it is nothrow move
+ // constructible. `future` also has a static assertion to that effect.
+
+ static auto make_ready(std::tuple<Element> t) noexcept {
+ return make_ready_future<Element>(std::get<0>(std::move(t)));
+ }
+
+ static auto make_failed(std::exception_ptr excp) noexcept {
+ return seastar::make_exception_future<Element>(std::move(excp));
+ }
+};
+
+template<>
+struct tuple_to_future<std::tuple<>> {
+ using type = future<>;
+ using promise_type = promise<>;
+
+ static auto make_ready(std::tuple<> t) noexcept {
+ return make_ready_future<>();
+ }
+
+ static auto make_failed(std::exception_ptr excp) noexcept {
+ return seastar::make_exception_future<>(std::move(excp));
+ }
+};
+
+#endif
+
+template<typename... Futures>
+class extract_values_from_futures_tuple {
+ static auto transform(std::tuple<Futures...> futures) noexcept {
+ auto prepare_result = [] (auto futures) noexcept {
+ auto fs = tuple_filter_by_type<internal::future_has_value>(std::move(futures));
+ return tuple_map(std::move(fs), [] (auto&& e) {
+ return e.get0();
+ });
+ };
+
+ using tuple_futurizer = internal::tuple_to_future<decltype(prepare_result(std::move(futures)))>;
+
+ std::exception_ptr excp;
+ tuple_for_each(futures, [&excp] (auto& f) {
+ if (!excp) {
+ if (f.failed()) {
+ excp = f.get_exception();
+ }
+ } else {
+ f.ignore_ready_future();
+ }
+ });
+ if (excp) {
+ return tuple_futurizer::make_failed(std::move(excp));
+ }
+
+ return tuple_futurizer::make_ready(prepare_result(std::move(futures)));
+ }
+public:
+ using future_type = decltype(transform(std::declval<std::tuple<Futures...>>()));
+ using promise_type = typename future_type::promise_type;
+
+ static void set_promise(promise_type& p, std::tuple<Futures...> tuple) {
+ transform(std::move(tuple)).forward_to(std::move(p));
+ }
+
+ static future_type make_ready_future(std::tuple<Futures...> tuple) noexcept {
+ return transform(std::move(tuple));
+ }
+
+ static future_type current_exception_as_future() noexcept {
+ future_type (*type_deduct)() = current_exception_as_future;
+ return type_deduct();
+ }
+};
+
+template<typename Future>
+struct extract_values_from_futures_vector {
+ using value_type = decltype(untuple(std::declval<typename Future::tuple_type>()));
+
+ using future_type = future<std::vector<value_type>>;
+
+ static future_type run(std::vector<Future> futures) noexcept {
+ auto make_values_vector = [] (size_t size) noexcept {
+ memory::scoped_critical_alloc_section _;
+ std::vector<value_type> values;
+ values.reserve(size);
+ return values;
+ };
+ std::vector<value_type> values = make_values_vector(futures.size());
+
+ std::exception_ptr excp;
+ for (auto&& f : futures) {
+ if (!excp) {
+ if (f.failed()) {
+ excp = f.get_exception();
+ } else {
+ values.emplace_back(f.get0());
+ }
+ } else {
+ f.ignore_ready_future();
+ }
+ }
+ if (excp) {
+ return seastar::make_exception_future<std::vector<value_type>>(std::move(excp));
+ }
+ return make_ready_future<std::vector<value_type>>(std::move(values));
+ }
+
+ static future_type current_exception_as_future() noexcept {
+ return seastar::current_exception_as_future<std::vector<value_type>>();
+ }
+};
+
+template<>
+struct extract_values_from_futures_vector<future<>> {
+ using future_type = future<>;
+
+ static future_type run(std::vector<future<>> futures) noexcept {
+ std::exception_ptr excp;
+ for (auto&& f : futures) {
+ if (!excp) {
+ if (f.failed()) {
+ excp = f.get_exception();
+ }
+ } else {
+ f.ignore_ready_future();
+ }
+ }
+ if (excp) {
+ return seastar::make_exception_future<>(std::move(excp));
+ }
+ return make_ready_future<>();
+ }
+
+ static future_type current_exception_as_future() noexcept {
+ return seastar::current_exception_as_future<>();
+ }
+};
+
+template<typename... Futures>
+SEASTAR_CONCEPT( requires seastar::AllAreFutures<Futures...> )
+inline auto when_all_succeed_impl(Futures&&... futures) noexcept {
+ using state = when_all_state<extract_values_from_futures_tuple<Futures...>, Futures...>;
+ return state::wait_all(std::forward<Futures>(futures)...);
+}
+
+} // namespace internal
+
+/// Wait for many futures to complete (variadic version).
+///
+/// Each future can be passed directly, or a function that returns a
+/// future can be given instead.
+///
+/// If any function throws, or if the returned future fails, one of
+/// the exceptions is returned by this function as a failed future.
+///
+/// \param fut_or_funcs futures or functions that return futures
+/// \return future containing values of futures returned by funcs
+template <typename... FutOrFuncs>
+inline auto when_all_succeed(FutOrFuncs&&... fut_or_funcs) noexcept {
+ return internal::when_all_succeed_impl(futurize_invoke_if_func(std::forward<FutOrFuncs>(fut_or_funcs))...);
+}
+
+/// Wait for many futures to complete (iterator version).
+///
+/// Given a range of futures as input, wait for all of them
+/// to resolve, and return a future containing a vector of values of the
+/// original futures.
+/// In case any of the given futures fails one of the exceptions is returned
+/// by this function as a failed future.
+/// \param begin an \c InputIterator designating the beginning of the range of futures
+/// \param end an \c InputIterator designating the end of the range of futures
+/// \return an \c std::vector<> of all the valus in the input
+template <typename FutureIterator, typename = typename std::iterator_traits<FutureIterator>::value_type>
+SEASTAR_CONCEPT( requires requires (FutureIterator i) {
+ *i++;
+ { i != i } -> std::convertible_to<bool>;
+ requires is_future<std::remove_reference_t<decltype(*i)>>::value;
+} )
+inline auto
+when_all_succeed(FutureIterator begin, FutureIterator end) noexcept {
+ using itraits = std::iterator_traits<FutureIterator>;
+ using result_transform = internal::extract_values_from_futures_vector<typename itraits::value_type>;
+ try {
+ return internal::do_when_all<result_transform>(std::move(begin), std::move(end));
+ } catch (...) {
+ return result_transform::current_exception_as_future();
+ }
+}
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/with_scheduling_group.hh b/src/seastar/include/seastar/core/with_scheduling_group.hh
new file mode 100644
index 000000000..6b652f41f
--- /dev/null
+++ b/src/seastar/include/seastar/core/with_scheduling_group.hh
@@ -0,0 +1,77 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 ScyllaDB.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/make_task.hh>
+
+namespace seastar {
+
+/// \addtogroup future-util
+/// @{
+
+namespace internal {
+
+template <typename Func>
+SEASTAR_CONCEPT( requires std::is_nothrow_move_constructible_v<Func> )
+auto
+schedule_in_group(scheduling_group sg, Func func) noexcept {
+ static_assert(std::is_nothrow_move_constructible_v<Func>);
+ auto tsk = make_task(sg, std::move(func));
+ schedule(tsk);
+ return tsk->get_future();
+}
+
+
+}
+
+/// \brief run a callable (with some arbitrary arguments) in a scheduling group
+///
+/// If the conditions are suitable (see scheduling_group::may_run_immediately()),
+/// then the function is run immediately. Otherwise, the function is queued to run
+/// when its scheduling group next runs.
+///
+/// \param sg scheduling group that controls execution time for the function
+/// \param func function to run; must be movable or copyable
+/// \param args arguments to the function; may be copied or moved, so use \c std::ref()
+/// to force passing references
+template <typename Func, typename... Args>
+SEASTAR_CONCEPT( requires std::is_nothrow_move_constructible_v<Func> )
+inline
+auto
+with_scheduling_group(scheduling_group sg, Func func, Args&&... args) noexcept {
+ static_assert(std::is_nothrow_move_constructible_v<Func>);
+ using return_type = decltype(func(std::forward<Args>(args)...));
+ using futurator = futurize<return_type>;
+ if (sg.active()) {
+ return futurator::invoke(func, std::forward<Args>(args)...);
+ } else {
+ return internal::schedule_in_group(sg, [func = std::move(func), args = std::make_tuple(std::forward<Args>(args)...)] () mutable {
+ return futurator::apply(func, std::move(args));
+ });
+ }
+}
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/core/with_timeout.hh b/src/seastar/include/seastar/core/with_timeout.hh
new file mode 100644
index 000000000..cbe4758dd
--- /dev/null
+++ b/src/seastar/include/seastar/core/with_timeout.hh
@@ -0,0 +1,73 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 ScyllaDB.
+ */
+
+
+#pragma once
+
+#include <chrono>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/timed_out_error.hh>
+#include <seastar/core/timer.hh>
+
+namespace seastar {
+
+/// \addtogroup future-util
+/// @{
+
+/// \brief Wait for either a future, or a timeout, whichever comes first
+///
+/// When timeout is reached the returned future resolves with an exception
+/// produced by ExceptionFactory::timeout(). By default it is \ref timed_out_error exception.
+///
+/// Note that timing out doesn't cancel any tasks associated with the original future.
+/// It also doesn't cancel the callback registerred on it.
+///
+/// \param f future to wait for
+/// \param timeout time point after which the returned future should be failed
+///
+/// \return a future which will be either resolved with f or a timeout exception
+template<typename ExceptionFactory = default_timeout_exception_factory, typename Clock, typename Duration, typename... T>
+future<T...> with_timeout(std::chrono::time_point<Clock, Duration> timeout, future<T...> f) {
+ if (f.available()) {
+ return f;
+ }
+ auto pr = std::make_unique<promise<T...>>();
+ auto result = pr->get_future();
+ timer<Clock> timer([&pr = *pr] {
+ pr.set_exception(std::make_exception_ptr(ExceptionFactory::timeout()));
+ });
+ timer.arm(timeout);
+ // Future is returned indirectly.
+ (void)f.then_wrapped([pr = std::move(pr), timer = std::move(timer)] (auto&& f) mutable {
+ if (timer.cancel()) {
+ f.forward_to(std::move(*pr));
+ } else {
+ f.ignore_ready_future();
+ }
+ });
+ return result;
+}
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/http/api_docs.hh b/src/seastar/include/seastar/http/api_docs.hh
new file mode 100644
index 000000000..a33b5b5d7
--- /dev/null
+++ b/src/seastar/include/seastar/http/api_docs.hh
@@ -0,0 +1,340 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+#include <seastar/json/json_elements.hh>
+#include <seastar/json/formatter.hh>
+#include <seastar/http/routes.hh>
+#include <seastar/http/transformers.hh>
+#include <string>
+#include <seastar/util/noncopyable_function.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+struct api_doc : public json::json_base {
+ json::json_element<std::string> path;
+ json::json_element<std::string> description;
+
+ void register_params() {
+ add(&path, "path");
+ add(&description, "description");
+
+ }
+ api_doc() {
+ register_params();
+ }
+ api_doc(const api_doc & e)
+ : json::json_base()
+ {
+ register_params();
+ path = e.path;
+ description = e.description;
+ }
+ template<class T>
+ api_doc& operator=(const T& e) {
+ path = e.path;
+ description = e.description;
+ return *this;
+ }
+ api_doc& operator=(const api_doc& e) {
+ path = e.path;
+ description = e.description;
+ return *this;
+ }
+};
+
+struct api_docs : public json::json_base {
+ json::json_element<std::string> apiVersion;
+ json::json_element<std::string> swaggerVersion;
+ json::json_list<api_doc> apis;
+
+ void register_params() {
+ add(&apiVersion, "apiVersion");
+ add(&swaggerVersion, "swaggerVersion");
+ add(&apis, "apis");
+
+ }
+ api_docs() {
+ apiVersion = "0.0.1";
+ swaggerVersion = "1.2";
+ register_params();
+ }
+ api_docs(const api_docs & e)
+ : json::json_base()
+ {
+ apiVersion = "0.0.1";
+ swaggerVersion = "1.2";
+ register_params();
+ }
+ template<class T>
+ api_docs& operator=(const T& e) {
+ apis = e.apis;
+ return *this;
+ }
+ api_docs& operator=(const api_docs& e) {
+ apis = e.apis;
+ return *this;
+ }
+};
+
+class api_registry_base : public handler_base {
+protected:
+ sstring _base_path;
+ sstring _file_directory;
+ routes& _routes;
+
+public:
+ api_registry_base(routes& routes, const sstring& file_directory,
+ const sstring& base_path)
+ : _base_path(base_path), _file_directory(file_directory), _routes(
+ routes) {
+ }
+
+ void set_route(handler_base* h) {
+ _routes.put(GET, _base_path, h);
+ }
+ virtual ~api_registry_base() = default;
+};
+
+class api_registry : public api_registry_base {
+ api_docs _docs;
+public:
+ api_registry(routes& routes, const sstring& file_directory,
+ const sstring& base_path)
+ : api_registry_base(routes, file_directory, base_path) {
+ set_route(this);
+ }
+
+ future<std::unique_ptr<reply>> handle(const sstring& path,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+ rep->_content = json::formatter::to_json(_docs);
+ rep->done("json");
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ }
+
+ void reg(const sstring& api, const sstring& description,
+ const sstring& alternative_path = "") {
+ api_doc doc;
+ doc.description = description;
+ doc.path = "/" + api;
+ _docs.apis.push(doc);
+ sstring path =
+ (alternative_path == "") ?
+ _file_directory + api + ".json" : alternative_path;
+ file_handler* index = new file_handler(path,
+ new content_replace("json"));
+ _routes.put(GET, _base_path + "/" + api, index);
+ }
+};
+
+class api_registry_builder_base {
+protected:
+ sstring _file_directory;
+ sstring _base_path;
+ static const sstring DEFAULT_DIR;
+ static const sstring DEFAULT_PATH;
+public:
+ api_registry_builder_base(const sstring& file_directory = DEFAULT_DIR,
+ const sstring& base_path = DEFAULT_PATH)
+ : _file_directory(file_directory), _base_path(base_path) {
+ }
+};
+
+class api_registry_builder : public api_registry_builder_base {
+public:
+ api_registry_builder(const sstring& file_directory = DEFAULT_DIR,
+ const sstring& base_path = DEFAULT_PATH)
+ : api_registry_builder_base(file_directory, base_path) {
+ }
+
+ void set_api_doc(routes& r) {
+ new api_registry(r, _file_directory, _base_path);
+ }
+
+ void register_function(routes& r, const sstring& api,
+ const sstring& description, const sstring& alternative_path = "") {
+ auto h = r.get_exact_match(GET, _base_path);
+ if (h) {
+ // if a handler is found, it was added there by the api_registry_builder
+ // with the set_api_doc method, so we know it's the type
+ static_cast<api_registry*>(h)->reg(api, description, alternative_path);
+ };
+ }
+};
+
+using doc_entry = noncopyable_function<future<>(output_stream<char>&)>;
+
+/*!
+ * \brief a helper function that creates a reader from a file
+ */
+
+doc_entry get_file_reader(sstring file_name);
+
+/*!
+ * \brief An api doc that support swagger version 2.0
+ *
+ * The result is a unified JSON file with the swagger definitions.
+ *
+ * The file content is a concatenation of the doc_entry by the order of
+ * their entry.
+ *
+ * Definitions will be added under the definition section
+ *
+ * typical usage:
+ *
+ * First entry:
+ *
+ {
+ "swagger": "2.0",
+ "host": "localhost:10000",
+ "basePath": "/v2",
+ "paths": {
+
+ * entry:
+ "/config/{id}": {
+ "get": {
+ "description": "Return a config value",
+ "operationId": "findConfigId",
+ "produces": [
+ "application/json"
+ ],
+ }
+ }
+ *
+ * Closing the entries:
+ },
+
+ "definitions": {
+ .....
+
+ .....
+ }
+}
+ *
+ */
+class api_docs_20 {
+ std::vector<doc_entry> _apis;
+ content_replace _transform;
+ std::vector<doc_entry> _definitions;
+
+public:
+ future<> write(output_stream<char>&&, std::unique_ptr<request> req);
+
+ void add_api(doc_entry&& f) {
+ _apis.emplace_back(std::move(f));
+ }
+
+ void add_definition(doc_entry&& f) {
+ _definitions.emplace_back(std::move(f));
+ }
+};
+
+class api_registry_20 : public api_registry_base {
+ api_docs_20 _docs;
+public:
+ api_registry_20(routes& routes, const sstring& file_directory,
+ const sstring& base_path)
+ : api_registry_base(routes, file_directory, base_path) {
+ set_route(this);
+ }
+
+ future<std::unique_ptr<reply>> handle(const sstring& path,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+ rep->write_body("json", [this, req = std::move(req)] (output_stream<char>&& os) mutable {
+ return _docs.write(std::move(os), std::move(req));
+ });
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ }
+
+ virtual void reg(doc_entry&& f) {
+ _docs.add_api(std::move(f));
+ }
+
+ virtual void add_definition(doc_entry&& f) {
+ _docs.add_definition(std::move(f));
+ }
+};
+
+class api_registry_builder20 : public api_registry_builder_base {
+ api_registry_20* get_register_base(routes& r) {
+ auto h = r.get_exact_match(GET, _base_path);
+ if (h) {
+ // if a handler is found, it was added there by the api_registry_builder
+ // with the set_api_doc method, so we know it's the type
+ return static_cast<api_registry_20*>(h);
+ }
+ return nullptr;
+ }
+
+public:
+ api_registry_builder20(const sstring& file_directory = DEFAULT_DIR,
+ const sstring& base_path = DEFAULT_PATH)
+ : api_registry_builder_base(file_directory, base_path) {
+ }
+
+ void set_api_doc(routes& r) {
+ new api_registry_20(r, _file_directory, _base_path);
+ }
+
+ /*!
+ * \brief register a doc_entry
+ * This doc_entry can be used to either take the definition from a file
+ * or generate them dynamically.
+ */
+ void register_function(routes& r, doc_entry&& f) {
+ auto h = get_register_base(r);
+ if (h) {
+ h->reg(std::move(f));
+ }
+ }
+ /*!
+ * \brief register an API
+ */
+ void register_api_file(routes& r, const sstring& api) {
+ register_function(r, get_file_reader(_file_directory + "/" + api + ".json"));
+ }
+
+
+ /*!
+ * Add a footer doc_entry
+ */
+ void add_definition(routes& r, doc_entry&& f) {
+ auto h = get_register_base(r);
+ if (h) {
+ h->add_definition(std::move(f));
+ }
+
+ }
+
+ /*!
+ * Add a definition file
+ */
+ void add_definitions_file(routes& r, const sstring& file) {
+ add_definition(r, get_file_reader(_file_directory + file + ".def.json" ));
+ }
+
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/common.hh b/src/seastar/include/seastar/http/common.hh
new file mode 100644
index 000000000..09dc0d8eb
--- /dev/null
+++ b/src/seastar/include/seastar/http/common.hh
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+
+class parameters {
+ std::unordered_map<sstring, sstring> params;
+public:
+ const sstring& path(const sstring& key) const {
+ return params.at(key);
+ }
+
+ sstring operator[](const sstring& key) const {
+ return params.at(key).substr(1);
+ }
+
+ const sstring& at(const sstring& key) const {
+ return path(key);
+ }
+
+ bool exists(const sstring& key) const {
+ return params.find(key) != params.end();
+ }
+
+ void set(const sstring& key, const sstring& value) {
+ params[key] = value;
+ }
+
+ void clear() {
+ params.clear();
+ }
+
+};
+
+enum operation_type {
+ GET, POST, PUT, DELETE, HEAD, OPTIONS, TRACE, CONNECT, NUM_OPERATION
+};
+
+/**
+ * Translate the string command to operation type
+ * @param type the string "GET" or "POST"
+ * @return the operation_type
+ */
+operation_type str2type(const sstring& type);
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/exception.hh b/src/seastar/include/seastar/http/exception.hh
new file mode 100644
index 000000000..eb5708308
--- /dev/null
+++ b/src/seastar/include/seastar/http/exception.hh
@@ -0,0 +1,142 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+#include <seastar/util/log.hh>
+#include <seastar/http/reply.hh>
+#include <seastar/json/json_elements.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * The base_exception is a base for all http exception.
+ * It contains a message that will be return as the message content
+ * and a status that will be return as a status code.
+ */
+class base_exception : public std::exception {
+public:
+ base_exception(const std::string& msg, reply::status_type status)
+ : _msg(msg), _status(status) {
+ }
+
+ virtual const char* what() const throw () {
+ return _msg.c_str();
+ }
+
+ reply::status_type status() const {
+ return _status;
+ }
+
+ virtual const std::string& str() const {
+ return _msg;
+ }
+private:
+ std::string _msg;
+ reply::status_type _status;
+
+};
+
+/**
+ * Throwing this exception will result in a redirect to the given url
+ */
+class redirect_exception : public base_exception {
+public:
+ redirect_exception(const std::string& url)
+ : base_exception("", reply::status_type::moved_permanently), url(
+ url) {
+ }
+ std::string url;
+};
+
+/**
+ * Throwing this exception will result in a 404 not found result
+ */
+class not_found_exception : public base_exception {
+public:
+ not_found_exception(const std::string& msg = "Not found")
+ : base_exception(msg, reply::status_type::not_found) {
+ }
+};
+
+/**
+ * Throwing this exception will result in a 400 bad request result
+ */
+
+class bad_request_exception : public base_exception {
+public:
+ bad_request_exception(const std::string& msg)
+ : base_exception(msg, reply::status_type::bad_request) {
+ }
+};
+
+class bad_param_exception : public bad_request_exception {
+public:
+ bad_param_exception(const std::string& msg)
+ : bad_request_exception(msg) {
+ }
+};
+
+class missing_param_exception : public bad_request_exception {
+public:
+ missing_param_exception(const std::string& param)
+ : bad_request_exception(
+ std::string("Missing mandatory parameter '") + param + "'") {
+ }
+};
+
+class server_error_exception : public base_exception {
+public:
+ server_error_exception(const std::string& msg)
+ : base_exception(msg, reply::status_type::internal_server_error) {
+ }
+};
+
+class json_exception : public json::json_base {
+public:
+ json::json_element<std::string> _msg;
+ json::json_element<int> _code;
+ void register_params() {
+ add(&_msg, "message");
+ add(&_code, "code");
+ }
+
+ json_exception(const base_exception & e) {
+ set(e.str(), e.status());
+ }
+
+ json_exception(std::exception_ptr e) {
+ std::ostringstream exception_description;
+ exception_description << e;
+ set(exception_description.str(), reply::status_type::internal_server_error);
+ }
+private:
+ void set(const std::string& msg, reply::status_type code) {
+ register_params();
+ _msg = msg;
+ _code = (int) code;
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/file_handler.hh b/src/seastar/include/seastar/http/file_handler.hh
new file mode 100644
index 000000000..4aff2cf9f
--- /dev/null
+++ b/src/seastar/include/seastar/http/file_handler.hh
@@ -0,0 +1,172 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/handlers.hh>
+#include <seastar/core/iostream.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * This is a base class for file transformer.
+ *
+ * File transformer adds the ability to modify a file content before returning
+ * the results, it acts as a factory class for output_stream.
+ *
+ * The transformer decides according to the file extension if transforming is
+ * needed.
+ *
+ * If a transformation is needed it would create a new output stream from the given stream.
+ */
+class file_transformer {
+public:
+ /**
+ * Any file transformer should implement this method.
+ * @param req the request
+ * @param extension the file extension originating the content
+ * returns a new output stream to be used when writing the file to the reply
+ */
+ virtual output_stream<char> transform(std::unique_ptr<request> req,
+ const sstring& extension, output_stream<char>&& s) = 0;
+
+ virtual ~file_transformer() = default;
+};
+
+/**
+ * A base class for handlers that interact with files.
+ * directory and file handlers both share some common logic
+ * with regards to file handling.
+ * they both needs to read a file from the disk, optionally transform it,
+ * and return the result or page not found on error
+ */
+class file_interaction_handler : public handler_base {
+public:
+ file_interaction_handler(file_transformer* p = nullptr)
+ : transformer(p) {
+
+ }
+
+ ~file_interaction_handler();
+
+ /**
+ * Allows setting a transformer to be used with the files returned.
+ * @param t the file transformer to use
+ * @return this
+ */
+ file_interaction_handler* set_transformer(file_transformer* t) {
+ transformer = t;
+ return this;
+ }
+
+ /**
+ * if the url ends without a slash redirect
+ * @param req the request
+ * @param rep the reply
+ * @return true on redirect
+ */
+ bool redirect_if_needed(const request& req, reply& rep) const;
+
+ /**
+ * A helper method that returns the file extension.
+ * @param file the file to check
+ * @return the file extension
+ */
+ static sstring get_extension(const sstring& file);
+
+protected:
+
+ /**
+ * read a file from the disk and return it in the replay.
+ * @param file the full path to a file on the disk
+ * @param req the reuest
+ * @param rep the reply
+ */
+ future<std::unique_ptr<reply> > read(sstring file,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep);
+ file_transformer* transformer;
+
+ output_stream<char> get_stream(std::unique_ptr<request> req,
+ const sstring& extension, output_stream<char>&& s);
+};
+
+/**
+ * The directory handler get a disk path in the
+ * constructor.
+ * and expect a path parameter in the handle method.
+ * it would concatenate the two and return the file
+ * e.g. if the path is /usr/mgmt/public in the path
+ * parameter is index.html
+ * handle will return the content of /usr/mgmt/public/index.html
+ */
+class directory_handler : public file_interaction_handler {
+public:
+
+ /**
+ * The directory handler map a base path and a path parameter to a file
+ * @param doc_root the root directory to search the file from.
+ * @param transformer an optional file transformer
+ * For example if the root is '/usr/mgmt/public' and the path parameter
+ * will be '/css/style.css' the file wil be /usr/mgmt/public/css/style.css'
+ */
+ explicit directory_handler(const sstring& doc_root,
+ file_transformer* transformer = nullptr);
+
+ future<std::unique_ptr<reply>> handle(const sstring& path,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep) override;
+
+private:
+ sstring doc_root;
+};
+
+/**
+ * The file handler get a path to a file on the disk
+ * in the constructor.
+ * it will always return the content of the file.
+ */
+class file_handler : public file_interaction_handler {
+public:
+
+ /**
+ * The file handler map a file to a url
+ * @param file the full path to the file on the disk
+ * @param transformer an optional file transformer
+ * @param force_path check if redirect is needed upon `handle`
+ */
+ explicit file_handler(const sstring& file, file_transformer* transformer =
+ nullptr, bool force_path = true)
+ : file_interaction_handler(transformer), file(file), force_path(
+ force_path) {
+ }
+
+ future<std::unique_ptr<reply>> handle(const sstring& path,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep) override;
+
+private:
+ sstring file;
+ bool force_path;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/function_handlers.hh b/src/seastar/include/seastar/http/function_handlers.hh
new file mode 100644
index 000000000..01fb5e147
--- /dev/null
+++ b/src/seastar/include/seastar/http/function_handlers.hh
@@ -0,0 +1,132 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/handlers.hh>
+#include <functional>
+#include <seastar/json/json_elements.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * A request function is a lambda expression that gets only the request
+ * as its parameter
+ */
+typedef std::function<sstring(const_req req)> request_function;
+
+/**
+ * A handle function is a lambda expression that gets request and reply
+ */
+typedef std::function<sstring(const_req req, reply&)> handle_function;
+
+/**
+ * A json request function is a lambda expression that gets only the request
+ * as its parameter and return a json response.
+ * Using the json response is done implicitly.
+ */
+typedef std::function<json::json_return_type(const_req req)> json_request_function;
+
+/**
+ * A future_json_function is a function that returns a future json reponse.
+ * Similiar to the json_request_function, using the json reponse is done
+ * implicitly.
+ */
+typedef std::function<
+ future<json::json_return_type>(std::unique_ptr<request> req)> future_json_function;
+
+typedef std::function<
+ future<std::unique_ptr<reply>>(std::unique_ptr<request> req,
+ std::unique_ptr<reply> rep)> future_handler_function;
+/**
+ * The function handler get a lambda expression in the constructor.
+ * it will call that expression to get the result
+ * This is suited for very simple handlers
+ *
+ */
+class function_handler : public handler_base {
+public:
+
+ function_handler(const handle_function & f_handle, const sstring& type)
+ : _f_handle(
+ [f_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+ rep->_content += f_handle(*req.get(), *rep.get());
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ }), _type(type) {
+ }
+
+ function_handler(const future_handler_function& f_handle, const sstring& type)
+ : _f_handle(f_handle), _type(type) {
+ }
+
+ function_handler(const request_function & _handle, const sstring& type)
+ : _f_handle(
+ [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+ rep->_content += _handle(*req.get());
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ }), _type(type) {
+ }
+
+ function_handler(const json_request_function& _handle)
+ : _f_handle(
+ [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+ json::json_return_type res = _handle(*req.get());
+ rep->_content += res._res;
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ }), _type("json") {
+ }
+
+ function_handler(const future_json_function& _handle)
+ : _f_handle(
+ [_handle](std::unique_ptr<request> req, std::unique_ptr<reply> rep) {
+ return _handle(std::move(req)).then([rep = std::move(rep)](json::json_return_type&& res) mutable {
+ if (res._body_writer) {
+ rep->write_body("json", std::move(res._body_writer));
+ } else {
+ rep->_content += res._res;
+
+ }
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ });
+ }), _type("json") {
+ }
+
+ function_handler(const function_handler&) = default;
+
+ future<std::unique_ptr<reply>> handle(const sstring& path,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep) override {
+ return _f_handle(std::move(req), std::move(rep)).then(
+ [this](std::unique_ptr<reply> rep) {
+ rep->done(_type);
+ return make_ready_future<std::unique_ptr<reply>>(std::move(rep));
+ });
+ }
+
+protected:
+ future_handler_function _f_handle;
+ sstring _type;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/handlers.hh b/src/seastar/include/seastar/http/handlers.hh
new file mode 100644
index 000000000..5465dde29
--- /dev/null
+++ b/src/seastar/include/seastar/http/handlers.hh
@@ -0,0 +1,73 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/request.hh>
+#include <seastar/http/common.hh>
+#include <seastar/http/reply.hh>
+
+#include <unordered_map>
+
+namespace seastar {
+
+namespace httpd {
+
+typedef const httpd::request& const_req;
+
+/**
+ * handlers holds the logic for serving an incoming request.
+ * All handlers inherit from the base httpserver_handler and
+ * implement the handle method.
+ *
+ */
+class handler_base {
+public:
+ /**
+ * All handlers should implement this method.
+ * It fill the reply according to the request.
+ * @param path the url path used in this call
+ * @param req the original request
+ * @param rep the reply
+ */
+ virtual future<std::unique_ptr<reply> > handle(const sstring& path,
+ std::unique_ptr<request> req, std::unique_ptr<reply> rep) = 0;
+
+ virtual ~handler_base() = default;
+
+ /**
+ * Add a mandatory parameter
+ * @param param a parameter name
+ * @return a reference to the handler
+ */
+ handler_base& mandatory(const sstring& param) {
+ _mandatory_param.push_back(param);
+ return *this;
+ }
+
+ std::vector<sstring> _mandatory_param;
+
+};
+
+}
+
+}
+
diff --git a/src/seastar/include/seastar/http/httpd.hh b/src/seastar/include/seastar/http/httpd.hh
new file mode 100644
index 000000000..3c4403036
--- /dev/null
+++ b/src/seastar/include/seastar/http/httpd.hh
@@ -0,0 +1,243 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/request_parser.hh>
+#include <seastar/http/request.hh>
+#include <seastar/core/seastar.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/app-template.hh>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/distributed.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/util/std-compat.hh>
+#include <iostream>
+#include <algorithm>
+#include <unordered_map>
+#include <queue>
+#include <bitset>
+#include <limits>
+#include <cctype>
+#include <vector>
+#include <boost/intrusive/list.hpp>
+#include <seastar/http/routes.hh>
+#include <seastar/net/tls.hh>
+#include <seastar/core/shared_ptr.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+class http_server;
+class http_stats;
+struct reply;
+
+using namespace std::chrono_literals;
+
+class http_stats {
+ metrics::metric_groups _metric_groups;
+public:
+ http_stats(http_server& server, const sstring& name);
+};
+
+class connection : public boost::intrusive::list_base_hook<> {
+ http_server& _server;
+ connected_socket _fd;
+ input_stream<char> _read_buf;
+ output_stream<char> _write_buf;
+ static constexpr size_t limit = 4096;
+ using tmp_buf = temporary_buffer<char>;
+ http_request_parser _parser;
+ std::unique_ptr<request> _req;
+ std::unique_ptr<reply> _resp;
+ // null element marks eof
+ queue<std::unique_ptr<reply>> _replies { 10 };
+ bool _done = false;
+public:
+ connection(http_server& server, connected_socket&& fd,
+ socket_address addr)
+ : _server(server), _fd(std::move(fd)), _read_buf(_fd.input()), _write_buf(
+ _fd.output()) {
+ on_new_connection();
+ }
+ ~connection();
+ void on_new_connection();
+
+ future<> process();
+ void shutdown();
+ future<> read();
+ future<> read_one();
+ future<> respond();
+ future<> do_response_loop();
+
+ void set_headers(reply& resp);
+
+ future<> start_response();
+ future<> write_reply_headers(std::unordered_map<sstring, sstring>::iterator hi);
+
+ static short hex_to_byte(char c);
+
+ /**
+ * Convert a hex encoded 2 bytes substring to char
+ */
+ static char hexstr_to_char(const std::string_view& in, size_t from);
+
+ /**
+ * URL_decode a substring and place it in the given out sstring
+ */
+ static bool url_decode(const std::string_view& in, sstring& out);
+
+ /**
+ * Add a single query parameter to the parameter list
+ */
+ static void add_param(request& req, const std::string_view& param);
+
+ /**
+ * Set the query parameters in the request objects.
+ * query param appear after the question mark and are separated
+ * by the ampersand sign
+ */
+ static sstring set_query_param(request& req);
+
+ future<bool> generate_reply(std::unique_ptr<request> req);
+ void generate_error_reply_and_close(std::unique_ptr<request> req, reply::status_type status, const sstring& msg);
+
+ future<> write_body();
+
+ output_stream<char>& out();
+};
+
+class http_server_tester;
+
+class http_server {
+ std::vector<server_socket> _listeners;
+ http_stats _stats;
+ uint64_t _total_connections = 0;
+ uint64_t _current_connections = 0;
+ uint64_t _requests_served = 0;
+ uint64_t _read_errors = 0;
+ uint64_t _respond_errors = 0;
+ shared_ptr<seastar::tls::server_credentials> _credentials;
+ sstring _date = http_date();
+ timer<> _date_format_timer { [this] {_date = http_date();} };
+ size_t _content_length_limit = std::numeric_limits<size_t>::max();
+ gate _task_gate;
+public:
+ routes _routes;
+ using connection = seastar::httpd::connection;
+ explicit http_server(const sstring& name) : _stats(*this, name) {
+ _date_format_timer.arm_periodic(1s);
+ }
+ /*!
+ * \brief set tls credentials for the server
+ * Setting the tls credentials will set the http-server to work in https mode.
+ *
+ * To use the https, create server credentials and pass it to the server before it starts.
+ *
+ * Use case example using seastar threads for clarity:
+
+ distributed<http_server> server; // typical server
+
+ seastar::shared_ptr<seastar::tls::credentials_builder> creds = seastar::make_shared<seastar::tls::credentials_builder>();
+ sstring ms_cert = "MyCertificate.crt";
+ sstring ms_key = "MyKey.key";
+
+ creds->set_dh_level(seastar::tls::dh_params::level::MEDIUM);
+
+ creds->set_x509_key_file(ms_cert, ms_key, seastar::tls::x509_crt_format::PEM).get();
+ creds->set_system_trust().get();
+
+
+ server.invoke_on_all([creds](http_server& server) {
+ server.set_tls_credentials(creds->build_server_credentials());
+ return make_ready_future<>();
+ }).get();
+ *
+ */
+ void set_tls_credentials(shared_ptr<seastar::tls::server_credentials> credentials);
+
+ size_t get_content_length_limit() const;
+
+ void set_content_length_limit(size_t limit);
+
+ future<> listen(socket_address addr, listen_options lo);
+ future<> listen(socket_address addr);
+ future<> stop();
+
+ future<> do_accepts(int which);
+
+ uint64_t total_connections() const;
+ uint64_t current_connections() const;
+ uint64_t requests_served() const;
+ uint64_t read_errors() const;
+ uint64_t reply_errors() const;
+ // Write the current date in the specific "preferred format" defined in
+ // RFC 7231, Section 7.1.1.1.
+ static sstring http_date();
+private:
+ future<> do_accept_one(int which);
+ boost::intrusive::list<connection> _connections;
+ friend class seastar::httpd::connection;
+ friend class http_server_tester;
+};
+
+class http_server_tester {
+public:
+ static std::vector<server_socket>& listeners(http_server& server) {
+ return server._listeners;
+ }
+};
+
+/*
+ * A helper class to start, set and listen an http server
+ * typical use would be:
+ *
+ * auto server = new http_server_control();
+ * server->start().then([server] {
+ * server->set_routes(set_routes);
+ * }).then([server, port] {
+ * server->listen(port);
+ * }).then([port] {
+ * std::cout << "Seastar HTTP server listening on port " << port << " ...\n";
+ * });
+ */
+class http_server_control {
+ std::unique_ptr<distributed<http_server>> _server_dist;
+private:
+ static sstring generate_server_name();
+public:
+ http_server_control() : _server_dist(new distributed<http_server>) {
+ }
+
+ future<> start(const sstring& name = generate_server_name());
+ future<> stop();
+ future<> set_routes(std::function<void(routes& r)> fun);
+ future<> listen(socket_address addr);
+ future<> listen(socket_address addr, listen_options lo);
+ distributed<http_server>& server();
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/json_path.hh b/src/seastar/include/seastar/http/json_path.hh
new file mode 100644
index 000000000..a9bc22ca5
--- /dev/null
+++ b/src/seastar/include/seastar/http/json_path.hh
@@ -0,0 +1,190 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+#include <tuple>
+#include <seastar/http/common.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/http/routes.hh>
+#include <seastar/http/function_handlers.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * A json_operation contain a method and a nickname.
+ * operation are associated to a path, that can
+ * have multiple methods
+ */
+struct json_operation {
+ /**
+ * default constructor
+ */
+ json_operation()
+ : method(GET) {
+ }
+
+ /**
+ * Construct with assignment
+ * @param method the http method type
+ * @param nickname the http nickname
+ */
+ json_operation(operation_type method, const sstring& nickname)
+ : method(method), nickname(nickname) {
+ }
+
+ operation_type method;
+ sstring nickname;
+
+};
+
+/**
+ * path description holds the path in the system.
+ * It maps a nickname to an operation, which allows
+ * defining the operation (path and method) by its
+ * nickname.
+ *
+ * A path_description has a type, a base path and a list of
+ * url components.
+ * Each component can be a regular path parameter, a path parameter that
+ * contains everything until the end of the path or a fixed string.
+ *
+ * the description are taken from the json swagger
+ * definition file, during auto code generation in the
+ * compilation.
+ */
+struct path_description {
+ //
+ enum class url_component_type {
+ PARAM, // a normal path parameter (starts with / and end with / or end of path)
+ PARAM_UNTIL_END_OF_PATH, // a parameter that contains all the path entil its end
+ FIXED_STRING, // a fixed string inside the path, must be a full match and does not count
+ // as a parameter
+ };
+
+ // path_part is either a parameter or a fixed string
+ struct path_part {
+ sstring name;
+ url_component_type type = url_component_type::PARAM;
+ };
+
+ /**
+ * default empty constructor
+ */
+ path_description() = default;
+
+ /**
+ * constructor for path with parameters
+ * The constructor is used by
+ * @param path the url path
+ * @param method the http method
+ * @param nickname the nickname
+ * @param path_parameters path parameters and url parts of the path
+ * @param mandatory_params the names of the mandatory query parameters
+ */
+ path_description(const sstring& path, operation_type method,
+ const sstring& nickname,
+ const std::vector<std::pair<sstring, bool>>& path_parameters,
+ const std::vector<sstring>& mandatory_params);
+
+ /**
+ * constructor for path with parameters
+ * The constructor is used by
+ * @param path the url path
+ * @param method the http method
+ * @param nickname the method nickname
+ * @param path_parameters path parameters and url parts of the path
+ * @param mandatory_params the names of the mandatory query parameters
+ */
+ path_description(const sstring& path, operation_type method,
+ const sstring& nickname,
+ const std::initializer_list<path_part>& path_parameters,
+ const std::vector<sstring>& mandatory_params);
+
+ /**
+ * Add a parameter to the path definition
+ * for example, if the url should match /file/{path}
+ * The constructor would be followed by a call to
+ * pushparam("path")
+ *
+ * @param param the name of the parameters, this name will
+ * be used by the handler to identify the parameters.
+ * A name can appear at most once in a description
+ * @param all_path when set to true the parameter will assume to match
+ * until the end of the url.
+ * This is useful for situation like file path with
+ * a rule like /file/{path} and a url /file/etc/hosts.
+ * path should be equal to /ets/hosts and not only /etc
+ * @return the current path description
+ */
+ path_description* pushparam(const sstring& param,
+ bool all_path = false) {
+ params.push_back( { param, (all_path) ? url_component_type::PARAM_UNTIL_END_OF_PATH : url_component_type::PARAM});
+ return this;
+ }
+
+ /*!
+ * \brief adds a fixed string as part of the path
+ * This will allow to combine fixed string URL parts and path parameters.
+ *
+ * For example to map a path like:
+ * /mypath/{param1}/morepath/{param2}
+ * path_description p("/mypath", operation_type::GET);
+ * p.pushparam("param1)->pushurl("morepath")->pushparam("param2");
+ */
+ path_description* push_static_path_part(const sstring& url) {
+ params.push_back( { url, url_component_type::FIXED_STRING});
+ return this;
+ }
+ /**
+ * adds a mandatory query parameter to the path
+ * this parameter will be check before calling a handler
+ * @param param the parameter to head
+ * @return a pointer to the current path description
+ */
+ path_description* pushmandatory_param(const sstring& param) {
+ mandatory_queryparams.push_back(param);
+ return this;
+ }
+
+ std::vector<path_part> params;
+ sstring path;
+ json_operation operations;
+ mutable routes::rule_cookie _cookie;
+
+ std::vector<sstring> mandatory_queryparams;
+
+ void set(routes& _routes, handler_base* handler) const;
+
+ void set(routes& _routes, const json_request_function& f) const;
+
+ void set(routes& _routes, const future_json_function& f) const;
+
+ void unset(routes& _routes) const;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/matcher.hh b/src/seastar/include/seastar/http/matcher.hh
new file mode 100644
index 000000000..3921cab8e
--- /dev/null
+++ b/src/seastar/include/seastar/http/matcher.hh
@@ -0,0 +1,111 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/common.hh>
+
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * a base class for the url matching.
+ * Each implementation check if the given url matches a criteria
+ */
+class matcher {
+public:
+
+ virtual ~matcher() = default;
+
+ /**
+ * check if the given url matches the rule
+ * @param url the url to check
+ * @param ind the position to start from
+ * @param param fill the parameters hash
+ * @return the end of of the matched part, or sstring::npos if not matched
+ */
+ virtual size_t match(const sstring& url, size_t ind, parameters& param) = 0;
+};
+
+/**
+ * Check if the url match a parameter and fill the parameters object
+ *
+ * Note that a non empty url will always return true with the parameters
+ * object filled
+ *
+ * Assume that the rule is /file/{path}/ and the param_matcher identify
+ * the /{path}
+ *
+ * For all non empty values, match will return true.
+ * If the entire url is /file/etc/hosts, and the part that is passed to
+ * param_matcher is /etc/hosts, if entire_path is true, the match will be
+ * '/etc/hosts' If entire_path is false, the match will be '/etc'
+ */
+class param_matcher : public matcher {
+public:
+ /**
+ * Constructor
+ * @param name the name of the parameter, will be used as the key
+ * in the parameters object
+ * @param entire_path when set to true, the matched parameters will
+ * include all the remaining url until the end of it.
+ * when set to false the match will terminate at the next slash
+ */
+ explicit param_matcher(const sstring& name, bool entire_path = false)
+ : _name(name), _entire_path(entire_path) {
+ }
+
+ virtual size_t match(const sstring& url, size_t ind, parameters& param)
+ override;
+private:
+ sstring _name;
+ bool _entire_path;
+};
+
+/**
+ * Check if the url match a predefine string.
+ *
+ * When parsing a match rule such as '/file/{path}' the str_match would parse
+ * the '/file' part
+ */
+class str_matcher : public matcher {
+public:
+ /**
+ * Constructor
+ * @param cmp the string to match
+ */
+ explicit str_matcher(const sstring& cmp)
+ : _cmp(cmp), _len(cmp.size()) {
+ }
+
+ virtual size_t match(const sstring& url, size_t ind, parameters& param)
+ override;
+private:
+ sstring _cmp;
+ unsigned _len;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/matchrules.hh b/src/seastar/include/seastar/http/matchrules.hh
new file mode 100644
index 000000000..032c60134
--- /dev/null
+++ b/src/seastar/include/seastar/http/matchrules.hh
@@ -0,0 +1,122 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/handlers.hh>
+#include <seastar/http/matcher.hh>
+#include <seastar/http/common.hh>
+
+#include <seastar/core/sstring.hh>
+#include <vector>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * match_rule check if a url matches criteria, that can contains
+ * parameters.
+ * the routes object would call the get method with a url and if
+ * it matches, the method will return a handler
+ * during the matching process, the method fill the parameters object.
+ */
+class match_rule {
+public:
+ /**
+ * The destructor deletes matchers.
+ */
+ ~match_rule() {
+ for (auto m : _match_list) {
+ delete m;
+ }
+ delete _handler;
+ }
+
+ /**
+ * Constructor with a handler
+ * @param handler the handler to return when this match rule is met
+ */
+ explicit match_rule(handler_base* handler)
+ : _handler(handler) {
+ }
+
+ /**
+ * Check if url match the rule and return a handler if it does
+ * @param url a url to compare against the rule
+ * @param params the parameters object, matches parameters will fill
+ * the object during the matching process
+ * @return a handler if there is a full match or nullptr if not
+ */
+ handler_base* get(const sstring& url, parameters& params) {
+ size_t ind = 0;
+ if (_match_list.empty()) {
+ return _handler;
+ }
+ for (unsigned int i = 0; i < _match_list.size(); i++) {
+ ind = _match_list.at(i)->match(url, ind, params);
+ if (ind == sstring::npos) {
+ return nullptr;
+ }
+ }
+ return (ind + 1 >= url.length()) ? _handler : nullptr;
+ }
+
+ /**
+ * Add a matcher to the rule
+ * @param match the matcher to add
+ * @return this
+ */
+ match_rule& add_matcher(matcher* match) {
+ _match_list.push_back(match);
+ return *this;
+ }
+
+ /**
+ * Add a static url matcher
+ * @param str the string to search for
+ * @return this
+ */
+ match_rule& add_str(const sstring& str) {
+ add_matcher(new str_matcher(str));
+ return *this;
+ }
+
+ /**
+ * add a parameter matcher to the rule
+ * @param str the parameter name
+ * @param fullpath when set to true, parameter will included all the
+ * remaining url until its end
+ * @return this
+ */
+ match_rule& add_param(const sstring& str, bool fullpath = false) {
+ add_matcher(new param_matcher(str, fullpath));
+ return *this;
+ }
+
+private:
+ std::vector<matcher*> _match_list;
+ handler_base* _handler;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/mime_types.hh b/src/seastar/include/seastar/http/mime_types.hh
new file mode 100644
index 000000000..c5530f5a0
--- /dev/null
+++ b/src/seastar/include/seastar/http/mime_types.hh
@@ -0,0 +1,33 @@
+//
+// mime_types.hpp
+// ~~~~~~~~~~~~~~
+//
+// Copyright (c) 2003-2013 Christopher M. Kohlhoff (chris at kohlhoff dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+namespace mime_types {
+
+/**
+ * Convert a file extension into a MIME type.
+ *
+ * @param extension the file extension
+ * @return the mime type as a string
+ */
+const char* extension_to_type(const sstring& extension);
+
+} // namespace mime_types
+
+} // namespace httpd
+
+}
diff --git a/src/seastar/include/seastar/http/reply.hh b/src/seastar/include/seastar/http/reply.hh
new file mode 100644
index 000000000..5401eb451
--- /dev/null
+++ b/src/seastar/include/seastar/http/reply.hh
@@ -0,0 +1,182 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+// This file was modified from boost http example
+//
+// reply.hpp
+// ~~~~~~~~~
+//
+// Copyright (c) 2003-2013 Christopher M. Kohlhoff (chris at kohlhoff dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <unordered_map>
+#include <seastar/http/mime_types.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/util/noncopyable_function.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+class connection;
+class routes;
+
+/**
+ * A reply to be sent to a client.
+ */
+struct reply {
+ /**
+ * The status of the reply.
+ */
+ enum class status_type {
+ continue_ = 100, //!< continue
+ ok = 200, //!< ok
+ created = 201, //!< created
+ accepted = 202, //!< accepted
+ no_content = 204, //!< no_content
+ multiple_choices = 300, //!< multiple_choices
+ moved_permanently = 301, //!< moved_permanently
+ moved_temporarily = 302, //!< moved_temporarily
+ not_modified = 304, //!< not_modified
+ bad_request = 400, //!< bad_request
+ unauthorized = 401, //!< unauthorized
+ forbidden = 403, //!< forbidden
+ not_found = 404, //!< not_found
+ length_required = 411, //!< length_required
+ payload_too_large = 413, //!< payload_too_large
+ internal_server_error = 500, //!< internal_server_error
+ not_implemented = 501, //!< not_implemented
+ bad_gateway = 502, //!< bad_gateway
+ service_unavailable = 503 //!< service_unavailable
+ } _status;
+
+ /**
+ * The headers to be included in the reply.
+ */
+ std::unordered_map<sstring, sstring> _headers;
+
+ sstring _version;
+ /**
+ * The content to be sent in the reply.
+ */
+ sstring _content;
+
+ sstring _response_line;
+ reply()
+ : _status(status_type::ok) {
+ }
+
+ reply& add_header(const sstring& h, const sstring& value) {
+ _headers[h] = value;
+ return *this;
+ }
+
+ reply& set_version(const sstring& version) {
+ _version = version;
+ return *this;
+ }
+
+ reply& set_status(status_type status, const sstring& content = "") {
+ _status = status;
+ if (content != "") {
+ _content = content;
+ }
+ return *this;
+ }
+
+ /**
+ * Set the content type mime type.
+ * Used when the mime type is known.
+ * For most cases, use the set_content_type
+ */
+ reply& set_mime_type(const sstring& mime) {
+ _headers["Content-Type"] = mime;
+ return *this;
+ }
+
+ /**
+ * Set the content type mime type according to the file extension
+ * that would have been used if it was a file: e.g. html, txt, json etc'
+ */
+ reply& set_content_type(const sstring& content_type = "html") {
+ set_mime_type(httpd::mime_types::extension_to_type(content_type));
+ return *this;
+ }
+
+ reply& done(const sstring& content_type) {
+ return set_content_type(content_type).done();
+ }
+ /**
+ * Done should be called before using the reply.
+ * It would set the response line
+ */
+ reply& done() {
+ _response_line = response_line();
+ return *this;
+ }
+ sstring response_line();
+
+ /*!
+ * \brief use an output stream to write the message body
+ *
+ * When a handler needs to use an output stream it should call this method
+ * with a function.
+ *
+ * \param content_type - is used to choose the content type of the body. Use the file extension
+ * you would have used for such a content, (i.e. "txt", "html", "json", etc')
+ * \param body_writer - a function that accept an output stream and use that stream to write the body.
+ * The function should take ownership of the stream while using it and must close the stream when it
+ * is done.
+ *
+ * Message would use chunked transfer encoding in the reply.
+ *
+ */
+
+ void write_body(const sstring& content_type, noncopyable_function<future<>(output_stream<char>&&)>&& body_writer);
+
+ /*!
+ * \brief Write a string as the reply
+ *
+ * \param content_type - is used to choose the content type of the body. Use the file extension
+ * you would have used for such a content, (i.e. "txt", "html", "json", etc')
+ * \param content - the message content.
+ * This would set the the content and content type of the message along
+ * with any additional information that is needed to send the message.
+ */
+ void write_body(const sstring& content_type, const sstring& content);
+
+private:
+ future<> write_reply_to_connection(connection& con);
+ future<> write_reply_headers(connection& connection);
+
+ noncopyable_function<future<>(output_stream<char>&&)> _body_writer;
+ friend class routes;
+ friend class connection;
+};
+
+} // namespace httpd
+
+}
diff --git a/src/seastar/include/seastar/http/request.hh b/src/seastar/include/seastar/http/request.hh
new file mode 100644
index 000000000..e09a1c1b3
--- /dev/null
+++ b/src/seastar/include/seastar/http/request.hh
@@ -0,0 +1,134 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+//
+// request.hpp
+// ~~~~~~~~~~~
+//
+// Copyright (c) 2003-2013 Christopher M. Kohlhoff (chris at kohlhoff dot com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <string>
+#include <vector>
+#include <strings.h>
+#include <seastar/http/common.hh>
+
+namespace seastar {
+
+namespace httpd {
+class connection;
+
+/**
+ * A request received from a client.
+ */
+struct request {
+ enum class ctclass
+ : char {
+ other, multipart, app_x_www_urlencoded,
+ };
+
+ struct case_insensitive_cmp {
+ bool operator()(const sstring& s1, const sstring& s2) const {
+ return std::equal(s1.begin(), s1.end(), s2.begin(), s2.end(),
+ [](char a, char b) { return ::tolower(a) == ::tolower(b); });
+ }
+ };
+
+ struct case_insensitive_hash {
+ size_t operator()(sstring s) const {
+ std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+ return std::hash<sstring>()(s);
+ }
+ };
+
+ sstring _method;
+ sstring _url;
+ sstring _version;
+ int http_version_major;
+ int http_version_minor;
+ ctclass content_type_class;
+ size_t content_length = 0;
+ std::unordered_map<sstring, sstring, case_insensitive_hash, case_insensitive_cmp> _headers;
+ std::unordered_map<sstring, sstring> query_parameters;
+ connection* connection_ptr;
+ parameters param;
+ sstring content;
+ sstring protocol_name = "http";
+
+ /**
+ * Search for the first header of a given name
+ * @param name the header name
+ * @return a pointer to the header value, if it exists or empty string
+ */
+ sstring get_header(const sstring& name) const {
+ auto res = _headers.find(name);
+ if (res == _headers.end()) {
+ return "";
+ }
+ return res->second;
+ }
+
+ /**
+ * Search for the first header of a given name
+ * @param name the header name
+ * @return a pointer to the header value, if it exists or empty string
+ */
+ sstring get_query_param(const sstring& name) const {
+ auto res = query_parameters.find(name);
+ if (res == query_parameters.end()) {
+ return "";
+ }
+ return res->second;
+ }
+
+ /**
+ * Get the request protocol name. Can be either "http" or "https".
+ */
+ sstring get_protocol_name() const {
+ return protocol_name;
+ }
+
+ /**
+ * Get the request url.
+ * @return the request url
+ */
+ sstring get_url() const {
+ return get_protocol_name() + "://" + get_header("Host") + _url;
+ }
+
+ bool is_multi_part() const {
+ return content_type_class == ctclass::multipart;
+ }
+
+ bool is_form_post() const {
+ return content_type_class == ctclass::app_x_www_urlencoded;
+ }
+
+};
+
+} // namespace httpd
+
+}
diff --git a/src/seastar/include/seastar/http/routes.hh b/src/seastar/include/seastar/http/routes.hh
new file mode 100644
index 000000000..30ab95c41
--- /dev/null
+++ b/src/seastar/include/seastar/http/routes.hh
@@ -0,0 +1,289 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/matchrules.hh>
+#include <seastar/http/handlers.hh>
+#include <seastar/http/common.hh>
+#include <seastar/http/reply.hh>
+
+#include <boost/program_options/variables_map.hpp>
+#include <unordered_map>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * The url helps defining a route.
+ */
+class url {
+public:
+ /**
+ * Move constructor
+ */
+ url(url&&) = default;
+
+ /**
+ * Construct with a url path as it's parameter
+ * @param path the url path to be used
+ */
+ url(const sstring& path)
+ : _path(path) {
+ }
+
+ /**
+ * Adds a parameter that matches untill the end of the URL.
+ * @param param the parmaeter name
+ * @return the current url
+ */
+ url& remainder(const sstring& param) {
+ this->_param = param;
+ return *this;
+ }
+
+ sstring _path;
+ sstring _param;
+};
+
+struct path_description;
+
+/**
+ * routes object do the request dispatching according to the url.
+ * It uses two decision mechanism exact match, if a url matches exactly
+ * (an optional leading slash is permitted) it is choosen
+ * If not, the matching rules are used.
+ * matching rules are evaluated by their insertion order
+ */
+class routes {
+public:
+ /**
+ * The destructor deletes the match rules and handlers
+ */
+ ~routes();
+
+ /**
+ * adding a handler as an exact match
+ * @param url the url to match (note that url should start with /)
+ * @param handler the desire handler
+ * @return it self
+ */
+ routes& put(operation_type type, const sstring& url, handler_base* handler);
+
+ /**
+ * removing a handler from exact match
+ * @param url the url to match (note that url should start with /)
+ * @return the current handler (to be removed by caller)
+ */
+ handler_base* drop(operation_type type, const sstring& url);
+
+ /**
+ * add a rule to be used.
+ * rules are search only if an exact match was not found.
+ * rules are search by the order they were added.
+ * First in higher priority
+ * @param rule a rule to add
+ * @param type the operation type
+ * @return it self
+ */
+ routes& add(match_rule* rule, operation_type type = GET) {
+ _rules[type][_rover++] = rule;
+ return *this;
+ }
+
+ /**
+ * Add a url match to a handler:
+ * Example routes.add(GET, url("/api").remainder("path"), handler);
+ * @param type
+ * @param url
+ * @param handler
+ * @return
+ */
+ routes& add(operation_type type, const url& url, handler_base* handler);
+
+ /**
+ * the main entry point.
+ * the general handler calls this method with the request
+ * the method takes the headers from the request and find the
+ * right handler.
+ * It then call the handler with the parameters (if they exists) found in the url
+ * @param path the url path found
+ * @param req the http request
+ * @param rep the http reply
+ */
+ future<std::unique_ptr<reply> > handle(const sstring& path, std::unique_ptr<request> req, std::unique_ptr<reply> rep);
+
+ /**
+ * Search and return an exact match
+ * @param url the request url
+ * @return the handler if exists or nullptr if it does not
+ */
+ handler_base* get_exact_match(operation_type type, const sstring& url) const {
+ auto i = _map[type].find(url);
+ return (i == _map[type].end()) ? nullptr : i->second;
+ }
+
+ /**
+ * Search and return a handler by the operation type and url
+ * @param type the http operation type
+ * @param url the request url
+ * @param params a parameter object that will be filled during the match
+ * @return a handler based on the type/url match
+ */
+ handler_base* get_handler(operation_type type, const sstring& url,
+ parameters& params);
+
+private:
+ /**
+ * Normalize the url to remove the last / if exists
+ * and get the parameter part
+ * @param url the full url path
+ * @param param_part will hold the string with the parameters
+ * @return the url from the request without the last /
+ */
+ sstring normalize_url(const sstring& url);
+
+ std::unordered_map<sstring, handler_base*> _map[NUM_OPERATION];
+public:
+ using rule_cookie = uint64_t;
+private:
+ rule_cookie _rover = 0;
+ std::map<rule_cookie, match_rule*> _rules[NUM_OPERATION];
+public:
+ using exception_handler_fun = std::function<std::unique_ptr<reply>(std::exception_ptr eptr)>;
+ using exception_handler_id = size_t;
+private:
+ std::map<exception_handler_id, exception_handler_fun> _exceptions;
+ exception_handler_id _exception_id = 0;
+ // for optimization reason, the lambda function
+ // that calls the exception_reply of the current object
+ // is stored
+ exception_handler_fun _general_handler;
+public:
+ /**
+ * The exception_handler_fun expect to call
+ * std::rethrow_exception(eptr);
+ * and catch only the exception it handles
+ */
+ exception_handler_id register_exeption_handler(exception_handler_fun fun) {
+ auto current = _exception_id++;
+ _exceptions[current] = fun;
+ return current;
+ }
+
+ void remove_exception_handler(exception_handler_id id) {
+ _exceptions.erase(id);
+ }
+
+ std::unique_ptr<reply> exception_reply(std::exception_ptr eptr);
+
+ routes();
+
+ /*!
+ * \brief add an alias to an already registered path.
+ * After registering a handler to a path, use this method
+ * to add an alias to that handler.
+ *
+ */
+ void add_alias(const path_description& old_path, const path_description& new_path);
+
+ /**
+ * Add a rule to be used.
+ * @param rule a rule to add
+ * @param type the operation type
+ * @return a cookie using which the rule can be removed
+ */
+ rule_cookie add_cookie(match_rule* rule, operation_type type) {
+ auto pos = _rover++;
+ _rules[type][pos] = rule;
+ return pos;
+ }
+
+ /**
+ * Del a rule by cookie
+ * @param cookie a cookie returned previously by add_cookie
+ * @param type the operation type
+ * @return the pointer to the rule
+ */
+ match_rule* del_cookie(rule_cookie cookie, operation_type type);
+};
+
+/**
+ * A helper function that check if a parameter is found in the params object
+ * if it does not the function would throw a parameter not found exception
+ * @param params the parameters object
+ * @param param the parameter to look for
+ */
+void verify_param(const httpd::request& req, const sstring& param);
+
+/**
+ * The handler_registration object facilitates registration and auto
+ * unregistration of an exact-match handler_base into \ref routes "routes"
+ */
+class handler_registration {
+ routes& _routes;
+ const sstring _url;
+ operation_type _op;
+
+public:
+ /**
+ * Registers the handler_base into routes with routes::put
+ * @param rs the routes object reference
+ * @param h the desire handler
+ * @param url the url to match
+ * @param op the operation type (`GET` by default)
+ */
+ handler_registration(routes& rs, handler_base& h, const sstring& url, operation_type op = GET);
+
+ /**
+ * Unregisters the handler from routes with routes::drop
+ */
+ ~handler_registration();
+};
+
+/**
+ * The rule_registration object facilitates registration and auto
+ * unregistration of a match_rule handler into \ref routes "routes"
+ */
+class rule_registration {
+ routes& _routes;
+ operation_type _op;
+ routes::rule_cookie _cookie;
+
+public:
+ /**
+ * Registers the match_rule into routes with routes::add_cookie
+ * @param rs the routes object reference
+ * @param rule a rule to add
+ * @param op the operation type (`GET` by default)
+ */
+ rule_registration(routes& rs, match_rule& rule, operation_type op = GET);
+
+ /**
+ * Unregisters the rule from routes with routes::del_cookie
+ */
+ ~rule_registration();
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/http/transformers.hh b/src/seastar/include/seastar/http/transformers.hh
new file mode 100644
index 000000000..82732587e
--- /dev/null
+++ b/src/seastar/include/seastar/http/transformers.hh
@@ -0,0 +1,59 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/http/handlers.hh>
+#include <seastar/http/file_handler.hh>
+
+namespace seastar {
+
+namespace httpd {
+
+/**
+ * content_replace replaces variable in a file with a dynamic value.
+ * It would take the host from request and will replace the variable
+ * in a file
+ *
+ * The replacement can be restricted to an extension.
+ *
+ * We are currently support only one file type for replacement.
+ * It could be extend if we will need it
+ *
+ */
+class content_replace : public file_transformer {
+public:
+ virtual output_stream<char> transform(std::unique_ptr<request> req,
+ const sstring& extension, output_stream<char>&& s);
+ /**
+ * the constructor get the file extension the replace would work on.
+ * @param extension file extension, when not set all files extension
+ */
+ explicit content_replace(const sstring& extension = "")
+ : extension(extension) {
+ }
+private:
+ sstring extension;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/json/formatter.hh b/src/seastar/include/seastar/json/formatter.hh
new file mode 100644
index 000000000..920ff50b5
--- /dev/null
+++ b/src/seastar/include/seastar/json/formatter.hh
@@ -0,0 +1,335 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <map>
+#include <time.h>
+#include <sstream>
+#include <seastar/core/loop.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/iostream.hh>
+
+namespace seastar {
+
+namespace json {
+
+class jsonable;
+
+typedef struct tm date_time;
+
+/**
+ * The formatter prints json values in a json format
+ * it overload to_json method for each of the supported format
+ * all to_json parameters are passed as a pointer
+ */
+class formatter {
+ enum class state {
+ none, array, map
+ };
+ static sstring begin(state);
+ static sstring end(state);
+
+ template<typename K, typename V>
+ static sstring to_json(state s, const std::pair<K, V>& p) {
+ return s == state::array ?
+ "{" + to_json(state::none, p) + "}" :
+ to_json(p.first) + ":" + to_json(p.second);
+ }
+
+ template<typename Iter>
+ static sstring to_json(state s, Iter i, Iter e) {
+ std::stringstream res;
+ res << begin(s);
+ size_t n = 0;
+ while (i != e) {
+ if (n++ != 0) {
+ res << ",";
+ }
+ res << to_json(s, *i++);
+ }
+ res << end(s);
+ return res.str();
+ }
+
+ // fallback template
+ template<typename T>
+ static sstring to_json(state, const T& t) {
+ return to_json(t);
+ }
+
+ template<typename K, typename V>
+ static future<> write(output_stream<char>& stream, state s, const std::pair<K, V>& p) {
+ if (s == state::array) {
+ return stream.write("{").then([&stream, &p] {
+ return write(stream, state::none, p).then([&stream] {
+ return stream.write("}");
+ });
+ });
+ } else {
+ return stream.write(to_json(p.first) + ":").then([&p, &stream] {
+ write(stream, p.second);
+ });
+ }
+ }
+
+ template<typename Iter>
+ static future<> write(output_stream<char>& stream, state s, Iter i, Iter e) {
+ return do_with(true, [&stream, s, i, e] (bool& first) {
+ return stream.write(begin(s)).then([&first, &stream, s, i, e] {
+ return do_for_each(i, e, [&first, &stream] (auto& m) {
+ auto f = (first) ? make_ready_future<>() : stream.write(",");
+ first = false;
+ return f.then([&m, &stream] {
+ return write(stream, m);
+ });
+ }).then([&stream, s] {
+ // FIXME: future is discarded
+ (void)stream.write(end(s));
+ });
+ });
+ });
+ }
+
+ // fallback template
+ template<typename T>
+ static future<> write(output_stream<char>& stream, state, const T& t) {
+ return stream.write(to_json(t));
+ }
+
+public:
+
+ /**
+ * return a json formated string
+ * @param str the string to format
+ * @return the given string in a json format
+ */
+ static sstring to_json(const sstring& str);
+
+ /**
+ * return a json formated int
+ * @param n the int to format
+ * @return the given int in a json format
+ */
+ static sstring to_json(int n);
+
+ /**
+ * return a json formated unsigned
+ * @param n the unsigned to format
+ * @return the given unsigned in a json format
+ */
+ static sstring to_json(unsigned n);
+
+ /**
+ * return a json formated long
+ * @param n the long to format
+ * @return the given long in a json format
+ */
+ static sstring to_json(long n);
+
+ /**
+ * return a json formated float
+ * @param f the float to format
+ * @return the given float in a json format
+ */
+ static sstring to_json(float f);
+
+ /**
+ * return a json formated double
+ * @param d the double to format
+ * @return the given double in a json format
+ */
+ static sstring to_json(double d);
+
+ /**
+ * return a json formated char* (treated as string)
+ * @param str the char* to foramt
+ * @return the given char* in a json foramt
+ */
+ static sstring to_json(const char* str);
+
+ /**
+ * return a json formated bool
+ * @param d the bool to format
+ * @return the given bool in a json format
+ */
+ static sstring to_json(bool d);
+
+ /**
+ * return a json formated list of a given vector of params
+ * @param vec the vector to format
+ * @return the given vector in a json format
+ */
+ template<typename... Args>
+ static sstring to_json(const std::vector<Args...>& vec) {
+ return to_json(state::array, vec.begin(), vec.end());
+ }
+
+ template<typename... Args>
+ static sstring to_json(const std::map<Args...>& map) {
+ return to_json(state::map, map.begin(), map.end());
+ }
+
+ template<typename... Args>
+ static sstring to_json(const std::unordered_map<Args...>& map) {
+ return to_json(state::map, map.begin(), map.end());
+ }
+
+ /**
+ * return a json formated date_time
+ * @param d the date_time to format
+ * @return the given date_time in a json format
+ */
+ static sstring to_json(const date_time& d);
+
+ /**
+ * return a json formated json object
+ * @param obj the date_time to format
+ * @return the given json object in a json format
+ */
+ static sstring to_json(const jsonable& obj);
+
+ /**
+ * return a json formated unsigned long
+ * @param l unsigned long to format
+ * @return the given unsigned long in a json format
+ */
+ static sstring to_json(unsigned long l);
+
+
+
+ /**
+ * return a json formated string
+ * @param str the string to format
+ * @return the given string in a json format
+ */
+ static future<> write(output_stream<char>& s, const sstring& str) {
+ return s.write(to_json(str));
+ }
+
+ /**
+ * return a json formated int
+ * @param n the int to format
+ * @return the given int in a json format
+ */
+ static future<> write(output_stream<char>& s, int n) {
+ return s.write(to_json(n));
+ }
+
+ /**
+ * return a json formated long
+ * @param n the long to format
+ * @return the given long in a json format
+ */
+ static future<> write(output_stream<char>& s, long n) {
+ return s.write(to_json(n));
+ }
+
+ /**
+ * return a json formated float
+ * @param f the float to format
+ * @return the given float in a json format
+ */
+ static future<> write(output_stream<char>& s, float f) {
+ return s.write(to_json(f));
+ }
+
+ /**
+ * return a json formated double
+ * @param d the double to format
+ * @return the given double in a json format
+ */
+ static future<> write(output_stream<char>& s, double d) {
+ return s.write(to_json(d));
+ }
+
+ /**
+ * return a json formated char* (treated as string)
+ * @param str the char* to foramt
+ * @return the given char* in a json foramt
+ */
+ static future<> write(output_stream<char>& s, const char* str) {
+ return s.write(to_json(str));
+ }
+
+ /**
+ * return a json formated bool
+ * @param d the bool to format
+ * @return the given bool in a json format
+ */
+ static future<> write(output_stream<char>& s, bool d) {
+ return s.write(to_json(d));
+ }
+
+ /**
+ * return a json formated list of a given vector of params
+ * @param vec the vector to format
+ * @return the given vector in a json format
+ */
+ template<typename... Args>
+ static future<> write(output_stream<char>& s, const std::vector<Args...>& vec) {
+ return write(s, state::array, vec.begin(), vec.end());
+ }
+
+ template<typename... Args>
+ static future<> write(output_stream<char>& s, const std::map<Args...>& map) {
+ return write(s, state::map, map.begin(), map.end());
+ }
+
+ template<typename... Args>
+ static future<> write(output_stream<char>& s, const std::unordered_map<Args...>& map) {
+ return write(s, state::map, map.begin(), map.end());
+ }
+
+ /**
+ * return a json formated date_time
+ * @param d the date_time to format
+ * @return the given date_time in a json format
+ */
+ static future<> write(output_stream<char>& s, const date_time& d) {
+ return s.write(to_json(d));
+ }
+
+ /**
+ * return a json formated json object
+ * @param obj the date_time to format
+ * @return the given json object in a json format
+ */
+ static future<> write(output_stream<char>& s, const jsonable& obj) {
+ return s.write(to_json(obj));
+ }
+
+ /**
+ * return a json formated unsigned long
+ * @param l unsigned long to format
+ * @return the given unsigned long in a json format
+ */
+ static future<> write(output_stream<char>& s, unsigned long l) {
+ return s.write(to_json(l));
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/json/json_elements.hh b/src/seastar/include/seastar/json/json_elements.hh
new file mode 100644
index 000000000..929e1bdf2
--- /dev/null
+++ b/src/seastar/include/seastar/json/json_elements.hh
@@ -0,0 +1,355 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <time.h>
+#include <sstream>
+#include <seastar/core/do_with.hh>
+#include <seastar/core/loop.hh>
+#include <seastar/json/formatter.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/iostream.hh>
+
+namespace seastar {
+
+namespace json {
+
+/**
+ * The base class for all json element.
+ * Every json element has a name
+ * An indication if it was set or not
+ * And is this element is mandatory.
+ * When a mandatory element is not set
+ * this is not a valid object
+ */
+class json_base_element {
+public:
+ /**
+ * The constructors
+ */
+ json_base_element()
+ : _mandatory(false), _set(false) {
+ }
+
+ virtual ~json_base_element() = default;
+
+ /**
+ * Check if it's a mandatory parameter
+ * and if it's set.
+ * @return true if this is not a mandatory parameter
+ * or if it is and it's value is set
+ */
+ virtual bool is_verify() {
+ return !(_mandatory && !_set);
+ }
+
+ json_base_element& operator=(const json_base_element& o) {
+ // Names and mandatory are never changed after creation
+ _set = o._set;
+ return *this;
+ }
+
+ /**
+ * returns the internal value in a json format
+ * Each inherit class must implement this method
+ * @return formated internal value
+ */
+ virtual std::string to_string() = 0;
+
+ virtual future<> write(output_stream<char>& s) const = 0;
+ std::string _name;
+ bool _mandatory;
+ bool _set;
+};
+
+/**
+ * Basic json element instantiate
+ * the json_element template.
+ * it adds a value to the base definition
+ * and the to_string implementation using the formatter
+ */
+template<class T>
+class json_element : public json_base_element {
+public:
+
+ /**
+ * the assignment operator also set
+ * the set value to true.
+ * @param new_value the new value
+ * @return the value itself
+ */
+ json_element &operator=(const T& new_value) {
+ _value = new_value;
+ _set = true;
+ return *this;
+ }
+ /**
+ * the assignment operator also set
+ * the set value to true.
+ * @param new_value the new value
+ * @return the value itself
+ */
+ template<class C>
+ json_element &operator=(const C& new_value) {
+ _value = new_value;
+ _set = true;
+ return *this;
+ }
+ /**
+ * The brackets operator
+ * @return the value
+ */
+ const T& operator()() const {
+ return _value;
+ }
+
+ /**
+ * The to_string return the value
+ * formated as a json value
+ * @return the value foramted for json
+ */
+ virtual std::string to_string() override
+ {
+ return formatter::to_json(_value);
+ }
+
+ virtual future<> write(output_stream<char>& s) const override {
+ return formatter::write(s, _value);
+ }
+private:
+ T _value;
+};
+
+/**
+ * json_list is based on std vector implementation.
+ *
+ * When values are added with push it is set the "set" flag to true
+ * hence will be included in the parsed object
+ */
+template<class T>
+class json_list : public json_base_element {
+public:
+
+ /**
+ * Add an element to the list.
+ * @param element a new element that will be added to the list
+ */
+ void push(const T& element) {
+ _set = true;
+ _elements.push_back(element);
+ }
+
+ virtual std::string to_string() override
+ {
+ return formatter::to_json(_elements);
+ }
+
+ /**
+ * Assignment can be done from any object that support const range
+ * iteration and that it's elements can be assigned to the list elements
+ */
+ template<class C>
+ json_list& operator=(const C& list) {
+ _elements.clear();
+ for (auto i : list) {
+ push(i);
+ }
+ return *this;
+ }
+ virtual future<> write(output_stream<char>& s) const override {
+ return formatter::write(s, _elements);
+ }
+ std::vector<T> _elements;
+};
+
+class jsonable {
+public:
+ virtual ~jsonable() = default;
+ /**
+ * create a foramted string of the object.
+ * @return the object formated.
+ */
+ virtual std::string to_json() const = 0;
+
+ /*!
+ * \brief write an object to the output stream
+ *
+ * The defult implementation uses the to_json
+ * Object implementation override it.
+ */
+ virtual future<> write(output_stream<char>& s) const {
+ return s.write(to_json());
+ }
+};
+
+/**
+ * The base class for all json objects
+ * It holds a list of all the element in it,
+ * allowing it implement the to_json method.
+ *
+ * It also allows iterating over the element
+ * in the object, even if not all the member
+ * are known in advance and in practice mimic
+ * reflection
+ */
+struct json_base : public jsonable {
+
+ virtual ~json_base() = default;
+
+ json_base() = default;
+
+ json_base(const json_base&) = delete;
+
+ json_base operator=(const json_base&) = delete;
+
+ /**
+ * create a foramted string of the object.
+ * @return the object formated.
+ */
+ virtual std::string to_json() const;
+
+ /*!
+ * \brief write to an output stream
+ */
+ virtual future<> write(output_stream<char>&) const;
+
+ /**
+ * Check that all mandatory elements are set
+ * @return true if all mandatory parameters are set
+ */
+ virtual bool is_verify() const;
+
+ /**
+ * Register an element in an object
+ * @param element the element to be added
+ * @param name the element name
+ * @param mandatory is this element mandatory.
+ */
+ virtual void add(json_base_element* element, std::string name,
+ bool mandatory = false);
+
+ std::vector<json_base_element*> _elements;
+};
+
+/**
+ * There are cases where a json request needs to return a successful
+ * empty reply.
+ * The json_void class will be used to mark that the reply should be empty.
+ *
+ */
+struct json_void : public jsonable{
+ virtual std::string to_json() const {
+ return "";
+ }
+
+ /*!
+ * \brief write to an output stream
+ */
+ virtual future<> write(output_stream<char>& s) const {
+ return s.close();
+ }
+};
+
+
+/**
+ * The json return type, is a helper class to return a json
+ * formatted string.
+ * It uses autoboxing in its constructor so when a function return
+ * type is json_return_type, it could return a type that would be converted
+ * ie.
+ * json_return_type foo() {
+ * return "hello";
+ * }
+ *
+ * would return a json formatted string: "hello" (rather then hello)
+ */
+struct json_return_type {
+ sstring _res;
+ std::function<future<>(output_stream<char>&&)> _body_writer;
+ json_return_type(std::function<future<>(output_stream<char>&&)>&& body_writer) : _body_writer(std::move(body_writer)) {
+ }
+ template<class T>
+ json_return_type(const T& res) {
+ _res = formatter::to_json(res);
+ }
+
+ json_return_type(json_return_type&& o) noexcept : _res(std::move(o._res)), _body_writer(std::move(o._body_writer)) {
+ }
+ json_return_type& operator=(json_return_type&& o) noexcept {
+ _res = std::move(o._res);
+ _body_writer = std::move(o._body_writer);
+ return *this;
+ }
+};
+
+/*!
+ * \brief capture a range and return a serialize function for it as a json array.
+ *
+ * To use it, pass a range and a mapping function.
+ * For example, if res is a map:
+ *
+ * return make_ready_future<json::json_return_type>(stream_range_as_array(res, [](const auto&i) {return i.first}));
+ */
+template<typename Container, typename Func>
+SEASTAR_CONCEPT( requires requires (Container c, Func aa, output_stream<char> s) { { formatter::write(s, aa(*c.begin())) } -> std::same_as<future<>>; } )
+std::function<future<>(output_stream<char>&&)> stream_range_as_array(Container val, Func fun) {
+ return [val = std::move(val), fun = std::move(fun)](output_stream<char>&& s) {
+ return do_with(output_stream<char>(std::move(s)), Container(std::move(val)), Func(std::move(fun)), true, [](output_stream<char>& s, const Container& val, const Func& f, bool& first){
+ return s.write("[").then([&val, &s, &first, &f] () {
+ return do_for_each(val, [&s, &first, &f](const typename Container::value_type& v){
+ auto fut = first ? make_ready_future<>() : s.write(", ");
+ first = false;
+ return fut.then([&s, &f, &v]() {
+ return formatter::write(s, f(v));
+ });
+ });
+ }).then([&s](){
+ return s.write("]").then([&s] {
+ return s.close();
+ });
+ });
+ });
+ };
+}
+
+/*!
+ * \brief capture an object and return a serialize function for it.
+ *
+ * To use it:
+ * return make_ready_future<json::json_return_type>(stream_object(res));
+ */
+template<class T>
+std::function<future<>(output_stream<char>&&)> stream_object(T val) {
+ return [val = std::move(val)](output_stream<char>&& s) {
+ return do_with(output_stream<char>(std::move(s)), T(std::move(val)), [](output_stream<char>& s, const T& val){
+ return formatter::write(s, val).then([&s] {
+ return s.close();
+ });
+ });
+ };
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/api.hh b/src/seastar/include/seastar/net/api.hh
new file mode 100644
index 000000000..6c2048c90
--- /dev/null
+++ b/src/seastar/include/seastar/net/api.hh
@@ -0,0 +1,392 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include <cstring>
+#include <seastar/core/future.hh>
+#include <seastar/net/byteorder.hh>
+#include <seastar/net/socket_defs.hh>
+#include <seastar/net/packet.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/util/std-compat.hh>
+#include "../core/internal/api-level.hh"
+#include <sys/types.h>
+
+namespace seastar {
+
+inline
+bool is_ip_unspecified(const ipv4_addr& addr) noexcept {
+ return addr.is_ip_unspecified();
+}
+
+inline
+bool is_port_unspecified(const ipv4_addr& addr) noexcept {
+ return addr.is_port_unspecified();
+}
+
+inline
+socket_address make_ipv4_address(const ipv4_addr& addr) noexcept {
+ return socket_address(addr);
+}
+
+inline
+socket_address make_ipv4_address(uint32_t ip, uint16_t port) noexcept {
+ return make_ipv4_address(ipv4_addr(ip, port));
+}
+
+namespace net {
+
+// see linux tcp(7) for parameter explanation
+struct tcp_keepalive_params {
+ std::chrono::seconds idle; // TCP_KEEPIDLE
+ std::chrono::seconds interval; // TCP_KEEPINTVL
+ unsigned count; // TCP_KEEPCNT
+};
+
+// see linux sctp(7) for parameter explanation
+struct sctp_keepalive_params {
+ std::chrono::seconds interval; // spp_hbinterval
+ unsigned count; // spp_pathmaxrt
+};
+
+using keepalive_params = std::variant<tcp_keepalive_params, sctp_keepalive_params>;
+
+/// \cond internal
+class connected_socket_impl;
+class socket_impl;
+
+class server_socket_impl;
+class udp_channel_impl;
+class get_impl;
+/// \endcond
+
+class udp_datagram_impl {
+public:
+ virtual ~udp_datagram_impl() {};
+ virtual socket_address get_src() = 0;
+ virtual socket_address get_dst() = 0;
+ virtual uint16_t get_dst_port() = 0;
+ virtual packet& get_data() = 0;
+};
+
+class udp_datagram final {
+private:
+ std::unique_ptr<udp_datagram_impl> _impl;
+public:
+ udp_datagram(std::unique_ptr<udp_datagram_impl>&& impl) noexcept : _impl(std::move(impl)) {};
+ socket_address get_src() { return _impl->get_src(); }
+ socket_address get_dst() { return _impl->get_dst(); }
+ uint16_t get_dst_port() { return _impl->get_dst_port(); }
+ packet& get_data() { return _impl->get_data(); }
+};
+
+class udp_channel {
+private:
+ std::unique_ptr<udp_channel_impl> _impl;
+public:
+ udp_channel() noexcept;
+ udp_channel(std::unique_ptr<udp_channel_impl>) noexcept;
+ ~udp_channel();
+
+ udp_channel(udp_channel&&) noexcept;
+ udp_channel& operator=(udp_channel&&) noexcept;
+
+ socket_address local_address() const;
+
+ future<udp_datagram> receive();
+ future<> send(const socket_address& dst, const char* msg);
+ future<> send(const socket_address& dst, packet p);
+ bool is_closed() const;
+ /// Causes a pending receive() to complete (possibly with an exception)
+ void shutdown_input();
+ /// Causes a pending send() to complete (possibly with an exception)
+ void shutdown_output();
+ /// Close the channel and releases all resources.
+ ///
+ /// Must be called only when there are no unfinished send() or receive() calls. You
+ /// can force pending calls to complete soon by calling shutdown_input() and
+ /// shutdown_output().
+ void close();
+};
+
+class network_interface_impl;
+
+} /* namespace net */
+
+/// \addtogroup networking-module
+/// @{
+
+/// Configuration for buffered connected_socket input operations
+///
+/// This structure allows tuning of buffered input operations done via
+/// connected_socket. It is a hint to the implementation and may be
+/// ignored (e.g. the zero-copy native stack does not allocate buffers,
+/// so it ignores buffer-size parameters).
+struct connected_socket_input_stream_config final {
+ /// Initial buffer size to use for input buffering
+ unsigned buffer_size = 8192;
+ /// Minimum buffer size to use for input buffering. The system will decrease
+ /// buffer sizes if it sees a tendency towards small requests, but will not go
+ /// below this buffer size.
+ unsigned min_buffer_size = 512;
+ /// Maximum buffer size to use for input buffering. The system will increase
+ /// buffer sizes if it sees a tendency towards large requests, but will not go
+ /// above this buffer size.
+ unsigned max_buffer_size = 128 * 1024;
+};
+
+/// A TCP (or other stream-based protocol) connection.
+///
+/// A \c connected_socket represents a full-duplex stream between
+/// two endpoints, a local endpoint and a remote endpoint.
+class connected_socket {
+ friend class net::get_impl;
+ std::unique_ptr<net::connected_socket_impl> _csi;
+public:
+ /// Constructs a \c connected_socket not corresponding to a connection
+ connected_socket() noexcept;
+ ~connected_socket();
+
+ /// \cond internal
+ explicit connected_socket(std::unique_ptr<net::connected_socket_impl> csi) noexcept;
+ /// \endcond
+ /// Moves a \c connected_socket object.
+ connected_socket(connected_socket&& cs) noexcept;
+ /// Move-assigns a \c connected_socket object.
+ connected_socket& operator=(connected_socket&& cs) noexcept;
+ /// Gets the input stream.
+ ///
+ /// \param csisc Configuration for the input_stream returned
+ ///
+ /// Gets an object returning data sent from the remote endpoint.
+ input_stream<char> input(connected_socket_input_stream_config csisc = {});
+ /// Gets the output stream.
+ ///
+ /// Gets an object that sends data to the remote endpoint.
+ /// \param buffer_size how much data to buffer
+ output_stream<char> output(size_t buffer_size = 8192);
+ /// Sets the TCP_NODELAY option (disabling Nagle's algorithm)
+ void set_nodelay(bool nodelay);
+ /// Gets the TCP_NODELAY option (Nagle's algorithm)
+ ///
+ /// \return whether the nodelay option is enabled or not
+ bool get_nodelay() const;
+ /// Sets SO_KEEPALIVE option (enable keepalive timer on a socket)
+ void set_keepalive(bool keepalive);
+ /// Gets O_KEEPALIVE option
+ /// \return whether the keepalive option is enabled or not
+ bool get_keepalive() const;
+ /// Sets TCP keepalive parameters
+ void set_keepalive_parameters(const net::keepalive_params& p);
+ /// Get TCP keepalive parameters
+ net::keepalive_params get_keepalive_parameters() const;
+ /// Sets custom socket options. Based on setsockopt function.
+ /// Linux users should refer to protocol-specific manuals
+ /// to see available options, e.g. tcp(7), ip(7), etc.
+ void set_sockopt(int level, int optname, const void* data, size_t len);
+ /// Gets custom socket options. Based on getsockopt function.
+ /// Linux users should refer to protocol-specific manuals
+ /// to see available options, e.g. tcp(7), ip(7), etc.
+ int get_sockopt(int level, int optname, void* data, size_t len) const;
+
+ /// Disables output to the socket.
+ ///
+ /// Current or future writes that have not been successfully flushed
+ /// will immediately fail with an error. This is useful to abort
+ /// operations on a socket that is not making progress due to a
+ /// peer failure.
+ void shutdown_output();
+ /// Disables input from the socket.
+ ///
+ /// Current or future reads will immediately fail with an error.
+ /// This is useful to abort operations on a socket that is not making
+ /// progress due to a peer failure.
+ void shutdown_input();
+};
+/// @}
+
+/// \addtogroup networking-module
+/// @{
+
+/// The seastar socket.
+///
+/// A \c socket that allows a connection to be established between
+/// two endpoints.
+class socket {
+ std::unique_ptr<net::socket_impl> _si;
+public:
+ socket() noexcept = default;
+ ~socket();
+
+ /// \cond internal
+ explicit socket(std::unique_ptr<net::socket_impl> si) noexcept;
+ /// \endcond
+ /// Moves a \c seastar::socket object.
+ socket(socket&&) noexcept;
+ /// Move-assigns a \c seastar::socket object.
+ socket& operator=(socket&&) noexcept;
+
+ /// Attempts to establish the connection.
+ ///
+ /// \return a \ref connected_socket representing the connection.
+ future<connected_socket> connect(socket_address sa, socket_address local = {}, transport proto = transport::TCP);
+
+ /// Sets SO_REUSEADDR option (enable reuseaddr option on a socket)
+ void set_reuseaddr(bool reuseaddr);
+ /// Gets O_REUSEADDR option
+ /// \return whether the reuseaddr option is enabled or not
+ bool get_reuseaddr() const;
+ /// Stops any in-flight connection attempt.
+ ///
+ /// Cancels the connection attempt if it's still in progress, and
+ /// terminates the connection if it has already been established.
+ void shutdown();
+};
+
+/// @}
+
+/// \addtogroup networking-module
+/// @{
+
+/// The result of an server_socket::accept() call
+struct accept_result {
+ connected_socket connection; ///< The newly-accepted connection
+ socket_address remote_address; ///< The address of the peer that connected to us
+};
+
+/// A listening socket, waiting to accept incoming network connections.
+class server_socket {
+ std::unique_ptr<net::server_socket_impl> _ssi;
+ bool _aborted = false;
+public:
+ enum class load_balancing_algorithm {
+ // This algorithm tries to distribute all connections equally between all shards.
+ // It does this by sending new connections to a shard with smallest amount of connections.
+ connection_distribution,
+ // This algorithm distributes new connection based on peer's tcp port. Destination shard
+ // is calculated as a port number modulo number of shards. This allows a client to connect
+ // to a specific shard in a server given it knows how many shards server has by choosing
+ // src port number accordingly.
+ port,
+ // This algorithm distributes all new connections to listen_options::fixed_cpu shard only.
+ fixed,
+ default_ = connection_distribution
+ };
+ /// Constructs a \c server_socket not corresponding to a connection
+ server_socket() noexcept;
+ /// \cond internal
+ explicit server_socket(std::unique_ptr<net::server_socket_impl> ssi) noexcept;
+ /// \endcond
+ /// Moves a \c server_socket object.
+ server_socket(server_socket&& ss) noexcept;
+ ~server_socket();
+ /// Move-assigns a \c server_socket object.
+ server_socket& operator=(server_socket&& cs) noexcept;
+
+ /// Accepts the next connection to successfully connect to this socket.
+ ///
+ /// \return an accept_result representing the connection and
+ /// the socket_address of the remote endpoint.
+ ///
+ /// \see listen(socket_address sa)
+ /// \see listen(socket_address sa, listen_options opts)
+ future<accept_result> accept();
+
+ /// Stops any \ref accept() in progress.
+ ///
+ /// Current and future \ref accept() calls will terminate immediately
+ /// with an error.
+ void abort_accept();
+
+ /// Local bound address
+ socket_address local_address() const noexcept;
+};
+
+/// @}
+
+struct listen_options {
+ bool reuse_address = false;
+ server_socket::load_balancing_algorithm lba = server_socket::load_balancing_algorithm::default_;
+ transport proto = transport::TCP;
+ int listen_backlog = 100;
+ unsigned fixed_cpu = 0u;
+ void set_fixed_cpu(unsigned cpu) {
+ lba = server_socket::load_balancing_algorithm::fixed;
+ fixed_cpu = cpu;
+ }
+};
+
+class network_interface {
+private:
+ shared_ptr<net::network_interface_impl> _impl;
+public:
+ network_interface() = delete;
+ network_interface(shared_ptr<net::network_interface_impl>) noexcept;
+ network_interface(network_interface&&) noexcept;
+
+ network_interface& operator=(network_interface&&) noexcept;
+
+ uint32_t index() const;
+ uint32_t mtu() const;
+
+ const sstring& name() const;
+ const sstring& display_name() const;
+ const std::vector<net::inet_address>& addresses() const;
+ const std::vector<uint8_t> hardware_address() const;
+
+ bool is_loopback() const;
+ bool is_virtual() const;
+ bool is_up() const;
+ bool supports_ipv6() const;
+};
+
+class network_stack {
+public:
+ virtual ~network_stack() {}
+ virtual server_socket listen(socket_address sa, listen_options opts) = 0;
+ // FIXME: local parameter assumes ipv4 for now, fix when adding other AF
+ future<connected_socket> connect(socket_address sa, socket_address = {}, transport proto = transport::TCP);
+ virtual ::seastar::socket socket() = 0;
+ virtual net::udp_channel make_udp_channel(const socket_address& = {}) = 0;
+ virtual future<> initialize() {
+ return make_ready_future();
+ }
+ virtual bool has_per_core_namespace() = 0;
+ // NOTE: this is not a correct query approach.
+ // This question should be per NIC, but we have no such
+ // abstraction, so for now this is "stack-wide"
+ virtual bool supports_ipv6() const {
+ return false;
+ }
+
+ /**
+ * Returns available network interfaces. This represents a
+ * snapshot of interfaces available at call time, hence the
+ * return by value.
+ */
+ virtual std::vector<network_interface> network_interfaces();
+};
+
+}
diff --git a/src/seastar/include/seastar/net/arp.hh b/src/seastar/include/seastar/net/arp.hh
new file mode 100644
index 000000000..043e0b00b
--- /dev/null
+++ b/src/seastar/include/seastar/net/arp.hh
@@ -0,0 +1,296 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#pragma once
+
+#include <seastar/net/net.hh>
+#include <seastar/core/byteorder.hh>
+#include <seastar/net/ethernet.hh>
+#include <unordered_map>
+
+namespace seastar {
+
+namespace net {
+
+class arp;
+class arp_for_protocol;
+template <typename L3>
+class arp_for;
+
+class arp_for_protocol {
+protected:
+ arp& _arp;
+ uint16_t _proto_num;
+public:
+ arp_for_protocol(arp& a, uint16_t proto_num);
+ virtual ~arp_for_protocol();
+ virtual future<> received(packet p) = 0;
+ virtual bool forward(forward_hash& out_hash_data, packet& p, size_t off) { return false; }
+};
+
+class arp {
+ interface* _netif;
+ l3_protocol _proto;
+ std::unordered_map<uint16_t, arp_for_protocol*> _arp_for_protocol;
+ circular_buffer<l3_protocol::l3packet> _packetq;
+private:
+ struct arp_hdr {
+ uint16_t htype;
+ uint16_t ptype;
+
+ static arp_hdr read(const char* p) {
+ arp_hdr ah;
+ ah.htype = consume_be<uint16_t>(p);
+ ah.ptype = consume_be<uint16_t>(p);
+ return ah;
+ }
+ static constexpr size_t size() { return 4; }
+ };
+public:
+ explicit arp(interface* netif);
+ void add(uint16_t proto_num, arp_for_protocol* afp);
+ void del(uint16_t proto_num);
+private:
+ ethernet_address l2self() { return _netif->hw_address(); }
+ future<> process_packet(packet p, ethernet_address from);
+ bool forward(forward_hash& out_hash_data, packet& p, size_t off);
+ std::optional<l3_protocol::l3packet> get_packet();
+ template <class l3_proto>
+ friend class arp_for;
+};
+
+template <typename L3>
+class arp_for : public arp_for_protocol {
+public:
+ using l2addr = ethernet_address;
+ using l3addr = typename L3::address_type;
+private:
+ static constexpr auto max_waiters = 512;
+ enum oper {
+ op_request = 1,
+ op_reply = 2,
+ };
+ struct arp_hdr {
+ uint16_t htype;
+ uint16_t ptype;
+ uint8_t hlen;
+ uint8_t plen;
+ uint16_t oper;
+ l2addr sender_hwaddr;
+ l3addr sender_paddr;
+ l2addr target_hwaddr;
+ l3addr target_paddr;
+
+ static arp_hdr read(const char* p) {
+ arp_hdr ah;
+ ah.htype = consume_be<uint16_t>(p);
+ ah.ptype = consume_be<uint16_t>(p);
+ ah.hlen = consume_be<uint8_t>(p);
+ ah.plen = consume_be<uint8_t>(p);
+ ah.oper = consume_be<uint16_t>(p);
+ ah.sender_hwaddr = l2addr::consume(p);
+ ah.sender_paddr = l3addr::consume(p);
+ ah.target_hwaddr = l2addr::consume(p);
+ ah.target_paddr = l3addr::consume(p);
+ return ah;
+ }
+ void write(char* p) const {
+ produce_be<uint16_t>(p, htype);
+ produce_be<uint16_t>(p, ptype);
+ produce_be<uint8_t>(p, hlen);
+ produce_be<uint8_t>(p, plen);
+ produce_be<uint16_t>(p, oper);
+ sender_hwaddr.produce(p);
+ sender_paddr.produce(p);
+ target_hwaddr.produce(p);
+ target_paddr.produce(p);
+ }
+ static constexpr size_t size() {
+ return 8 + 2 * (l2addr::size() + l3addr::size());
+ }
+ };
+ struct resolution {
+ std::vector<promise<l2addr>> _waiters;
+ timer<> _timeout_timer;
+ };
+private:
+ l3addr _l3self = L3::broadcast_address();
+ std::unordered_map<l3addr, l2addr> _table;
+ std::unordered_map<l3addr, resolution> _in_progress;
+private:
+ packet make_query_packet(l3addr paddr);
+ virtual future<> received(packet p) override;
+ future<> handle_request(arp_hdr* ah);
+ l2addr l2self() { return _arp.l2self(); }
+ void send(l2addr to, packet p);
+public:
+ future<> send_query(const l3addr& paddr);
+ explicit arp_for(arp& a) : arp_for_protocol(a, L3::arp_protocol_type()) {
+ _table[L3::broadcast_address()] = ethernet::broadcast_address();
+ }
+ future<ethernet_address> lookup(const l3addr& addr);
+ void learn(l2addr l2, l3addr l3);
+ void run();
+ void set_self_addr(l3addr addr) {
+ _table.erase(_l3self);
+ _table[addr] = l2self();
+ _l3self = addr;
+ }
+ friend class arp;
+};
+
+template <typename L3>
+packet
+arp_for<L3>::make_query_packet(l3addr paddr) {
+ arp_hdr hdr;
+ hdr.htype = ethernet::arp_hardware_type();
+ hdr.ptype = L3::arp_protocol_type();
+ hdr.hlen = sizeof(l2addr);
+ hdr.plen = sizeof(l3addr);
+ hdr.oper = op_request;
+ hdr.sender_hwaddr = l2self();
+ hdr.sender_paddr = _l3self;
+ hdr.target_hwaddr = ethernet::broadcast_address();
+ hdr.target_paddr = paddr;
+ auto p = packet();
+ p.prepend_uninitialized_header(hdr.size());
+ hdr.write(p.get_header(0, hdr.size()));
+ return p;
+}
+
+template <typename L3>
+void arp_for<L3>::send(l2addr to, packet p) {
+ _arp._packetq.push_back(l3_protocol::l3packet{eth_protocol_num::arp, to, std::move(p)});
+}
+
+template <typename L3>
+future<>
+arp_for<L3>::send_query(const l3addr& paddr) {
+ send(ethernet::broadcast_address(), make_query_packet(paddr));
+ return make_ready_future<>();
+}
+
+class arp_error : public std::runtime_error {
+public:
+ arp_error(const std::string& msg) : std::runtime_error(msg) {}
+};
+
+class arp_timeout_error : public arp_error {
+public:
+ arp_timeout_error() : arp_error("ARP timeout") {}
+};
+
+class arp_queue_full_error : public arp_error {
+public:
+ arp_queue_full_error() : arp_error("ARP waiter's queue is full") {}
+};
+
+template <typename L3>
+future<ethernet_address>
+arp_for<L3>::lookup(const l3addr& paddr) {
+ auto i = _table.find(paddr);
+ if (i != _table.end()) {
+ return make_ready_future<ethernet_address>(i->second);
+ }
+ auto j = _in_progress.find(paddr);
+ auto first_request = j == _in_progress.end();
+ auto& res = first_request ? _in_progress[paddr] : j->second;
+
+ if (first_request) {
+ res._timeout_timer.set_callback([paddr, this, &res] {
+ // FIXME: future is discarded
+ (void)send_query(paddr);
+ for (auto& w : res._waiters) {
+ w.set_exception(arp_timeout_error());
+ }
+ res._waiters.clear();
+ });
+ res._timeout_timer.arm_periodic(std::chrono::seconds(1));
+ // FIXME: future is discarded
+ (void)send_query(paddr);
+ }
+
+ if (res._waiters.size() >= max_waiters) {
+ return make_exception_future<ethernet_address>(arp_queue_full_error());
+ }
+
+ res._waiters.emplace_back();
+ return res._waiters.back().get_future();
+}
+
+template <typename L3>
+void
+arp_for<L3>::learn(l2addr hwaddr, l3addr paddr) {
+ _table[paddr] = hwaddr;
+ auto i = _in_progress.find(paddr);
+ if (i != _in_progress.end()) {
+ auto& res = i->second;
+ res._timeout_timer.cancel();
+ for (auto &&pr : res._waiters) {
+ pr.set_value(hwaddr);
+ }
+ _in_progress.erase(i);
+ }
+}
+
+template <typename L3>
+future<>
+arp_for<L3>::received(packet p) {
+ auto ah = p.get_header(0, arp_hdr::size());
+ if (!ah) {
+ return make_ready_future<>();
+ }
+ auto h = arp_hdr::read(ah);
+ if (h.hlen != sizeof(l2addr) || h.plen != sizeof(l3addr)) {
+ return make_ready_future<>();
+ }
+ switch (h.oper) {
+ case op_request:
+ return handle_request(&h);
+ case op_reply:
+ arp_learn(h.sender_hwaddr, h.sender_paddr);
+ return make_ready_future<>();
+ default:
+ return make_ready_future<>();
+ }
+}
+
+template <typename L3>
+future<>
+arp_for<L3>::handle_request(arp_hdr* ah) {
+ if (ah->target_paddr == _l3self
+ && _l3self != L3::broadcast_address()) {
+ ah->oper = op_reply;
+ ah->target_hwaddr = ah->sender_hwaddr;
+ ah->target_paddr = ah->sender_paddr;
+ ah->sender_hwaddr = l2self();
+ ah->sender_paddr = _l3self;
+ auto p = packet();
+ ah->write(p.prepend_uninitialized_header(ah->size()));
+ send(ah->target_hwaddr, std::move(p));
+ }
+ return make_ready_future<>();
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/byteorder.hh b/src/seastar/include/seastar/net/byteorder.hh
new file mode 100644
index 000000000..20342c3d0
--- /dev/null
+++ b/src/seastar/include/seastar/net/byteorder.hh
@@ -0,0 +1,128 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <arpa/inet.h> // for ntohs() and friends
+#include <iosfwd>
+#include <utility>
+
+#include <seastar/core/unaligned.hh>
+
+namespace seastar {
+
+inline uint64_t ntohq(uint64_t v) {
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ // big endian, nothing to do
+ return v;
+#else
+ // little endian, reverse bytes
+ return __builtin_bswap64(v);
+#endif
+}
+inline uint64_t htonq(uint64_t v) {
+ // htonq and ntohq have identical implementations
+ return ntohq(v);
+}
+
+namespace net {
+
+inline void ntoh() {}
+inline void hton() {}
+
+inline uint8_t ntoh(uint8_t x) { return x; }
+inline uint8_t hton(uint8_t x) { return x; }
+inline uint16_t ntoh(uint16_t x) { return ntohs(x); }
+inline uint16_t hton(uint16_t x) { return htons(x); }
+inline uint32_t ntoh(uint32_t x) { return ntohl(x); }
+inline uint32_t hton(uint32_t x) { return htonl(x); }
+inline uint64_t ntoh(uint64_t x) { return ntohq(x); }
+inline uint64_t hton(uint64_t x) { return htonq(x); }
+
+inline int8_t ntoh(int8_t x) { return x; }
+inline int8_t hton(int8_t x) { return x; }
+inline int16_t ntoh(int16_t x) { return ntohs(x); }
+inline int16_t hton(int16_t x) { return htons(x); }
+inline int32_t ntoh(int32_t x) { return ntohl(x); }
+inline int32_t hton(int32_t x) { return htonl(x); }
+inline int64_t ntoh(int64_t x) { return ntohq(x); }
+inline int64_t hton(int64_t x) { return htonq(x); }
+
+// Deprecated alias net::packed<> for unaligned<> from unaligned.hh.
+// TODO: get rid of this alias.
+template <typename T> using packed = unaligned<T>;
+
+template <typename T>
+inline T ntoh(const packed<T>& x) {
+ T v = x;
+ return ntoh(v);
+}
+
+template <typename T>
+inline T hton(const packed<T>& x) {
+ T v = x;
+ return hton(v);
+}
+
+template <typename T>
+inline std::ostream& operator<<(std::ostream& os, const packed<T>& v) {
+ auto x = v.raw;
+ return os << x;
+}
+
+inline
+void ntoh_inplace() {}
+inline
+void hton_inplace() {};
+
+template <typename First, typename... Rest>
+inline
+void ntoh_inplace(First& first, Rest&... rest) {
+ first = ntoh(first);
+ ntoh_inplace(std::forward<Rest&>(rest)...);
+}
+
+template <typename First, typename... Rest>
+inline
+void hton_inplace(First& first, Rest&... rest) {
+ first = hton(first);
+ hton_inplace(std::forward<Rest&>(rest)...);
+}
+
+template <class T>
+inline
+T ntoh(const T& x) {
+ T tmp = x;
+ tmp.adjust_endianness([] (auto&&... what) { ntoh_inplace(std::forward<decltype(what)&>(what)...); });
+ return tmp;
+}
+
+template <class T>
+inline
+T hton(const T& x) {
+ T tmp = x;
+ tmp.adjust_endianness([] (auto&&... what) { hton_inplace(std::forward<decltype(what)&>(what)...); });
+ return tmp;
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/config.hh b/src/seastar/include/seastar/net/config.hh
new file mode 100644
index 000000000..6432d3c45
--- /dev/null
+++ b/src/seastar/include/seastar/net/config.hh
@@ -0,0 +1,65 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2017 Marek Waszkiewicz ( marek.waszkiewicz77@gmail.com )
+ */
+
+#pragma once
+
+#include <seastar/util/std-compat.hh>
+#include <istream>
+#include <string>
+#include <unordered_map>
+
+namespace seastar {
+namespace net {
+
+ struct ipv4_config {
+ std::string ip;
+ std::string netmask;
+ std::string gateway;
+ bool dhcp{ false };
+ };
+
+ struct hw_config {
+ std::string pci_address;
+ std::optional<unsigned> port_index;
+ bool lro{ true };
+ bool tso{ true };
+ bool ufo{ true };
+ bool hw_fc{ true };
+ bool event_index{ true };
+ bool csum_offload{ true };
+ std::optional<unsigned> ring_size;
+ };
+
+ struct device_config {
+ ipv4_config ip_cfg;
+ hw_config hw_cfg;
+ };
+
+ std::unordered_map<std::string, device_config> parse_config(std::istream& input);
+
+ class config_exception : public std::runtime_error {
+ public:
+ config_exception(const std::string& msg)
+ : std::runtime_error(msg) {
+ }
+ };
+}
+}
diff --git a/src/seastar/include/seastar/net/const.hh b/src/seastar/include/seastar/net/const.hh
new file mode 100644
index 000000000..7a79b85d3
--- /dev/null
+++ b/src/seastar/include/seastar/net/const.hh
@@ -0,0 +1,44 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+namespace seastar {
+
+namespace net {
+
+enum class ip_protocol_num : uint8_t {
+ icmp = 1, tcp = 6, udp = 17, unused = 255
+};
+
+enum class eth_protocol_num : uint16_t {
+ ipv4 = 0x0800, arp = 0x0806, ipv6 = 0x86dd
+};
+
+const uint8_t eth_hdr_len = 14;
+const uint8_t tcp_hdr_len_min = 20;
+const uint8_t ipv4_hdr_len_min = 20;
+const uint8_t ipv6_hdr_len_min = 40;
+const uint16_t ip_packet_len_max = 65535;
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/dhcp.hh b/src/seastar/include/seastar/net/dhcp.hh
new file mode 100644
index 000000000..09d209e83
--- /dev/null
+++ b/src/seastar/include/seastar/net/dhcp.hh
@@ -0,0 +1,84 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2014 Cloudius Systems
+ */
+
+#pragma once
+
+#include <seastar/net/ip.hh>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+namespace net {
+
+/*
+ * Simplistic DHCP query class.
+ * Due to the nature of the native stack,
+ * it operates on an "ipv4" object instead of,
+ * for example, an interface.
+ */
+class dhcp {
+public:
+ dhcp(ipv4 &);
+ dhcp(dhcp &&) noexcept;
+ ~dhcp();
+
+ static const steady_clock_type::duration default_timeout;
+
+ struct lease {
+ ipv4_address ip;
+ ipv4_address netmask;
+ ipv4_address broadcast;
+
+ ipv4_address gateway;
+ ipv4_address dhcp_server;
+
+ std::vector<ipv4_address> name_servers;
+
+ std::chrono::seconds lease_time;
+ std::chrono::seconds renew_time;
+ std::chrono::seconds rebind_time;
+
+ uint16_t mtu = 0;
+ };
+
+ typedef future<std::optional<lease>> result_type;
+
+ /**
+ * Runs a discover/request sequence on the ipv4 "stack".
+ * During this execution the ipv4 will be "hijacked"
+ * more or less (through packet filter), and while not
+ * inoperable, most likely quite less efficient.
+ *
+ * Please note that this does _not_ modify the ipv4 object bound.
+ * It only makes queries and records replys for the related NIC.
+ * It is up to caller to use the returned information as he se fit.
+ */
+ result_type discover(const steady_clock_type::duration & = default_timeout);
+ result_type renew(const lease &, const steady_clock_type::duration & = default_timeout);
+ ip_packet_filter* get_ipv4_filter();
+private:
+ class impl;
+ std::unique_ptr<impl> _impl;
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/dns.hh b/src/seastar/include/seastar/net/dns.hh
new file mode 100644
index 000000000..3ece937d2
--- /dev/null
+++ b/src/seastar/include/seastar/net/dns.hh
@@ -0,0 +1,156 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2016 Cloudius Systems
+ */
+
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+#include <memory>
+#include <seastar/util/std-compat.hh>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/net/inet_address.hh>
+
+namespace seastar {
+
+struct ipv4_addr;
+
+class socket_address;
+class network_stack;
+
+/**
+ * C-ares based dns query support.
+ * Handles name- and ip-based resolution.
+ *
+ */
+
+namespace net {
+
+/**
+ * A c++-esque version of a hostent
+ */
+struct hostent {
+ // Primary name is always first
+ std::vector<sstring> names;
+ // Primary address is also always first.
+ std::vector<inet_address> addr_list;
+};
+
+typedef std::optional<inet_address::family> opt_family;
+
+struct srv_record {
+ unsigned short priority;
+ unsigned short weight;
+ unsigned short port;
+ sstring target;
+};
+
+/**
+ * A DNS resolver object.
+ * Wraps the query logic & networking.
+ * Can be instantiated with options and your network
+ * stack of choice, though for "normal" non-test
+ * querying, you are probably better of with the
+ * global calls further down.
+ */
+class dns_resolver {
+public:
+ struct options {
+ std::optional<bool>
+ use_tcp_query;
+ std::optional<std::vector<inet_address>>
+ servers;
+ std::optional<std::chrono::milliseconds>
+ timeout;
+ std::optional<uint16_t>
+ tcp_port, udp_port;
+ std::optional<std::vector<sstring>>
+ domains;
+ };
+
+ enum class srv_proto {
+ tcp, udp
+ };
+ using srv_records = std::vector<srv_record>;
+
+ dns_resolver();
+ dns_resolver(dns_resolver&&) noexcept;
+ explicit dns_resolver(const options&);
+ explicit dns_resolver(network_stack&, const options& = {});
+ ~dns_resolver();
+
+ dns_resolver& operator=(dns_resolver&&) noexcept;
+
+ /**
+ * Resolves a hostname to one or more addresses and aliases
+ */
+ future<hostent> get_host_by_name(const sstring&, opt_family = {});
+ /**
+ * Resolves an address to one or more addresses and aliases
+ */
+ future<hostent> get_host_by_addr(const inet_address&);
+
+ /**
+ * Resolves a hostname to one (primary) address
+ */
+ future<inet_address> resolve_name(const sstring&, opt_family = {});
+ /**
+ * Resolves an address to one (primary) name
+ */
+ future<sstring> resolve_addr(const inet_address&);
+
+ /**
+ * Resolve a service in given domain to one or more SRV records
+ */
+ future<srv_records> get_srv_records(srv_proto proto,
+ const sstring& service,
+ const sstring& domain);
+
+ /**
+ * Shuts the object down. Great for tests.
+ */
+ future<> close();
+private:
+ class impl;
+ shared_ptr<impl> _impl;
+};
+
+namespace dns {
+
+// See above. These functions simply queries using a shard-local
+// default-stack, default-opts resolver
+future<hostent> get_host_by_name(const sstring&, opt_family = {});
+future<hostent> get_host_by_addr(const inet_address&);
+
+future<inet_address> resolve_name(const sstring&, opt_family = {});
+future<sstring> resolve_addr(const inet_address&);
+
+future<std::vector<srv_record>> get_srv_records(dns_resolver::srv_proto proto,
+ const sstring& service,
+ const sstring& domain);
+
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/dpdk.hh b/src/seastar/include/seastar/net/dpdk.hh
new file mode 100644
index 000000000..18929374f
--- /dev/null
+++ b/src/seastar/include/seastar/net/dpdk.hh
@@ -0,0 +1,54 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#ifdef SEASTAR_HAVE_DPDK
+
+#include <memory>
+#include <seastar/net/config.hh>
+#include <seastar/net/net.hh>
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+
+std::unique_ptr<net::device> create_dpdk_net_device(
+ uint16_t port_idx = 0,
+ uint16_t num_queues = 1,
+ bool use_lro = true,
+ bool enable_fc = true);
+
+std::unique_ptr<net::device> create_dpdk_net_device(
+ const net::hw_config& hw_cfg);
+
+
+boost::program_options::options_description get_dpdk_net_options_description();
+
+namespace dpdk {
+/**
+ * @return Number of bytes needed for mempool objects of each QP.
+ */
+uint32_t qp_mempool_obj_size(bool hugetlbfs_membackend);
+}
+
+}
+
+#endif // SEASTAR_HAVE_DPDK
diff --git a/src/seastar/include/seastar/net/ethernet.hh b/src/seastar/include/seastar/net/ethernet.hh
new file mode 100644
index 000000000..9181d4189
--- /dev/null
+++ b/src/seastar/include/seastar/net/ethernet.hh
@@ -0,0 +1,96 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <array>
+#include <assert.h>
+#include <algorithm>
+#include <seastar/net/byteorder.hh>
+
+namespace seastar {
+
+namespace net {
+
+struct ethernet_address {
+ ethernet_address()
+ : mac{} {}
+
+ ethernet_address(const uint8_t *eaddr) {
+ std::copy(eaddr, eaddr + 6, mac.begin());
+ }
+
+ ethernet_address(std::initializer_list<uint8_t> eaddr) {
+ assert(eaddr.size() == mac.size());
+ std::copy(eaddr.begin(), eaddr.end(), mac.begin());
+ }
+
+ std::array<uint8_t, 6> mac;
+
+ template <typename Adjuster>
+ void adjust_endianness(Adjuster a) {}
+
+ static ethernet_address read(const char* p) {
+ ethernet_address ea;
+ std::copy_n(p, size(), reinterpret_cast<char*>(ea.mac.data()));\
+ return ea;
+ }
+ static ethernet_address consume(const char*& p) {
+ auto ea = read(p);
+ p += size();
+ return ea;
+ }
+ void write(char* p) const {
+ std::copy_n(reinterpret_cast<const char*>(mac.data()), size(), p);
+ }
+ void produce(char*& p) const {
+ write(p);
+ p += size();
+ }
+ static constexpr size_t size() {
+ return 6;
+ }
+} __attribute__((packed));
+
+std::ostream& operator<<(std::ostream& os, ethernet_address ea);
+
+struct ethernet {
+ using address = ethernet_address;
+ static address broadcast_address() {
+ return {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+ }
+ static constexpr uint16_t arp_hardware_type() { return 1; }
+};
+
+struct eth_hdr {
+ ethernet_address dst_mac;
+ ethernet_address src_mac;
+ packed<uint16_t> eth_proto;
+ template <typename Adjuster>
+ auto adjust_endianness(Adjuster a) {
+ return a(eth_proto);
+ }
+} __attribute__((packed));
+
+ethernet_address parse_ethernet_address(std::string addr);
+}
+
+}
diff --git a/src/seastar/include/seastar/net/inet_address.hh b/src/seastar/include/seastar/net/inet_address.hh
new file mode 100644
index 000000000..612caf866
--- /dev/null
+++ b/src/seastar/include/seastar/net/inet_address.hh
@@ -0,0 +1,128 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+
+#include <iosfwd>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <stdexcept>
+#include <vector>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+namespace net {
+
+struct ipv4_address;
+struct ipv6_address;
+
+class unknown_host : public std::invalid_argument {
+public:
+ using invalid_argument::invalid_argument;
+};
+
+class inet_address {
+public:
+ enum class family : sa_family_t {
+ INET = AF_INET, INET6 = AF_INET6
+ };
+private:
+ family _in_family;
+
+ union {
+ ::in_addr _in;
+ ::in6_addr _in6;
+ };
+
+ uint32_t _scope = invalid_scope;
+public:
+ static constexpr uint32_t invalid_scope = std::numeric_limits<uint32_t>::max();
+
+ inet_address() noexcept;
+ inet_address(family) noexcept;
+ inet_address(::in_addr i) noexcept;
+ inet_address(::in6_addr i, uint32_t scope = invalid_scope) noexcept;
+ // NOTE: does _not_ resolve the address. Only parses
+ // ipv4/ipv6 numerical address
+ // throws std::invalid_argument if sstring is invalid
+ inet_address(const sstring&);
+ inet_address(inet_address&&) noexcept = default;
+ inet_address(const inet_address&) noexcept = default;
+
+ inet_address(const ipv4_address&) noexcept;
+ inet_address(const ipv6_address&, uint32_t scope = invalid_scope) noexcept;
+
+ // throws iff ipv6
+ ipv4_address as_ipv4_address() const;
+ ipv6_address as_ipv6_address() const noexcept;
+
+ inet_address& operator=(const inet_address&) noexcept = default;
+ bool operator==(const inet_address&) const noexcept;
+
+ family in_family() const noexcept {
+ return _in_family;
+ }
+
+ bool is_ipv6() const noexcept {
+ return _in_family == family::INET6;
+ }
+ bool is_ipv4() const noexcept {
+ return _in_family == family::INET;
+ }
+
+ size_t size() const noexcept;
+ const void * data() const noexcept;
+
+ uint32_t scope() const noexcept {
+ return _scope;
+ }
+
+ // throws iff ipv6
+ operator ::in_addr() const;
+ operator ::in6_addr() const noexcept;
+
+ operator ipv6_address() const noexcept;
+
+ future<sstring> hostname() const;
+ future<std::vector<sstring>> aliases() const;
+
+ static future<inet_address> find(const sstring&);
+ static future<inet_address> find(const sstring&, family);
+ static future<std::vector<inet_address>> find_all(const sstring&);
+ static future<std::vector<inet_address>> find_all(const sstring&, family);
+
+ static std::optional<inet_address> parse_numerical(const sstring&);
+};
+
+std::ostream& operator<<(std::ostream&, const inet_address&);
+std::ostream& operator<<(std::ostream&, const inet_address::family&);
+
+}
+}
+
+namespace std {
+template<>
+struct hash<seastar::net::inet_address> {
+ size_t operator()(const seastar::net::inet_address&) const;
+};
+}
diff --git a/src/seastar/include/seastar/net/ip.hh b/src/seastar/include/seastar/net/ip.hh
new file mode 100644
index 000000000..68ba5a8c5
--- /dev/null
+++ b/src/seastar/include/seastar/net/ip.hh
@@ -0,0 +1,529 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#pragma once
+
+#include <boost/asio/ip/address_v4.hpp>
+#include <arpa/inet.h>
+#include <unordered_map>
+#include <cstdint>
+#include <array>
+#include <map>
+#include <list>
+#include <chrono>
+#include <seastar/core/array_map.hh>
+#include <seastar/net/byteorder.hh>
+#include <seastar/core/byteorder.hh>
+#include <seastar/net/arp.hh>
+#include <seastar/net/ip_checksum.hh>
+#include <seastar/net/const.hh>
+#include <seastar/net/packet-util.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/net/toeplitz.hh>
+#include <seastar/net/udp.hh>
+#include <seastar/core/metrics_registration.hh>
+
+namespace seastar {
+
+struct ipv6_addr;
+
+namespace net {
+
+class ipv4;
+template <ip_protocol_num ProtoNum>
+class ipv4_l4;
+struct ipv4_address;
+
+template <typename InetTraits>
+class tcp;
+
+struct ipv4_address {
+ ipv4_address() noexcept : ip(0) {}
+ explicit ipv4_address(uint32_t ip) noexcept : ip(ip) {}
+ // throws if addr is not a valid ipv4 address
+ explicit ipv4_address(const std::string& addr);
+ ipv4_address(ipv4_addr addr) noexcept {
+ ip = addr.ip;
+ }
+
+ packed<uint32_t> ip;
+
+ template <typename Adjuster>
+ auto adjust_endianness(Adjuster a) { return a(ip); }
+
+ friend bool operator==(ipv4_address x, ipv4_address y) noexcept {
+ return x.ip == y.ip;
+ }
+ friend bool operator!=(ipv4_address x, ipv4_address y) noexcept {
+ return x.ip != y.ip;
+ }
+
+ static ipv4_address read(const char* p) noexcept {
+ ipv4_address ia;
+ ia.ip = read_be<uint32_t>(p);
+ return ia;
+ }
+ static ipv4_address consume(const char*& p) noexcept {
+ auto ia = read(p);
+ p += 4;
+ return ia;
+ }
+ void write(char* p) const noexcept {
+ write_be<uint32_t>(p, ip);
+ }
+ void produce(char*& p) const noexcept {
+ produce_be<uint32_t>(p, ip);
+ }
+ static constexpr size_t size() {
+ return 4;
+ }
+} __attribute__((packed));
+
+static inline bool is_unspecified(ipv4_address addr) noexcept { return addr.ip == 0; }
+
+std::ostream& operator<<(std::ostream& os, const ipv4_address& a);
+
+// IPv6
+struct ipv6_address {
+ using ipv6_bytes = std::array<uint8_t, 16>;
+
+ static_assert(alignof(ipv6_bytes) == 1, "ipv6_bytes should be byte-aligned");
+ static_assert(sizeof(ipv6_bytes) == 16, "ipv6_bytes should be 16 bytes");
+
+ ipv6_address() noexcept;
+ explicit ipv6_address(const ::in6_addr&) noexcept;
+ explicit ipv6_address(const ipv6_bytes&) noexcept;
+ // throws if addr is not a valid ipv6 address
+ explicit ipv6_address(const std::string&);
+ ipv6_address(const ipv6_addr& addr) noexcept;
+
+ // No need to use packed - we only store
+ // as byte array. If we want to read as
+ // uints or whatnot, we must copy
+ ipv6_bytes ip;
+
+ template <typename Adjuster>
+ auto adjust_endianness(Adjuster a) { return a(ip); }
+
+ bool operator==(const ipv6_address& y) const noexcept {
+ return bytes() == y.bytes();
+ }
+ bool operator!=(const ipv6_address& y) const noexcept {
+ return !(*this == y);
+ }
+
+ const ipv6_bytes& bytes() const noexcept {
+ return ip;
+ }
+
+ bool is_unspecified() const noexcept;
+
+ static ipv6_address read(const char*) noexcept;
+ static ipv6_address consume(const char*& p) noexcept;
+ void write(char* p) const noexcept;
+ void produce(char*& p) const noexcept;
+ static constexpr size_t size() {
+ return sizeof(ipv6_bytes);
+ }
+} __attribute__((packed));
+
+std::ostream& operator<<(std::ostream&, const ipv6_address&);
+
+}
+
+}
+
+namespace std {
+
+template <>
+struct hash<seastar::net::ipv4_address> {
+ size_t operator()(seastar::net::ipv4_address a) const { return a.ip; }
+};
+
+template <>
+struct hash<seastar::net::ipv6_address> {
+ size_t operator()(const seastar::net::ipv6_address&) const;
+};
+
+}
+
+namespace seastar {
+
+namespace net {
+
+struct ipv4_traits {
+ using address_type = ipv4_address;
+ using inet_type = ipv4_l4<ip_protocol_num::tcp>;
+ struct l4packet {
+ ipv4_address to;
+ packet p;
+ ethernet_address e_dst;
+ ip_protocol_num proto_num;
+ };
+ using packet_provider_type = std::function<std::optional<l4packet> ()>;
+ static void tcp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+ csum.sum_many(src.ip.raw, dst.ip.raw, uint8_t(0), uint8_t(ip_protocol_num::tcp), len);
+ }
+ static void udp_pseudo_header_checksum(checksummer& csum, ipv4_address src, ipv4_address dst, uint16_t len) {
+ csum.sum_many(src.ip.raw, dst.ip.raw, uint8_t(0), uint8_t(ip_protocol_num::udp), len);
+ }
+ static constexpr uint8_t ip_hdr_len_min = ipv4_hdr_len_min;
+};
+
+template <ip_protocol_num ProtoNum>
+class ipv4_l4 {
+public:
+ ipv4& _inet;
+public:
+ ipv4_l4(ipv4& inet) : _inet(inet) {}
+ void register_packet_provider(ipv4_traits::packet_provider_type func);
+ future<ethernet_address> get_l2_dst_address(ipv4_address to);
+ const ipv4& inet() const {
+ return _inet;
+ }
+};
+
+class ip_protocol {
+public:
+ virtual ~ip_protocol() {}
+ virtual void received(packet p, ipv4_address from, ipv4_address to) = 0;
+ virtual bool forward(forward_hash& out_hash_data, packet& p, size_t off) { return true; }
+};
+
+template <typename InetTraits>
+struct l4connid {
+ using ipaddr = typename InetTraits::address_type;
+ using inet_type = typename InetTraits::inet_type;
+ struct connid_hash;
+
+ ipaddr local_ip;
+ ipaddr foreign_ip;
+ uint16_t local_port;
+ uint16_t foreign_port;
+
+ bool operator==(const l4connid& x) const {
+ return local_ip == x.local_ip
+ && foreign_ip == x.foreign_ip
+ && local_port == x.local_port
+ && foreign_port == x.foreign_port;
+ }
+
+ uint32_t hash(rss_key_type rss_key) {
+ forward_hash hash_data;
+ hash_data.push_back(hton(foreign_ip.ip));
+ hash_data.push_back(hton(local_ip.ip));
+ hash_data.push_back(hton(foreign_port));
+ hash_data.push_back(hton(local_port));
+ return toeplitz_hash(rss_key, hash_data);
+ }
+};
+
+class ipv4_tcp final : public ip_protocol {
+ ipv4_l4<ip_protocol_num::tcp> _inet_l4;
+ std::unique_ptr<tcp<ipv4_traits>> _tcp;
+public:
+ ipv4_tcp(ipv4& inet);
+ ~ipv4_tcp();
+ virtual void received(packet p, ipv4_address from, ipv4_address to) override;
+ virtual bool forward(forward_hash& out_hash_data, packet& p, size_t off) override;
+ friend class ipv4;
+};
+
+struct icmp_hdr {
+ enum class msg_type : uint8_t {
+ echo_reply = 0,
+ echo_request = 8,
+ };
+ msg_type type;
+ uint8_t code;
+ packed<uint16_t> csum;
+ packed<uint32_t> rest;
+ template <typename Adjuster>
+ auto adjust_endianness(Adjuster a) {
+ return a(csum);
+ }
+} __attribute__((packed));
+
+
+class icmp {
+public:
+ using ipaddr = ipv4_address;
+ using inet_type = ipv4_l4<ip_protocol_num::icmp>;
+ explicit icmp(inet_type& inet) : _inet(inet) {
+ _inet.register_packet_provider([this] {
+ std::optional<ipv4_traits::l4packet> l4p;
+ if (!_packetq.empty()) {
+ l4p = std::move(_packetq.front());
+ _packetq.pop_front();
+ _queue_space.signal(l4p.value().p.len());
+ }
+ return l4p;
+ });
+ }
+ void received(packet p, ipaddr from, ipaddr to);
+private:
+ inet_type& _inet;
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+ semaphore _queue_space = {212992};
+};
+
+class ipv4_icmp final : public ip_protocol {
+ ipv4_l4<ip_protocol_num::icmp> _inet_l4;
+ icmp _icmp;
+public:
+ ipv4_icmp(ipv4& inet) : _inet_l4(inet), _icmp(_inet_l4) {}
+ virtual void received(packet p, ipv4_address from, ipv4_address to) {
+ _icmp.received(std::move(p), from, to);
+ }
+ friend class ipv4;
+};
+
+class ipv4_udp : public ip_protocol {
+ using connid = l4connid<ipv4_traits>;
+ using connid_hash = typename connid::connid_hash;
+
+public:
+ static const int default_queue_size;
+private:
+ static const uint16_t min_anonymous_port = 32768;
+ ipv4 &_inet;
+ std::unordered_map<uint16_t, lw_shared_ptr<udp_channel_state>> _channels;
+ int _queue_size = default_queue_size;
+ uint16_t _next_anonymous_port = min_anonymous_port;
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+private:
+ uint16_t next_port(uint16_t port);
+public:
+ class registration {
+ private:
+ ipv4_udp &_proto;
+ uint16_t _port;
+ public:
+ registration(ipv4_udp &proto, uint16_t port) : _proto(proto), _port(port) {};
+
+ void unregister() {
+ _proto._channels.erase(_proto._channels.find(_port));
+ }
+
+ uint16_t port() const {
+ return _port;
+ }
+ };
+
+ ipv4_udp(ipv4& inet);
+ udp_channel make_channel(ipv4_addr addr);
+ virtual void received(packet p, ipv4_address from, ipv4_address to) override;
+ void send(uint16_t src_port, ipv4_addr dst, packet &&p);
+ bool forward(forward_hash& out_hash_data, packet& p, size_t off) override;
+ void set_queue_size(int size) { _queue_size = size; }
+
+ const ipv4& inet() const {
+ return _inet;
+ }
+};
+
+struct ip_hdr;
+
+struct ip_packet_filter {
+ virtual ~ip_packet_filter() {};
+ virtual future<> handle(packet& p, ip_hdr* iph, ethernet_address from, bool & handled) = 0;
+};
+
+struct ipv4_frag_id {
+ struct hash;
+ ipv4_address src_ip;
+ ipv4_address dst_ip;
+ uint16_t identification;
+ uint8_t protocol;
+ bool operator==(const ipv4_frag_id& x) const {
+ return src_ip == x.src_ip &&
+ dst_ip == x.dst_ip &&
+ identification == x.identification &&
+ protocol == x.protocol;
+ }
+};
+
+struct ipv4_frag_id::hash : private std::hash<ipv4_address>,
+ private std::hash<uint16_t>, private std::hash<uint8_t> {
+ size_t operator()(const ipv4_frag_id& id) const noexcept {
+ using h1 = std::hash<ipv4_address>;
+ using h2 = std::hash<uint16_t>;
+ using h3 = std::hash<uint8_t>;
+ return h1::operator()(id.src_ip) ^
+ h1::operator()(id.dst_ip) ^
+ h2::operator()(id.identification) ^
+ h3::operator()(id.protocol);
+ }
+};
+
+struct ipv4_tag {};
+using ipv4_packet_merger = packet_merger<uint32_t, ipv4_tag>;
+
+class ipv4 {
+public:
+ using clock_type = lowres_clock;
+ using address_type = ipv4_address;
+ using proto_type = uint16_t;
+ static address_type broadcast_address() { return ipv4_address(0xffffffff); }
+ static proto_type arp_protocol_type() { return proto_type(eth_protocol_num::ipv4); }
+private:
+ interface* _netif;
+ std::vector<ipv4_traits::packet_provider_type> _pkt_providers;
+ arp _global_arp;
+ arp_for<ipv4> _arp;
+ ipv4_address _host_address;
+ ipv4_address _gw_address;
+ ipv4_address _netmask;
+ l3_protocol _l3;
+ ipv4_tcp _tcp;
+ ipv4_icmp _icmp;
+ ipv4_udp _udp;
+ array_map<ip_protocol*, 256> _l4;
+ ip_packet_filter * _packet_filter = nullptr;
+ struct frag {
+ packet header;
+ ipv4_packet_merger data;
+ clock_type::time_point rx_time;
+ uint32_t mem_size = 0;
+ // fragment with MF == 0 inidates it is the last fragment
+ bool last_frag_received = false;
+
+ packet get_assembled_packet(ethernet_address from, ethernet_address to);
+ int32_t merge(ip_hdr &h, uint16_t offset, packet p);
+ bool is_complete();
+ };
+ std::unordered_map<ipv4_frag_id, frag, ipv4_frag_id::hash> _frags;
+ std::list<ipv4_frag_id> _frags_age;
+ static constexpr std::chrono::seconds _frag_timeout{30};
+ static constexpr uint32_t _frag_low_thresh{3 * 1024 * 1024};
+ static constexpr uint32_t _frag_high_thresh{4 * 1024 * 1024};
+ uint32_t _frag_mem{0};
+ timer<lowres_clock> _frag_timer;
+ circular_buffer<l3_protocol::l3packet> _packetq;
+ unsigned _pkt_provider_idx = 0;
+ metrics::metric_groups _metrics;
+private:
+ future<> handle_received_packet(packet p, ethernet_address from);
+ bool forward(forward_hash& out_hash_data, packet& p, size_t off);
+ std::optional<l3_protocol::l3packet> get_packet();
+ bool in_my_netmask(ipv4_address a) const;
+ void frag_limit_mem();
+ void frag_timeout();
+ void frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size);
+ void frag_arm(clock_type::time_point now) {
+ auto tp = now + _frag_timeout;
+ _frag_timer.arm(tp);
+ }
+ void frag_arm() {
+ auto now = clock_type::now();
+ frag_arm(now);
+ }
+public:
+ explicit ipv4(interface* netif);
+ void set_host_address(ipv4_address ip);
+ ipv4_address host_address() const;
+ void set_gw_address(ipv4_address ip);
+ ipv4_address gw_address() const;
+ void set_netmask_address(ipv4_address ip);
+ ipv4_address netmask_address() const;
+ interface * netif() const {
+ return _netif;
+ }
+ // TODO or something. Should perhaps truly be a list
+ // of filters. With ordering. And blackjack. Etc.
+ // But for now, a simple single raw pointer suffices
+ void set_packet_filter(ip_packet_filter *);
+ ip_packet_filter * packet_filter() const;
+ void send(ipv4_address to, ip_protocol_num proto_num, packet p, ethernet_address e_dst);
+ tcp<ipv4_traits>& get_tcp() { return *_tcp._tcp; }
+ ipv4_udp& get_udp() { return _udp; }
+ void register_l4(proto_type id, ip_protocol* handler);
+ const net::hw_features& hw_features() const { return _netif->hw_features(); }
+ static bool needs_frag(packet& p, ip_protocol_num proto_num, net::hw_features hw_features);
+ void learn(ethernet_address l2, ipv4_address l3) {
+ _arp.learn(l2, l3);
+ }
+ void register_packet_provider(ipv4_traits::packet_provider_type&& func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ future<ethernet_address> get_l2_dst_address(ipv4_address to);
+};
+
+template <ip_protocol_num ProtoNum>
+inline
+void ipv4_l4<ProtoNum>::register_packet_provider(ipv4_traits::packet_provider_type func) {
+ _inet.register_packet_provider([func = std::move(func)] {
+ auto l4p = func();
+ if (l4p) {
+ l4p.value().proto_num = ProtoNum;
+ }
+ return l4p;
+ });
+}
+
+template <ip_protocol_num ProtoNum>
+inline
+future<ethernet_address> ipv4_l4<ProtoNum>::get_l2_dst_address(ipv4_address to) {
+ return _inet.get_l2_dst_address(to);
+}
+
+struct ip_hdr {
+ uint8_t ihl : 4;
+ uint8_t ver : 4;
+ uint8_t dscp : 6;
+ uint8_t ecn : 2;
+ packed<uint16_t> len;
+ packed<uint16_t> id;
+ packed<uint16_t> frag;
+ enum class frag_bits : uint8_t { mf = 13, df = 14, reserved = 15, offset_shift = 3 };
+ uint8_t ttl;
+ uint8_t ip_proto;
+ packed<uint16_t> csum;
+ ipv4_address src_ip;
+ ipv4_address dst_ip;
+ uint8_t options[0];
+ template <typename Adjuster>
+ auto adjust_endianness(Adjuster a) {
+ return a(len, id, frag, csum, src_ip, dst_ip);
+ }
+ bool mf() { return frag & (1 << uint8_t(frag_bits::mf)); }
+ bool df() { return frag & (1 << uint8_t(frag_bits::df)); }
+ uint16_t offset() { return frag << uint8_t(frag_bits::offset_shift); }
+} __attribute__((packed));
+
+template <typename InetTraits>
+struct l4connid<InetTraits>::connid_hash : private std::hash<ipaddr>, private std::hash<uint16_t> {
+ size_t operator()(const l4connid<InetTraits>& id) const noexcept {
+ using h1 = std::hash<ipaddr>;
+ using h2 = std::hash<uint16_t>;
+ return h1::operator()(id.local_ip)
+ ^ h1::operator()(id.foreign_ip)
+ ^ h2::operator()(id.local_port)
+ ^ h2::operator()(id.foreign_port);
+ }
+};
+
+void arp_learn(ethernet_address l2, ipv4_address l3);
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/ip_checksum.hh b/src/seastar/include/seastar/net/ip_checksum.hh
new file mode 100644
index 000000000..cb3539149
--- /dev/null
+++ b/src/seastar/include/seastar/net/ip_checksum.hh
@@ -0,0 +1,76 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/net/packet.hh>
+#include <cstdint>
+#include <cstddef>
+#include <arpa/inet.h>
+
+namespace seastar {
+
+namespace net {
+
+uint16_t ip_checksum(const void* data, size_t len);
+
+struct checksummer {
+ __int128 csum = 0;
+ bool odd = false;
+ void sum(const char* data, size_t len);
+ void sum(const packet& p);
+ void sum(uint8_t data) {
+ if (!odd) {
+ csum += data << 8;
+ } else {
+ csum += data;
+ }
+ odd = !odd;
+ }
+ void sum(uint16_t data) {
+ if (odd) {
+ sum(uint8_t(data >> 8));
+ sum(uint8_t(data));
+ } else {
+ csum += data;
+ }
+ }
+ void sum(uint32_t data) {
+ if (odd) {
+ sum(uint16_t(data));
+ sum(uint16_t(data >> 16));
+ } else {
+ csum += data;
+ }
+ }
+ void sum_many() {}
+ template <typename T0, typename... T>
+ void sum_many(T0 data, T... rest) {
+ sum(data);
+ sum_many(rest...);
+ }
+ uint16_t get() const;
+};
+
+}
+
+}
+
diff --git a/src/seastar/include/seastar/net/native-stack.hh b/src/seastar/include/seastar/net/native-stack.hh
new file mode 100644
index 000000000..f74c97596
--- /dev/null
+++ b/src/seastar/include/seastar/net/native-stack.hh
@@ -0,0 +1,36 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/net/net.hh>
+#include <boost/program_options.hpp>
+
+namespace seastar {
+
+namespace net {
+
+void create_native_stack(boost::program_options::variables_map opts, std::shared_ptr<device> dev);
+void register_native_stack();
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/net.hh b/src/seastar/include/seastar/net/net.hh
new file mode 100644
index 000000000..9d3fb1f82
--- /dev/null
+++ b/src/seastar/include/seastar/net/net.hh
@@ -0,0 +1,308 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/smp.hh>
+#include <seastar/core/deleter.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/stream.hh>
+#include <seastar/core/metrics_registration.hh>
+#include <seastar/net/toeplitz.hh>
+#include <seastar/net/ethernet.hh>
+#include <seastar/net/packet.hh>
+#include <seastar/net/const.hh>
+#include <unordered_map>
+
+namespace seastar {
+
+namespace internal {
+
+class poller;
+
+}
+
+namespace net {
+
+class packet;
+class interface;
+class device;
+class qp;
+class l3_protocol;
+
+class forward_hash {
+ uint8_t data[64];
+ size_t end_idx = 0;
+public:
+ size_t size() const {
+ return end_idx;
+ }
+ void push_back(uint8_t b) {
+ assert(end_idx < sizeof(data));
+ data[end_idx++] = b;
+ }
+ void push_back(uint16_t b) {
+ push_back(uint8_t(b));
+ push_back(uint8_t(b >> 8));
+ }
+ void push_back(uint32_t b) {
+ push_back(uint16_t(b));
+ push_back(uint16_t(b >> 16));
+ }
+ const uint8_t& operator[](size_t idx) const {
+ return data[idx];
+ }
+};
+
+struct hw_features {
+ // Enable tx ip header checksum offload
+ bool tx_csum_ip_offload = false;
+ // Enable tx l4 (TCP or UDP) checksum offload
+ bool tx_csum_l4_offload = false;
+ // Enable rx checksum offload
+ bool rx_csum_offload = false;
+ // LRO is enabled
+ bool rx_lro = false;
+ // Enable tx TCP segment offload
+ bool tx_tso = false;
+ // Enable tx UDP fragmentation offload
+ bool tx_ufo = false;
+ // Maximum Transmission Unit
+ uint16_t mtu = 1500;
+ // Maximun packet len when TCP/UDP offload is enabled
+ uint16_t max_packet_len = ip_packet_len_max - eth_hdr_len;
+};
+
+class l3_protocol {
+public:
+ struct l3packet {
+ eth_protocol_num proto_num;
+ ethernet_address to;
+ packet p;
+ };
+ using packet_provider_type = std::function<std::optional<l3packet> ()>;
+private:
+ interface* _netif;
+ eth_protocol_num _proto_num;
+public:
+ explicit l3_protocol(interface* netif, eth_protocol_num proto_num, packet_provider_type func);
+ future<> receive(
+ std::function<future<> (packet, ethernet_address)> rx_fn,
+ std::function<bool (forward_hash&, packet&, size_t)> forward);
+private:
+ friend class interface;
+};
+
+class interface {
+ struct l3_rx_stream {
+ stream<packet, ethernet_address> packet_stream;
+ future<> ready;
+ std::function<bool (forward_hash&, packet&, size_t)> forward;
+ l3_rx_stream(std::function<bool (forward_hash&, packet&, size_t)>&& fw) : ready(packet_stream.started()), forward(fw) {}
+ };
+ std::unordered_map<uint16_t, l3_rx_stream> _proto_map;
+ std::shared_ptr<device> _dev;
+ ethernet_address _hw_address;
+ net::hw_features _hw_features;
+ std::vector<l3_protocol::packet_provider_type> _pkt_providers;
+private:
+ future<> dispatch_packet(packet p);
+public:
+ explicit interface(std::shared_ptr<device> dev);
+ ethernet_address hw_address() { return _hw_address; }
+ const net::hw_features& hw_features() const { return _hw_features; }
+ future<> register_l3(eth_protocol_num proto_num,
+ std::function<future<> (packet p, ethernet_address from)> next,
+ std::function<bool (forward_hash&, packet&, size_t)> forward);
+ void forward(unsigned cpuid, packet p);
+ unsigned hash2cpu(uint32_t hash);
+ void register_packet_provider(l3_protocol::packet_provider_type func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ uint16_t hw_queues_count();
+ rss_key_type rss_key() const;
+ friend class l3_protocol;
+};
+
+struct qp_stats_good {
+ /**
+ * Update the packets bunch related statistics.
+ *
+ * Update the last packets bunch size and the total packets counter.
+ *
+ * @param count Number of packets in the last packets bunch.
+ */
+ void update_pkts_bunch(uint64_t count) {
+ last_bunch = count;
+ packets += count;
+ }
+
+ /**
+ * Increment the appropriate counters when a few fragments have been
+ * processed in a copy-way.
+ *
+ * @param nr_frags Number of copied fragments
+ * @param bytes Number of copied bytes
+ */
+ void update_copy_stats(uint64_t nr_frags, uint64_t bytes) {
+ copy_frags += nr_frags;
+ copy_bytes += bytes;
+ }
+
+ /**
+ * Increment total fragments and bytes statistics
+ *
+ * @param nfrags Number of processed fragments
+ * @param nbytes Number of bytes in the processed fragments
+ */
+ void update_frags_stats(uint64_t nfrags, uint64_t nbytes) {
+ nr_frags += nfrags;
+ bytes += nbytes;
+ }
+
+ uint64_t bytes; // total number of bytes
+ uint64_t nr_frags; // total number of fragments
+ uint64_t copy_frags; // fragments that were copied on L2 level
+ uint64_t copy_bytes; // bytes that were copied on L2 level
+ uint64_t packets; // total number of packets
+ uint64_t last_bunch; // number of packets in the last sent/received bunch
+};
+
+struct qp_stats {
+ qp_stats() : rx{}, tx{} {}
+
+ struct {
+ struct qp_stats_good good;
+
+ struct {
+ void inc_csum_err() {
+ ++csum;
+ ++total;
+ }
+
+ void inc_no_mem() {
+ ++no_mem;
+ ++total;
+ }
+
+ uint64_t no_mem; // Packets dropped due to allocation failure
+ uint64_t total; // total number of erroneous packets
+ uint64_t csum; // packets with bad checksum
+ } bad;
+ } rx;
+
+ struct {
+ struct qp_stats_good good;
+ uint64_t linearized; // number of packets that were linearized
+ } tx;
+};
+
+class qp {
+ using packet_provider_type = std::function<std::optional<packet> ()>;
+ std::vector<packet_provider_type> _pkt_providers;
+ std::optional<std::array<uint8_t, 128>> _sw_reta;
+ circular_buffer<packet> _proxy_packetq;
+ stream<packet> _rx_stream;
+ std::unique_ptr<internal::poller> _tx_poller;
+ circular_buffer<packet> _tx_packetq;
+
+protected:
+ const std::string _stats_plugin_name;
+ const std::string _queue_name;
+ metrics::metric_groups _metrics;
+ qp_stats _stats;
+
+public:
+ qp(bool register_copy_stats = false,
+ const std::string stats_plugin_name = std::string("network"),
+ uint8_t qid = 0);
+ virtual ~qp();
+ virtual future<> send(packet p) = 0;
+ virtual uint32_t send(circular_buffer<packet>& p) {
+ uint32_t sent = 0;
+ while (!p.empty()) {
+ // FIXME: future is discarded
+ (void)send(std::move(p.front()));
+ p.pop_front();
+ sent++;
+ }
+ return sent;
+ }
+ virtual void rx_start() {};
+ void configure_proxies(const std::map<unsigned, float>& cpu_weights);
+ // build REdirection TAble for cpu_weights map: target cpu -> weight
+ void build_sw_reta(const std::map<unsigned, float>& cpu_weights);
+ void proxy_send(packet p) {
+ _proxy_packetq.push_back(std::move(p));
+ }
+ void register_packet_provider(packet_provider_type func) {
+ _pkt_providers.push_back(std::move(func));
+ }
+ bool poll_tx();
+ friend class device;
+};
+
+class device {
+protected:
+ std::unique_ptr<qp*[]> _queues;
+ size_t _rss_table_bits = 0;
+public:
+ device() {
+ _queues = std::make_unique<qp*[]>(smp::count);
+ }
+ virtual ~device() {};
+ qp& queue_for_cpu(unsigned cpu) { return *_queues[cpu]; }
+ qp& local_queue() { return queue_for_cpu(this_shard_id()); }
+ void l2receive(packet p) {
+ // FIXME: future is discarded
+ (void)_queues[this_shard_id()]->_rx_stream.produce(std::move(p));
+ }
+ future<> receive(std::function<future<> (packet)> next_packet);
+ virtual ethernet_address hw_address() = 0;
+ virtual net::hw_features hw_features() = 0;
+ virtual rss_key_type rss_key() const { return default_rsskey_40bytes; }
+ virtual uint16_t hw_queues_count() { return 1; }
+ virtual future<> link_ready() { return make_ready_future<>(); }
+ virtual std::unique_ptr<qp> init_local_queue(boost::program_options::variables_map opts, uint16_t qid) = 0;
+ virtual unsigned hash2qid(uint32_t hash) {
+ return hash % hw_queues_count();
+ }
+ void set_local_queue(std::unique_ptr<qp> dev);
+ template <typename Func>
+ unsigned forward_dst(unsigned src_cpuid, Func&& hashfn) {
+ auto& qp = queue_for_cpu(src_cpuid);
+ if (!qp._sw_reta) {
+ return src_cpuid;
+ }
+ auto hash = hashfn() >> _rss_table_bits;
+ auto& reta = *qp._sw_reta;
+ return reta[hash % reta.size()];
+ }
+ virtual unsigned hash2cpu(uint32_t hash) {
+ // there is an assumption here that qid == cpu_id which will
+ // not necessary be true in the future
+ return forward_dst(hash2qid(hash), [hash] { return hash; });
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/packet-data-source.hh b/src/seastar/include/seastar/net/packet-data-source.hh
new file mode 100644
index 000000000..5c12d5f5c
--- /dev/null
+++ b/src/seastar/include/seastar/net/packet-data-source.hh
@@ -0,0 +1,53 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#pragma once
+
+#include <seastar/net/packet.hh>
+#include <seastar/core/iostream.hh>
+
+namespace seastar {
+
+namespace net {
+
+class packet_data_source final : public data_source_impl {
+ size_t _cur_frag = 0;
+ packet _p;
+public:
+ explicit packet_data_source(net::packet&& p)
+ : _p(std::move(p))
+ {}
+
+ virtual future<temporary_buffer<char>> get() override {
+ if (_cur_frag != _p.nr_frags()) {
+ auto& f = _p.fragments()[_cur_frag++];
+ return make_ready_future<temporary_buffer<char>>(
+ temporary_buffer<char>(f.base, f.size,
+ make_deleter(deleter(), [p = _p.share()] () mutable {})));
+ }
+ return make_ready_future<temporary_buffer<char>>(temporary_buffer<char>());
+ }
+};
+
+static inline
+input_stream<char> as_input_stream(packet&& p) {
+ return input_stream<char>(data_source(std::make_unique<packet_data_source>(std::move(p))));
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/packet-util.hh b/src/seastar/include/seastar/net/packet-util.hh
new file mode 100644
index 000000000..3e2718992
--- /dev/null
+++ b/src/seastar/include/seastar/net/packet-util.hh
@@ -0,0 +1,158 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/net/packet.hh>
+#include <map>
+#include <iostream>
+
+namespace seastar {
+
+namespace net {
+
+template <typename Offset, typename Tag>
+class packet_merger {
+private:
+ static uint64_t& linearizations_ref() {
+ static thread_local uint64_t linearization_count;
+ return linearization_count;
+ }
+public:
+ std::map<Offset, packet> map;
+
+ static uint64_t linearizations() {
+ return linearizations_ref();
+ }
+
+ void merge(Offset offset, packet p) {
+ bool insert = true;
+ auto beg = offset;
+ auto end = beg + p.len();
+ // Fisrt, try to merge the packet with existing segment
+ for (auto it = map.begin(); it != map.end();) {
+ auto& seg_pkt = it->second;
+ auto seg_beg = it->first;
+ auto seg_end = seg_beg + seg_pkt.len();
+ // There are 6 cases:
+ if (seg_beg <= beg && end <= seg_end) {
+ // 1) seg_beg beg end seg_end
+ // We already have data in this packet
+ return;
+ } else if (beg <= seg_beg && seg_end <= end) {
+ // 2) beg seg_beg seg_end end
+ // The new segment contains more data than this old segment
+ // Delete the old one, insert the new one
+ it = map.erase(it);
+ insert = true;
+ break;
+ } else if (beg < seg_beg && seg_beg <= end && end <= seg_end) {
+ // 3) beg seg_beg end seg_end
+ // Merge two segments, trim front of old segment
+ auto trim = end - seg_beg;
+ seg_pkt.trim_front(trim);
+ p.append(std::move(seg_pkt));
+ // Delete the old one, insert the new one
+ it = map.erase(it);
+ insert = true;
+ break;
+ } else if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+ // 4) seg_beg beg seg_end end
+ // Merge two segments, trim front of new segment
+ auto trim = seg_end - beg;
+ p.trim_front(trim);
+ // Append new data to the old segment, keep the old segment
+ seg_pkt.append(std::move(p));
+ seg_pkt.linearize();
+ ++linearizations_ref();
+ insert = false;
+ break;
+ } else {
+ // 5) beg end < seg_beg seg_end
+ // or
+ // 6) seg_beg seg_end < beg end
+ // Can not merge with this segment, keep looking
+ it++;
+ insert = true;
+ }
+ }
+
+ if (insert) {
+ p.linearize();
+ ++linearizations_ref();
+ map.emplace(beg, std::move(p));
+ }
+
+ // Second, merge adjacent segments after this packet has been merged,
+ // becasue this packet might fill a "whole" and make two adjacent
+ // segments mergable
+ for (auto it = map.begin(); it != map.end();) {
+ // The first segment
+ auto& seg_pkt = it->second;
+ auto seg_beg = it->first;
+ auto seg_end = seg_beg + seg_pkt.len();
+
+ // The second segment
+ auto it_next = it;
+ it_next++;
+ if (it_next == map.end()) {
+ break;
+ }
+ auto& p = it_next->second;
+ auto beg = it_next->first;
+ auto end = beg + p.len();
+
+ // Merge the the second segment into first segment if possible
+ if (seg_beg <= beg && beg <= seg_end && seg_end < end) {
+ // Merge two segments, trim front of second segment
+ auto trim = seg_end - beg;
+ p.trim_front(trim);
+ // Append new data to the first segment, keep the first segment
+ seg_pkt.append(std::move(p));
+
+ // Delete the second segment
+ map.erase(it_next);
+
+ // Keep merging this first segment with its new next packet
+ // So we do not update the iterator: it
+ continue;
+ } else if (end <= seg_end) {
+ // The first segment has all the data in the second segment
+ // Delete the second segment
+ map.erase(it_next);
+ continue;
+ } else if (seg_end < beg) {
+ // Can not merge first segment with second segment
+ it = it_next;
+ continue;
+ } else {
+ // If we reach here, we have a bug with merge.
+ std::cerr << "packet_merger: merge error\n";
+ abort();
+ break;
+ }
+ }
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/packet.hh b/src/seastar/include/seastar/net/packet.hh
new file mode 100644
index 000000000..f7154e7d1
--- /dev/null
+++ b/src/seastar/include/seastar/net/packet.hh
@@ -0,0 +1,622 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/deleter.hh>
+#include <seastar/core/temporary_buffer.hh>
+#include <seastar/net/const.hh>
+#include <vector>
+#include <cassert>
+#include <algorithm>
+#include <iosfwd>
+#include <seastar/util/std-compat.hh>
+#include <functional>
+
+namespace seastar {
+
+namespace net {
+
+struct fragment {
+ char* base;
+ size_t size;
+};
+
+struct offload_info {
+ ip_protocol_num protocol = ip_protocol_num::unused;
+ bool needs_csum = false;
+ uint8_t ip_hdr_len = 20;
+ uint8_t tcp_hdr_len = 20;
+ uint8_t udp_hdr_len = 8;
+ bool needs_ip_csum = false;
+ bool reassembled = false;
+ uint16_t tso_seg_size = 0;
+ // HW stripped VLAN header (CPU order)
+ std::optional<uint16_t> vlan_tci;
+};
+
+// Zero-copy friendly packet class
+//
+// For implementing zero-copy, we need a flexible destructor that can
+// destroy packet data in different ways: decrementing a reference count,
+// or calling a free()-like function.
+//
+// Moreover, we need different destructors for each set of fragments within
+// a single fragment. For example, a header and trailer might need delete[]
+// to be called, while the internal data needs a reference count to be
+// released. Matters are complicated in that fragments can be split
+// (due to virtual/physical translation).
+//
+// To implement this, we associate each packet with a single destructor,
+// but allow composing a packet from another packet plus a fragment to
+// be added, with its own destructor, causing the destructors to be chained.
+//
+// The downside is that the data needed for the destructor is duplicated,
+// if it is already available in the fragment itself.
+//
+// As an optimization, when we allocate small fragments, we allocate some
+// extra space, so prepending to the packet does not require extra
+// allocations. This is useful when adding headers.
+//
+class packet final {
+ // enough for lots of headers, not quite two cache lines:
+ static constexpr size_t internal_data_size = 128 - 16;
+ static constexpr size_t default_nr_frags = 4;
+
+ struct pseudo_vector {
+ fragment* _start;
+ fragment* _finish;
+ pseudo_vector(fragment* start, size_t nr)
+ : _start(start), _finish(_start + nr) {}
+ fragment* begin() { return _start; }
+ fragment* end() { return _finish; }
+ fragment& operator[](size_t idx) { return _start[idx]; }
+ };
+
+ struct impl {
+ // when destroyed, virtual destructor will reclaim resources
+ deleter _deleter;
+ unsigned _len = 0;
+ uint16_t _nr_frags = 0;
+ uint16_t _allocated_frags;
+ offload_info _offload_info;
+ std::optional<uint32_t> _rss_hash;
+ char _data[internal_data_size]; // only _frags[0] may use
+ unsigned _headroom = internal_data_size; // in _data
+ // FIXME: share _data/_frags space
+
+ fragment _frags[];
+
+ impl(size_t nr_frags = default_nr_frags) noexcept;
+ impl(const impl&) = delete;
+ impl(fragment frag, size_t nr_frags = default_nr_frags);
+
+ pseudo_vector fragments() { return { _frags, _nr_frags }; }
+
+ static std::unique_ptr<impl> allocate(size_t nr_frags) {
+ nr_frags = std::max(nr_frags, default_nr_frags);
+ return std::unique_ptr<impl>(new (nr_frags) impl(nr_frags));
+ }
+
+ static std::unique_ptr<impl> copy(impl* old, size_t nr) {
+ auto n = allocate(nr);
+ n->_deleter = std::move(old->_deleter);
+ n->_len = old->_len;
+ n->_nr_frags = old->_nr_frags;
+ n->_headroom = old->_headroom;
+ n->_offload_info = old->_offload_info;
+ n->_rss_hash = old->_rss_hash;
+ std::copy(old->_frags, old->_frags + old->_nr_frags, n->_frags);
+ old->copy_internal_fragment_to(n.get());
+ return n;
+ }
+
+ static std::unique_ptr<impl> copy(impl* old) {
+ return copy(old, old->_nr_frags);
+ }
+
+ static std::unique_ptr<impl> allocate_if_needed(std::unique_ptr<impl> old, size_t extra_frags) {
+ if (old->_allocated_frags >= old->_nr_frags + extra_frags) {
+ return old;
+ }
+ return copy(old.get(), std::max<size_t>(old->_nr_frags + extra_frags, 2 * old->_nr_frags));
+ }
+ void* operator new(size_t size, size_t nr_frags = default_nr_frags) {
+ assert(nr_frags == uint16_t(nr_frags));
+ return ::operator new(size + nr_frags * sizeof(fragment));
+ }
+ // Matching the operator new above
+ void operator delete(void* ptr, size_t nr_frags) {
+ return ::operator delete(ptr);
+ }
+ // Since the above "placement delete" hides the global one, expose it
+ void operator delete(void* ptr) {
+ return ::operator delete(ptr);
+ }
+
+ bool using_internal_data() const {
+ return _nr_frags
+ && _frags[0].base >= _data
+ && _frags[0].base < _data + internal_data_size;
+ }
+
+ void unuse_internal_data() {
+ if (!using_internal_data()) {
+ return;
+ }
+ auto buf = static_cast<char*>(::malloc(_frags[0].size));
+ if (!buf) {
+ throw std::bad_alloc();
+ }
+ deleter d = make_free_deleter(buf);
+ std::copy(_frags[0].base, _frags[0].base + _frags[0].size, buf);
+ _frags[0].base = buf;
+ d.append(std::move(_deleter));
+ _deleter = std::move(d);
+ _headroom = internal_data_size;
+ }
+ void copy_internal_fragment_to(impl* to) {
+ if (!using_internal_data()) {
+ return;
+ }
+ to->_frags[0].base = to->_data + _headroom;
+ std::copy(_frags[0].base, _frags[0].base + _frags[0].size,
+ to->_frags[0].base);
+ }
+ };
+ packet(std::unique_ptr<impl>&& impl) noexcept : _impl(std::move(impl)) {}
+ std::unique_ptr<impl> _impl;
+public:
+ static packet from_static_data(const char* data, size_t len) noexcept {
+ return {fragment{const_cast<char*>(data), len}, deleter()};
+ }
+
+ // build empty packet
+ packet();
+ // build empty packet with nr_frags allocated
+ packet(size_t nr_frags);
+ // move existing packet
+ packet(packet&& x) noexcept;
+ // copy data into packet
+ packet(const char* data, size_t len);
+ // copy data into packet
+ packet(fragment frag);
+ // zero-copy single fragment
+ packet(fragment frag, deleter del);
+ // zero-copy multiple fragments
+ packet(std::vector<fragment> frag, deleter del);
+ // build packet with iterator
+ template <typename Iterator>
+ packet(Iterator begin, Iterator end, deleter del);
+ // append fragment (copying new fragment)
+ packet(packet&& x, fragment frag);
+ // prepend fragment (copying new fragment, with header optimization)
+ packet(fragment frag, packet&& x);
+ // prepend fragment (zero-copy)
+ packet(fragment frag, deleter del, packet&& x);
+ // append fragment (zero-copy)
+ packet(packet&& x, fragment frag, deleter d);
+ // append temporary_buffer (zero-copy)
+ packet(packet&& x, temporary_buffer<char> buf);
+ // create from temporary_buffer (zero-copy)
+ packet(temporary_buffer<char> buf);
+ // append deleter
+ packet(packet&& x, deleter d);
+
+ packet& operator=(packet&& x) noexcept {
+ if (this != &x) {
+ this->~packet();
+ new (this) packet(std::move(x));
+ }
+ return *this;
+ }
+
+ unsigned len() const { return _impl->_len; }
+ unsigned memory() const { return len() + sizeof(packet::impl); }
+
+ fragment frag(unsigned idx) const { return _impl->_frags[idx]; }
+ fragment& frag(unsigned idx) { return _impl->_frags[idx]; }
+
+ unsigned nr_frags() const { return _impl->_nr_frags; }
+ pseudo_vector fragments() const { return { _impl->_frags, _impl->_nr_frags }; }
+ fragment* fragment_array() const { return _impl->_frags; }
+
+ // share packet data (reference counted, non COW)
+ packet share();
+ packet share(size_t offset, size_t len);
+
+ void append(packet&& p);
+
+ void trim_front(size_t how_much);
+ void trim_back(size_t how_much);
+
+ // get a header pointer, linearizing if necessary
+ template <typename Header>
+ Header* get_header(size_t offset = 0);
+
+ // get a header pointer, linearizing if necessary
+ char* get_header(size_t offset, size_t size);
+
+ // prepend a header (default-initializing it)
+ template <typename Header>
+ Header* prepend_header(size_t extra_size = 0);
+
+ // prepend a header (uninitialized!)
+ char* prepend_uninitialized_header(size_t size);
+
+ packet free_on_cpu(unsigned cpu, std::function<void()> cb = []{});
+
+ void linearize() { return linearize(0, len()); }
+
+ void reset() { _impl.reset(); }
+
+ void reserve(int n_frags) {
+ if (n_frags > _impl->_nr_frags) {
+ auto extra = n_frags - _impl->_nr_frags;
+ _impl = impl::allocate_if_needed(std::move(_impl), extra);
+ }
+ }
+ std::optional<uint32_t> rss_hash() {
+ return _impl->_rss_hash;
+ }
+ std::optional<uint32_t> set_rss_hash(uint32_t hash) {
+ return _impl->_rss_hash = hash;
+ }
+ // Call `func` for each fragment, avoiding data copies when possible
+ // `func` is called with a temporary_buffer<char> parameter
+ template <typename Func>
+ void release_into(Func&& func) {
+ unsigned idx = 0;
+ if (_impl->using_internal_data()) {
+ auto&& f = frag(idx++);
+ func(temporary_buffer<char>(f.base, f.size));
+ }
+ while (idx < nr_frags()) {
+ auto&& f = frag(idx++);
+ func(temporary_buffer<char>(f.base, f.size, _impl->_deleter.share()));
+ }
+ }
+ std::vector<temporary_buffer<char>> release() {
+ std::vector<temporary_buffer<char>> ret;
+ ret.reserve(_impl->_nr_frags);
+ release_into([&ret] (temporary_buffer<char>&& frag) {
+ ret.push_back(std::move(frag));
+ });
+ return ret;
+ }
+ explicit operator bool() {
+ return bool(_impl);
+ }
+ static packet make_null_packet() noexcept {
+ return net::packet(nullptr);
+ }
+private:
+ void linearize(size_t at_frag, size_t desired_size);
+ bool allocate_headroom(size_t size);
+public:
+ struct offload_info offload_info() const { return _impl->_offload_info; }
+ struct offload_info& offload_info_ref() { return _impl->_offload_info; }
+ void set_offload_info(struct offload_info oi) { _impl->_offload_info = oi; }
+};
+
+std::ostream& operator<<(std::ostream& os, const packet& p);
+
+inline
+packet::packet(packet&& x) noexcept
+ : _impl(std::move(x._impl)) {
+}
+
+inline
+packet::impl::impl(size_t nr_frags) noexcept
+ : _len(0), _allocated_frags(nr_frags) {
+}
+
+inline
+packet::impl::impl(fragment frag, size_t nr_frags)
+ : _len(frag.size), _allocated_frags(nr_frags) {
+ assert(_allocated_frags > _nr_frags);
+ if (frag.size <= internal_data_size) {
+ _headroom -= frag.size;
+ _frags[0] = { _data + _headroom, frag.size };
+ } else {
+ auto buf = static_cast<char*>(::malloc(frag.size));
+ if (!buf) {
+ throw std::bad_alloc();
+ }
+ deleter d = make_free_deleter(buf);
+ _frags[0] = { buf, frag.size };
+ _deleter.append(std::move(d));
+ }
+ std::copy(frag.base, frag.base + frag.size, _frags[0].base);
+ ++_nr_frags;
+}
+
+inline
+packet::packet()
+ : _impl(impl::allocate(1)) {
+}
+
+inline
+packet::packet(size_t nr_frags)
+ : _impl(impl::allocate(nr_frags)) {
+}
+
+inline
+packet::packet(fragment frag) : _impl(new impl(frag)) {
+}
+
+inline
+packet::packet(const char* data, size_t size) : packet(fragment{const_cast<char*>(data), size}) {
+}
+
+inline
+packet::packet(fragment frag, deleter d)
+ : _impl(impl::allocate(1)) {
+ _impl->_deleter = std::move(d);
+ _impl->_frags[_impl->_nr_frags++] = frag;
+ _impl->_len = frag.size;
+}
+
+inline
+packet::packet(std::vector<fragment> frag, deleter d)
+ : _impl(impl::allocate(frag.size())) {
+ _impl->_deleter = std::move(d);
+ std::copy(frag.begin(), frag.end(), _impl->_frags);
+ _impl->_nr_frags = frag.size();
+ _impl->_len = 0;
+ for (auto&& f : _impl->fragments()) {
+ _impl->_len += f.size;
+ }
+}
+
+template <typename Iterator>
+inline
+packet::packet(Iterator begin, Iterator end, deleter del) {
+ unsigned nr_frags = 0, len = 0;
+ nr_frags = std::distance(begin, end);
+ std::for_each(begin, end, [&] (const fragment& frag) { len += frag.size; });
+ _impl = impl::allocate(nr_frags);
+ _impl->_deleter = std::move(del);
+ _impl->_len = len;
+ _impl->_nr_frags = nr_frags;
+ std::copy(begin, end, _impl->_frags);
+}
+
+inline
+packet::packet(packet&& x, fragment frag)
+ : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+ _impl->_len += frag.size;
+ std::unique_ptr<char[]> buf(new char[frag.size]);
+ std::copy(frag.base, frag.base + frag.size, buf.get());
+ _impl->_frags[_impl->_nr_frags++] = {buf.get(), frag.size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter), [buf = buf.release()] {
+ delete[] buf;
+ });
+}
+
+inline
+bool
+packet::allocate_headroom(size_t size) {
+ if (_impl->_headroom >= size) {
+ _impl->_len += size;
+ if (!_impl->using_internal_data()) {
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ std::copy_backward(_impl->_frags, _impl->_frags + _impl->_nr_frags,
+ _impl->_frags + _impl->_nr_frags + 1);
+ _impl->_frags[0] = { _impl->_data + internal_data_size, 0 };
+ ++_impl->_nr_frags;
+ }
+ _impl->_headroom -= size;
+ _impl->_frags[0].base -= size;
+ _impl->_frags[0].size += size;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+
+inline
+packet::packet(fragment frag, packet&& x)
+ : _impl(std::move(x._impl)) {
+ // try to prepend into existing internal fragment
+ if (allocate_headroom(frag.size)) {
+ std::copy(frag.base, frag.base + frag.size, _impl->_frags[0].base);
+ return;
+ } else {
+ // didn't work out, allocate and copy
+ _impl->unuse_internal_data();
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ _impl->_len += frag.size;
+ std::unique_ptr<char[]> buf(new char[frag.size]);
+ std::copy(frag.base, frag.base + frag.size, buf.get());
+ std::copy_backward(_impl->_frags, _impl->_frags + _impl->_nr_frags,
+ _impl->_frags + _impl->_nr_frags + 1);
+ ++_impl->_nr_frags;
+ _impl->_frags[0] = {buf.get(), frag.size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+ [buf = std::move(buf)] {});
+ }
+}
+
+inline
+packet::packet(packet&& x, fragment frag, deleter d)
+ : _impl(impl::allocate_if_needed(std::move(x._impl), 1)) {
+ _impl->_len += frag.size;
+ _impl->_frags[_impl->_nr_frags++] = frag;
+ d.append(std::move(_impl->_deleter));
+ _impl->_deleter = std::move(d);
+}
+
+inline
+packet::packet(packet&& x, deleter d)
+ : _impl(std::move(x._impl)) {
+ _impl->_deleter.append(std::move(d));
+}
+
+inline
+packet::packet(packet&& x, temporary_buffer<char> buf)
+ : packet(std::move(x), fragment{buf.get_write(), buf.size()}, buf.release()) {
+}
+
+inline
+packet::packet(temporary_buffer<char> buf)
+ : packet(fragment{buf.get_write(), buf.size()}, buf.release()) {}
+
+inline
+void packet::append(packet&& p) {
+ if (!_impl->_len) {
+ *this = std::move(p);
+ return;
+ }
+ _impl = impl::allocate_if_needed(std::move(_impl), p._impl->_nr_frags);
+ _impl->_len += p._impl->_len;
+ p._impl->unuse_internal_data();
+ std::copy(p._impl->_frags, p._impl->_frags + p._impl->_nr_frags,
+ _impl->_frags + _impl->_nr_frags);
+ _impl->_nr_frags += p._impl->_nr_frags;
+ p._impl->_deleter.append(std::move(_impl->_deleter));
+ _impl->_deleter = std::move(p._impl->_deleter);
+}
+
+inline
+char* packet::get_header(size_t offset, size_t size) {
+ if (offset + size > _impl->_len) {
+ return nullptr;
+ }
+ size_t i = 0;
+ while (i != _impl->_nr_frags && offset >= _impl->_frags[i].size) {
+ offset -= _impl->_frags[i++].size;
+ }
+ if (i == _impl->_nr_frags) {
+ return nullptr;
+ }
+ if (offset + size > _impl->_frags[i].size) {
+ linearize(i, offset + size);
+ }
+ return _impl->_frags[i].base + offset;
+}
+
+template <typename Header>
+inline
+Header* packet::get_header(size_t offset) {
+ return reinterpret_cast<Header*>(get_header(offset, sizeof(Header)));
+}
+
+inline
+void packet::trim_front(size_t how_much) {
+ assert(how_much <= _impl->_len);
+ _impl->_len -= how_much;
+ size_t i = 0;
+ while (how_much && how_much >= _impl->_frags[i].size) {
+ how_much -= _impl->_frags[i++].size;
+ }
+ std::copy(_impl->_frags + i, _impl->_frags + _impl->_nr_frags, _impl->_frags);
+ _impl->_nr_frags -= i;
+ if (!_impl->using_internal_data()) {
+ _impl->_headroom = internal_data_size;
+ }
+ if (how_much) {
+ if (_impl->using_internal_data()) {
+ _impl->_headroom += how_much;
+ }
+ _impl->_frags[0].base += how_much;
+ _impl->_frags[0].size -= how_much;
+ }
+}
+
+inline
+void packet::trim_back(size_t how_much) {
+ assert(how_much <= _impl->_len);
+ _impl->_len -= how_much;
+ size_t i = _impl->_nr_frags - 1;
+ while (how_much && how_much >= _impl->_frags[i].size) {
+ how_much -= _impl->_frags[i--].size;
+ }
+ _impl->_nr_frags = i + 1;
+ if (how_much) {
+ _impl->_frags[i].size -= how_much;
+ if (i == 0 && _impl->using_internal_data()) {
+ _impl->_headroom += how_much;
+ }
+ }
+}
+
+template <typename Header>
+Header*
+packet::prepend_header(size_t extra_size) {
+ auto h = prepend_uninitialized_header(sizeof(Header) + extra_size);
+ return new (h) Header{};
+}
+
+// prepend a header (uninitialized!)
+inline
+char* packet::prepend_uninitialized_header(size_t size) {
+ if (!allocate_headroom(size)) {
+ // didn't work out, allocate and copy
+ _impl->unuse_internal_data();
+ // try again, after unuse_internal_data we may have space after all
+ if (!allocate_headroom(size)) {
+ // failed
+ _impl->_len += size;
+ _impl = impl::allocate_if_needed(std::move(_impl), 1);
+ std::unique_ptr<char[]> buf(new char[size]);
+ std::copy_backward(_impl->_frags, _impl->_frags + _impl->_nr_frags,
+ _impl->_frags + _impl->_nr_frags + 1);
+ ++_impl->_nr_frags;
+ _impl->_frags[0] = {buf.get(), size};
+ _impl->_deleter = make_deleter(std::move(_impl->_deleter),
+ [buf = std::move(buf)] {});
+ }
+ }
+ return _impl->_frags[0].base;
+}
+
+inline
+packet packet::share() {
+ return share(0, _impl->_len);
+}
+
+inline
+packet packet::share(size_t offset, size_t len) {
+ _impl->unuse_internal_data(); // FIXME: eliminate?
+ packet n;
+ n._impl = impl::allocate_if_needed(std::move(n._impl), _impl->_nr_frags);
+ size_t idx = 0;
+ while (offset > 0 && offset >= _impl->_frags[idx].size) {
+ offset -= _impl->_frags[idx++].size;
+ }
+ while (n._impl->_len < len) {
+ auto& f = _impl->_frags[idx++];
+ auto fsize = std::min(len - n._impl->_len, f.size - offset);
+ n._impl->_frags[n._impl->_nr_frags++] = { f.base + offset, fsize };
+ n._impl->_len += fsize;
+ offset = 0;
+ }
+ n._impl->_offload_info = _impl->_offload_info;
+ assert(!n._impl->_deleter);
+ n._impl->_deleter = _impl->_deleter.share();
+ return n;
+}
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/posix-stack.hh b/src/seastar/include/seastar/net/posix-stack.hh
new file mode 100644
index 000000000..00fd15ba1
--- /dev/null
+++ b/src/seastar/include/seastar/net/posix-stack.hh
@@ -0,0 +1,223 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/sharded.hh>
+#include <seastar/core/internal/pollable_fd.hh>
+#include <seastar/net/stack.hh>
+#include <seastar/core/polymorphic_temporary_buffer.hh>
+#include <seastar/core/internal/buffer_allocator.hh>
+#include <boost/program_options.hpp>
+
+namespace seastar {
+
+namespace net {
+
+using namespace seastar;
+
+// We can't keep this in any of the socket servers as instance members, because a connection can
+// outlive the socket server. To avoid having the whole socket_server tracked as a shared pointer,
+// we will have a conntrack structure.
+//
+// Right now this class is used by the posix_server_socket_impl, but it could be used by any other.
+class conntrack {
+ class load_balancer {
+ std::vector<unsigned> _cpu_load;
+ public:
+ load_balancer() : _cpu_load(size_t(smp::count), 0) {}
+ void closed_cpu(shard_id cpu) {
+ _cpu_load[cpu]--;
+ }
+ shard_id next_cpu() {
+ // FIXME: The naive algorithm will just round robin the connections around the shards.
+ // A more complex version can keep track of the amount of activity in each connection,
+ // and use that information.
+ auto min_el = std::min_element(_cpu_load.begin(), _cpu_load.end());
+ auto cpu = shard_id(std::distance(_cpu_load.begin(), min_el));
+ _cpu_load[cpu]++;
+ return cpu;
+ }
+ shard_id force_cpu(shard_id cpu) {
+ _cpu_load[cpu]++;
+ return cpu;
+ }
+ };
+
+ lw_shared_ptr<load_balancer> _lb;
+ void closed_cpu(shard_id cpu) {
+ _lb->closed_cpu(cpu);
+ }
+public:
+ class handle {
+ shard_id _host_cpu;
+ shard_id _target_cpu;
+ foreign_ptr<lw_shared_ptr<load_balancer>> _lb;
+ public:
+ handle() : _lb(nullptr) {}
+ handle(shard_id cpu, lw_shared_ptr<load_balancer> lb)
+ : _host_cpu(this_shard_id())
+ , _target_cpu(cpu)
+ , _lb(make_foreign(std::move(lb))) {}
+
+ handle(const handle&) = delete;
+ handle(handle&&) = default;
+ ~handle() {
+ if (!_lb) {
+ return;
+ }
+ // FIXME: future is discarded
+ (void)smp::submit_to(_host_cpu, [cpu = _target_cpu, lb = std::move(_lb)] {
+ lb->closed_cpu(cpu);
+ });
+ }
+ shard_id cpu() {
+ return _target_cpu;
+ }
+ };
+ friend class handle;
+
+ conntrack() : _lb(make_lw_shared<load_balancer>()) {}
+ handle get_handle() {
+ return handle(_lb->next_cpu(), _lb);
+ }
+ handle get_handle(shard_id cpu) {
+ return handle(_lb->force_cpu(cpu), _lb);
+ }
+};
+
+class posix_data_source_impl final : public data_source_impl, private internal::buffer_allocator {
+ std::pmr::polymorphic_allocator<char>* _buffer_allocator;
+ pollable_fd _fd;
+ connected_socket_input_stream_config _config;
+private:
+ virtual temporary_buffer<char> allocate_buffer() override;
+public:
+ explicit posix_data_source_impl(pollable_fd fd, connected_socket_input_stream_config config,
+ std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator)
+ : _buffer_allocator(allocator), _fd(std::move(fd)), _config(config) {
+ }
+ future<temporary_buffer<char>> get() override;
+ future<> close() override;
+};
+
+class posix_data_sink_impl : public data_sink_impl {
+ pollable_fd _fd;
+ packet _p;
+public:
+ explicit posix_data_sink_impl(pollable_fd fd) : _fd(std::move(fd)) {}
+ using data_sink_impl::put;
+ future<> put(packet p) override;
+ future<> put(temporary_buffer<char> buf) override;
+ future<> close() override;
+};
+
+class posix_ap_server_socket_impl : public server_socket_impl {
+ using protocol_and_socket_address = std::tuple<int, socket_address>;
+ struct connection {
+ pollable_fd fd;
+ socket_address addr;
+ conntrack::handle connection_tracking_handle;
+ connection(pollable_fd xfd, socket_address xaddr, conntrack::handle cth) : fd(std::move(xfd)), addr(xaddr), connection_tracking_handle(std::move(cth)) {}
+ };
+ using sockets_map_t = std::unordered_map<protocol_and_socket_address, promise<accept_result>>;
+ using conn_map_t = std::unordered_multimap<protocol_and_socket_address, connection>;
+ static thread_local sockets_map_t sockets;
+ static thread_local conn_map_t conn_q;
+ int _protocol;
+ socket_address _sa;
+ std::pmr::polymorphic_allocator<char>* _allocator;
+public:
+ explicit posix_ap_server_socket_impl(int protocol, socket_address sa, std::pmr::polymorphic_allocator<char>* allocator = memory::malloc_allocator) : _protocol(protocol), _sa(sa), _allocator(allocator) {}
+ virtual future<accept_result> accept() override;
+ virtual void abort_accept() override;
+ socket_address local_address() const override {
+ return _sa;
+ }
+ static void move_connected_socket(int protocol, socket_address sa, pollable_fd fd, socket_address addr, conntrack::handle handle, std::pmr::polymorphic_allocator<char>* allocator);
+
+ template <typename T>
+ friend class std::hash;
+};
+
+class posix_server_socket_impl : public server_socket_impl {
+ socket_address _sa;
+ int _protocol;
+ pollable_fd _lfd;
+ conntrack _conntrack;
+ server_socket::load_balancing_algorithm _lba;
+ shard_id _fixed_cpu;
+ std::pmr::polymorphic_allocator<char>* _allocator;
+public:
+ explicit posix_server_socket_impl(int protocol, socket_address sa, pollable_fd lfd,
+ server_socket::load_balancing_algorithm lba, shard_id fixed_cpu,
+ std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) : _sa(sa), _protocol(protocol), _lfd(std::move(lfd)), _lba(lba), _fixed_cpu(fixed_cpu), _allocator(allocator) {}
+ virtual future<accept_result> accept() override;
+ virtual void abort_accept() override;
+ virtual socket_address local_address() const override;
+};
+
+class posix_reuseport_server_socket_impl : public server_socket_impl {
+ socket_address _sa;
+ int _protocol;
+ pollable_fd _lfd;
+ std::pmr::polymorphic_allocator<char>* _allocator;
+public:
+ explicit posix_reuseport_server_socket_impl(int protocol, socket_address sa, pollable_fd lfd,
+ std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) : _sa(sa), _protocol(protocol), _lfd(std::move(lfd)), _allocator(allocator) {}
+ virtual future<accept_result> accept() override;
+ virtual void abort_accept() override;
+ virtual socket_address local_address() const override;
+};
+
+class posix_network_stack : public network_stack {
+private:
+ const bool _reuseport;
+protected:
+ std::pmr::polymorphic_allocator<char>* _allocator;
+public:
+ explicit posix_network_stack(boost::program_options::variables_map opts, std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator);
+ virtual server_socket listen(socket_address sa, listen_options opts) override;
+ virtual ::seastar::socket socket() override;
+ virtual net::udp_channel make_udp_channel(const socket_address&) override;
+ static future<std::unique_ptr<network_stack>> create(boost::program_options::variables_map opts, std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) {
+ return make_ready_future<std::unique_ptr<network_stack>>(std::unique_ptr<network_stack>(new posix_network_stack(opts, allocator)));
+ }
+ virtual bool has_per_core_namespace() override { return _reuseport; };
+ bool supports_ipv6() const override;
+ std::vector<network_interface> network_interfaces() override;
+};
+
+class posix_ap_network_stack : public posix_network_stack {
+private:
+ const bool _reuseport;
+public:
+ posix_ap_network_stack(boost::program_options::variables_map opts, std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator);
+ virtual server_socket listen(socket_address sa, listen_options opts) override;
+ static future<std::unique_ptr<network_stack>> create(boost::program_options::variables_map opts, std::pmr::polymorphic_allocator<char>* allocator=memory::malloc_allocator) {
+ return make_ready_future<std::unique_ptr<network_stack>>(std::unique_ptr<network_stack>(new posix_ap_network_stack(opts, allocator)));
+ }
+};
+
+void register_posix_stack();
+}
+
+}
diff --git a/src/seastar/include/seastar/net/proxy.hh b/src/seastar/include/seastar/net/proxy.hh
new file mode 100644
index 000000000..cd2632424
--- /dev/null
+++ b/src/seastar/include/seastar/net/proxy.hh
@@ -0,0 +1,32 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#pragma once
+
+#include <memory>
+#include <seastar/net/net.hh>
+#include <seastar/net/packet.hh>
+
+namespace seastar {
+
+namespace net {
+
+std::unique_ptr<qp> create_proxy_net_device(unsigned master_cpu, device* dev);
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/socket_defs.hh b/src/seastar/include/seastar/net/socket_defs.hh
new file mode 100644
index 000000000..3e771d8b6
--- /dev/null
+++ b/src/seastar/include/seastar/net/socket_defs.hh
@@ -0,0 +1,186 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+#pragma once
+
+#include <iosfwd>
+#include <array>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <netinet/ip.h>
+#include <seastar/net/byteorder.hh>
+#include <seastar/net/unix_address.hh>
+#include <cassert>
+
+namespace seastar {
+
+namespace net {
+class inet_address;
+}
+
+struct ipv4_addr;
+struct ipv6_addr;
+
+class socket_address {
+public:
+ socklen_t addr_length; ///!< actual size of the relevant 'u' member
+ union {
+ ::sockaddr_storage sas;
+ ::sockaddr sa;
+ ::sockaddr_in in;
+ ::sockaddr_in6 in6;
+ ::sockaddr_un un;
+ } u;
+ socket_address(const sockaddr_in& sa) noexcept : addr_length{sizeof(::sockaddr_in)} {
+ u.in = sa;
+ }
+ socket_address(const sockaddr_in6& sa) noexcept : addr_length{sizeof(::sockaddr_in6)} {
+ u.in6 = sa;
+ }
+ socket_address(uint16_t) noexcept;
+ socket_address(ipv4_addr) noexcept;
+ socket_address(const ipv6_addr&) noexcept;
+ socket_address(const ipv6_addr&, uint32_t scope) noexcept;
+ socket_address(const net::inet_address&, uint16_t p = 0) noexcept;
+ explicit socket_address(const unix_domain_addr&) noexcept;
+ /** creates an uninitialized socket_address. this can be written into, or used as
+ * "unspecified" for such addresses as bind(addr) or local address in socket::connect
+ * (i.e. system picks)
+ */
+ socket_address() noexcept;
+
+ ::sockaddr& as_posix_sockaddr() noexcept { return u.sa; }
+ ::sockaddr_in& as_posix_sockaddr_in() noexcept { return u.in; }
+ ::sockaddr_in6& as_posix_sockaddr_in6() noexcept { return u.in6; }
+ const ::sockaddr& as_posix_sockaddr() const noexcept { return u.sa; }
+ const ::sockaddr_in& as_posix_sockaddr_in() const noexcept { return u.in; }
+ const ::sockaddr_in6& as_posix_sockaddr_in6() const noexcept { return u.in6; }
+
+ socket_address(uint32_t, uint16_t p = 0) noexcept;
+
+ socklen_t length() const noexcept { return addr_length; };
+
+ bool is_af_unix() const noexcept {
+ return u.sa.sa_family == AF_UNIX;
+ }
+
+ bool is_unspecified() const noexcept;
+
+ sa_family_t family() const noexcept {
+ return u.sa.sa_family;
+ }
+
+ net::inet_address addr() const noexcept;
+ ::in_port_t port() const noexcept;
+ bool is_wildcard() const noexcept;
+
+ bool operator==(const socket_address&) const noexcept;
+ bool operator!=(const socket_address& a) const noexcept {
+ return !(*this == a);
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const socket_address&);
+
+enum class transport {
+ TCP = IPPROTO_TCP,
+ SCTP = IPPROTO_SCTP
+};
+
+struct ipv4_addr {
+ uint32_t ip;
+ uint16_t port;
+
+ ipv4_addr() noexcept : ip(0), port(0) {}
+ ipv4_addr(uint32_t ip, uint16_t port) noexcept : ip(ip), port(port) {}
+ ipv4_addr(uint16_t port) noexcept : ip(0), port(port) {}
+ // throws if not a valid ipv4 addr
+ ipv4_addr(const std::string &addr);
+ ipv4_addr(const std::string &addr, uint16_t port);
+ // throws if not an ipv4 addr
+ ipv4_addr(const net::inet_address&, uint16_t);
+ ipv4_addr(const socket_address &) noexcept;
+ ipv4_addr(const ::in_addr&, uint16_t = 0) noexcept;
+
+ bool is_ip_unspecified() const noexcept {
+ return ip == 0;
+ }
+ bool is_port_unspecified() const noexcept {
+ return port == 0;
+ }
+};
+
+struct ipv6_addr {
+ using ipv6_bytes = std::array<uint8_t, 16>;
+
+ ipv6_bytes ip;
+ uint16_t port;
+
+ ipv6_addr(const ipv6_bytes&, uint16_t port = 0) noexcept;
+ ipv6_addr(uint16_t port = 0) noexcept;
+ // throws if not a valid ipv6 addr
+ ipv6_addr(const std::string&);
+ ipv6_addr(const std::string&, uint16_t port);
+ ipv6_addr(const net::inet_address&, uint16_t = 0) noexcept;
+ ipv6_addr(const ::in6_addr&, uint16_t = 0) noexcept;
+ ipv6_addr(const ::sockaddr_in6&) noexcept;
+ ipv6_addr(const socket_address&) noexcept;
+
+ bool is_ip_unspecified() const noexcept;
+ bool is_port_unspecified() const noexcept {
+ return port == 0;
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const ipv4_addr&);
+std::ostream& operator<<(std::ostream&, const ipv6_addr&);
+
+inline bool operator==(const ipv4_addr &lhs, const ipv4_addr& rhs) noexcept {
+ return lhs.ip == rhs.ip && lhs.port == rhs.port;
+}
+
+}
+
+namespace std {
+template<>
+struct hash<seastar::socket_address> {
+ size_t operator()(const seastar::socket_address&) const;
+};
+template<>
+struct hash<seastar::ipv4_addr> {
+ size_t operator()(const seastar::ipv4_addr&) const;
+};
+template<>
+struct hash<seastar::unix_domain_addr> {
+ size_t operator()(const seastar::unix_domain_addr&) const;
+};
+template<>
+struct hash<::sockaddr_un> {
+ size_t operator()(const ::sockaddr_un&) const;
+};
+
+template <>
+struct hash<seastar::transport> {
+ size_t operator()(seastar::transport tr) const {
+ return static_cast<size_t>(tr);
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/net/stack.hh b/src/seastar/include/seastar/net/stack.hh
new file mode 100644
index 000000000..438298569
--- /dev/null
+++ b/src/seastar/include/seastar/net/stack.hh
@@ -0,0 +1,103 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+#pragma once
+
+#include <chrono>
+#include <seastar/net/api.hh>
+#include <seastar/core/memory.hh>
+#include "../core/internal/api-level.hh"
+
+namespace seastar {
+
+namespace net {
+
+/// \cond internal
+class connected_socket_impl {
+public:
+ virtual ~connected_socket_impl() {}
+ virtual data_source source() = 0;
+ virtual data_source source(connected_socket_input_stream_config csisc);
+ virtual data_sink sink() = 0;
+ virtual void shutdown_input() = 0;
+ virtual void shutdown_output() = 0;
+ virtual void set_nodelay(bool nodelay) = 0;
+ virtual bool get_nodelay() const = 0;
+ virtual void set_keepalive(bool keepalive) = 0;
+ virtual bool get_keepalive() const = 0;
+ virtual void set_keepalive_parameters(const keepalive_params&) = 0;
+ virtual keepalive_params get_keepalive_parameters() const = 0;
+ virtual void set_sockopt(int level, int optname, const void* data, size_t len) = 0;
+ virtual int get_sockopt(int level, int optname, void* data, size_t len) const = 0;
+};
+
+class socket_impl {
+public:
+ virtual ~socket_impl() {}
+ virtual future<connected_socket> connect(socket_address sa, socket_address local, transport proto = transport::TCP) = 0;
+ virtual void set_reuseaddr(bool reuseaddr) = 0;
+ virtual bool get_reuseaddr() const = 0;
+ virtual void shutdown() = 0;
+};
+
+
+class server_socket_impl {
+public:
+ virtual ~server_socket_impl() {}
+ virtual future<accept_result> accept() = 0;
+ virtual void abort_accept() = 0;
+ virtual socket_address local_address() const = 0;
+};
+
+class udp_channel_impl {
+public:
+ virtual ~udp_channel_impl() {}
+ virtual socket_address local_address() const = 0;
+ virtual future<udp_datagram> receive() = 0;
+ virtual future<> send(const socket_address& dst, const char* msg) = 0;
+ virtual future<> send(const socket_address& dst, packet p) = 0;
+ virtual void shutdown_input() = 0;
+ virtual void shutdown_output() = 0;
+ virtual bool is_closed() const = 0;
+ virtual void close() = 0;
+};
+
+class network_interface_impl {
+public:
+ virtual ~network_interface_impl() {}
+ virtual uint32_t index() const = 0;
+ virtual uint32_t mtu() const = 0;
+
+ virtual const sstring& name() const = 0;
+ virtual const sstring& display_name() const = 0;
+ virtual const std::vector<net::inet_address>& addresses() const = 0;
+ virtual const std::vector<uint8_t> hardware_address() const = 0;
+
+ virtual bool is_loopback() const = 0;
+ virtual bool is_virtual() const = 0;
+ virtual bool is_up() const = 0;
+ virtual bool supports_ipv6() const = 0;
+};
+
+/// \endcond
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/tcp-stack.hh b/src/seastar/include/seastar/net/tcp-stack.hh
new file mode 100644
index 000000000..cf797a2b4
--- /dev/null
+++ b/src/seastar/include/seastar/net/tcp-stack.hh
@@ -0,0 +1,50 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+// tcp/network-stack integration
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include "../core/internal/api-level.hh"
+
+namespace seastar {
+
+struct listen_options;
+
+class server_socket;
+class connected_socket;
+
+namespace net {
+
+struct ipv4_traits;
+template <typename InetTraits>
+class tcp;
+
+server_socket
+tcpv4_listen(tcp<ipv4_traits>& tcpv4, uint16_t port, listen_options opts);
+
+seastar::socket
+tcpv4_socket(tcp<ipv4_traits>& tcpv4);
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/tcp.hh b/src/seastar/include/seastar/net/tcp.hh
new file mode 100644
index 000000000..cde1942a1
--- /dev/null
+++ b/src/seastar/include/seastar/net/tcp.hh
@@ -0,0 +1,2135 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/semaphore.hh>
+#include <seastar/core/byteorder.hh>
+#include <seastar/core/metrics.hh>
+#include <seastar/net/net.hh>
+#include <seastar/net/ip_checksum.hh>
+#include <seastar/net/ip.hh>
+#include <seastar/net/const.hh>
+#include <seastar/net/packet-util.hh>
+#include <seastar/util/std-compat.hh>
+#include <unordered_map>
+#include <map>
+#include <functional>
+#include <deque>
+#include <chrono>
+#include <random>
+#include <stdexcept>
+#include <system_error>
+
+#define CRYPTOPP_ENABLE_NAMESPACE_WEAK 1
+#include <cryptopp/md5.h>
+
+namespace seastar {
+
+using namespace std::chrono_literals;
+
+namespace net {
+
+struct tcp_hdr;
+
+inline auto tcp_error(int err) {
+ return std::system_error(err, std::system_category());
+}
+
+inline auto tcp_reset_error() {
+ return tcp_error(ECONNRESET);
+};
+
+inline auto tcp_connect_error() {
+ return tcp_error(ECONNABORTED);
+}
+
+inline auto tcp_refused_error() {
+ return tcp_error(ECONNREFUSED);
+};
+
+enum class tcp_state : uint16_t {
+ CLOSED = (1 << 0),
+ LISTEN = (1 << 1),
+ SYN_SENT = (1 << 2),
+ SYN_RECEIVED = (1 << 3),
+ ESTABLISHED = (1 << 4),
+ FIN_WAIT_1 = (1 << 5),
+ FIN_WAIT_2 = (1 << 6),
+ CLOSE_WAIT = (1 << 7),
+ CLOSING = (1 << 8),
+ LAST_ACK = (1 << 9),
+ TIME_WAIT = (1 << 10)
+};
+
+inline tcp_state operator|(tcp_state s1, tcp_state s2) {
+ return tcp_state(uint16_t(s1) | uint16_t(s2));
+}
+
+template <typename... Args>
+void tcp_debug(const char* fmt, Args&&... args) {
+#if TCP_DEBUG
+ print(fmt, std::forward<Args>(args)...);
+#endif
+}
+
+struct tcp_option {
+ // The kind and len field are fixed and defined in TCP protocol
+ enum class option_kind: uint8_t { mss = 2, win_scale = 3, sack = 4, timestamps = 8, nop = 1, eol = 0 };
+ enum class option_len: uint8_t { mss = 4, win_scale = 3, sack = 2, timestamps = 10, nop = 1, eol = 1 };
+ static void write(char* p, option_kind kind, option_len len) {
+ p[0] = static_cast<uint8_t>(kind);
+ if (static_cast<uint8_t>(len) > 1) {
+ p[1] = static_cast<uint8_t>(len);
+ }
+ }
+ struct mss {
+ static constexpr option_kind kind = option_kind::mss;
+ static constexpr option_len len = option_len::mss;
+ uint16_t mss;
+ static tcp_option::mss read(const char* p) {
+ tcp_option::mss x;
+ x.mss = read_be<uint16_t>(p + 2);
+ return x;
+ }
+ void write(char* p) const {
+ tcp_option::write(p, kind, len);
+ write_be<uint16_t>(p + 2, mss);
+ }
+ };
+ struct win_scale {
+ static constexpr option_kind kind = option_kind::win_scale;
+ static constexpr option_len len = option_len::win_scale;
+ uint8_t shift;
+ static tcp_option::win_scale read(const char* p) {
+ tcp_option::win_scale x;
+ x.shift = p[2];
+ return x;
+ }
+ void write(char* p) const {
+ tcp_option::write(p, kind, len);
+ p[2] = shift;
+ }
+ };
+ struct sack {
+ static constexpr option_kind kind = option_kind::sack;
+ static constexpr option_len len = option_len::sack;
+ static tcp_option::sack read(const char* p) {
+ return {};
+ }
+ void write(char* p) const {
+ tcp_option::write(p, kind, len);
+ }
+ };
+ struct timestamps {
+ static constexpr option_kind kind = option_kind::timestamps;
+ static constexpr option_len len = option_len::timestamps;
+ uint32_t t1;
+ uint32_t t2;
+ static tcp_option::timestamps read(const char* p) {
+ tcp_option::timestamps ts;
+ ts.t1 = read_be<uint32_t>(p + 2);
+ ts.t2 = read_be<uint32_t>(p + 6);
+ return ts;
+ }
+ void write(char* p) const {
+ tcp_option::write(p, kind, len);
+ write_be<uint32_t>(p + 2, t1);
+ write_be<uint32_t>(p + 6, t2);
+ }
+ };
+ struct nop {
+ static constexpr option_kind kind = option_kind::nop;
+ static constexpr option_len len = option_len::nop;
+ void write(char* p) const {
+ tcp_option::write(p, kind, len);
+ }
+ };
+ struct eol {
+ static constexpr option_kind kind = option_kind::eol;
+ static constexpr option_len len = option_len::eol;
+ void write(char* p) const {
+ tcp_option::write(p, kind, len);
+ }
+ };
+ static const uint8_t align = 4;
+
+ void parse(uint8_t* beg, uint8_t* end);
+ uint8_t fill(void* h, const tcp_hdr* th, uint8_t option_size);
+ uint8_t get_size(bool syn_on, bool ack_on);
+
+ // For option negotiattion
+ bool _mss_received = false;
+ bool _win_scale_received = false;
+ bool _timestamps_received = false;
+ bool _sack_received = false;
+
+ // Option data
+ uint16_t _remote_mss = 536;
+ uint16_t _local_mss;
+ uint8_t _remote_win_scale = 0;
+ uint8_t _local_win_scale = 0;
+};
+inline char*& operator+=(char*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline const char*& operator+=(const char*& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+inline uint8_t& operator+=(uint8_t& x, tcp_option::option_len len) { x += uint8_t(len); return x; }
+
+struct tcp_seq {
+ uint32_t raw;
+};
+
+inline tcp_seq ntoh(tcp_seq s) {
+ return tcp_seq { ntoh(s.raw) };
+}
+
+inline tcp_seq hton(tcp_seq s) {
+ return tcp_seq { hton(s.raw) };
+}
+
+inline
+std::ostream& operator<<(std::ostream& os, tcp_seq s) {
+ return os << s.raw;
+}
+
+inline tcp_seq make_seq(uint32_t raw) { return tcp_seq{raw}; }
+inline tcp_seq& operator+=(tcp_seq& s, int32_t n) { s.raw += n; return s; }
+inline tcp_seq& operator-=(tcp_seq& s, int32_t n) { s.raw -= n; return s; }
+inline tcp_seq operator+(tcp_seq s, int32_t n) { return s += n; }
+inline tcp_seq operator-(tcp_seq s, int32_t n) { return s -= n; }
+inline int32_t operator-(tcp_seq s, tcp_seq q) { return s.raw - q.raw; }
+inline bool operator==(tcp_seq s, tcp_seq q) { return s.raw == q.raw; }
+inline bool operator!=(tcp_seq s, tcp_seq q) { return !(s == q); }
+inline bool operator<(tcp_seq s, tcp_seq q) { return s - q < 0; }
+inline bool operator>(tcp_seq s, tcp_seq q) { return q < s; }
+inline bool operator<=(tcp_seq s, tcp_seq q) { return !(s > q); }
+inline bool operator>=(tcp_seq s, tcp_seq q) { return !(s < q); }
+
+struct tcp_hdr {
+ static constexpr size_t len = 20;
+ uint16_t src_port;
+ uint16_t dst_port;
+ tcp_seq seq;
+ tcp_seq ack;
+ uint8_t rsvd1 : 4;
+ uint8_t data_offset : 4;
+ uint8_t f_fin : 1;
+ uint8_t f_syn : 1;
+ uint8_t f_rst : 1;
+ uint8_t f_psh : 1;
+ uint8_t f_ack : 1;
+ uint8_t f_urg : 1;
+ uint8_t rsvd2 : 2;
+ uint16_t window;
+ uint16_t checksum;
+ uint16_t urgent;
+ static tcp_hdr read(const char* p) {
+ tcp_hdr h;
+ h.src_port = read_be<uint16_t>(p + 0);
+ h.dst_port = read_be<uint16_t>(p + 2);
+ h.seq = tcp_seq{read_be<uint32_t>(p + 4)};
+ h.ack = tcp_seq{read_be<uint32_t>(p + 8)};
+ h.rsvd1 = p[12] & 15;
+ h.data_offset = uint8_t(p[12]) >> 4;
+ h.f_fin = (uint8_t(p[13]) >> 0) & 1;
+ h.f_syn = (uint8_t(p[13]) >> 1) & 1;
+ h.f_rst = (uint8_t(p[13]) >> 2) & 1;
+ h.f_psh = (uint8_t(p[13]) >> 3) & 1;
+ h.f_ack = (uint8_t(p[13]) >> 4) & 1;
+ h.f_urg = (uint8_t(p[13]) >> 5) & 1;
+ h.rsvd2 = (uint8_t(p[13]) >> 6) & 3;
+ h.window = read_be<uint16_t>(p + 14);
+ h.checksum = read_be<uint16_t>(p + 16);
+ h.urgent = read_be<uint16_t>(p + 18);
+ return h;
+ }
+ void write(char* p) const {
+ write_be<uint16_t>(p + 0, src_port);
+ write_be<uint16_t>(p + 2, dst_port);
+ write_be<uint32_t>(p + 4, seq.raw);
+ write_be<uint32_t>(p + 8, ack.raw);
+ p[12] = rsvd1 | (data_offset << 4);
+ p[13] = (f_fin << 0)
+ | (f_syn << 1)
+ | (f_rst << 2)
+ | (f_psh << 3)
+ | (f_ack << 4)
+ | (f_urg << 5)
+ | (rsvd2 << 6);
+ write_be<uint16_t>(p + 14, window);
+ write_be<uint16_t>(p + 16, checksum);
+ write_be<uint16_t>(p + 18, urgent);
+ }
+ static void write_nbo_checksum(char* p, uint16_t checksum_in_network_byte_order) {
+ std::copy_n(reinterpret_cast<const char*>(&checksum_in_network_byte_order), 2, p + 16);
+ }
+};
+
+struct tcp_tag {};
+using tcp_packet_merger = packet_merger<tcp_seq, tcp_tag>;
+
+template <typename InetTraits>
+class tcp {
+public:
+ using ipaddr = typename InetTraits::address_type;
+ using inet_type = typename InetTraits::inet_type;
+ using connid = l4connid<InetTraits>;
+ using connid_hash = typename connid::connid_hash;
+ class connection;
+ class listener;
+private:
+ class tcb;
+
+ class tcb : public enable_lw_shared_from_this<tcb> {
+ using clock_type = lowres_clock;
+ static constexpr tcp_state CLOSED = tcp_state::CLOSED;
+ static constexpr tcp_state LISTEN = tcp_state::LISTEN;
+ static constexpr tcp_state SYN_SENT = tcp_state::SYN_SENT;
+ static constexpr tcp_state SYN_RECEIVED = tcp_state::SYN_RECEIVED;
+ static constexpr tcp_state ESTABLISHED = tcp_state::ESTABLISHED;
+ static constexpr tcp_state FIN_WAIT_1 = tcp_state::FIN_WAIT_1;
+ static constexpr tcp_state FIN_WAIT_2 = tcp_state::FIN_WAIT_2;
+ static constexpr tcp_state CLOSE_WAIT = tcp_state::CLOSE_WAIT;
+ static constexpr tcp_state CLOSING = tcp_state::CLOSING;
+ static constexpr tcp_state LAST_ACK = tcp_state::LAST_ACK;
+ static constexpr tcp_state TIME_WAIT = tcp_state::TIME_WAIT;
+ tcp_state _state = CLOSED;
+ tcp& _tcp;
+ connection* _conn = nullptr;
+ promise<> _connect_done;
+ ipaddr _local_ip;
+ ipaddr _foreign_ip;
+ uint16_t _local_port;
+ uint16_t _foreign_port;
+ struct unacked_segment {
+ packet p;
+ uint16_t data_len;
+ unsigned nr_transmits;
+ clock_type::time_point tx_time;
+ };
+ struct send {
+ tcp_seq unacknowledged;
+ tcp_seq next;
+ uint32_t window;
+ uint8_t window_scale;
+ uint16_t mss;
+ tcp_seq urgent;
+ tcp_seq wl1;
+ tcp_seq wl2;
+ tcp_seq initial;
+ std::deque<unacked_segment> data;
+ std::deque<packet> unsent;
+ uint32_t unsent_len = 0;
+ bool closed = false;
+ promise<> _window_opened;
+ // Wait for all data are acked
+ std::optional<promise<>> _all_data_acked_promise;
+ // Limit number of data queued into send queue
+ size_t max_queue_space = 212992;
+ size_t current_queue_space = 0;
+ // wait for there is at least one byte available in the queue
+ std::optional<promise<>> _send_available_promise;
+ // Round-trip time variation
+ std::chrono::milliseconds rttvar;
+ // Smoothed round-trip time
+ std::chrono::milliseconds srtt;
+ bool first_rto_sample = true;
+ clock_type::time_point syn_tx_time;
+ // Congestion window
+ uint32_t cwnd;
+ // Slow start threshold
+ uint32_t ssthresh;
+ // Duplicated ACKs
+ uint16_t dupacks = 0;
+ unsigned syn_retransmit = 0;
+ unsigned fin_retransmit = 0;
+ uint32_t limited_transfer = 0;
+ uint32_t partial_ack = 0;
+ tcp_seq recover;
+ bool window_probe = false;
+ uint8_t zero_window_probing_out = 0;
+ } _snd;
+ struct receive {
+ tcp_seq next;
+ uint32_t window;
+ uint8_t window_scale;
+ uint16_t mss;
+ tcp_seq urgent;
+ tcp_seq initial;
+ std::deque<packet> data;
+ // The total size of data stored in std::deque<packet> data
+ size_t data_size = 0;
+ tcp_packet_merger out_of_order;
+ std::optional<promise<>> _data_received_promise;
+ // The maximun memory buffer size allowed for receiving
+ // Currently, it is the same as default receive window size when window scaling is enabled
+ size_t max_receive_buf_size = 3737600;
+ } _rcv;
+ tcp_option _option;
+ timer<lowres_clock> _delayed_ack;
+ // Retransmission timeout
+ std::chrono::milliseconds _rto{1000};
+ std::chrono::milliseconds _persist_time_out{1000};
+ static constexpr std::chrono::milliseconds _rto_min{1000};
+ static constexpr std::chrono::milliseconds _rto_max{60000};
+ // Clock granularity
+ static constexpr std::chrono::milliseconds _rto_clk_granularity{1};
+ static constexpr uint16_t _max_nr_retransmit{5};
+ timer<lowres_clock> _retransmit;
+ timer<lowres_clock> _persist;
+ uint16_t _nr_full_seg_received = 0;
+ struct isn_secret {
+ // 512 bits secretkey for ISN generating
+ uint32_t key[16];
+ isn_secret () {
+ std::random_device rd;
+ std::default_random_engine e(rd());
+ std::uniform_int_distribution<uint32_t> dist{};
+ for (auto& k : key) {
+ k = dist(e);
+ }
+ }
+ };
+ static isn_secret _isn_secret;
+ tcp_seq get_isn();
+ circular_buffer<typename InetTraits::l4packet> _packetq;
+ bool _poll_active = false;
+ uint32_t get_default_receive_window_size() {
+ // Linux's default window size
+ constexpr uint32_t size = 29200;
+ return size << _rcv.window_scale;
+ }
+ // Returns the current receive window according to available receiving buffer size
+ uint32_t get_modified_receive_window_size() {
+ uint32_t left = _rcv.data_size > _rcv.max_receive_buf_size ? 0 : _rcv.max_receive_buf_size - _rcv.data_size;
+ return std::min(left, get_default_receive_window_size());
+ }
+ public:
+ tcb(tcp& t, connid id);
+ void input_handle_listen_state(tcp_hdr* th, packet p);
+ void input_handle_syn_sent_state(tcp_hdr* th, packet p);
+ void input_handle_other_state(tcp_hdr* th, packet p);
+ void output_one(bool data_retransmit = false);
+ future<> wait_for_data();
+ void abort_reader();
+ future<> wait_for_all_data_acked();
+ future<> wait_send_available();
+ future<> send(packet p);
+ void connect();
+ packet read();
+ void close();
+ void remove_from_tcbs() {
+ auto id = connid{_local_ip, _foreign_ip, _local_port, _foreign_port};
+ _tcp._tcbs.erase(id);
+ }
+ std::optional<typename InetTraits::l4packet> get_packet();
+ void output() {
+ if (!_poll_active) {
+ _poll_active = true;
+ // FIXME: future is discarded
+ (void)_tcp.poll_tcb(_foreign_ip, this->shared_from_this()).then_wrapped([this] (auto&& f) {
+ try {
+ f.get();
+ } catch(arp_queue_full_error& ex) {
+ // retry later
+ _poll_active = false;
+ this->start_retransmit_timer();
+ } catch(arp_timeout_error& ex) {
+ if (this->in_state(SYN_SENT)) {
+ _connect_done.set_exception(ex);
+ this->cleanup();
+ }
+ // in other states connection should time out
+ }
+ });
+ }
+ }
+ future<> connect_done() {
+ return _connect_done.get_future();
+ }
+ tcp_state& state() {
+ return _state;
+ }
+ private:
+ void respond_with_reset(tcp_hdr* th);
+ bool merge_out_of_order();
+ void insert_out_of_order(tcp_seq seq, packet p);
+ void trim_receive_data_after_window();
+ bool should_send_ack(uint16_t seg_len);
+ void clear_delayed_ack();
+ packet get_transmit_packet();
+ void retransmit_one() {
+ bool data_retransmit = true;
+ output_one(data_retransmit);
+ }
+ void start_retransmit_timer() {
+ auto now = clock_type::now();
+ start_retransmit_timer(now);
+ };
+ void start_retransmit_timer(clock_type::time_point now) {
+ auto tp = now + _rto;
+ _retransmit.rearm(tp);
+ };
+ void stop_retransmit_timer() {
+ _retransmit.cancel();
+ };
+ void start_persist_timer() {
+ auto now = clock_type::now();
+ start_persist_timer(now);
+ };
+ void start_persist_timer(clock_type::time_point now) {
+ auto tp = now + _persist_time_out;
+ _persist.rearm(tp);
+ };
+ void stop_persist_timer() {
+ _persist.cancel();
+ };
+ void persist();
+ void retransmit();
+ void fast_retransmit();
+ void update_rto(clock_type::time_point tx_time);
+ void update_cwnd(uint32_t acked_bytes);
+ void cleanup();
+ uint32_t can_send() {
+ if (_snd.window_probe) {
+ return 1;
+ }
+
+ // Can not send if send window is zero
+ if (_snd.window == 0) {
+ return 0;
+ }
+
+ // Can not send if send window is less than unacknowledged data size
+ auto window_used = uint32_t(_snd.next - _snd.unacknowledged);
+ if (window_used > _snd.window) {
+ return 0;
+ }
+
+ // Can not send more than advertised window allows or unsent data size
+ auto x = std::min(_snd.window - window_used, _snd.unsent_len);
+
+ // Can not send more than congestion window allows
+ x = std::min(_snd.cwnd, x);
+ if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+ // RFC5681 Step 3.1
+ // Send cwnd + 2 * smss per RFC3042
+ auto flight = flight_size();
+ auto max = _snd.cwnd + 2 * _snd.mss;
+ x = flight <= max ? std::min(x, max - flight) : 0;
+ _snd.limited_transfer += x;
+ } else if (_snd.dupacks >= 3) {
+ // RFC5681 Step 3.5
+ // Sent 1 full-sized segment at most
+ x = std::min(uint32_t(_snd.mss), x);
+ }
+ return x;
+ }
+ uint32_t flight_size() {
+ uint32_t size = 0;
+ std::for_each(_snd.data.begin(), _snd.data.end(), [&] (unacked_segment& seg) { size += seg.p.len(); });
+ return size;
+ }
+ uint16_t local_mss() {
+ return _tcp.hw_features().mtu - net::tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+ }
+ void queue_packet(packet p) {
+ _packetq.emplace_back(typename InetTraits::l4packet{_foreign_ip, std::move(p)});
+ }
+ void signal_data_received() {
+ if (_rcv._data_received_promise) {
+ _rcv._data_received_promise->set_value();
+ _rcv._data_received_promise = {};
+ }
+ }
+ void signal_all_data_acked() {
+ if (_snd._all_data_acked_promise && _snd.unsent_len == 0) {
+ _snd._all_data_acked_promise->set_value();
+ _snd._all_data_acked_promise = {};
+ }
+ }
+ void signal_send_available() {
+ if (_snd._send_available_promise && _snd.max_queue_space > _snd.current_queue_space) {
+ _snd._send_available_promise->set_value();
+ _snd._send_available_promise = {};
+ }
+ }
+ void do_syn_sent() {
+ _state = SYN_SENT;
+ _snd.syn_tx_time = clock_type::now();
+ // Send <SYN> to remote
+ output();
+ }
+ void do_syn_received() {
+ _state = SYN_RECEIVED;
+ _snd.syn_tx_time = clock_type::now();
+ // Send <SYN,ACK> to remote
+ output();
+ }
+ void do_established() {
+ _state = ESTABLISHED;
+ update_rto(_snd.syn_tx_time);
+ _connect_done.set_value();
+ }
+ void do_reset() {
+ _state = CLOSED;
+ cleanup();
+ if (_rcv._data_received_promise) {
+ _rcv._data_received_promise->set_exception(tcp_reset_error());
+ _rcv._data_received_promise = std::nullopt;
+ }
+ if (_snd._all_data_acked_promise) {
+ _snd._all_data_acked_promise->set_exception(tcp_reset_error());
+ _snd._all_data_acked_promise = std::nullopt;
+ }
+ if (_snd._send_available_promise) {
+ _snd._send_available_promise->set_exception(tcp_reset_error());
+ _snd._send_available_promise = std::nullopt;
+ }
+ }
+ void do_time_wait() {
+ // FIXME: Implement TIME_WAIT state timer
+ _state = TIME_WAIT;
+ cleanup();
+ }
+ void do_closed() {
+ _state = CLOSED;
+ cleanup();
+ }
+ void do_setup_isn() {
+ _snd.initial = get_isn();
+ _snd.unacknowledged = _snd.initial;
+ _snd.next = _snd.initial + 1;
+ _snd.recover = _snd.initial;
+ }
+ void do_local_fin_acked() {
+ _snd.unacknowledged += 1;
+ _snd.next += 1;
+ }
+ bool syn_needs_on() {
+ return in_state(SYN_SENT | SYN_RECEIVED);
+ }
+ bool fin_needs_on() {
+ return in_state(FIN_WAIT_1 | CLOSING | LAST_ACK) && _snd.closed &&
+ _snd.unsent_len == 0;
+ }
+ bool ack_needs_on() {
+ return !in_state(CLOSED | LISTEN | SYN_SENT);
+ }
+ bool foreign_will_not_send() {
+ return in_state(CLOSING | TIME_WAIT | CLOSE_WAIT | LAST_ACK | CLOSED);
+ }
+ bool in_state(tcp_state state) {
+ return uint16_t(_state) & uint16_t(state);
+ }
+ void exit_fast_recovery() {
+ _snd.dupacks = 0;
+ _snd.limited_transfer = 0;
+ _snd.partial_ack = 0;
+ }
+ uint32_t data_segment_acked(tcp_seq seg_ack);
+ bool segment_acceptable(tcp_seq seg_seq, unsigned seg_len);
+ void init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end);
+ friend class connection;
+ };
+ inet_type& _inet;
+ std::unordered_map<connid, lw_shared_ptr<tcb>, connid_hash> _tcbs;
+ std::unordered_map<uint16_t, listener*> _listening;
+ std::random_device _rd;
+ std::default_random_engine _e;
+ std::uniform_int_distribution<uint16_t> _port_dist{41952, 65535};
+ circular_buffer<std::pair<lw_shared_ptr<tcb>, ethernet_address>> _poll_tcbs;
+ // queue for packets that do not belong to any tcb
+ circular_buffer<ipv4_traits::l4packet> _packetq;
+ semaphore _queue_space = {212992};
+ metrics::metric_groups _metrics;
+public:
+ const inet_type& inet() const {
+ return _inet;
+ }
+ class connection {
+ lw_shared_ptr<tcb> _tcb;
+ public:
+ explicit connection(lw_shared_ptr<tcb> tcbp) : _tcb(std::move(tcbp)) { _tcb->_conn = this; }
+ connection(const connection&) = delete;
+ connection(connection&& x) noexcept : _tcb(std::move(x._tcb)) {
+ _tcb->_conn = this;
+ }
+ ~connection();
+ void operator=(const connection&) = delete;
+ connection& operator=(connection&& x) {
+ if (this != &x) {
+ this->~connection();
+ new (this) connection(std::move(x));
+ }
+ return *this;
+ }
+ future<> connected() {
+ return _tcb->connect_done();
+ }
+ future<> send(packet p) {
+ return _tcb->send(std::move(p));
+ }
+ future<> wait_for_data() {
+ return _tcb->wait_for_data();
+ }
+ packet read() {
+ return _tcb->read();
+ }
+ ipaddr foreign_ip() {
+ return _tcb->_foreign_ip;
+ }
+ uint16_t foreign_port() {
+ return _tcb->_foreign_port;
+ }
+ void shutdown_connect();
+ void close_read();
+ void close_write();
+ };
+ class listener {
+ tcp& _tcp;
+ uint16_t _port;
+ queue<connection> _q;
+ size_t _pending = 0;
+ private:
+ listener(tcp& t, uint16_t port, size_t queue_length)
+ : _tcp(t), _port(port), _q(queue_length) {
+ _tcp._listening.emplace(_port, this);
+ }
+ public:
+ listener(listener&& x)
+ : _tcp(x._tcp), _port(x._port), _q(std::move(x._q)) {
+ _tcp._listening[_port] = this;
+ x._port = 0;
+ }
+ ~listener() {
+ if (_port) {
+ _tcp._listening.erase(_port);
+ }
+ }
+ future<connection> accept() {
+ return _q.not_empty().then([this] {
+ return make_ready_future<connection>(_q.pop());
+ });
+ }
+ void abort_accept() {
+ _q.abort(std::make_exception_ptr(std::system_error(ECONNABORTED, std::system_category())));
+ }
+ bool full() { return _pending + _q.size() >= _q.max_size(); }
+ void inc_pending() { _pending++; }
+ void dec_pending() { _pending--; }
+
+ const tcp& get_tcp() const {
+ return _tcp;
+ }
+ uint16_t port() const {
+ return _port;
+ }
+ friend class tcp;
+ };
+public:
+ explicit tcp(inet_type& inet);
+ void received(packet p, ipaddr from, ipaddr to);
+ bool forward(forward_hash& out_hash_data, packet& p, size_t off);
+ listener listen(uint16_t port, size_t queue_length = 100);
+ connection connect(socket_address sa);
+ const net::hw_features& hw_features() const { return _inet._inet.hw_features(); }
+ future<> poll_tcb(ipaddr to, lw_shared_ptr<tcb> tcb);
+ void add_connected_tcb(lw_shared_ptr<tcb> tcbp, uint16_t local_port) {
+ auto it = _listening.find(local_port);
+ if (it != _listening.end()) {
+ it->second->_q.push(connection(tcbp));
+ it->second->dec_pending();
+ }
+ }
+private:
+ void send_packet_without_tcb(ipaddr from, ipaddr to, packet p);
+ void respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip);
+ friend class listener;
+};
+
+template <typename InetTraits>
+tcp<InetTraits>::tcp(inet_type& inet)
+ : _inet(inet)
+ , _e(_rd()) {
+ namespace sm = metrics;
+
+ _metrics.add_group("tcp", {
+ sm::make_derive("linearizations", [] { return tcp_packet_merger::linearizations(); },
+ sm::description("Counts a number of times a buffer linearization was invoked during the buffers merge process. "
+ "Divide it by a total TCP receive packet rate to get an everage number of lineraizations per TCP packet."))
+ });
+
+ _inet.register_packet_provider([this, tcb_polled = 0u] () mutable {
+ std::optional<typename InetTraits::l4packet> l4p;
+ auto c = _poll_tcbs.size();
+ if (!_packetq.empty() && (!(tcb_polled % 128) || c == 0)) {
+ l4p = std::move(_packetq.front());
+ _packetq.pop_front();
+ _queue_space.signal(l4p.value().p.len());
+ } else {
+ while (c--) {
+ tcb_polled++;
+ lw_shared_ptr<tcb> tcb;
+ ethernet_address dst;
+ std::tie(tcb, dst) = std::move(_poll_tcbs.front());
+ _poll_tcbs.pop_front();
+ l4p = tcb->get_packet();
+ if (l4p) {
+ l4p.value().e_dst = dst;
+ break;
+ }
+ }
+ }
+ return l4p;
+ });
+}
+
+template <typename InetTraits>
+future<> tcp<InetTraits>::poll_tcb(ipaddr to, lw_shared_ptr<tcb> tcb) {
+ return _inet.get_l2_dst_address(to).then([this, tcb = std::move(tcb)] (ethernet_address dst) {
+ _poll_tcbs.emplace_back(std::move(tcb), dst);
+ });
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::listen(uint16_t port, size_t queue_length) -> listener {
+ return listener(*this, port, queue_length);
+}
+
+template <typename InetTraits>
+auto tcp<InetTraits>::connect(socket_address sa) -> connection {
+ uint16_t src_port;
+ connid id;
+ auto src_ip = _inet._inet.host_address();
+ auto dst_ip = ipv4_address(sa);
+ auto dst_port = net::ntoh(sa.u.in.sin_port);
+
+ do {
+ src_port = _port_dist(_e);
+ id = connid{src_ip, dst_ip, src_port, dst_port};
+ } while (_inet._inet.netif()->hw_queues_count() > 1 &&
+ (_inet._inet.netif()->hash2cpu(id.hash(_inet._inet.netif()->rss_key())) != this_shard_id()
+ || _tcbs.find(id) != _tcbs.end()));
+
+ auto tcbp = make_lw_shared<tcb>(*this, id);
+ _tcbs.insert({id, tcbp});
+ tcbp->connect();
+ return connection(tcbp);
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::forward(forward_hash& out_hash_data, packet& p, size_t off) {
+ auto th = p.get_header(off, tcp_hdr::len);
+ if (th) {
+ // src_port, dst_port in network byte order
+ out_hash_data.push_back(uint8_t(th[0]));
+ out_hash_data.push_back(uint8_t(th[1]));
+ out_hash_data.push_back(uint8_t(th[2]));
+ out_hash_data.push_back(uint8_t(th[3]));
+ }
+ return true;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::received(packet p, ipaddr from, ipaddr to) {
+ auto th = p.get_header(0, tcp_hdr::len);
+ if (!th) {
+ return;
+ }
+ // data_offset is correct even before ntoh()
+ auto data_offset = uint8_t(th[12]) >> 4;
+ if (size_t(data_offset * 4) < tcp_hdr::len) {
+ return;
+ }
+
+ if (!hw_features().rx_csum_offload) {
+ checksummer csum;
+ InetTraits::tcp_pseudo_header_checksum(csum, from, to, p.len());
+ csum.sum(p);
+ if (csum.get() != 0) {
+ return;
+ }
+ }
+ auto h = tcp_hdr::read(th);
+ auto id = connid{to, from, h.dst_port, h.src_port};
+ auto tcbi = _tcbs.find(id);
+ lw_shared_ptr<tcb> tcbp;
+ if (tcbi == _tcbs.end()) {
+ auto listener = _listening.find(id.local_port);
+ if (listener == _listening.end() || listener->second->full()) {
+ // 1) In CLOSE state
+ // 1.1 all data in the incoming segment is discarded. An incoming
+ // segment containing a RST is discarded. An incoming segment not
+ // containing a RST causes a RST to be sent in response.
+ // FIXME:
+ // if ACK off: <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK>
+ // if ACK on: <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+ } else {
+ // 2) In LISTEN state
+ // 2.1 first check for an RST
+ if (h.f_rst) {
+ // An incoming RST should be ignored
+ return;
+ }
+ // 2.2 second check for an ACK
+ if (h.f_ack) {
+ // Any acknowledgment is bad if it arrives on a connection
+ // still in the LISTEN state.
+ // <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(&h, id.local_ip, id.foreign_ip);
+ }
+ // 2.3 third check for a SYN
+ if (h.f_syn) {
+ // check the security
+ // NOTE: Ignored for now
+ tcbp = make_lw_shared<tcb>(*this, id);
+ _tcbs.insert({id, tcbp});
+ // TODO: we need to remove the tcb and decrease the pending if
+ // it stays SYN_RECEIVED state forever.
+ listener->second->inc_pending();
+
+ return tcbp->input_handle_listen_state(&h, std::move(p));
+ }
+ // 2.4 fourth other text or control
+ // So you are unlikely to get here, but if you do, drop the
+ // segment, and return.
+ return;
+ }
+ } else {
+ tcbp = tcbi->second;
+ if (tcbp->state() == tcp_state::SYN_SENT) {
+ // 3) In SYN_SENT State
+ return tcbp->input_handle_syn_sent_state(&h, std::move(p));
+ } else {
+ // 4) In other state, can be one of the following:
+ // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+ // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+ return tcbp->input_handle_other_state(&h, std::move(p));
+ }
+ }
+}
+
+// Send packet does not belong to any tcb
+template <typename InetTraits>
+void tcp<InetTraits>::send_packet_without_tcb(ipaddr from, ipaddr to, packet p) {
+ if (_queue_space.try_wait(p.len())) { // drop packets that do not fit the queue
+ // FIXME: future is discarded
+ (void)_inet.get_l2_dst_address(to).then([this, to, p = std::move(p)] (ethernet_address e_dst) mutable {
+ _packetq.emplace_back(ipv4_traits::l4packet{to, std::move(p), e_dst, ip_protocol_num::tcp});
+ });
+ }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::connection::~connection() {
+ if (_tcb) {
+ _tcb->_conn = nullptr;
+ close_read();
+ close_write();
+ }
+}
+
+template <typename InetTraits>
+tcp<InetTraits>::tcb::tcb(tcp& t, connid id)
+ : _tcp(t)
+ , _local_ip(id.local_ip)
+ , _foreign_ip(id.foreign_ip)
+ , _local_port(id.local_port)
+ , _foreign_port(id.foreign_port)
+ , _delayed_ack([this] { _nr_full_seg_received = 0; output(); })
+ , _retransmit([this] { retransmit(); })
+ , _persist([this] { persist(); }) {
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::respond_with_reset(tcp_hdr* rth) {
+ _tcp.respond_with_reset(rth, _local_ip, _foreign_ip);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::respond_with_reset(tcp_hdr* rth, ipaddr local_ip, ipaddr foreign_ip) {
+ if (rth->f_rst) {
+ return;
+ }
+ packet p;
+ auto th = p.prepend_uninitialized_header(tcp_hdr::len);
+ auto h = tcp_hdr{};
+ h.src_port = rth->dst_port;
+ h.dst_port = rth->src_port;
+ if (rth->f_ack) {
+ h.seq = rth->ack;
+ }
+ // If this RST packet is in response to a SYN packet. We ACK the ISN.
+ if (rth->f_syn) {
+ h.ack = rth->seq + 1;
+ h.f_ack = true;
+ }
+ h.f_rst = true;
+ h.data_offset = tcp_hdr::len / 4;
+ h.checksum = 0;
+ h.write(th);
+
+ checksummer csum;
+ offload_info oi;
+ InetTraits::tcp_pseudo_header_checksum(csum, local_ip, foreign_ip, tcp_hdr::len);
+ uint16_t checksum;
+ if (hw_features().tx_csum_l4_offload) {
+ checksum = ~csum.get();
+ oi.needs_csum = true;
+ } else {
+ csum.sum(p);
+ checksum = csum.get();
+ oi.needs_csum = false;
+ }
+ tcp_hdr::write_nbo_checksum(th, checksum);
+
+ oi.protocol = ip_protocol_num::tcp;
+ oi.tcp_hdr_len = tcp_hdr::len;
+ p.set_offload_info(oi);
+
+ send_packet_without_tcb(local_ip, foreign_ip, std::move(p));
+}
+
+template <typename InetTraits>
+uint32_t tcp<InetTraits>::tcb::data_segment_acked(tcp_seq seg_ack) {
+ uint32_t total_acked_bytes = 0;
+ // Full ACK of segment
+ while (!_snd.data.empty()
+ && (_snd.unacknowledged + _snd.data.front().p.len() <= seg_ack)) {
+ auto acked_bytes = _snd.data.front().p.len();
+ _snd.unacknowledged += acked_bytes;
+ // Ignore retransmitted segments when setting the RTO
+ if (_snd.data.front().nr_transmits == 0) {
+ update_rto(_snd.data.front().tx_time);
+ }
+ update_cwnd(acked_bytes);
+ total_acked_bytes += acked_bytes;
+ _snd.current_queue_space -= _snd.data.front().data_len;
+ signal_send_available();
+ _snd.data.pop_front();
+ }
+ // Partial ACK of segment
+ if (_snd.unacknowledged < seg_ack) {
+ auto acked_bytes = seg_ack - _snd.unacknowledged;
+ if (!_snd.data.empty()) {
+ auto& unacked_seg = _snd.data.front();
+ unacked_seg.p.trim_front(acked_bytes);
+ }
+ _snd.unacknowledged = seg_ack;
+ update_cwnd(acked_bytes);
+ total_acked_bytes += acked_bytes;
+ }
+ return total_acked_bytes;
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::segment_acceptable(tcp_seq seg_seq, unsigned seg_len) {
+ if (seg_len == 0 && _rcv.window == 0) {
+ // SEG.SEQ = RCV.NXT
+ return seg_seq == _rcv.next;
+ } else if (seg_len == 0 && _rcv.window > 0) {
+ // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ return (_rcv.next <= seg_seq) && (seg_seq < _rcv.next + _rcv.window);
+ } else if (seg_len > 0 && _rcv.window > 0) {
+ // RCV.NXT =< SEG.SEQ < RCV.NXT+RCV.WND
+ // or
+ // RCV.NXT =< SEG.SEQ+SEG.LEN-1 < RCV.NXT+RCV.WND
+ bool x = (_rcv.next <= seg_seq) && seg_seq < (_rcv.next + _rcv.window);
+ bool y = (_rcv.next <= seg_seq + seg_len - 1) && (seg_seq + seg_len - 1 < _rcv.next + _rcv.window);
+ return x || y;
+ } else {
+ // SEG.LEN > 0 RCV.WND = 0, not acceptable
+ return false;
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::init_from_options(tcp_hdr* th, uint8_t* opt_start, uint8_t* opt_end) {
+ // Handle tcp options
+ _option.parse(opt_start, opt_end);
+
+ // Remote receive window scale factor
+ _snd.window_scale = _option._remote_win_scale;
+ // Local receive window scale factor
+ _rcv.window_scale = _option._local_win_scale;
+
+ // Maximum segment size remote can receive
+ _snd.mss = _option._remote_mss;
+ // Maximum segment size local can receive
+ _rcv.mss = _option._local_mss = local_mss();
+
+ _rcv.window = get_default_receive_window_size();
+ _snd.window = th->window << _snd.window_scale;
+
+ // Segment sequence number used for last window update
+ _snd.wl1 = th->seq;
+ // Segment acknowledgment number used for last window update
+ _snd.wl2 = th->ack;
+
+ // Setup initial congestion window
+ if (2190 < _snd.mss) {
+ _snd.cwnd = 2 * _snd.mss;
+ } else if (1095 < _snd.mss && _snd.mss <= 2190) {
+ _snd.cwnd = 3 * _snd.mss;
+ } else {
+ _snd.cwnd = 4 * _snd.mss;
+ }
+
+ // Setup initial slow start threshold
+ _snd.ssthresh = th->window << _snd.window_scale;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_listen_state(tcp_hdr* th, packet p) {
+ auto opt_len = th->data_offset * 4 - tcp_hdr::len;
+ auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + tcp_hdr::len;
+ auto opt_end = opt_start + opt_len;
+ p.trim_front(th->data_offset * 4);
+ tcp_seq seg_seq = th->seq;
+
+ // Set RCV.NXT to SEG.SEQ+1, IRS is set to SEG.SEQ
+ _rcv.next = seg_seq + 1;
+ _rcv.initial = seg_seq;
+
+ // ISS should be selected and a SYN segment sent of the form:
+ // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+ // SND.NXT is set to ISS+1 and SND.UNA to ISS
+ // NOTE: In previous code, _snd.next is set to ISS + 1 only when SYN is
+ // ACKed. Now, we set _snd.next to ISS + 1 here, so in output_one(): we
+ // have
+ // th->seq = syn_on ? _snd.initial : _snd.next
+ // to make sure retransmitted SYN has correct SEQ number.
+ do_setup_isn();
+
+ _rcv.urgent = _rcv.next;
+
+ tcp_debug("listen: LISTEN -> SYN_RECEIVED\n");
+ init_from_options(th, opt_start, opt_end);
+ do_syn_received();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_syn_sent_state(tcp_hdr* th, packet p) {
+ auto opt_len = th->data_offset * 4 - tcp_hdr::len;
+ auto opt_start = reinterpret_cast<uint8_t*>(p.get_header(0, th->data_offset * 4)) + tcp_hdr::len;
+ auto opt_end = opt_start + opt_len;
+ p.trim_front(th->data_offset * 4);
+ tcp_seq seg_seq = th->seq;
+ auto seg_ack = th->ack;
+
+ bool acceptable = false;
+ // 3.1 first check the ACK bit
+ if (th->f_ack) {
+ // If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send a reset (unless the
+ // RST bit is set, if so drop the segment and return)
+ if (seg_ack <= _snd.initial || seg_ack > _snd.next) {
+ return respond_with_reset(th);
+ }
+
+ // If SND.UNA =< SEG.ACK =< SND.NXT then the ACK is acceptable.
+ acceptable = _snd.unacknowledged <= seg_ack && seg_ack <= _snd.next;
+ }
+
+ // 3.2 second check the RST bit
+ if (th->f_rst) {
+ // If the ACK was acceptable then signal the user "error: connection
+ // reset", drop the segment, enter CLOSED state, delete TCB, and
+ // return. Otherwise (no ACK) drop the segment and return.
+ if (acceptable) {
+ return do_reset();
+ } else {
+ return;
+ }
+ }
+
+ // 3.3 third check the security and precedence
+ // NOTE: Ignored for now
+
+ // 3.4 fourth check the SYN bit
+ if (th->f_syn) {
+ // RCV.NXT is set to SEG.SEQ+1, IRS is set to SEG.SEQ. SND.UNA should
+ // be advanced to equal SEG.ACK (if there is an ACK), and any segments
+ // on the retransmission queue which are thereby acknowledged should be
+ // removed.
+ _rcv.next = seg_seq + 1;
+ _rcv.initial = seg_seq;
+ if (th->f_ack) {
+ // TODO: clean retransmission queue
+ _snd.unacknowledged = seg_ack;
+ }
+ if (_snd.unacknowledged > _snd.initial) {
+ // If SND.UNA > ISS (our SYN has been ACKed), change the connection
+ // state to ESTABLISHED, form an ACK segment
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ tcp_debug("syn: SYN_SENT -> ESTABLISHED\n");
+ init_from_options(th, opt_start, opt_end);
+ do_established();
+ output();
+ } else {
+ // Otherwise enter SYN_RECEIVED, form a SYN,ACK segment
+ // <SEQ=ISS><ACK=RCV.NXT><CTL=SYN,ACK>
+ tcp_debug("syn: SYN_SENT -> SYN_RECEIVED\n");
+ do_syn_received();
+ }
+ }
+
+ // 3.5 fifth, if neither of the SYN or RST bits is set then drop the
+ // segment and return.
+ return;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::input_handle_other_state(tcp_hdr* th, packet p) {
+ p.trim_front(th->data_offset * 4);
+ bool do_output = false;
+ bool do_output_data = false;
+ tcp_seq seg_seq = th->seq;
+ auto seg_ack = th->ack;
+ auto seg_len = p.len();
+
+ // 4.1 first check sequence number
+ if (!segment_acceptable(seg_seq, seg_len)) {
+ //<SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ return output();
+ }
+
+ // In the following it is assumed that the segment is the idealized
+ // segment that begins at RCV.NXT and does not exceed the window.
+ if (seg_seq < _rcv.next) {
+ // ignore already acknowledged data
+ auto dup = std::min(uint32_t(_rcv.next - seg_seq), seg_len);
+ p.trim_front(dup);
+ seg_len -= dup;
+ seg_seq += dup;
+ }
+ // FIXME: We should trim data outside the right edge of the receive window as well
+
+ if (seg_seq != _rcv.next) {
+ insert_out_of_order(seg_seq, std::move(p));
+ // A TCP receiver SHOULD send an immediate duplicate ACK
+ // when an out-of-order segment arrives.
+ return output();
+ }
+
+ // 4.2 second check the RST bit
+ if (th->f_rst) {
+ if (in_state(SYN_RECEIVED)) {
+ // If this connection was initiated with a passive OPEN (i.e.,
+ // came from the LISTEN state), then return this connection to
+ // LISTEN state and return. The user need not be informed. If
+ // this connection was initiated with an active OPEN (i.e., came
+ // from SYN_SENT state) then the connection was refused, signal
+ // the user "connection refused". In either case, all segments
+ // on the retransmission queue should be removed. And in the
+ // active OPEN case, enter the CLOSED state and delete the TCB,
+ // and return.
+ _connect_done.set_exception(tcp_refused_error());
+ return do_reset();
+ }
+ if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2 | CLOSE_WAIT)) {
+ // If the RST bit is set then, any outstanding RECEIVEs and SEND
+ // should receive "reset" responses. All segment queues should be
+ // flushed. Users should also receive an unsolicited general
+ // "connection reset" signal. Enter the CLOSED state, delete the
+ // TCB, and return.
+ return do_reset();
+ }
+ if (in_state(CLOSING | LAST_ACK | TIME_WAIT)) {
+ // If the RST bit is set then, enter the CLOSED state, delete the
+ // TCB, and return.
+ return do_closed();
+ }
+ }
+
+ // 4.3 third check security and precedence
+ // NOTE: Ignored for now
+
+ // 4.4 fourth, check the SYN bit
+ if (th->f_syn) {
+ // SYN_RECEIVED, ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2
+ // CLOSE_WAIT, CLOSING, LAST_ACK, TIME_WAIT
+
+ // If the SYN is in the window it is an error, send a reset, any
+ // outstanding RECEIVEs and SEND should receive "reset" responses,
+ // all segment queues should be flushed, the user should also
+ // receive an unsolicited general "connection reset" signal, enter
+ // the CLOSED state, delete the TCB, and return.
+ respond_with_reset(th);
+ return do_reset();
+
+ // If the SYN is not in the window this step would not be reached
+ // and an ack would have been sent in the first step (sequence
+ // number check).
+ }
+
+ // 4.5 fifth check the ACK field
+ if (!th->f_ack) {
+ // if the ACK bit is off drop the segment and return
+ return;
+ } else {
+ // SYN_RECEIVED STATE
+ if (in_state(SYN_RECEIVED)) {
+ // If SND.UNA =< SEG.ACK =< SND.NXT then enter ESTABLISHED state
+ // and continue processing.
+ if (_snd.unacknowledged <= seg_ack && seg_ack <= _snd.next) {
+ tcp_debug("SYN_RECEIVED -> ESTABLISHED\n");
+ do_established();
+ _tcp.add_connected_tcb(this->shared_from_this(), _local_port);
+ } else {
+ // <SEQ=SEG.ACK><CTL=RST>
+ return respond_with_reset(th);
+ }
+ }
+ auto update_window = [this, th, seg_seq, seg_ack] {
+ tcp_debug("window update seg_seq=%d, seg_ack=%d, old window=%d new window=%d\n",
+ seg_seq, seg_ack, _snd.window, th->window << _snd.window_scale);
+ _snd.window = th->window << _snd.window_scale;
+ _snd.wl1 = seg_seq;
+ _snd.wl2 = seg_ack;
+ _snd.zero_window_probing_out = 0;
+ if (_snd.window == 0) {
+ _persist_time_out = _rto;
+ start_persist_timer();
+ } else {
+ stop_persist_timer();
+ }
+ };
+ // ESTABLISHED STATE or
+ // CLOSE_WAIT STATE: Do the same processing as for the ESTABLISHED state.
+ if (in_state(ESTABLISHED | CLOSE_WAIT)){
+ // When we are in zero window probing phase and packets_out = 0 we bypass "duplicated ack" check
+ auto packets_out = _snd.next - _snd.unacknowledged - _snd.zero_window_probing_out;
+ // If SND.UNA < SEG.ACK =< SND.NXT then, set SND.UNA <- SEG.ACK.
+ if (_snd.unacknowledged < seg_ack && seg_ack <= _snd.next) {
+ // Remote ACKed data we sent
+ auto acked_bytes = data_segment_acked(seg_ack);
+
+ // If SND.UNA < SEG.ACK =< SND.NXT, the send window should be updated.
+ if (_snd.wl1 < seg_seq || (_snd.wl1 == seg_seq && _snd.wl2 <= seg_ack)) {
+ update_window();
+ }
+
+ // some data is acked, try send more data
+ do_output_data = true;
+
+ auto set_retransmit_timer = [this] {
+ if (_snd.data.empty()) {
+ // All outstanding segments are acked, turn off the timer.
+ stop_retransmit_timer();
+ // Signal the waiter of this event
+ signal_all_data_acked();
+ } else {
+ // Restart the timer becasue new data is acked.
+ start_retransmit_timer();
+ }
+ };
+
+ if (_snd.dupacks >= 3) {
+ // We are in fast retransmit / fast recovery phase
+ uint32_t smss = _snd.mss;
+ if (seg_ack > _snd.recover) {
+ tcp_debug("ack: full_ack\n");
+ // Set cwnd to min (ssthresh, max(FlightSize, SMSS) + SMSS)
+ _snd.cwnd = std::min(_snd.ssthresh, std::max(flight_size(), smss) + smss);
+ // Exit the fast recovery procedure
+ exit_fast_recovery();
+ set_retransmit_timer();
+ } else {
+ tcp_debug("ack: partial_ack\n");
+ // Retransmit the first unacknowledged segment
+ fast_retransmit();
+ // Deflate the congestion window by the amount of new data
+ // acknowledged by the Cumulative Acknowledgment field
+ _snd.cwnd -= acked_bytes;
+ // If the partial ACK acknowledges at least one SMSS of new
+ // data, then add back SMSS bytes to the congestion window
+ if (acked_bytes >= smss) {
+ _snd.cwnd += smss;
+ }
+ // Send a new segment if permitted by the new value of
+ // cwnd. Do not exit the fast recovery procedure For
+ // the first partial ACK that arrives during fast
+ // recovery, also reset the retransmit timer.
+ if (++_snd.partial_ack == 1) {
+ start_retransmit_timer();
+ }
+ }
+ } else {
+ // RFC5681: The fast retransmit algorithm uses the arrival
+ // of 3 duplicate ACKs (as defined in section 2, without
+ // any intervening ACKs which move SND.UNA) as an
+ // indication that a segment has been lost.
+ //
+ // So, here we reset dupacks to zero becasue this ACK moves
+ // SND.UNA.
+ exit_fast_recovery();
+ set_retransmit_timer();
+ }
+ } else if ((packets_out > 0) && !_snd.data.empty() && seg_len == 0 &&
+ th->f_fin == 0 && th->f_syn == 0 &&
+ th->ack == _snd.unacknowledged &&
+ uint32_t(th->window << _snd.window_scale) == _snd.window) {
+ // Note:
+ // RFC793 states:
+ // If the ACK is a duplicate (SEG.ACK < SND.UNA), it can be ignored
+ // RFC5681 states:
+ // The TCP sender SHOULD use the "fast retransmit" algorithm to detect
+ // and repair loss, based on incoming duplicate ACKs.
+ // Here, We follow RFC5681.
+ _snd.dupacks++;
+ uint32_t smss = _snd.mss;
+ // 3 duplicated ACKs trigger a fast retransmit
+ if (_snd.dupacks == 1 || _snd.dupacks == 2) {
+ // RFC5681 Step 3.1
+ // Send cwnd + 2 * smss per RFC3042
+ do_output_data = true;
+ } else if (_snd.dupacks == 3) {
+ // RFC6582 Step 3.2
+ if (seg_ack - 1 > _snd.recover) {
+ _snd.recover = _snd.next - 1;
+ // RFC5681 Step 3.2
+ _snd.ssthresh = std::max((flight_size() - _snd.limited_transfer) / 2, 2 * smss);
+ fast_retransmit();
+ } else {
+ // Do not enter fast retransmit and do not reset ssthresh
+ }
+ // RFC5681 Step 3.3
+ _snd.cwnd = _snd.ssthresh + 3 * smss;
+ } else if (_snd.dupacks > 3) {
+ // RFC5681 Step 3.4
+ _snd.cwnd += smss;
+ // RFC5681 Step 3.5
+ do_output_data = true;
+ }
+ } else if (seg_ack > _snd.next) {
+ // If the ACK acks something not yet sent (SEG.ACK > SND.NXT)
+ // then send an ACK, drop the segment, and return
+ return output();
+ } else if (_snd.window == 0 && th->window > 0) {
+ update_window();
+ do_output_data = true;
+ }
+ }
+ // FIN_WAIT_1 STATE
+ if (in_state(FIN_WAIT_1)) {
+ // In addition to the processing for the ESTABLISHED state, if
+ // our FIN is now acknowledged then enter FIN-WAIT-2 and continue
+ // processing in that state.
+ if (seg_ack == _snd.next + 1) {
+ tcp_debug("ack: FIN_WAIT_1 -> FIN_WAIT_2\n");
+ _state = FIN_WAIT_2;
+ do_local_fin_acked();
+ }
+ }
+ // FIN_WAIT_2 STATE
+ if (in_state(FIN_WAIT_2)) {
+ // In addition to the processing for the ESTABLISHED state, if
+ // the retransmission queue is empty, the user’s CLOSE can be
+ // acknowledged ("ok") but do not delete the TCB.
+ // TODO
+ }
+ // CLOSING STATE
+ if (in_state(CLOSING)) {
+ if (seg_ack == _snd.next + 1) {
+ tcp_debug("ack: CLOSING -> TIME_WAIT\n");
+ do_local_fin_acked();
+ return do_time_wait();
+ } else {
+ return;
+ }
+ }
+ // LAST_ACK STATE
+ if (in_state(LAST_ACK)) {
+ if (seg_ack == _snd.next + 1) {
+ tcp_debug("ack: LAST_ACK -> CLOSED\n");
+ do_local_fin_acked();
+ return do_closed();
+ }
+ }
+ // TIME_WAIT STATE
+ if (in_state(TIME_WAIT)) {
+ // The only thing that can arrive in this state is a
+ // retransmission of the remote FIN. Acknowledge it, and restart
+ // the 2 MSL timeout.
+ // TODO
+ }
+ }
+
+ // 4.6 sixth, check the URG bit
+ if (th->f_urg) {
+ // TODO
+ }
+
+ // 4.7 seventh, process the segment text
+ if (in_state(ESTABLISHED | FIN_WAIT_1 | FIN_WAIT_2)) {
+ if (p.len()) {
+ // Once the TCP takes responsibility for the data it advances
+ // RCV.NXT over the data accepted, and adjusts RCV.WND as
+ // apporopriate to the current buffer availability. The total of
+ // RCV.NXT and RCV.WND should not be reduced.
+ _rcv.data_size += p.len();
+ _rcv.data.push_back(std::move(p));
+ _rcv.next += seg_len;
+ auto merged = merge_out_of_order();
+ _rcv.window = get_modified_receive_window_size();
+ signal_data_received();
+ // Send an acknowledgment of the form:
+ // <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+ // This acknowledgment should be piggybacked on a segment being
+ // transmitted if possible without incurring undue delay.
+ if (merged) {
+ // TCP receiver SHOULD send an immediate ACK when the
+ // incoming segment fills in all or part of a gap in the
+ // sequence space.
+ do_output = true;
+ } else {
+ do_output = should_send_ack(seg_len);
+ }
+ }
+ } else if (in_state(CLOSE_WAIT | CLOSING | LAST_ACK | TIME_WAIT)) {
+ // This should not occur, since a FIN has been received from the
+ // remote side. Ignore the segment text.
+ return;
+ }
+
+ // 4.8 eighth, check the FIN bit
+ if (th->f_fin) {
+ if (in_state(CLOSED | LISTEN | SYN_SENT)) {
+ // Do not process the FIN if the state is CLOSED, LISTEN or SYN-SENT
+ // since the SEG.SEQ cannot be validated; drop the segment and return.
+ return;
+ }
+ auto fin_seq = seg_seq + seg_len;
+ if (fin_seq == _rcv.next) {
+ _rcv.next = fin_seq + 1;
+ signal_data_received();
+
+ // If this <FIN> packet contains data as well, we can ACK both data
+ // and <FIN> in a single packet, so canncel the previous ACK.
+ clear_delayed_ack();
+ do_output = false;
+ // Send ACK for the FIN!
+ output();
+
+ if (in_state(SYN_RECEIVED | ESTABLISHED)) {
+ tcp_debug("fin: SYN_RECEIVED or ESTABLISHED -> CLOSE_WAIT\n");
+ _state = CLOSE_WAIT;
+ }
+ if (in_state(FIN_WAIT_1)) {
+ // If our FIN has been ACKed (perhaps in this segment), then
+ // enter TIME-WAIT, start the time-wait timer, turn off the other
+ // timers; otherwise enter the CLOSING state.
+ // Note: If our FIN has been ACKed, we should be in FIN_WAIT_2
+ // not FIN_WAIT_1 if we reach here.
+ tcp_debug("fin: FIN_WAIT_1 -> CLOSING\n");
+ _state = CLOSING;
+ }
+ if (in_state(FIN_WAIT_2)) {
+ tcp_debug("fin: FIN_WAIT_2 -> TIME_WAIT\n");
+ return do_time_wait();
+ }
+ }
+ }
+ if (do_output || (do_output_data && can_send())) {
+ // Since we will do output, we can canncel scheduled delayed ACK.
+ clear_delayed_ack();
+ output();
+ }
+}
+
+template <typename InetTraits>
+packet tcp<InetTraits>::tcb::get_transmit_packet() {
+ // easy case: empty queue
+ if (_snd.unsent.empty()) {
+ return packet();
+ }
+ auto can_send = this->can_send();
+ // Max number of TCP payloads we can pass to NIC
+ uint32_t len;
+ if (_tcp.hw_features().tx_tso) {
+ // FIXME: Info tap device the size of the splitted packet
+ len = _tcp.hw_features().max_packet_len - net::tcp_hdr_len_min - InetTraits::ip_hdr_len_min;
+ } else {
+ len = std::min(uint16_t(_tcp.hw_features().mtu - net::tcp_hdr_len_min - InetTraits::ip_hdr_len_min), _snd.mss);
+ }
+ can_send = std::min(can_send, len);
+ // easy case: one small packet
+ if (_snd.unsent.size() == 1 && _snd.unsent.front().len() <= can_send) {
+ auto p = std::move(_snd.unsent.front());
+ _snd.unsent.pop_front();
+ _snd.unsent_len -= p.len();
+ return p;
+ }
+ // moderate case: need to split one packet
+ if (_snd.unsent.front().len() > can_send) {
+ auto p = _snd.unsent.front().share(0, can_send);
+ _snd.unsent.front().trim_front(can_send);
+ _snd.unsent_len -= p.len();
+ return p;
+ }
+ // hard case: merge some packets, possibly split last
+ auto p = std::move(_snd.unsent.front());
+ _snd.unsent.pop_front();
+ can_send -= p.len();
+ while (!_snd.unsent.empty()
+ && _snd.unsent.front().len() <= can_send) {
+ can_send -= _snd.unsent.front().len();
+ p.append(std::move(_snd.unsent.front()));
+ _snd.unsent.pop_front();
+ }
+ if (!_snd.unsent.empty() && can_send) {
+ auto& q = _snd.unsent.front();
+ p.append(q.share(0, can_send));
+ q.trim_front(can_send);
+ }
+ _snd.unsent_len -= p.len();
+ return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::output_one(bool data_retransmit) {
+ if (in_state(CLOSED)) {
+ return;
+ }
+
+ packet p = data_retransmit ? _snd.data.front().p.share() : get_transmit_packet();
+ packet clone = p.share(); // early clone to prevent share() from calling packet::unuse_internal_data() on header.
+ uint16_t len = p.len();
+ bool syn_on = syn_needs_on();
+ bool ack_on = ack_needs_on();
+
+ auto options_size = _option.get_size(syn_on, ack_on);
+ auto th = p.prepend_uninitialized_header(tcp_hdr::len + options_size);
+ auto h = tcp_hdr{};
+
+ h.src_port = _local_port;
+ h.dst_port = _foreign_port;
+
+ h.f_syn = syn_on;
+ h.f_ack = ack_on;
+ if (ack_on) {
+ clear_delayed_ack();
+ }
+ h.f_urg = false;
+ h.f_psh = false;
+
+ tcp_seq seq;
+ if (data_retransmit) {
+ seq = _snd.unacknowledged;
+ } else {
+ seq = syn_on ? _snd.initial : _snd.next;
+ _snd.next += len;
+ }
+ h.seq = seq;
+ h.ack = _rcv.next;
+ h.data_offset = (tcp_hdr::len + options_size) / 4;
+ h.window = _rcv.window >> _rcv.window_scale;
+ h.checksum = 0;
+
+ // FIXME: does the FIN have to fit in the window?
+ bool fin_on = fin_needs_on();
+ h.f_fin = fin_on;
+
+ // Add tcp options
+ _option.fill(th, &h, options_size);
+ h.write(th);
+
+ offload_info oi;
+ checksummer csum;
+ uint16_t pseudo_hdr_seg_len = 0;
+
+ oi.tcp_hdr_len = tcp_hdr::len + options_size;
+
+ if (_tcp.hw_features().tx_csum_l4_offload) {
+ oi.needs_csum = true;
+
+ //
+ // tx checksum offloading: both virtio-net's VIRTIO_NET_F_CSUM dpdk's
+ // PKT_TX_TCP_CKSUM - requires th->checksum to be initialized to ones'
+ // complement sum of the pseudo header.
+ //
+ // For TSO the csum should be calculated for a pseudo header with
+ // segment length set to 0. All the rest is the same as for a TCP Tx
+ // CSUM offload case.
+ //
+ if (_tcp.hw_features().tx_tso && len > _snd.mss) {
+ oi.tso_seg_size = _snd.mss;
+ } else {
+ pseudo_hdr_seg_len = tcp_hdr::len + options_size + len;
+ }
+ } else {
+ pseudo_hdr_seg_len = tcp_hdr::len + options_size + len;
+ oi.needs_csum = false;
+ }
+
+ InetTraits::tcp_pseudo_header_checksum(csum, _local_ip, _foreign_ip,
+ pseudo_hdr_seg_len);
+
+ uint16_t checksum;
+ if (_tcp.hw_features().tx_csum_l4_offload) {
+ checksum = ~csum.get();
+ } else {
+ csum.sum(p);
+ checksum = csum.get();
+ }
+ tcp_hdr::write_nbo_checksum(th, checksum);
+
+ oi.protocol = ip_protocol_num::tcp;
+
+ p.set_offload_info(oi);
+
+ if (!data_retransmit && (len || syn_on || fin_on)) {
+ auto now = clock_type::now();
+ if (len) {
+ unsigned nr_transmits = 0;
+ _snd.data.emplace_back(unacked_segment{std::move(clone),
+ len, nr_transmits, now});
+ }
+ if (!_retransmit.armed()) {
+ start_retransmit_timer(now);
+ }
+ }
+
+
+ // if advertised TCP receive window is 0 we may only transmit zero window probing segment.
+ // Payload size of this segment is 1. Queueing anything bigger when _snd.window == 0 is bug
+ // and violation of RFC
+ assert((_snd.window > 0) || ((_snd.window == 0) && (len <= 1)));
+ queue_packet(std::move(p));
+}
+
+template <typename InetTraits>
+future<> tcp<InetTraits>::tcb::wait_for_data() {
+ if (!_rcv.data.empty() || foreign_will_not_send()) {
+ return make_ready_future<>();
+ }
+ _rcv._data_received_promise = promise<>();
+ return _rcv._data_received_promise->get_future();
+}
+
+template <typename InetTraits>
+void
+tcp<InetTraits>::tcb::abort_reader() {
+ if (_rcv._data_received_promise) {
+ _rcv._data_received_promise->set_exception(
+ std::make_exception_ptr(std::system_error(ECONNABORTED, std::system_category())));
+ _rcv._data_received_promise = std::nullopt;
+ }
+}
+
+template <typename InetTraits>
+future<> tcp<InetTraits>::tcb::wait_for_all_data_acked() {
+ if (_snd.data.empty() && _snd.unsent_len == 0) {
+ return make_ready_future<>();
+ }
+ _snd._all_data_acked_promise = promise<>();
+ return _snd._all_data_acked_promise->get_future();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::connect() {
+ // An initial send sequence number (ISS) is selected. A SYN segment of the
+ // form <SEQ=ISS><CTL=SYN> is sent. Set SND.UNA to ISS, SND.NXT to ISS+1,
+ // enter SYN-SENT state, and return.
+ do_setup_isn();
+
+ // Local receive window scale factor
+ _rcv.window_scale = _option._local_win_scale = 7;
+ // Maximum segment size local can receive
+ _rcv.mss = _option._local_mss = local_mss();
+ _rcv.window = get_default_receive_window_size();
+
+ do_syn_sent();
+}
+
+template <typename InetTraits>
+packet tcp<InetTraits>::tcb::read() {
+ packet p;
+ for (auto&& q : _rcv.data) {
+ p.append(std::move(q));
+ }
+ _rcv.data_size = 0;
+ _rcv.data.clear();
+ _rcv.window = get_default_receive_window_size();
+ return p;
+}
+
+template <typename InetTraits>
+future<> tcp<InetTraits>::tcb::wait_send_available() {
+ if (_snd.max_queue_space > _snd.current_queue_space) {
+ return make_ready_future<>();
+ }
+ _snd._send_available_promise = promise<>();
+ return _snd._send_available_promise->get_future();
+}
+
+template <typename InetTraits>
+future<> tcp<InetTraits>::tcb::send(packet p) {
+ // We can not send after the connection is closed
+ if (_snd.closed || in_state(CLOSED)) {
+ return make_exception_future<>(tcp_reset_error());
+ }
+
+ auto len = p.len();
+ _snd.current_queue_space += len;
+ _snd.unsent_len += len;
+ _snd.unsent.push_back(std::move(p));
+
+ if (can_send() > 0) {
+ output();
+ }
+
+ return wait_send_available();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::close() {
+ if (in_state(CLOSED) || _snd.closed) {
+ return;
+ }
+ // TODO: We should return a future to upper layer
+ (void)wait_for_all_data_acked().then([this, zis = this->shared_from_this()] () mutable {
+ _snd.closed = true;
+ tcp_debug("close: unsent_len=%d\n", _snd.unsent_len);
+ if (in_state(CLOSE_WAIT)) {
+ tcp_debug("close: CLOSE_WAIT -> LAST_ACK\n");
+ _state = LAST_ACK;
+ } else if (in_state(ESTABLISHED)) {
+ tcp_debug("close: ESTABLISHED -> FIN_WAIT_1\n");
+ _state = FIN_WAIT_1;
+ }
+ // Send <FIN> to remote
+ // Note: we call output_one to make sure a packet with FIN actually
+ // sent out. If we only call output() and _packetq is not empty,
+ // tcp::tcb::get_packet(), packet with FIN will not be generated.
+ output_one();
+ output();
+ });
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::should_send_ack(uint16_t seg_len) {
+ // We've received a TSO packet, do ack immediately
+ if (seg_len > _rcv.mss) {
+ _nr_full_seg_received = 0;
+ _delayed_ack.cancel();
+ return true;
+ }
+
+ // We've received a full sized segment, ack for every second full sized segment
+ if (seg_len == _rcv.mss) {
+ if (_nr_full_seg_received++ >= 1) {
+ _nr_full_seg_received = 0;
+ _delayed_ack.cancel();
+ return true;
+ }
+ }
+
+ // If the timer is armed and its callback hasn't been run.
+ if (_delayed_ack.armed()) {
+ return false;
+ }
+
+ // If the timer is not armed, schedule a delayed ACK.
+ // The maximum delayed ack timer allowed by RFC1122 is 500ms, most
+ // implementations use 200ms.
+ _delayed_ack.arm(200ms);
+ return false;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::clear_delayed_ack() {
+ _delayed_ack.cancel();
+}
+
+template <typename InetTraits>
+bool tcp<InetTraits>::tcb::merge_out_of_order() {
+ bool merged = false;
+ if (_rcv.out_of_order.map.empty()) {
+ return merged;
+ }
+ for (auto it = _rcv.out_of_order.map.begin(); it != _rcv.out_of_order.map.end();) {
+ auto& p = it->second;
+ auto seg_beg = it->first;
+ auto seg_len = p.len();
+ auto seg_end = seg_beg + seg_len;
+ if (seg_beg <= _rcv.next && _rcv.next < seg_end) {
+ // This segment has been received out of order and its previous
+ // segment has been received now
+ auto trim = _rcv.next - seg_beg;
+ if (trim) {
+ p.trim_front(trim);
+ seg_len -= trim;
+ }
+ _rcv.next += seg_len;
+ _rcv.data_size += p.len();
+ _rcv.data.push_back(std::move(p));
+ // Since c++11, erase() always returns the value of the following element
+ it = _rcv.out_of_order.map.erase(it);
+ merged = true;
+ } else if (_rcv.next >= seg_end) {
+ // This segment has been receive already, drop it
+ it = _rcv.out_of_order.map.erase(it);
+ } else {
+ // seg_beg > _rcv.need, can not merge. Note, seg_beg can grow only,
+ // so we can stop looking here.
+ it++;
+ break;
+ }
+ }
+ return merged;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::insert_out_of_order(tcp_seq seg, packet p) {
+ _rcv.out_of_order.merge(seg, std::move(p));
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::trim_receive_data_after_window() {
+ abort();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::persist() {
+ tcp_debug("persist timer fired\n");
+ // Send 1 byte packet to probe peer's window size
+ _snd.window_probe = true;
+ _snd.zero_window_probing_out++;
+ output_one();
+ _snd.window_probe = false;
+
+ output();
+ // Perform binary exponential back-off per RFC1122
+ _persist_time_out = std::min(_persist_time_out * 2, _rto_max);
+ start_persist_timer();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::retransmit() {
+ auto output_update_rto = [this] {
+ output();
+ // According to RFC6298, Update RTO <- RTO * 2 to perform binary exponential back-off
+ this->_rto = std::min(this->_rto * 2, this->_rto_max);
+ start_retransmit_timer();
+ };
+
+ // Retransmit SYN
+ if (syn_needs_on()) {
+ if (_snd.syn_retransmit++ < _max_nr_retransmit) {
+ output_update_rto();
+ } else {
+ _connect_done.set_exception(tcp_connect_error());
+ cleanup();
+ return;
+ }
+ }
+
+ // Retransmit FIN
+ if (fin_needs_on()) {
+ if (_snd.fin_retransmit++ < _max_nr_retransmit) {
+ output_update_rto();
+ } else {
+ cleanup();
+ return;
+ }
+ }
+
+ // Retransmit Data
+ if (_snd.data.empty()) {
+ return;
+ }
+
+ // If there are unacked data, retransmit the earliest segment
+ auto& unacked_seg = _snd.data.front();
+
+ // According to RFC5681
+ // Update ssthresh only for the first retransmit
+ uint32_t smss = _snd.mss;
+ if (unacked_seg.nr_transmits == 0) {
+ _snd.ssthresh = std::max(flight_size() / 2, 2 * smss);
+ }
+ // RFC6582 Step 4
+ _snd.recover = _snd.next - 1;
+ // Start the slow start process
+ _snd.cwnd = smss;
+ // End fast recovery
+ exit_fast_recovery();
+
+ if (unacked_seg.nr_transmits < _max_nr_retransmit) {
+ unacked_seg.nr_transmits++;
+ } else {
+ // Delete connection when max num of retransmission is reached
+ do_reset();
+ return;
+ }
+ retransmit_one();
+
+ output_update_rto();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::fast_retransmit() {
+ if (!_snd.data.empty()) {
+ auto& unacked_seg = _snd.data.front();
+ unacked_seg.nr_transmits++;
+ retransmit_one();
+ output();
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_rto(clock_type::time_point tx_time) {
+ // Update RTO according to RFC6298
+ auto R = std::chrono::duration_cast<std::chrono::milliseconds>(clock_type::now() - tx_time);
+ if (_snd.first_rto_sample) {
+ _snd.first_rto_sample = false;
+ // RTTVAR <- R/2
+ // SRTT <- R
+ _snd.rttvar = R / 2;
+ _snd.srtt = R;
+ } else {
+ // RTTVAR <- (1 - beta) * RTTVAR + beta * |SRTT - R'|
+ // SRTT <- (1 - alpha) * SRTT + alpha * R'
+ // where alpha = 1/8 and beta = 1/4
+ auto delta = _snd.srtt > R ? (_snd.srtt - R) : (R - _snd.srtt);
+ _snd.rttvar = _snd.rttvar * 3 / 4 + delta / 4;
+ _snd.srtt = _snd.srtt * 7 / 8 + R / 8;
+ }
+ // RTO <- SRTT + max(G, K * RTTVAR)
+ _rto = _snd.srtt + std::max(_rto_clk_granularity, 4 * _snd.rttvar);
+
+ // Make sure 1 sec << _rto << 60 sec
+ _rto = std::max(_rto, _rto_min);
+ _rto = std::min(_rto, _rto_max);
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::update_cwnd(uint32_t acked_bytes) {
+ uint32_t smss = _snd.mss;
+ if (_snd.cwnd < _snd.ssthresh) {
+ // In slow start phase
+ _snd.cwnd += std::min(acked_bytes, smss);
+ } else {
+ // In congestion avoidance phase
+ uint32_t round_up = 1;
+ _snd.cwnd += std::max(round_up, smss * smss / _snd.cwnd);
+ }
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::tcb::cleanup() {
+ _snd.unsent.clear();
+ _snd.data.clear();
+ _rcv.out_of_order.map.clear();
+ _rcv.data_size = 0;
+ _rcv.data.clear();
+ stop_retransmit_timer();
+ clear_delayed_ack();
+ remove_from_tcbs();
+}
+
+template <typename InetTraits>
+tcp_seq tcp<InetTraits>::tcb::get_isn() {
+ // Per RFC6528, TCP SHOULD generate its Initial Sequence Numbers
+ // with the expression:
+ // ISN = M + F(localip, localport, remoteip, remoteport, secretkey)
+ // M is the 4 microsecond timer
+ using namespace std::chrono;
+ uint32_t hash[4];
+ hash[0] = _local_ip.ip;
+ hash[1] = _foreign_ip.ip;
+ hash[2] = (_local_port << 16) + _foreign_port;
+ hash[3] = _isn_secret.key[15];
+ CryptoPP::Weak::MD5::Transform(hash, _isn_secret.key);
+ auto seq = hash[0];
+ auto m = duration_cast<microseconds>(clock_type::now().time_since_epoch());
+ seq += m.count() / 4;
+ return make_seq(seq);
+}
+
+template <typename InetTraits>
+std::optional<typename InetTraits::l4packet> tcp<InetTraits>::tcb::get_packet() {
+ _poll_active = false;
+ if (_packetq.empty()) {
+ output_one();
+ }
+
+ if (in_state(CLOSED)) {
+ return std::optional<typename InetTraits::l4packet>();
+ }
+
+ assert(!_packetq.empty());
+
+ auto p = std::move(_packetq.front());
+ _packetq.pop_front();
+ if (!_packetq.empty() || (_snd.dupacks < 3 && can_send() > 0 && (_snd.window > 0))) {
+ // If there are packets to send in the queue or tcb is allowed to send
+ // more add tcp back to polling set to keep sending. In addition, dupacks >= 3
+ // is an indication that an segment is lost, stop sending more in this case.
+ // Finally - we can't send more until window is opened again.
+ output();
+ }
+ return p;
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_read() {
+ _tcb->abort_reader();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::close_write() {
+ _tcb->close();
+}
+
+template <typename InetTraits>
+void tcp<InetTraits>::connection::shutdown_connect() {
+ if (_tcb->syn_needs_on()) {
+ _tcb->_connect_done.set_exception(tcp_refused_error());
+ _tcb->cleanup();
+ } else {
+ close_read();
+ close_write();
+ }
+}
+
+template <typename InetTraits>
+constexpr uint16_t tcp<InetTraits>::tcb::_max_nr_retransmit;
+
+template <typename InetTraits>
+constexpr std::chrono::milliseconds tcp<InetTraits>::tcb::_rto_min;
+
+template <typename InetTraits>
+constexpr std::chrono::milliseconds tcp<InetTraits>::tcb::_rto_max;
+
+template <typename InetTraits>
+constexpr std::chrono::milliseconds tcp<InetTraits>::tcb::_rto_clk_granularity;
+
+template <typename InetTraits>
+typename tcp<InetTraits>::tcb::isn_secret tcp<InetTraits>::tcb::_isn_secret;
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/tls.hh b/src/seastar/include/seastar/net/tls.hh
new file mode 100644
index 000000000..4f55b14e3
--- /dev/null
+++ b/src/seastar/include/seastar/net/tls.hh
@@ -0,0 +1,346 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2015 Cloudius Systems
+ */
+#pragma once
+
+#include <functional>
+#include <unordered_set>
+
+#include <boost/any.hpp>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/net/socket_defs.hh>
+#include <seastar/util/std-compat.hh>
+#include <seastar/net/api.hh>
+#include "../core/internal/api-level.hh"
+
+namespace seastar {
+
+class socket;
+
+class server_socket;
+class connected_socket;
+class socket_address;
+
+/**
+ * Relatively thin SSL wrapper for socket IO.
+ * (Can be expanded to other IO forms).
+ *
+ * The current underlying mechanism is
+ * gnutls, however, all interfaces are kept
+ * agnostic, so in theory it could be replaced
+ * with OpenSSL or similar.
+ *
+ */
+namespace tls {
+ enum class x509_crt_format {
+ DER,
+ PEM,
+ };
+
+ typedef std::basic_string_view<char> blob;
+
+ class session;
+ class server_session;
+ class server_credentials;
+ class certificate_credentials;
+ class credentials_builder;
+
+ /**
+ * Diffie-Hellman parameters for
+ * wire encryption.
+ */
+ class dh_params {
+ public:
+ // Key strength
+ enum class level {
+ LEGACY = 2,
+ MEDIUM = 3,
+ HIGH = 4,
+ ULTRA = 5
+ };
+ dh_params(level = level::LEGACY);
+ // loads a key from data
+ dh_params(const blob&, x509_crt_format);
+ ~dh_params();
+
+ dh_params(dh_params&&) noexcept;
+ dh_params& operator=(dh_params&&) noexcept;
+
+ dh_params(const dh_params&) = delete;
+ dh_params& operator=(const dh_params&) = delete;
+
+ /** loads a key from file */
+ static future<dh_params> from_file(const sstring&, x509_crt_format);
+ private:
+ class impl;
+ friend class server_credentials;
+ friend class certificate_credentials;
+ std::unique_ptr<impl> _impl;
+ };
+
+ class x509_cert {
+ x509_cert(const blob&, x509_crt_format);
+
+ static future<x509_cert> from_file(const sstring&, x509_crt_format);
+ private:
+ class impl;
+ x509_cert(shared_ptr<impl>);
+ shared_ptr<impl> _impl;
+ };
+
+ class abstract_credentials {
+ public:
+ virtual ~abstract_credentials() {};
+
+ virtual void set_x509_trust(const blob&, x509_crt_format) = 0;
+ virtual void set_x509_crl(const blob&, x509_crt_format) = 0;
+ virtual void set_x509_key(const blob& cert, const blob& key, x509_crt_format) = 0;
+
+ virtual void set_simple_pkcs12(const blob&, x509_crt_format, const sstring& password) = 0;
+
+ virtual future<> set_x509_trust_file(const sstring& cafile, x509_crt_format);
+ virtual future<> set_x509_crl_file(const sstring& crlfile, x509_crt_format);
+ virtual future<> set_x509_key_file(const sstring& cf, const sstring& kf, x509_crt_format);
+
+ virtual future<> set_simple_pkcs12_file(const sstring& pkcs12file, x509_crt_format, const sstring& password);
+ };
+
+ template<typename Base>
+ class reloadable_credentials;
+
+ /**
+ * Enum like tls::session::type but independent of gnutls headers
+ *
+ * \warning Uses a different internal encoding than tls::session::type
+ */
+ enum class session_type {
+ CLIENT, SERVER,
+ };
+
+ /**
+ * Callback prototype for receiving Distinguished Name (DN) information
+ *
+ * \param type Our own role in the TLS handshake (client vs. server)
+ * \param subject The subject DN string
+ * \param issuer The issuer DN string
+ */
+ using dn_callback = noncopyable_function<void(session_type type, sstring subject, sstring issuer)>;
+
+ /**
+ * Holds certificates and keys.
+ *
+ * Typically, credentials are shared for multiple client/server
+ * sessions. Changes to the credentials object will affect all
+ * sessions instantiated with it.
+ * You should probably set it up once, before starting client/server
+ * connections.
+ */
+ class certificate_credentials : public abstract_credentials {
+ public:
+ certificate_credentials();
+ ~certificate_credentials();
+
+ certificate_credentials(certificate_credentials&&) noexcept;
+ certificate_credentials& operator=(certificate_credentials&&) noexcept;
+
+ certificate_credentials(const certificate_credentials&) = delete;
+ certificate_credentials& operator=(const certificate_credentials&) = delete;
+
+ void set_x509_trust(const blob&, x509_crt_format) override;
+ void set_x509_crl(const blob&, x509_crt_format) override;
+ void set_x509_key(const blob& cert, const blob& key, x509_crt_format) override;
+ void set_simple_pkcs12(const blob&, x509_crt_format, const sstring& password) override;
+
+ /**
+ * Loads default system cert trust file
+ * into this object.
+ */
+ future<> set_system_trust();
+
+ // TODO add methods for certificate verification
+
+ /**
+ * TLS handshake priority string. See gnutls docs and syntax at
+ * https://gnutls.org/manual/html_node/Priority-Strings.html
+ *
+ * Allows specifying order and allowance for handshake alg.
+ */
+ void set_priority_string(const sstring&);
+
+ /**
+ * Register a callback for receiving Distinguished Name (DN) information
+ * during the TLS handshake, extracted from the certificate as sent by the peer.
+ *
+ * The callback is not invoked in case the peer did not send a certificate.
+ * (This could e.g. happen when we are the server, and a client connects while
+ * client_auth is not set to REQUIRE.)
+ *
+ * If, based upon the extracted DN information, you want to abort the handshake,
+ * then simply throw an exception (e.g., from the callback) like verification_error.
+ *
+ * Registering this callback does not bypass the 'standard' certificate verification
+ * procedure; instead it merely extracts the DN information from the peer certificate
+ * (i.e., the 'leaf' certificate from the chain of certificates sent by the peer)
+ * and allows for extra checks.
+ *
+ * To keep the API simple, you can unregister the callback by means of registering
+ * an empty callback, i.e. dn_callback{}
+ *
+ * The callback prototype is documented in the dn_callback typedef.
+ */
+ void set_dn_verification_callback(dn_callback);
+
+ private:
+ class impl;
+ friend class session;
+ friend class server_session;
+ friend class server_credentials;
+ friend class credentials_builder;
+ template<typename Base>
+ friend class reloadable_credentials;
+ shared_ptr<impl> _impl;
+ };
+
+ /** Exception thrown on certificate validation error */
+ class verification_error : public std::runtime_error {
+ public:
+ using runtime_error::runtime_error;
+ };
+
+ enum class client_auth {
+ NONE, REQUEST, REQUIRE
+ };
+
+ /**
+ * Extending certificates and keys for server usage.
+ * More probably goes in here...
+ */
+ class server_credentials : public certificate_credentials {
+ public:
+ server_credentials();
+ server_credentials(shared_ptr<dh_params>);
+ server_credentials(const dh_params&);
+
+ server_credentials(server_credentials&&) noexcept;
+ server_credentials& operator=(server_credentials&&) noexcept;
+
+ server_credentials(const server_credentials&) = delete;
+ server_credentials& operator=(const server_credentials&) = delete;
+
+ void set_client_auth(client_auth);
+ };
+
+ class reloadable_credentials_base;
+
+ using reload_callback = std::function<void(const std::unordered_set<sstring>&, std::exception_ptr)>;
+
+ /**
+ * Intentionally "primitive", and more importantly, copyable
+ * container for certificate credentials options.
+ * The intendend use case is to be able to use across shards,
+ * at, say, initialization of tls objects
+ *
+ * Note that loading invalid objects (malformed certs etc) will
+ * _not_ generate exceptions until, earliest, the build functions
+ * are called.
+ */
+ class credentials_builder : public abstract_credentials {
+ public:
+ void set_dh_level(dh_params::level = dh_params::level::LEGACY);
+
+ void set_x509_trust(const blob&, x509_crt_format) override ;
+ void set_x509_crl(const blob&, x509_crt_format) override;
+ void set_x509_key(const blob& cert, const blob& key, x509_crt_format) override;
+ void set_simple_pkcs12(const blob&, x509_crt_format, const sstring& password) override;
+
+ future<> set_x509_trust_file(const sstring& cafile, x509_crt_format) override;
+ future<> set_x509_crl_file(const sstring& crlfile, x509_crt_format) override;
+ future<> set_x509_key_file(const sstring& cf, const sstring& kf, x509_crt_format) override;
+ future<> set_simple_pkcs12_file(const sstring& pkcs12file, x509_crt_format, const sstring& password) override;
+
+ future<> set_system_trust();
+ void set_client_auth(client_auth);
+ void set_priority_string(const sstring&);
+
+ void apply_to(certificate_credentials&) const;
+
+ shared_ptr<certificate_credentials> build_certificate_credentials() const;
+ shared_ptr<server_credentials> build_server_credentials() const;
+
+ // same as above, but any files used for certs/keys etc will be watched
+ // for modification and reloaded if changed
+ future<shared_ptr<certificate_credentials>> build_reloadable_certificate_credentials(reload_callback = {}) const;
+ future<shared_ptr<server_credentials>> build_reloadable_server_credentials(reload_callback = {}) const;
+ private:
+ friend class reloadable_credentials_base;
+
+ std::multimap<sstring, boost::any> _blobs;
+ client_auth _client_auth = client_auth::NONE;
+ sstring _priority;
+ };
+
+ /**
+ * Creates a TLS client connection using the default network stack and the
+ * supplied credentials.
+ * Typically these should contain enough information
+ * to validate the remote certificate (i.e. trust info).
+ *
+ * \param name An optional expected server name for the remote end point
+ */
+ /// @{
+ future<connected_socket> connect(shared_ptr<certificate_credentials>, socket_address, sstring name = {});
+ future<connected_socket> connect(shared_ptr<certificate_credentials>, socket_address, socket_address local, sstring name = {});
+ /// @}
+
+ /**
+ * Creates a socket through which a TLS client connection can be created,
+ * using the default network stack and the supplied credentials.
+ * Typically these should contain enough information
+ * to validate the remote certificate (i.e. trust info).
+ *
+ * \param name An optional expected server name for the remote end point
+ */
+ /// @{
+ ::seastar::socket socket(shared_ptr<certificate_credentials>, sstring name = {});
+ /// @}
+
+ /** Wraps an existing connection in SSL/TLS. */
+ /// @{
+ future<connected_socket> wrap_client(shared_ptr<certificate_credentials>, connected_socket&&, sstring name = {});
+ future<connected_socket> wrap_server(shared_ptr<server_credentials>, connected_socket&&);
+ /// @}
+
+ /**
+ * Creates a server socket that accepts SSL/TLS clients using default network stack
+ * and the supplied credentials.
+ * The credentials object should contain certificate info
+ * for the server and optionally trust/crl data.
+ */
+ /// @{
+ server_socket listen(shared_ptr<server_credentials>, socket_address sa, listen_options opts = listen_options());
+ // Wraps an existing server socket in SSL
+ server_socket listen(shared_ptr<server_credentials>, server_socket);
+ /// @}
+}
+}
+
diff --git a/src/seastar/include/seastar/net/toeplitz.hh b/src/seastar/include/seastar/net/toeplitz.hh
new file mode 100644
index 000000000..01e1e8370
--- /dev/null
+++ b/src/seastar/include/seastar/net/toeplitz.hh
@@ -0,0 +1,99 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*-
+ * Copyright (c) 2010 David Malone <dwmalone@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <vector>
+
+namespace seastar {
+
+using rss_key_type = std::basic_string_view<uint8_t>;
+
+// Mellanox Linux's driver key
+static constexpr uint8_t default_rsskey_40bytes_v[] = {
+ 0xd1, 0x81, 0xc6, 0x2c, 0xf7, 0xf4, 0xdb, 0x5b,
+ 0x19, 0x83, 0xa2, 0xfc, 0x94, 0x3e, 0x1a, 0xdb,
+ 0xd9, 0x38, 0x9e, 0x6b, 0xd1, 0x03, 0x9c, 0x2c,
+ 0xa7, 0x44, 0x99, 0xad, 0x59, 0x3d, 0x56, 0xd9,
+ 0xf3, 0x25, 0x3c, 0x06, 0x2a, 0xdc, 0x1f, 0xfc
+};
+
+static constexpr rss_key_type default_rsskey_40bytes{default_rsskey_40bytes_v, sizeof(default_rsskey_40bytes_v)};
+
+// Intel's i40e PMD default RSS key
+static constexpr uint8_t default_rsskey_52bytes_v[] = {
+ 0x44, 0x39, 0x79, 0x6b, 0xb5, 0x4c, 0x50, 0x23,
+ 0xb6, 0x75, 0xea, 0x5b, 0x12, 0x4f, 0x9f, 0x30,
+ 0xb8, 0xa2, 0xc0, 0x3d, 0xdf, 0xdc, 0x4d, 0x02,
+ 0xa0, 0x8c, 0x9b, 0x33, 0x4a, 0xf6, 0x4a, 0x4c,
+ 0x05, 0xc6, 0xfa, 0x34, 0x39, 0x58, 0xd8, 0x55,
+ 0x7d, 0x99, 0x58, 0x3a, 0xe1, 0x38, 0xc9, 0x2e,
+ 0x81, 0x15, 0x03, 0x66
+};
+
+static constexpr rss_key_type default_rsskey_52bytes{default_rsskey_52bytes_v, sizeof(default_rsskey_52bytes_v)};
+
+template<typename T>
+static inline uint32_t
+toeplitz_hash(rss_key_type key, const T& data)
+{
+ uint32_t hash = 0, v;
+ u_int i, b;
+
+ /* XXXRW: Perhaps an assertion about key length vs. data length? */
+
+ v = (key[0]<<24) + (key[1]<<16) + (key[2] <<8) + key[3];
+ for (i = 0; i < data.size(); i++) {
+ for (b = 0; b < 8; b++) {
+ if (data[i] & (1<<(7-b)))
+ hash ^= v;
+ v <<= 1;
+ if ((i + 4) < key.size() &&
+ (key[i+4] & (1<<(7-b))))
+ v |= 1;
+ }
+ }
+ return (hash);
+}
+
+}
diff --git a/src/seastar/include/seastar/net/udp.hh b/src/seastar/include/seastar/net/udp.hh
new file mode 100644
index 000000000..28f447970
--- /dev/null
+++ b/src/seastar/include/seastar/net/udp.hh
@@ -0,0 +1,59 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include <assert.h>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/net/api.hh>
+#include <seastar/net/const.hh>
+#include <seastar/net/net.hh>
+
+namespace seastar {
+
+namespace net {
+
+struct udp_hdr {
+ packed<uint16_t> src_port;
+ packed<uint16_t> dst_port;
+ packed<uint16_t> len;
+ packed<uint16_t> cksum;
+
+ template<typename Adjuster>
+ auto adjust_endianness(Adjuster a) {
+ return a(src_port, dst_port, len, cksum);
+ }
+} __attribute__((packed));
+
+struct udp_channel_state {
+ queue<udp_datagram> _queue;
+ // Limit number of data queued into send queue
+ semaphore _user_queue_space = {212992};
+ udp_channel_state(size_t queue_size) : _queue(queue_size) {}
+ future<> wait_for_send_buffer(size_t len) { return _user_queue_space.wait(len); }
+ void complete_send(size_t len) { _user_queue_space.signal(len); }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/net/unix_address.hh b/src/seastar/include/seastar/net/unix_address.hh
new file mode 100644
index 000000000..b177cfbfc
--- /dev/null
+++ b/src/seastar/include/seastar/net/unix_address.hh
@@ -0,0 +1,75 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 Red Hat, Inc.
+ */
+#pragma once
+
+#include <iosfwd>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <string>
+
+namespace seastar {
+
+/*!
+ A helper struct for creating/manipulating UNIX-domain sockets.
+
+ A UNIX-domain socket is either named or unnamed. If named, the name is either
+ a path in the filesystem namespace, or an abstract-domain identifier. Abstract-domain
+ names start with a null byte, and may contain non-printable characters.
+
+ std::string() can hold a sequence of arbitrary bytes, and has a length() attribute
+ that does not rely on using strlen(). Thus it is used here to hold the address.
+ */
+struct unix_domain_addr {
+ const std::string name;
+ const int path_count; // either name.length() or name.length()+1. See path_length_aux() below.
+
+ explicit unix_domain_addr(const std::string& fn) : name{fn}, path_count{path_length_aux()} {}
+
+ explicit unix_domain_addr(const char* fn) : name{fn}, path_count{path_length_aux()} {}
+
+ int path_length() const { return path_count; }
+
+ // the following holds:
+ // for abstract name: name.length() == number of meaningful bytes, including the null in name[0].
+ // for filesystem path: name.length() does not count the implicit terminating null.
+ // Here we tweak the outside-visible length of the address.
+ int path_length_aux() const {
+ auto pl = (int)name.length();
+ if (!pl || (name[0] == '\0')) {
+ // unnamed, or abstract-namespace
+ return pl;
+ }
+ return 1 + pl;
+ }
+
+ const char* path_bytes() const { return name.c_str(); }
+
+ bool operator==(const unix_domain_addr& a) const {
+ return name == a.name;
+ }
+ bool operator!=(const unix_domain_addr& a) const {
+ return !(*this == a);
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const unix_domain_addr&);
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/net/virtio-interface.hh b/src/seastar/include/seastar/net/virtio-interface.hh
new file mode 100644
index 000000000..367806f01
--- /dev/null
+++ b/src/seastar/include/seastar/net/virtio-interface.hh
@@ -0,0 +1,131 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#pragma once
+
+#ifndef _LINUX_VIRTIO_RING_H
+#define _LINUX_VIRTIO_RING_H
+
+/* An interface for efficient virtio implementation, currently for use by KVM
+ * and lguest, but hopefully others soon. Do NOT change this since it will
+ * break existing servers and clients.
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright Rusty Russell IBM Corporation 2007. */
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT 1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE 2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT 4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me when
+ * you add a buffer. It's unreliable, so it's simply an optimization. Guest
+ * will still kick if it's out of buffers. */
+#define VRING_USED_F_NO_NOTIFY 1
+/* The Guest uses this in avail->flags to advise the Host: don't interrupt me
+ * when you consume a buffer. It's unreliable, so it's simply an
+ * optimization. */
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28)
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field. */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field. */
+#define VIRTIO_RING_F_EVENT_IDX (1 << 29)
+
+/* The standard layout for the ring is a continuous chunk of memory which looks
+ * like this. We assume num is a power of 2.
+ *
+ * struct vring
+ * {
+ * // The actual descriptors (16 bytes each)
+ * struct vring_desc desc[num];
+ *
+ * // A ring of available descriptor heads with free-running index.
+ * uint16_t avail_flags;
+ * uint16_t avail_idx;
+ * uint16_t available[num];
+ * uint16_t used_event_idx;
+ *
+ * // Padding to the next align boundary.
+ * char pad[];
+ *
+ * // A ring of used descriptor heads with free-running index.
+ * uint16_t used_flags;
+ * uint16_t used_idx;
+ * struct vring_used_elem used[num];
+ * uint16_t avail_event_idx;
+ * };
+ */
+
+#endif
+
+#define VIRTIO_NET_F_CSUM (1 << 0)
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1)
+#define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS (1 << 2)
+#define VIRTIO_NET_F_MAC (1 << 5)
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7)
+#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8)
+#define VIRTIO_NET_F_GUEST_ECN (1 << 9)
+#define VIRTIO_NET_F_GUEST_UFO (1 << 10)
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11)
+#define VIRTIO_NET_F_HOST_TSO6 (1 << 12)
+#define VIRTIO_NET_F_HOST_ECN (1 << 13)
+#define VIRTIO_NET_F_HOST_UFO (1 << 14)
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15)
+#define VIRTIO_NET_F_STATUS (1 << 16)
+#define VIRTIO_NET_F_CTRL_VQ (1 << 17)
+#define VIRTIO_NET_F_CTRL_RX (1 << 18)
+#define VIRTIO_NET_F_CTRL_VLAN (1 << 19)
+#define VIRTIO_NET_F_GUEST_ANNOUNCE (1 << 21)
+#define VIRTIO_NET_F_MQ (1 << 22)
+#define VIRTIO_NET_F_CTRL_MAC_ADDR (1 << 23)
diff --git a/src/seastar/include/seastar/net/virtio.hh b/src/seastar/include/seastar/net/virtio.hh
new file mode 100644
index 000000000..4e530ef6e
--- /dev/null
+++ b/src/seastar/include/seastar/net/virtio.hh
@@ -0,0 +1,33 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <seastar/net/net.hh>
+#include <seastar/core/sstring.hh>
+
+namespace seastar {
+
+std::unique_ptr<net::device> create_virtio_net_device(boost::program_options::variables_map opts = boost::program_options::variables_map());
+boost::program_options::options_description get_virtio_net_options_description();
+
+}
diff --git a/src/seastar/include/seastar/rpc/lz4_compressor.hh b/src/seastar/include/seastar/rpc/lz4_compressor.hh
new file mode 100644
index 000000000..3a4523623
--- /dev/null
+++ b/src/seastar/include/seastar/rpc/lz4_compressor.hh
@@ -0,0 +1,48 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 Scylladb, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/rpc/rpc_types.hh>
+#include <lz4.h>
+
+namespace seastar {
+
+namespace rpc {
+ class lz4_compressor : public compressor {
+ public:
+ class factory: public rpc::compressor::factory {
+ public:
+ virtual const sstring& supported() const override;
+ virtual std::unique_ptr<rpc::compressor> negotiate(sstring feature, bool is_server) const override;
+ };
+ public:
+ ~lz4_compressor() {}
+ // compress data, leaving head_space empty in returned buffer
+ snd_buf compress(size_t head_space, snd_buf data) override;
+ // decompress data
+ rcv_buf decompress(rcv_buf data) override;
+ sstring name() const override;
+ };
+}
+
+}
diff --git a/src/seastar/include/seastar/rpc/lz4_fragmented_compressor.hh b/src/seastar/include/seastar/rpc/lz4_fragmented_compressor.hh
new file mode 100644
index 000000000..a3ad29705
--- /dev/null
+++ b/src/seastar/include/seastar/rpc/lz4_fragmented_compressor.hh
@@ -0,0 +1,44 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 Scylladb, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/rpc/rpc_types.hh>
+
+namespace seastar {
+namespace rpc {
+
+class lz4_fragmented_compressor final : public compressor {
+public:
+ class factory final : public rpc::compressor::factory {
+ public:
+ virtual const sstring& supported() const override;
+ virtual std::unique_ptr<rpc::compressor> negotiate(sstring feature, bool is_server) const override;
+ };
+public:
+ virtual snd_buf compress(size_t head_space, snd_buf data) override;
+ virtual rcv_buf decompress(rcv_buf data) override;
+ sstring name() const override;
+};
+
+}
+}
diff --git a/src/seastar/include/seastar/rpc/multi_algo_compressor_factory.hh b/src/seastar/include/seastar/rpc/multi_algo_compressor_factory.hh
new file mode 100644
index 000000000..7a4380dc5
--- /dev/null
+++ b/src/seastar/include/seastar/rpc/multi_algo_compressor_factory.hh
@@ -0,0 +1,80 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 Scylladb, Ltd.
+ */
+
+#pragma once
+
+#include <boost/range/adaptor/transformed.hpp>
+#include <boost/algorithm/string.hpp>
+#include <seastar/core/sstring.hh>
+#include <seastar/rpc/rpc_types.hh>
+
+namespace seastar {
+
+namespace rpc {
+
+// This is meta compressor factory. It gets an array of regular factories that
+// support one compression algorithm each and negotiates common compression algorithm
+// that is supported both by a client and a server. The order of algorithm preferences
+// is the order they appear in clien's list
+class multi_algo_compressor_factory : public rpc::compressor::factory {
+ std::vector<const rpc::compressor::factory*> _factories;
+ sstring _features;
+
+public:
+ multi_algo_compressor_factory(std::vector<const rpc::compressor::factory*> factories) : _factories(std::move(factories)) {
+ _features = boost::algorithm::join(_factories | boost::adaptors::transformed(std::mem_fn(&rpc::compressor::factory::supported)), sstring(","));
+ }
+ multi_algo_compressor_factory(std::initializer_list<const rpc::compressor::factory*> factories) :
+ multi_algo_compressor_factory(std::vector<const rpc::compressor::factory*>(std::move(factories))) {}
+ multi_algo_compressor_factory(const rpc::compressor::factory* factory) : multi_algo_compressor_factory({factory}) {}
+ // return feature string that will be sent as part of protocol negotiation
+ virtual const sstring& supported() const {
+ return _features;
+ }
+ // negotiate compress algorithm
+ virtual std::unique_ptr<compressor> negotiate(sstring feature, bool is_server) const {
+ std::vector<sstring> names;
+ boost::split(names, feature, boost::is_any_of(","));
+ std::unique_ptr<compressor> c;
+ if (is_server) {
+ for (auto&& n : names) {
+ for (auto&& f : _factories) {
+ if ((c = f->negotiate(n, is_server))) {
+ return c;
+ }
+ }
+ }
+ } else {
+ for (auto&& f : _factories) {
+ for (auto&& n : names) {
+ if ((c = f->negotiate(n, is_server))) {
+ return c;
+ }
+ }
+ }
+ }
+ return nullptr;
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/rpc/rpc.hh b/src/seastar/include/seastar/rpc/rpc.hh
new file mode 100644
index 000000000..1dfc6c12b
--- /dev/null
+++ b/src/seastar/include/seastar/rpc/rpc.hh
@@ -0,0 +1,870 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+#include <list>
+#include <seastar/core/future.hh>
+#include <seastar/core/seastar.hh>
+#include <seastar/net/api.hh>
+#include <seastar/core/iostream.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/condition-variable.hh>
+#include <seastar/core/gate.hh>
+#include <seastar/rpc/rpc_types.hh>
+#include <seastar/core/byteorder.hh>
+#include <seastar/core/shared_future.hh>
+#include <seastar/core/queue.hh>
+#include <seastar/core/weak_ptr.hh>
+#include <seastar/core/scheduling.hh>
+#include <seastar/util/backtrace.hh>
+#include <seastar/util/log.hh>
+
+namespace seastar {
+
+namespace rpc {
+
+/// \defgroup rpc rpc - remote procedure call framework
+///
+/// \brief
+/// rpc is a framework that can be used to define client-server communication
+/// protocols.
+/// For a high-level description of the RPC features see
+/// [doc/rpc.md](./md_rpc.html),
+/// [doc/rpc-streaming.md](./md_rpc-streaming.html) and
+/// [doc/rpc-compression.md](./md_rpc-compression.html)
+///
+/// The entry point for setting up an rpc protocol is
+/// seastar::rpc::protocol.
+
+using id_type = int64_t;
+
+using rpc_semaphore = basic_semaphore<semaphore_default_exception_factory, rpc_clock_type>;
+using resource_permit = semaphore_units<semaphore_default_exception_factory, rpc_clock_type>;
+
+static constexpr char rpc_magic[] = "SSTARRPC";
+
+/// \addtogroup rpc
+/// @{
+
+/// Specifies resource isolation for a connection.
+struct isolation_config {
+ /// Specifies a scheduling group under which the connection (and all its
+ /// verb handlers) will execute.
+ scheduling_group sched_group = current_scheduling_group();
+};
+
+/// Default isolation configuration - run everything in the default scheduling group.
+///
+/// In the scheduling_group that the protocol::server was created in.
+isolation_config default_isolate_connection(sstring isolation_cookie);
+
+/// \brief Resource limits for an RPC server
+///
+/// A request's memory use will be estimated as
+///
+/// req_mem = basic_request_size * sizeof(serialized_request) * bloat_factor
+///
+/// Concurrent requests will be limited so that
+///
+/// sum(req_mem) <= max_memory
+///
+/// \see server
+struct resource_limits {
+ size_t basic_request_size = 0; ///< Minimum request footprint in memory
+ unsigned bloat_factor = 1; ///< Serialized size multiplied by this to estimate memory used by request
+ size_t max_memory = rpc_semaphore::max_counter(); ///< Maximum amount of memory that may be consumed by all requests
+ /// Configures isolation for a connection based on its isolation cookie. May throw,
+ /// in which case the connection will be terminated.
+ std::function<isolation_config (sstring isolation_cookie)> isolate_connection = default_isolate_connection;
+};
+
+struct client_options {
+ std::optional<net::tcp_keepalive_params> keepalive;
+ bool tcp_nodelay = true;
+ bool reuseaddr = false;
+ compressor::factory* compressor_factory = nullptr;
+ bool send_timeout_data = true;
+ connection_id stream_parent = invalid_connection_id;
+ /// Configures how this connection is isolated from other connection on the same server.
+ ///
+ /// \see resource_limits::isolate_connection
+ sstring isolation_cookie;
+};
+
+/// @}
+
+// RPC call that passes stream connection id as a parameter
+// may arrive to a different shard from where the stream connection
+// was opened, so the connection id is not known to a server that handles
+// the RPC call. The shard that the stream connection belong to is know
+// since it is a part of connection id, but this is not enough to locate
+// a server instance the connection belongs to if there are more than one
+// server on the shard. Stream domain parameter is here to help with that.
+// Different servers on all shards logically belonging to the same service should
+// belong to the same streaming domain. Only one server on each shard can belong to
+// a particulr streaming domain.
+class streaming_domain_type {
+ uint64_t _id;
+public:
+ explicit streaming_domain_type(uint64_t id) : _id(id) {}
+ bool operator==(const streaming_domain_type& o) const {
+ return _id == o._id;
+ }
+ friend struct std::hash<streaming_domain_type>;
+ friend std::ostream& operator<<(std::ostream&, const streaming_domain_type&);
+};
+
+/// \addtogroup rpc
+/// @{
+
+struct server_options {
+ compressor::factory* compressor_factory = nullptr;
+ bool tcp_nodelay = true;
+ std::optional<streaming_domain_type> streaming_domain;
+ server_socket::load_balancing_algorithm load_balancing_algorithm = server_socket::load_balancing_algorithm::default_;
+};
+
+/// @}
+
+inline
+size_t
+estimate_request_size(const resource_limits& lim, size_t serialized_size) {
+ return lim.basic_request_size + serialized_size * lim.bloat_factor;
+}
+
+struct negotiation_frame {
+ char magic[sizeof(rpc_magic) - 1];
+ uint32_t len; // additional negotiation data length; multiple negotiation_frame_feature_record structs
+};
+
+enum class protocol_features : uint32_t {
+ COMPRESS = 0,
+ TIMEOUT = 1,
+ CONNECTION_ID = 2,
+ STREAM_PARENT = 3,
+ ISOLATION = 4,
+};
+
+// internal representation of feature data
+using feature_map = std::map<protocol_features, sstring>;
+
+// An rpc signature, in the form signature<Ret (In0, In1, In2)>.
+template <typename Function>
+struct signature;
+
+class logger {
+ std::function<void(const sstring&)> _logger;
+ ::seastar::logger* _seastar_logger = nullptr;
+
+ // _seastar_logger will always be used first if it's available
+ void log(const sstring& str) const {
+ if (_seastar_logger) {
+ // default level for log messages is `info`
+ _seastar_logger->info("{}", str);
+ } else if (_logger) {
+ _logger(str);
+ }
+ }
+
+ // _seastar_logger will always be used first if it's available
+ template <typename... Args>
+ void log(log_level level, const char* fmt, Args&&... args) const {
+ if (_seastar_logger) {
+ _seastar_logger->log(level, fmt, std::forward<Args>(args)...);
+ // If the log level is at least `info`, fall back to legacy logging without explicit level.
+ // Ignore less severe levels in order not to spam user's log with messages during transition,
+ // i.e. when the user still only defines a level-less logger.
+ } else if (_logger && level <= log_level::info) {
+ _logger(format(fmt, std::forward<Args>(args)...));
+ }
+ }
+
+public:
+ void set(std::function<void(const sstring&)> l) {
+ _logger = std::move(l);
+ }
+
+ void set(::seastar::logger* logger) {
+ _seastar_logger = logger;
+ }
+
+ void operator()(const client_info& info, id_type msg_id, const sstring& str) const;
+ void operator()(const client_info& info, id_type msg_id, log_level level, std::string_view str) const;
+
+ void operator()(const client_info& info, const sstring& str) const;
+ void operator()(const client_info& info, log_level level, std::string_view str) const;
+
+ void operator()(const socket_address& addr, const sstring& str) const;
+ void operator()(const socket_address& addr, log_level level, std::string_view str) const;
+};
+
+class connection {
+protected:
+ connected_socket _fd;
+ input_stream<char> _read_buf;
+ output_stream<char> _write_buf;
+ bool _error = false;
+ bool _connected = false;
+ promise<> _stopped;
+ stats _stats;
+ const logger& _logger;
+ // The owner of the pointer below is an instance of rpc::protocol<typename Serializer> class.
+ // The type of the pointer is erased here, but the original type is Serializer
+ void* _serializer;
+ struct outgoing_entry {
+ timer<rpc_clock_type> t;
+ snd_buf buf;
+ std::optional<promise<>> p = promise<>();
+ cancellable* pcancel = nullptr;
+ outgoing_entry(snd_buf b) : buf(std::move(b)) {}
+ outgoing_entry(outgoing_entry&& o) noexcept : t(std::move(o.t)), buf(std::move(o.buf)), p(std::move(o.p)), pcancel(o.pcancel) {
+ o.p = std::nullopt;
+ }
+ ~outgoing_entry() {
+ if (p) {
+ if (pcancel) {
+ pcancel->cancel_send = std::function<void()>();
+ pcancel->send_back_pointer = nullptr;
+ }
+ p->set_value();
+ }
+ }
+ };
+ friend outgoing_entry;
+ std::list<outgoing_entry> _outgoing_queue;
+ condition_variable _outgoing_queue_cond;
+ future<> _send_loop_stopped = make_ready_future<>();
+ std::unique_ptr<compressor> _compressor;
+ bool _timeout_negotiated = false;
+ // stream related fields
+ bool _is_stream = false;
+ connection_id _id = invalid_connection_id;
+
+ std::unordered_map<connection_id, xshard_connection_ptr> _streams;
+ queue<rcv_buf> _stream_queue = queue<rcv_buf>(max_queued_stream_buffers);
+ semaphore _stream_sem = semaphore(max_stream_buffers_memory);
+ bool _sink_closed = true;
+ bool _source_closed = true;
+ // the future holds if sink is already closed
+ // if it is not ready it means the sink is been closed
+ future<bool> _sink_closed_future = make_ready_future<bool>(false);
+
+ bool is_stream() {
+ return _is_stream;
+ }
+
+ snd_buf compress(snd_buf buf);
+ future<> send_buffer(snd_buf buf);
+
+ enum class outgoing_queue_type {
+ request,
+ response,
+ stream = response
+ };
+
+ template<outgoing_queue_type QueueType> void send_loop();
+ future<> stop_send_loop();
+ future<std::optional<rcv_buf>> read_stream_frame_compressed(input_stream<char>& in);
+ bool stream_check_twoway_closed() {
+ return _sink_closed && _source_closed;
+ }
+ future<> stream_close();
+ future<> stream_process_incoming(rcv_buf&&);
+ future<> handle_stream_frame();
+
+public:
+ connection(connected_socket&& fd, const logger& l, void* s, connection_id id = invalid_connection_id) : connection(l, s, id) {
+ set_socket(std::move(fd));
+ }
+ connection(const logger& l, void* s, connection_id id = invalid_connection_id) : _logger(l), _serializer(s), _id(id) {}
+ virtual ~connection() {}
+ void set_socket(connected_socket&& fd);
+ future<> send_negotiation_frame(feature_map features);
+ // functions below are public because they are used by external heavily templated functions
+ // and I am not smart enough to know how to define them as friends
+ future<> send(snd_buf buf, std::optional<rpc_clock_type::time_point> timeout = {}, cancellable* cancel = nullptr);
+ bool error() { return _error; }
+ void abort();
+ future<> stop();
+ future<> stream_receive(circular_buffer<foreign_ptr<std::unique_ptr<rcv_buf>>>& bufs);
+ future<> close_sink() {
+ _sink_closed = true;
+ if (stream_check_twoway_closed()) {
+ return stream_close();
+ }
+ return make_ready_future();
+ }
+ bool sink_closed() {
+ return _sink_closed;
+ }
+ future<> close_source() {
+ _source_closed = true;
+ if (stream_check_twoway_closed()) {
+ return stream_close();
+ }
+ return make_ready_future();
+ }
+ connection_id get_connection_id() const {
+ return _id;
+ }
+ xshard_connection_ptr get_stream(connection_id id) const;
+ void register_stream(connection_id id, xshard_connection_ptr c);
+ virtual socket_address peer_address() const = 0;
+
+ const logger& get_logger() const {
+ return _logger;
+ }
+
+ template<typename Serializer>
+ Serializer& serializer() {
+ return *static_cast<Serializer*>(_serializer);
+ }
+
+ template <typename FrameType>
+ typename FrameType::return_type read_frame(socket_address info, input_stream<char>& in);
+
+ template <typename FrameType>
+ typename FrameType::return_type read_frame_compressed(socket_address info, std::unique_ptr<compressor>& compressor, input_stream<char>& in);
+ friend class client;
+ template<typename Serializer, typename... Out>
+ friend class sink_impl;
+ template<typename Serializer, typename... In>
+ friend class source_impl;
+};
+
+// send data Out...
+template<typename Serializer, typename... Out>
+class sink_impl : public sink<Out...>::impl {
+public:
+ sink_impl(xshard_connection_ptr con) : sink<Out...>::impl(std::move(con)) { this->_con->get()->_sink_closed = false; }
+ future<> operator()(const Out&... args) override;
+ future<> close() override;
+ future<> flush() override;
+ ~sink_impl() override;
+};
+
+// receive data In...
+template<typename Serializer, typename... In>
+class source_impl : public source<In...>::impl {
+public:
+ source_impl(xshard_connection_ptr con) : source<In...>::impl(std::move(con)) { this->_con->get()->_source_closed = false; }
+ future<std::optional<std::tuple<In...>>> operator()() override;
+};
+
+class client : public rpc::connection, public weakly_referencable<client> {
+ socket _socket;
+ id_type _message_id = 1;
+ struct reply_handler_base {
+ timer<rpc_clock_type> t;
+ cancellable* pcancel = nullptr;
+ virtual void operator()(client&, id_type, rcv_buf data) = 0;
+ virtual void timeout() {}
+ virtual void cancel() {}
+ virtual ~reply_handler_base() {
+ if (pcancel) {
+ pcancel->cancel_wait = std::function<void()>();
+ pcancel->wait_back_pointer = nullptr;
+ }
+ };
+ };
+public:
+ template<typename Reply, typename Func>
+ struct reply_handler final : reply_handler_base {
+ Func func;
+ Reply reply;
+ reply_handler(Func&& f) : func(std::move(f)) {}
+ virtual void operator()(client& client, id_type msg_id, rcv_buf data) override {
+ return func(reply, client, msg_id, std::move(data));
+ }
+ virtual void timeout() override {
+ reply.done = true;
+ reply.p.set_exception(timeout_error());
+ }
+ virtual void cancel() override {
+ reply.done = true;
+ reply.p.set_exception(canceled_error());
+ }
+ virtual ~reply_handler() {}
+ };
+private:
+ std::unordered_map<id_type, std::unique_ptr<reply_handler_base>> _outstanding;
+ socket_address _server_addr;
+ client_options _options;
+ std::optional<shared_promise<>> _client_negotiated = shared_promise<>();
+ weak_ptr<client> _parent; // for stream clients
+
+private:
+ future<> negotiate_protocol(input_stream<char>& in);
+ void negotiate(feature_map server_features);
+ future<std::tuple<int64_t, std::optional<rcv_buf>>>
+ read_response_frame(input_stream<char>& in);
+ future<std::tuple<int64_t, std::optional<rcv_buf>>>
+ read_response_frame_compressed(input_stream<char>& in);
+ void send_loop() {
+ if (is_stream()) {
+ rpc::connection::send_loop<rpc::connection::outgoing_queue_type::stream>();
+ } else {
+ rpc::connection::send_loop<rpc::connection::outgoing_queue_type::request>();
+ }
+ }
+public:
+ /**
+ * Create client object which will attempt to connect to the remote address.
+ *
+ * @param l \ref seastar::logger to use for logging error messages
+ * @param s an optional connection serializer
+ * @param addr the remote address identifying this client
+ * @param local the local address of this client
+ */
+ client(const logger& l, void* s, const socket_address& addr, const socket_address& local = {});
+ client(const logger& l, void* s, client_options options, const socket_address& addr, const socket_address& local = {});
+
+ /**
+ * Create client object which will attempt to connect to the remote address using the
+ * specified seastar::socket.
+ *
+ * @param l \ref seastar::logger to use for logging error messages
+ * @param s an optional connection serializer
+ * @param addr the remote address identifying this client
+ * @param local the local address of this client
+ * @param socket the socket object use to connect to the remote address
+ */
+ client(const logger& l, void* s, socket socket, const socket_address& addr, const socket_address& local = {});
+ client(const logger& l, void* s, client_options options, socket socket, const socket_address& addr, const socket_address& local = {});
+
+ stats get_stats() const;
+ stats& get_stats_internal() {
+ return _stats;
+ }
+ auto next_message_id() { return _message_id++; }
+ void wait_for_reply(id_type id, std::unique_ptr<reply_handler_base>&& h, std::optional<rpc_clock_type::time_point> timeout, cancellable* cancel);
+ void wait_timed_out(id_type id);
+ future<> stop();
+ void abort_all_streams();
+ void deregister_this_stream();
+ socket_address peer_address() const override {
+ return _server_addr;
+ }
+ future<> await_connection() {
+ if (!_client_negotiated) {
+ return make_ready_future<>();
+ } else {
+ return _client_negotiated->get_shared_future();
+ }
+ }
+ template<typename Serializer, typename... Out>
+ future<sink<Out...>> make_stream_sink(socket socket) {
+ return await_connection().then([this, socket = std::move(socket)] () mutable {
+ if (!this->get_connection_id()) {
+ return make_exception_future<sink<Out...>>(std::runtime_error("Streaming is not supported by the server"));
+ }
+ client_options o = _options;
+ o.stream_parent = this->get_connection_id();
+ o.send_timeout_data = false;
+ auto c = make_shared<client>(_logger, _serializer, o, std::move(socket), _server_addr);
+ c->_parent = this->weak_from_this();
+ c->_is_stream = true;
+ return c->await_connection().then([c, this] {
+ xshard_connection_ptr s = make_lw_shared(make_foreign(static_pointer_cast<rpc::connection>(c)));
+ this->register_stream(c->get_connection_id(), s);
+ return sink<Out...>(make_shared<sink_impl<Serializer, Out...>>(std::move(s)));
+ });
+ });
+ }
+ template<typename Serializer, typename... Out>
+ future<sink<Out...>> make_stream_sink() {
+ return make_stream_sink<Serializer, Out...>(make_socket());
+ }
+};
+
+class protocol_base;
+
+class server {
+private:
+ static thread_local std::unordered_map<streaming_domain_type, server*> _servers;
+
+public:
+ class connection : public rpc::connection, public enable_shared_from_this<connection> {
+ server& _server;
+ client_info _info;
+ connection_id _parent_id = invalid_connection_id;
+ std::optional<isolation_config> _isolation_config;
+ private:
+ future<> negotiate_protocol(input_stream<char>& in);
+ future<std::tuple<std::optional<uint64_t>, uint64_t, int64_t, std::optional<rcv_buf>>>
+ read_request_frame_compressed(input_stream<char>& in);
+ future<feature_map> negotiate(feature_map requested);
+ void send_loop() {
+ if (is_stream()) {
+ rpc::connection::send_loop<rpc::connection::outgoing_queue_type::stream>();
+ } else {
+ rpc::connection::send_loop<rpc::connection::outgoing_queue_type::response>();
+ }
+ }
+ future<> send_unknown_verb_reply(std::optional<rpc_clock_type::time_point> timeout, int64_t msg_id, uint64_t type);
+ public:
+ connection(server& s, connected_socket&& fd, socket_address&& addr, const logger& l, void* seralizer, connection_id id);
+ future<> process();
+ future<> respond(int64_t msg_id, snd_buf&& data, std::optional<rpc_clock_type::time_point> timeout);
+ client_info& info() { return _info; }
+ const client_info& info() const { return _info; }
+ stats get_stats() const {
+ stats res = _stats;
+ res.pending = _outgoing_queue.size();
+ return res;
+ }
+
+ stats& get_stats_internal() {
+ return _stats;
+ }
+ socket_address peer_address() const override {
+ return _info.addr;
+ }
+ // Resources will be released when this goes out of scope
+ future<resource_permit> wait_for_resources(size_t memory_consumed, std::optional<rpc_clock_type::time_point> timeout) {
+ if (timeout) {
+ return get_units(_server._resources_available, memory_consumed, *timeout);
+ } else {
+ return get_units(_server._resources_available, memory_consumed);
+ }
+ }
+ size_t estimate_request_size(size_t serialized_size) {
+ return rpc::estimate_request_size(_server._limits, serialized_size);
+ }
+ size_t max_request_size() const {
+ return _server._limits.max_memory;
+ }
+ server& get_server() {
+ return _server;
+ }
+ future<> deregister_this_stream();
+ };
+private:
+ protocol_base* _proto;
+ server_socket _ss;
+ resource_limits _limits;
+ rpc_semaphore _resources_available;
+ std::unordered_map<connection_id, shared_ptr<connection>> _conns;
+ promise<> _ss_stopped;
+ gate _reply_gate;
+ server_options _options;
+ uint64_t _next_client_id = 1;
+
+public:
+ server(protocol_base* proto, const socket_address& addr, resource_limits memory_limit = resource_limits());
+ server(protocol_base* proto, server_options opts, const socket_address& addr, resource_limits memory_limit = resource_limits());
+ server(protocol_base* proto, server_socket, resource_limits memory_limit = resource_limits(), server_options opts = server_options{});
+ server(protocol_base* proto, server_options opts, server_socket, resource_limits memory_limit = resource_limits());
+ void accept();
+ future<> stop();
+ template<typename Func>
+ void foreach_connection(Func&& f) {
+ for (auto c : _conns) {
+ f(*c.second);
+ }
+ }
+ gate& reply_gate() {
+ return _reply_gate;
+ }
+ friend connection;
+ friend client;
+};
+
+using rpc_handler_func = std::function<future<> (shared_ptr<server::connection>, std::optional<rpc_clock_type::time_point> timeout, int64_t msgid,
+ rcv_buf data)>;
+
+struct rpc_handler {
+ scheduling_group sg;
+ rpc_handler_func func;
+ gate use_gate;
+};
+
+class protocol_base {
+public:
+ virtual ~protocol_base() {};
+ virtual shared_ptr<server::connection> make_server_connection(rpc::server& server, connected_socket fd, socket_address addr, connection_id id) = 0;
+protected:
+ friend class server;
+
+ virtual rpc_handler* get_handler(uint64_t msg_id) = 0;
+ virtual void put_handler(rpc_handler*) = 0;
+};
+
+/// \addtogroup rpc
+/// @{
+
+/// Defines a protocol for communication between a server and a client.
+///
+/// A protocol is defined by a `Serializer` and a `MsgType`. The `Serializer` is
+/// responsible for serializing and unserializing all types used as arguments and
+/// return types used in the protocol. The `Serializer` is expected to define a
+/// `read()` and `write()` method for each such type `T` as follows:
+///
+/// template <typename Output>
+/// void write(const serializer&, Output& output, const T& data);
+///
+/// template <typename Input>
+/// T read(const serializer&, Input& input, type<T> type_tag); // type_tag used to disambiguate
+///
+/// Where `Input` and `Output` have a `void read(char*, size_t)` and
+/// `write(const char*, size_t)` respectively.
+/// `MsgType` defines the type to be used as the message id, the id which is
+/// used to identify different messages used in the protocol. These are also
+/// often referred to as "verbs". The client will use the message id, to
+/// specify the remote method (verb) to invoke on the server. The server uses
+/// the message id to dispatch the incoming call to the right handler.
+/// `MsgType` should be hashable and serializable. It is preferable to use enum
+/// for message types, but do not forget to provide hash function for it.
+///
+/// Use register_handler() on the server to define the available verbs and the
+/// code to be executed when they are invoked by clients. Use make_client() on
+/// the client to create a matching callable that can be used to invoke the
+/// verb on the server and wait for its result. Note that register_handler()
+/// also returns a client, that can be used to invoke the registered verb on
+/// another node (given that the other node has the same verb). This is useful
+/// for symmetric protocols, where two or more nodes all have servers as well as
+/// connect to the other nodes as clients.
+///
+/// Use protocol::server to listen for and accept incoming connections on the
+/// server and protocol::client to establish connections to the server.
+/// Note that registering the available verbs can be done before/after
+/// listening for connections, but best to ensure that by the time incoming
+/// requests are to be expected, all the verbs are set-up.
+///
+/// ## Configuration
+///
+/// TODO
+///
+/// ## Isolation
+///
+/// RPC supports isolating verb handlers from each other. There are two ways to
+/// achieve this: per-handler isolation (the old way) and per-connection
+/// isolation (the new way). If no isolation is configured, all handlers will be
+/// executed in the context of the scheduling_group in which the
+/// protocol::server was created.
+///
+/// Per-handler isolation (the old way) can be configured by using the
+/// register_handler() overload which takes a scheduling_group. When invoked,
+/// the body of the handler will be executed from the context of the configured
+/// scheduling_group.
+///
+/// Per-connection isolation (the new way) is a more flexible mechanism that
+/// requires user application provided logic to determine how connections are
+/// isolated. This mechanism has two parts, the server and the client part.
+/// The client configures isolation by setting client_options::isolation_cookie.
+/// This cookie is an opaque (to the RPC layer) string that is to be interpreted
+/// on the server using user application provided logic. The application
+/// provides this logic to the server by setting
+/// resource_limits::isolate_connection to an appropriate handler function, that
+/// interprets the opaque cookie and resolves it to an isolation_config. The
+/// scheduling_group in the former will be used not just to execute all verb
+/// handlers, but also the connection loop itself, hence providing better
+/// isolation.
+///
+/// There a few gotchas related to mixing the two isolation mechanisms. This can
+/// happen when the application is updated and one of the client/server is
+/// still using the old/new mechanism. In general per-connection isolation
+/// overrides the per-handler one. If both are set up, the former will determine
+/// the scheduling_group context for the handlers. If the client is not
+/// configured to send an isolation cookie, the server's
+/// resource_limits::isolate_connection will not be invoked and the server will
+/// fall back to per-handler isolation if configured. If the client is
+/// configured to send an isolation cookie but the server doesn't have a
+/// resource_limits::isolate_connection configured, it will use
+/// default_isolate_connection() to interpret the cookie. Note that this still
+/// overrides the per-handler isolation if any is configured. If the server is
+/// so old that it doesn't have the per-connection isolation feature at all, it
+/// will of course just use the per-handler one, if configured.
+///
+/// ## Compatibility
+///
+/// TODO
+///
+/// \tparam Serializer the serializer for the protocol.
+/// \tparam MsgType the type to be used as the message id or verb id.
+template<typename Serializer, typename MsgType = uint32_t>
+class protocol : public protocol_base {
+public:
+ /// Represents the listening port and all accepted connections.
+ class server : public rpc::server {
+ public:
+ server(protocol& proto, const socket_address& addr, resource_limits memory_limit = resource_limits()) :
+ rpc::server(&proto, addr, memory_limit) {}
+ server(protocol& proto, server_options opts, const socket_address& addr, resource_limits memory_limit = resource_limits()) :
+ rpc::server(&proto, opts, addr, memory_limit) {}
+ server(protocol& proto, server_socket socket, resource_limits memory_limit = resource_limits(), server_options opts = server_options{}) :
+ rpc::server(&proto, std::move(socket), memory_limit) {}
+ server(protocol& proto, server_options opts, server_socket socket, resource_limits memory_limit = resource_limits()) :
+ rpc::server(&proto, opts, std::move(socket), memory_limit) {}
+ };
+ /// Represents a client side connection.
+ class client : public rpc::client {
+ public:
+ /*
+ * Create client object which will attempt to connect to the remote address.
+ *
+ * @param addr the remote address identifying this client
+ * @param local the local address of this client
+ */
+ client(protocol& p, const socket_address& addr, const socket_address& local = {}) :
+ rpc::client(p.get_logger(), &p._serializer, addr, local) {}
+ client(protocol& p, client_options options, const socket_address& addr, const socket_address& local = {}) :
+ rpc::client(p.get_logger(), &p._serializer, options, addr, local) {}
+
+ /**
+ * Create client object which will attempt to connect to the remote address using the
+ * specified seastar::socket.
+ *
+ * @param addr the remote address identifying this client
+ * @param local the local address of this client
+ * @param socket the socket object use to connect to the remote address
+ */
+ client(protocol& p, socket socket, const socket_address& addr, const socket_address& local = {}) :
+ rpc::client(p.get_logger(), &p._serializer, std::move(socket), addr, local) {}
+ client(protocol& p, client_options options, socket socket, const socket_address& addr, const socket_address& local = {}) :
+ rpc::client(p.get_logger(), &p._serializer, options, std::move(socket), addr, local) {}
+ };
+
+ friend server;
+private:
+ std::unordered_map<MsgType, rpc_handler> _handlers;
+ Serializer _serializer;
+ logger _logger;
+
+public:
+ protocol(Serializer&& serializer) : _serializer(std::forward<Serializer>(serializer)) {}
+
+ /// Creates a callable that can be used to invoke the verb on the remote.
+ ///
+ /// \tparam Func The signature of the verb. Has to be either the same or
+ /// compatible with the one passed to register_handler on the server.
+ /// \param t the verb to invoke on the remote.
+ ///
+ /// \returns a callable whose signature is derived from Func as follows:
+ /// given `Func == Ret(Args...)` the returned callable has the following
+ /// signature: `future<Ret>(protocol::client&, Args...)`.
+ template<typename Func>
+ auto make_client(MsgType t);
+
+ /// Register a handler to be called when this verb is invoked.
+ ///
+ /// \tparam Func the type of the handler for the verb. This determines the
+ /// signature of the verb.
+ /// \param t the verb to register the handler for.
+ /// \param func the callable to be called when the verb is invoked by the
+ /// remote.
+ ///
+ /// \returns a client, a callable that can be used to invoke the verb. See
+ /// make_client(). The client can be discarded, in fact this is what
+ /// most callers will do as real clients will live on a remote node, not
+ /// on the one where handlers are registered.
+ template<typename Func>
+ auto register_handler(MsgType t, Func&& func);
+
+ /// Register a handler to be called when this verb is invoked.
+ ///
+ /// \tparam Func the type of the handler for the verb. This determines the
+ /// signature of the verb.
+ /// \param t the verb to register the handler for.
+ /// \param sg the scheduling group that will be used to invoke the handler
+ /// in. This can be used to execute different verbs in different
+ /// scheduling groups. Note that there is a newer mechanism to determine
+ /// the scheduling groups a handler will run it per invocation, see
+ /// isolation_config.
+ /// \param func the callable to be called when the verb is invoked by the
+ /// remote.
+ ///
+ /// \returns a client, a callable that can be used to invoke the verb. See
+ /// make_client(). The client can be discarded, in fact this is what
+ /// most callers will do as real clients will live on a remote node, not
+ /// on the one where handlers are registered.
+ template <typename Func>
+ auto register_handler(MsgType t, scheduling_group sg, Func&& func);
+
+ /// Unregister the handler for the verb.
+ ///
+ /// Waits for all currently running handlers, then unregisters the handler.
+ /// Future attempts to invoke the verb will fail. This becomes effective
+ /// immediately after calling this function.
+ ///
+ /// \param t the verb to unregister the handler for.
+ ///
+ /// \returns a future that becomes available once all currently running
+ /// handlers finished.
+ future<> unregister_handler(MsgType t);
+
+ /// Set a logger function to be used to log messages.
+ ///
+ /// \deprecated use the logger overload set_logger(::seastar::logger*)
+ /// instead.
+ [[deprecated("Use set_logger(::seastar::logger*) instead")]]
+ void set_logger(std::function<void(const sstring&)> logger) {
+ _logger.set(std::move(logger));
+ }
+
+ /// Set a logger to be used to log messages.
+ void set_logger(::seastar::logger* logger) {
+ _logger.set(logger);
+ }
+
+ const logger& get_logger() const {
+ return _logger;
+ }
+
+ shared_ptr<rpc::server::connection> make_server_connection(rpc::server& server, connected_socket fd, socket_address addr, connection_id id) override {
+ return make_shared<rpc::server::connection>(server, std::move(fd), std::move(addr), _logger, &_serializer, id);
+ }
+
+ bool has_handler(MsgType msg_id);
+
+ /// Checks if any there are handlers registered.
+ /// Debugging helper, should only be used for debugging and not relied on.
+ ///
+ /// \returns true if there are, false if there are no registered handlers.
+ bool has_handlers() const noexcept {
+ return !_handlers.empty();
+ }
+
+private:
+ rpc_handler* get_handler(uint64_t msg_id) override;
+ void put_handler(rpc_handler*) override;
+
+ template<typename Ret, typename... In>
+ auto make_client(signature<Ret(In...)> sig, MsgType t);
+
+ void register_receiver(MsgType t, rpc_handler&& handler) {
+ auto r = _handlers.emplace(t, std::move(handler));
+ if (!r.second) {
+ throw_with_backtrace<std::runtime_error>("registered handler already exists");
+ }
+ }
+};
+
+/// @}
+
+}
+
+}
+
+#include "rpc_impl.hh"
diff --git a/src/seastar/include/seastar/rpc/rpc_impl.hh b/src/seastar/include/seastar/rpc/rpc_impl.hh
new file mode 100644
index 000000000..7e1d3680e
--- /dev/null
+++ b/src/seastar/include/seastar/rpc/rpc_impl.hh
@@ -0,0 +1,881 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+#pragma once
+
+#include <seastar/core/function_traits.hh>
+#include <seastar/core/shared_ptr.hh>
+#include <seastar/core/sstring.hh>
+#include <seastar/core/when_all.hh>
+#include <seastar/util/is_smart_ptr.hh>
+#include <seastar/core/simple-stream.hh>
+#include <boost/range/numeric.hpp>
+#include <boost/range/adaptor/transformed.hpp>
+#include <seastar/net/packet-data-source.hh>
+#include <seastar/core/print.hh>
+
+namespace seastar {
+
+namespace rpc {
+
+enum class exception_type : uint32_t {
+ USER = 0,
+ UNKNOWN_VERB = 1,
+};
+
+template<typename T>
+struct remove_optional {
+ using type = T;
+};
+
+template<typename T>
+struct remove_optional<optional<T>> {
+ using type = T;
+};
+
+struct wait_type {}; // opposite of no_wait_type
+
+// tags to tell whether we want a const client_info& parameter
+struct do_want_client_info {};
+struct dont_want_client_info {};
+
+// tags to tell whether we want a opt_time_point parameter
+struct do_want_time_point {};
+struct dont_want_time_point {};
+
+// General case
+template <typename Ret, typename... In>
+struct signature<Ret (In...)> {
+ using ret_type = Ret;
+ using arg_types = std::tuple<In...>;
+ using clean = signature;
+ using want_client_info = dont_want_client_info;
+ using want_time_point = dont_want_time_point;
+};
+
+// Specialize 'clean' for handlers that receive client_info
+template <typename Ret, typename... In>
+struct signature<Ret (const client_info&, In...)> {
+ using ret_type = Ret;
+ using arg_types = std::tuple<In...>;
+ using clean = signature<Ret (In...)>;
+ using want_client_info = do_want_client_info;
+ using want_time_point = dont_want_time_point;
+};
+
+template <typename Ret, typename... In>
+struct signature<Ret (client_info&, In...)> {
+ using ret_type = Ret;
+ using arg_types = std::tuple<In...>;
+ using clean = signature<Ret (In...)>;
+ using want_client_info = do_want_client_info;
+ using want_time_point = dont_want_time_point;
+};
+
+// Specialize 'clean' for handlers that receive client_info and opt_time_point
+template <typename Ret, typename... In>
+struct signature<Ret (const client_info&, opt_time_point, In...)> {
+ using ret_type = Ret;
+ using arg_types = std::tuple<In...>;
+ using clean = signature<Ret (In...)>;
+ using want_client_info = do_want_client_info;
+ using want_time_point = do_want_time_point;
+};
+
+template <typename Ret, typename... In>
+struct signature<Ret (client_info&, opt_time_point, In...)> {
+ using ret_type = Ret;
+ using arg_types = std::tuple<In...>;
+ using clean = signature<Ret (In...)>;
+ using want_client_info = do_want_client_info;
+ using want_time_point = do_want_time_point;
+};
+
+// Specialize 'clean' for handlers that receive opt_time_point
+template <typename Ret, typename... In>
+struct signature<Ret (opt_time_point, In...)> {
+ using ret_type = Ret;
+ using arg_types = std::tuple<In...>;
+ using clean = signature<Ret (In...)>;
+ using want_client_info = dont_want_client_info;
+ using want_time_point = do_want_time_point;
+};
+
+template <typename T>
+struct wait_signature {
+ using type = wait_type;
+ using cleaned_type = T;
+};
+
+template <typename... T>
+struct wait_signature<future<T...>> {
+ using type = wait_type;
+ using cleaned_type = future<T...>;
+};
+
+template <>
+struct wait_signature<no_wait_type> {
+ using type = no_wait_type;
+ using cleaned_type = void;
+};
+
+template <>
+struct wait_signature<future<no_wait_type>> {
+ using type = no_wait_type;
+ using cleaned_type = future<>;
+};
+
+template <typename T>
+using wait_signature_t = typename wait_signature<T>::type;
+
+template <typename... In>
+inline
+std::tuple<In...>
+maybe_add_client_info(dont_want_client_info, client_info& ci, std::tuple<In...>&& args) {
+ return std::move(args);
+}
+
+template <typename... In>
+inline
+std::tuple<std::reference_wrapper<client_info>, In...>
+maybe_add_client_info(do_want_client_info, client_info& ci, std::tuple<In...>&& args) {
+ return std::tuple_cat(std::make_tuple(std::ref(ci)), std::move(args));
+}
+
+template <typename... In>
+inline
+std::tuple<In...>
+maybe_add_time_point(dont_want_time_point, opt_time_point& otp, std::tuple<In...>&& args) {
+ return std::move(args);
+}
+
+template <typename... In>
+inline
+std::tuple<opt_time_point, In...>
+maybe_add_time_point(do_want_time_point, opt_time_point& otp, std::tuple<In...>&& args) {
+ return std::tuple_cat(std::make_tuple(otp), std::move(args));
+}
+
+inline sstring serialize_connection_id(const connection_id& id) {
+ sstring p = uninitialized_string(sizeof(id));
+ auto c = p.data();
+ write_le(c, id.id);
+ return p;
+}
+
+inline connection_id deserialize_connection_id(const sstring& s) {
+ connection_id id;
+ auto p = s.c_str();
+ id.id = read_le<decltype(id.id)>(p);
+ return id;
+}
+
+template <bool IsSmartPtr>
+struct serialize_helper;
+
+template <>
+struct serialize_helper<false> {
+ template <typename Serializer, typename Output, typename T>
+ static inline void serialize(Serializer& serializer, Output& out, const T& t) {
+ return write(serializer, out, t);
+ }
+};
+
+template <>
+struct serialize_helper<true> {
+ template <typename Serializer, typename Output, typename T>
+ static inline void serialize(Serializer& serializer, Output& out, const T& t) {
+ return write(serializer, out, *t);
+ }
+};
+
+template <typename Serializer, typename Output, typename... T>
+inline void do_marshall(Serializer& serializer, Output& out, const T&... args);
+
+template <typename Serializer, typename Output>
+struct marshall_one {
+ template <typename T> struct helper {
+ static void doit(Serializer& serializer, Output& out, const T& arg) {
+ using serialize_helper_type = serialize_helper<is_smart_ptr<typename std::remove_reference<T>::type>::value>;
+ serialize_helper_type::serialize(serializer, out, arg);
+ }
+ };
+ template<typename T> struct helper<std::reference_wrapper<const T>> {
+ static void doit(Serializer& serializer, Output& out, const std::reference_wrapper<const T>& arg) {
+ helper<T>::doit(serializer, out, arg.get());
+ }
+ };
+ static void put_connection_id(const connection_id& cid, Output& out) {
+ sstring id = serialize_connection_id(cid);
+ out.write(id.c_str(), id.size());
+ }
+ template <typename... T> struct helper<sink<T...>> {
+ static void doit(Serializer& serializer, Output& out, const sink<T...>& arg) {
+ put_connection_id(arg.get_id(), out);
+ }
+ };
+ template <typename... T> struct helper<source<T...>> {
+ static void doit(Serializer& serializer, Output& out, const source<T...>& arg) {
+ put_connection_id(arg.get_id(), out);
+ }
+ };
+ template <typename... T> struct helper<tuple<T...>> {
+ static void doit(Serializer& serializer, Output& out, const tuple<T...>& arg) {
+ auto do_do_marshall = [&serializer, &out] (const auto&... args) {
+ do_marshall(serializer, out, args...);
+ };
+ std::apply(do_do_marshall, arg);
+ }
+ };
+};
+
+template <typename Serializer, typename Output, typename... T>
+inline void do_marshall(Serializer& serializer, Output& out, const T&... args) {
+ // C++ guarantees that brace-initialization expressions are evaluted in order
+ (void)std::initializer_list<int>{(marshall_one<Serializer, Output>::template helper<T>::doit(serializer, out, args), 1)...};
+}
+
+static inline memory_output_stream<snd_buf::iterator> make_serializer_stream(snd_buf& output) {
+ auto* b = std::get_if<temporary_buffer<char>>(&output.bufs);
+ if (b) {
+ return memory_output_stream<snd_buf::iterator>(memory_output_stream<snd_buf::iterator>::simple(b->get_write(), b->size()));
+ } else {
+ auto& ar = std::get<std::vector<temporary_buffer<char>>>(output.bufs);
+ return memory_output_stream<snd_buf::iterator>(memory_output_stream<snd_buf::iterator>::fragmented(ar.begin(), output.size));
+ }
+}
+
+template <typename Serializer, typename... T>
+inline snd_buf marshall(Serializer& serializer, size_t head_space, const T&... args) {
+ measuring_output_stream measure;
+ do_marshall(serializer, measure, args...);
+ snd_buf ret(measure.size() + head_space);
+ auto out = make_serializer_stream(ret);
+ out.skip(head_space);
+ do_marshall(serializer, out, args...);
+ return ret;
+}
+
+template <typename Serializer, typename Input>
+inline std::tuple<> do_unmarshall(connection& c, Input& in) {
+ return std::make_tuple();
+}
+
+template<typename Serializer, typename Input>
+struct unmarshal_one {
+ template<typename T> struct helper {
+ static T doit(connection& c, Input& in) {
+ return read(c.serializer<Serializer>(), in, type<T>());
+ }
+ };
+ template<typename T> struct helper<optional<T>> {
+ static optional<T> doit(connection& c, Input& in) {
+ if (in.size()) {
+ return optional<T>(read(c.serializer<Serializer>(), in, type<typename remove_optional<T>::type>()));
+ } else {
+ return optional<T>();
+ }
+ }
+ };
+ template<typename T> struct helper<std::reference_wrapper<const T>> {
+ static T doit(connection& c, Input& in) {
+ return helper<T>::doit(c, in);
+ }
+ };
+ static connection_id get_connection_id(Input& in) {
+ sstring id = uninitialized_string(sizeof(connection_id));
+ in.read(id.data(), sizeof(connection_id));
+ return deserialize_connection_id(id);
+ }
+ template<typename... T> struct helper<sink<T...>> {
+ static sink<T...> doit(connection& c, Input& in) {
+ return sink<T...>(make_shared<sink_impl<Serializer, T...>>(c.get_stream(get_connection_id(in))));
+ }
+ };
+ template<typename... T> struct helper<source<T...>> {
+ static source<T...> doit(connection& c, Input& in) {
+ return source<T...>(make_shared<source_impl<Serializer, T...>>(c.get_stream(get_connection_id(in))));
+ }
+ };
+ template <typename... T> struct helper<tuple<T...>> {
+ static tuple<T...> doit(connection& c, Input& in) {
+ return do_unmarshall<Serializer, Input, T...>(c, in);
+ }
+ };
+};
+
+template <typename Serializer, typename Input, typename T0, typename... Trest>
+inline std::tuple<T0, Trest...> do_unmarshall(connection& c, Input& in) {
+ // FIXME: something less recursive
+ auto first = std::make_tuple(unmarshal_one<Serializer, Input>::template helper<T0>::doit(c, in));
+ auto rest = do_unmarshall<Serializer, Input, Trest...>(c, in);
+ return std::tuple_cat(std::move(first), std::move(rest));
+}
+
+template <typename Serializer, typename... T>
+inline std::tuple<T...> unmarshall(connection& c, rcv_buf input) {
+ auto in = make_deserializer_stream(input);
+ return do_unmarshall<Serializer, decltype(in), T...>(c, in);
+}
+
+inline std::exception_ptr unmarshal_exception(rcv_buf& d) {
+ std::exception_ptr ex;
+ auto data = make_deserializer_stream(d);
+
+ uint32_t v32;
+ data.read(reinterpret_cast<char*>(&v32), 4);
+ exception_type ex_type = exception_type(le_to_cpu(v32));
+ data.read(reinterpret_cast<char*>(&v32), 4);
+ uint32_t ex_len = le_to_cpu(v32);
+
+ switch (ex_type) {
+ case exception_type::USER: {
+ std::string s(ex_len, '\0');
+ data.read(&*s.begin(), ex_len);
+ ex = std::make_exception_ptr(std::runtime_error(std::move(s)));
+ break;
+ }
+ case exception_type::UNKNOWN_VERB: {
+ uint64_t v64;
+ data.read(reinterpret_cast<char*>(&v64), 8);
+ ex = std::make_exception_ptr(unknown_verb_error(le_to_cpu(v64)));
+ break;
+ }
+ default:
+ ex = std::make_exception_ptr(unknown_exception_error());
+ break;
+ }
+ return ex;
+}
+
+template <typename Payload, typename... T>
+struct rcv_reply_base {
+ bool done = false;
+ promise<T...> p;
+ template<typename... V>
+ void set_value(V&&... v) {
+ done = true;
+ p.set_value(internal::untuple(std::forward<V>(v))...);
+ }
+ ~rcv_reply_base() {
+ if (!done) {
+ p.set_exception(closed_error());
+ }
+ }
+};
+
+template<typename Serializer, typename T>
+struct rcv_reply : rcv_reply_base<T, T> {
+ inline void get_reply(rpc::client& dst, rcv_buf input) {
+ this->set_value(unmarshall<Serializer, T>(dst, std::move(input)));
+ }
+};
+
+template<typename Serializer, typename... T>
+struct rcv_reply<Serializer, future<T...>> : rcv_reply_base<std::tuple<T...>, T...> {
+ inline void get_reply(rpc::client& dst, rcv_buf input) {
+ this->set_value(unmarshall<Serializer, T...>(dst, std::move(input)));
+ }
+};
+
+template<typename Serializer>
+struct rcv_reply<Serializer, void> : rcv_reply_base<void, void> {
+ inline void get_reply(rpc::client& dst, rcv_buf input) {
+ this->set_value();
+ }
+};
+
+template<typename Serializer>
+struct rcv_reply<Serializer, future<>> : rcv_reply<Serializer, void> {};
+
+template <typename Serializer, typename Ret, typename... InArgs>
+inline auto wait_for_reply(wait_type, std::optional<rpc_clock_type::time_point> timeout, cancellable* cancel, rpc::client& dst, id_type msg_id,
+ signature<Ret (InArgs...)> sig) {
+ using reply_type = rcv_reply<Serializer, Ret>;
+ auto lambda = [] (reply_type& r, rpc::client& dst, id_type msg_id, rcv_buf data) mutable {
+ if (msg_id >= 0) {
+ dst.get_stats_internal().replied++;
+ return r.get_reply(dst, std::move(data));
+ } else {
+ dst.get_stats_internal().exception_received++;
+ r.done = true;
+ r.p.set_exception(unmarshal_exception(data));
+ }
+ };
+ using handler_type = typename rpc::client::template reply_handler<reply_type, decltype(lambda)>;
+ auto r = std::make_unique<handler_type>(std::move(lambda));
+ auto fut = r->reply.p.get_future();
+ dst.wait_for_reply(msg_id, std::move(r), timeout, cancel);
+ return fut;
+}
+
+template<typename Serializer, typename... InArgs>
+inline auto wait_for_reply(no_wait_type, std::optional<rpc_clock_type::time_point>, cancellable* cancel, rpc::client& dst, id_type msg_id,
+ signature<no_wait_type (InArgs...)> sig) { // no_wait overload
+ return make_ready_future<>();
+}
+
+template<typename Serializer, typename... InArgs>
+inline auto wait_for_reply(no_wait_type, std::optional<rpc_clock_type::time_point>, cancellable* cancel, rpc::client& dst, id_type msg_id,
+ signature<future<no_wait_type> (InArgs...)> sig) { // future<no_wait> overload
+ return make_ready_future<>();
+}
+
+// Convert a relative timeout (a duration) to an absolute one (time_point).
+// Do the calculation safely so that a very large duration will be capped by
+// time_point::max, instead of wrapping around to ancient history.
+inline rpc_clock_type::time_point
+relative_timeout_to_absolute(rpc_clock_type::duration relative) {
+ rpc_clock_type::time_point now = rpc_clock_type::now();
+ return now + std::min(relative, rpc_clock_type::time_point::max() - now);
+}
+
+// Returns lambda that can be used to send rpc messages.
+// The lambda gets client connection and rpc parameters as arguments, marshalls them sends
+// to a server and waits for a reply. After receiving reply it unmarshalls it and signal completion
+// to a caller.
+template<typename Serializer, typename MsgType, typename Ret, typename... InArgs>
+auto send_helper(MsgType xt, signature<Ret (InArgs...)> xsig) {
+ struct shelper {
+ MsgType t;
+ signature<Ret (InArgs...)> sig;
+ auto send(rpc::client& dst, std::optional<rpc_clock_type::time_point> timeout, cancellable* cancel, const InArgs&... args) {
+ if (dst.error()) {
+ using cleaned_ret_type = typename wait_signature<Ret>::cleaned_type;
+ return futurize<cleaned_ret_type>::make_exception_future(closed_error());
+ }
+
+ // send message
+ auto msg_id = dst.next_message_id();
+ snd_buf data = marshall(dst.template serializer<Serializer>(), 28, args...);
+ static_assert(snd_buf::chunk_size >= 28, "send buffer chunk size is too small");
+ auto p = data.front().get_write() + 8; // 8 extra bytes for expiration timer
+ write_le<uint64_t>(p, uint64_t(t));
+ write_le<int64_t>(p + 8, msg_id);
+ write_le<uint32_t>(p + 16, data.size - 28);
+
+ // prepare reply handler, if return type is now_wait_type this does nothing, since no reply will be sent
+ using wait = wait_signature_t<Ret>;
+ return when_all(dst.send(std::move(data), timeout, cancel), wait_for_reply<Serializer>(wait(), timeout, cancel, dst, msg_id, sig)).then([] (auto r) {
+ return std::move(std::get<1>(r)); // return future of wait_for_reply
+ });
+ }
+ auto operator()(rpc::client& dst, const InArgs&... args) {
+ return send(dst, {}, nullptr, args...);
+ }
+ auto operator()(rpc::client& dst, rpc_clock_type::time_point timeout, const InArgs&... args) {
+ return send(dst, timeout, nullptr, args...);
+ }
+ auto operator()(rpc::client& dst, rpc_clock_type::duration timeout, const InArgs&... args) {
+ return send(dst, relative_timeout_to_absolute(timeout), nullptr, args...);
+ }
+ auto operator()(rpc::client& dst, cancellable& cancel, const InArgs&... args) {
+ return send(dst, {}, &cancel, args...);
+ }
+
+ };
+ return shelper{xt, xsig};
+}
+
+template<typename Serializer, typename SEASTAR_ELLIPSIS RetTypes>
+inline future<> reply(wait_type, future<RetTypes SEASTAR_ELLIPSIS>&& ret, int64_t msg_id, shared_ptr<server::connection> client,
+ std::optional<rpc_clock_type::time_point> timeout) {
+ if (!client->error()) {
+ snd_buf data;
+ try {
+#if SEASTAR_API_LEVEL < 6
+ if constexpr (sizeof...(RetTypes) == 0) {
+#else
+ if constexpr (std::is_void_v<RetTypes>) {
+#endif
+ ret.get();
+ data = std::invoke(marshall<Serializer>, std::ref(client->template serializer<Serializer>()), 12);
+ } else {
+ data = std::invoke(marshall<Serializer, const RetTypes& SEASTAR_ELLIPSIS>, std::ref(client->template serializer<Serializer>()), 12, std::move(ret.get0()));
+ }
+ } catch (std::exception& ex) {
+ uint32_t len = std::strlen(ex.what());
+ data = snd_buf(20 + len);
+ auto os = make_serializer_stream(data);
+ os.skip(12);
+ uint32_t v32 = cpu_to_le(uint32_t(exception_type::USER));
+ os.write(reinterpret_cast<char*>(&v32), sizeof(v32));
+ v32 = cpu_to_le(len);
+ os.write(reinterpret_cast<char*>(&v32), sizeof(v32));
+ os.write(ex.what(), len);
+ msg_id = -msg_id;
+ }
+
+ return client->respond(msg_id, std::move(data), timeout);
+ } else {
+ ret.ignore_ready_future();
+ return make_ready_future<>();
+ }
+}
+
+// specialization for no_wait_type which does not send a reply
+template<typename Serializer>
+inline future<> reply(no_wait_type, future<no_wait_type>&& r, int64_t msgid, shared_ptr<server::connection> client, std::optional<rpc_clock_type::time_point> timeout) {
+ try {
+ r.get();
+ } catch (std::exception& ex) {
+ client->get_logger()(client->info(), msgid, to_sstring("exception \"") + ex.what() + "\" in no_wait handler ignored");
+ }
+ return make_ready_future<>();
+}
+
+template<typename Ret, typename... InArgs, typename WantClientInfo, typename WantTimePoint, typename Func, typename ArgsTuple>
+inline futurize_t<Ret> apply(Func& func, client_info& info, opt_time_point time_point, WantClientInfo wci, WantTimePoint wtp, signature<Ret (InArgs...)> sig, ArgsTuple&& args) {
+ using futurator = futurize<Ret>;
+ try {
+ return futurator::apply(func, maybe_add_client_info(wci, info, maybe_add_time_point(wtp, time_point, std::forward<ArgsTuple>(args))));
+ } catch (std::runtime_error& ex) {
+ return futurator::make_exception_future(std::current_exception());
+ }
+}
+
+// lref_to_cref is a helper that encapsulates lvalue reference in std::ref() or does nothing otherwise
+template<typename T>
+auto lref_to_cref(T&& x) {
+ return std::move(x);
+}
+
+template<typename T>
+auto lref_to_cref(T& x) {
+ return std::ref(x);
+}
+
+// Creates lambda to handle RPC message on a server.
+// The lambda unmarshalls all parameters, calls a handler, marshall return values and sends them back to a client
+template <typename Serializer, typename Func, typename Ret, typename... InArgs, typename WantClientInfo, typename WantTimePoint>
+auto recv_helper(signature<Ret (InArgs...)> sig, Func&& func, WantClientInfo wci, WantTimePoint wtp) {
+ using signature = decltype(sig);
+ using wait_style = wait_signature_t<Ret>;
+ return [func = lref_to_cref(std::forward<Func>(func))](shared_ptr<server::connection> client,
+ std::optional<rpc_clock_type::time_point> timeout,
+ int64_t msg_id,
+ rcv_buf data) mutable {
+ auto memory_consumed = client->estimate_request_size(data.size);
+ if (memory_consumed > client->max_request_size()) {
+ auto err = format("request size {:d} large than memory limit {:d}", memory_consumed, client->max_request_size());
+ client->get_logger()(client->peer_address(), err);
+ // FIXME: future is discarded
+ (void)try_with_gate(client->get_server().reply_gate(), [client, timeout, msg_id, err = std::move(err)] {
+ return reply<Serializer>(wait_style(), futurize<Ret>::make_exception_future(std::runtime_error(err.c_str())), msg_id, client, timeout).handle_exception([client, msg_id] (std::exception_ptr eptr) {
+ client->get_logger()(client->info(), msg_id, format("got exception while processing an oversized message: {}", eptr));
+ });
+ }).handle_exception_type([] (gate_closed_exception&) {/* ignore */});
+ return make_ready_future();
+ }
+ // note: apply is executed asynchronously with regards to networking so we cannot chain futures here by doing "return apply()"
+ auto f = client->wait_for_resources(memory_consumed, timeout).then([client, timeout, msg_id, data = std::move(data), &func] (auto permit) mutable {
+ // FIXME: future is discarded
+ (void)try_with_gate(client->get_server().reply_gate(), [client, timeout, msg_id, data = std::move(data), permit = std::move(permit), &func] () mutable {
+ try {
+ auto args = unmarshall<Serializer, InArgs...>(*client, std::move(data));
+ return apply(func, client->info(), timeout, WantClientInfo(), WantTimePoint(), signature(), std::move(args)).then_wrapped([client, timeout, msg_id, permit = std::move(permit)] (futurize_t<Ret> ret) mutable {
+ return reply<Serializer>(wait_style(), std::move(ret), msg_id, client, timeout).handle_exception([permit = std::move(permit), client, msg_id] (std::exception_ptr eptr) {
+ client->get_logger()(client->info(), msg_id, format("got exception while processing a message: {}", eptr));
+ });
+ });
+ } catch (...) {
+ client->get_logger()(client->info(), msg_id, format("got exception while processing a message: {}", std::current_exception()));
+ return make_ready_future();
+ }
+ }).handle_exception_type([] (gate_closed_exception&) {/* ignore */});
+ });
+
+ if (timeout) {
+ f = f.handle_exception_type([] (semaphore_timed_out&) { /* ignore */ });
+ }
+
+ return f;
+ };
+}
+
+// helper to create copy constructible lambda from non copy constructible one. std::function<> works only with former kind.
+template<typename Func>
+auto make_copyable_function(Func&& func, std::enable_if_t<!std::is_copy_constructible<std::decay_t<Func>>::value, void*> = nullptr) {
+ auto p = make_lw_shared<typename std::decay_t<Func>>(std::forward<Func>(func));
+ return [p] (auto&&... args) { return (*p)( std::forward<decltype(args)>(args)... ); };
+}
+
+template<typename Func>
+auto make_copyable_function(Func&& func, std::enable_if_t<std::is_copy_constructible<std::decay_t<Func>>::value, void*> = nullptr) {
+ return std::forward<Func>(func);
+}
+
+// This class is used to calculate client side rpc function signature.
+// Return type is converted from a smart pointer to a type it points to.
+// rpc::optional are converted to non optional type.
+//
+// Examples:
+// std::unique_ptr<int>(int, rpc::optional<long>) -> int(int, long)
+// double(float) -> double(float)
+template<typename Ret, typename... In>
+class client_function_type {
+ template<typename T, bool IsSmartPtr>
+ struct drop_smart_ptr_impl;
+ template<typename T>
+ struct drop_smart_ptr_impl<T, true> {
+ using type = typename T::element_type;
+ };
+ template<typename T>
+ struct drop_smart_ptr_impl<T, false> {
+ using type = T;
+ };
+ template<typename T>
+ using drop_smart_ptr = drop_smart_ptr_impl<T, is_smart_ptr<T>::value>;
+
+ // if return type is smart ptr take a type it points to instead
+ using return_type = typename drop_smart_ptr<Ret>::type;
+public:
+ using type = return_type(typename remove_optional<In>::type...);
+};
+
+template<typename Serializer, typename MsgType>
+template<typename Ret, typename... In>
+auto protocol<Serializer, MsgType>::make_client(signature<Ret(In...)> clear_sig, MsgType t) {
+ using sig_type = signature<typename client_function_type<Ret, In...>::type>;
+ return send_helper<Serializer>(t, sig_type());
+}
+
+template<typename Serializer, typename MsgType>
+template<typename Func>
+auto protocol<Serializer, MsgType>::make_client(MsgType t) {
+ return make_client(typename signature<typename function_traits<Func>::signature>::clean(), t);
+}
+
+template<typename Serializer, typename MsgType>
+template<typename Func>
+auto protocol<Serializer, MsgType>::register_handler(MsgType t, scheduling_group sg, Func&& func) {
+ using sig_type = signature<typename function_traits<Func>::signature>;
+ using clean_sig_type = typename sig_type::clean;
+ using want_client_info = typename sig_type::want_client_info;
+ using want_time_point = typename sig_type::want_time_point;
+ auto recv = recv_helper<Serializer>(clean_sig_type(), std::forward<Func>(func),
+ want_client_info(), want_time_point());
+ register_receiver(t, rpc_handler{sg, make_copyable_function(std::move(recv))});
+ return make_client(clean_sig_type(), t);
+}
+
+template<typename Serializer, typename MsgType>
+template<typename Func>
+auto protocol<Serializer, MsgType>::register_handler(MsgType t, Func&& func) {
+ return register_handler(t, scheduling_group(), std::forward<Func>(func));
+}
+
+template<typename Serializer, typename MsgType>
+future<> protocol<Serializer, MsgType>::unregister_handler(MsgType t) {
+ auto it = _handlers.find(t);
+ if (it != _handlers.end()) {
+ return it->second.use_gate.close().finally([this, t] {
+ _handlers.erase(t);
+ });
+ }
+ return make_ready_future<>();
+}
+
+template<typename Serializer, typename MsgType>
+bool protocol<Serializer, MsgType>::has_handler(MsgType msg_id) {
+ auto it = _handlers.find(msg_id);
+ if (it == _handlers.end()) {
+ return false;
+ }
+ return !it->second.use_gate.is_closed();
+}
+
+template<typename Serializer, typename MsgType>
+rpc_handler* protocol<Serializer, MsgType>::get_handler(uint64_t msg_id) {
+ rpc_handler* h = nullptr;
+ auto it = _handlers.find(MsgType(msg_id));
+ if (it != _handlers.end()) {
+ try {
+ it->second.use_gate.enter();
+ h = &it->second;
+ } catch (gate_closed_exception&) {
+ // unregistered, just ignore
+ }
+ }
+ return h;
+}
+
+template<typename Serializer, typename MsgType>
+void protocol<Serializer, MsgType>::put_handler(rpc_handler* h) {
+ h->use_gate.leave();
+}
+
+template<typename T> T make_shard_local_buffer_copy(foreign_ptr<std::unique_ptr<T>> org);
+
+template<typename Serializer, typename... Out>
+future<> sink_impl<Serializer, Out...>::operator()(const Out&... args) {
+ // note that we use remote serializer pointer, so if serailizer needs a state
+ // it should have per-cpu one
+ snd_buf data = marshall(this->_con->get()->template serializer<Serializer>(), 4, args...);
+ static_assert(snd_buf::chunk_size >= 4, "send buffer chunk size is too small");
+ auto p = data.front().get_write();
+ write_le<uint32_t>(p, data.size - 4);
+ // we do not want to dead lock on huge packets, so let them in
+ // but only one at a time
+ auto size = std::min(size_t(data.size), max_stream_buffers_memory);
+ return get_units(this->_sem, size).then([this, data = make_foreign(std::make_unique<snd_buf>(std::move(data)))] (semaphore_units<> su) mutable {
+ if (this->_ex) {
+ return make_exception_future(this->_ex);
+ }
+ // It is OK to discard this future. The user is required to
+ // wait for it when closing.
+ (void)smp::submit_to(this->_con->get_owner_shard(), [this, data = std::move(data)] () mutable {
+ connection* con = this->_con->get();
+ if (con->error()) {
+ return make_exception_future(closed_error());
+ }
+ if(con->sink_closed()) {
+ return make_exception_future(stream_closed());
+ }
+ return con->send(make_shard_local_buffer_copy(std::move(data)), {}, nullptr);
+ }).then_wrapped([su = std::move(su), this] (future<> f) {
+ if (f.failed() && !this->_ex) { // first error is the interesting one
+ this->_ex = f.get_exception();
+ } else {
+ f.ignore_ready_future();
+ }
+ });
+ return make_ready_future<>();
+ });
+}
+
+template<typename Serializer, typename... Out>
+future<> sink_impl<Serializer, Out...>::flush() {
+ // wait until everything is sent out before returning.
+ return with_semaphore(this->_sem, max_stream_buffers_memory, [this] {
+ if (this->_ex) {
+ return make_exception_future(this->_ex);
+ }
+ return make_ready_future();
+ });
+}
+
+template<typename Serializer, typename... Out>
+future<> sink_impl<Serializer, Out...>::close() {
+ return with_semaphore(this->_sem, max_stream_buffers_memory, [this] {
+ return smp::submit_to(this->_con->get_owner_shard(), [this] {
+ connection* con = this->_con->get();
+ if (con->sink_closed()) { // double close, should not happen!
+ return make_exception_future(stream_closed());
+ }
+ future<> f = make_ready_future<>();
+ if (!con->error() && !this->_ex) {
+ snd_buf data = marshall(con->template serializer<Serializer>(), 4);
+ static_assert(snd_buf::chunk_size >= 4, "send buffer chunk size is too small");
+ auto p = data.front().get_write();
+ write_le<uint32_t>(p, -1U); // max len fragment marks an end of a stream
+ f = con->send(std::move(data), {}, nullptr);
+ } else {
+ f = this->_ex ? make_exception_future(this->_ex) : make_exception_future(closed_error());
+ }
+ return f.finally([con] { return con->close_sink(); });
+ });
+ });
+}
+
+template<typename Serializer, typename... Out>
+sink_impl<Serializer, Out...>::~sink_impl() {
+ // A failure to close might leave some continuations running after
+ // this is destroyed, leading to use-after-free bugs.
+ assert(this->_con->get()->sink_closed());
+}
+
+template<typename Serializer, typename... In>
+future<std::optional<std::tuple<In...>>> source_impl<Serializer, In...>::operator()() {
+ auto process_one_buffer = [this] {
+ foreign_ptr<std::unique_ptr<rcv_buf>> buf = std::move(this->_bufs.front());
+ this->_bufs.pop_front();
+ return std::apply([] (In&&... args) {
+ auto ret = std::make_optional(std::make_tuple(std::move(args)...));
+ return make_ready_future<std::optional<std::tuple<In...>>>(std::move(ret));
+ }, unmarshall<Serializer, In...>(*this->_con->get(), make_shard_local_buffer_copy(std::move(buf))));
+ };
+
+ if (!this->_bufs.empty()) {
+ return process_one_buffer();
+ }
+
+ // refill buffers from remote cpu
+ return smp::submit_to(this->_con->get_owner_shard(), [this] () -> future<> {
+ connection* con = this->_con->get();
+ if (con->_source_closed) {
+ return make_exception_future<>(stream_closed());
+ }
+ return con->stream_receive(this->_bufs).then_wrapped([this, con] (future<>&& f) {
+ if (f.failed()) {
+ return con->close_source().then_wrapped([ex = f.get_exception()] (future<> f){
+ f.ignore_ready_future();
+ return make_exception_future<>(ex);
+ });
+ }
+ if (this->_bufs.empty()) { // nothing to read -> eof
+ return con->close_source().then_wrapped([] (future<> f) {
+ f.ignore_ready_future();
+ return make_ready_future<>();
+ });
+ }
+ return make_ready_future<>();
+ });
+ }).then([this, process_one_buffer] () {
+ if (this->_bufs.empty()) {
+ return make_ready_future<std::optional<std::tuple<In...>>>(std::nullopt);
+ } else {
+ return process_one_buffer();
+ }
+ });
+}
+
+template<typename... Out>
+connection_id sink<Out...>::get_id() const {
+ return _impl->_con->get()->get_connection_id();
+}
+
+template<typename... In>
+connection_id source<In...>::get_id() const {
+ return _impl->_con->get()->get_connection_id();
+}
+
+template<typename... In>
+template<typename Serializer, typename... Out>
+sink<Out...> source<In...>::make_sink() {
+ return sink<Out...>(make_shared<sink_impl<Serializer, Out...>>(_impl->_con));
+}
+
+}
+
+}
+
+namespace std {
+template<>
+struct hash<seastar::rpc::streaming_domain_type> {
+ size_t operator()(const seastar::rpc::streaming_domain_type& domain) const {
+ size_t h = 0;
+ boost::hash_combine(h, std::hash<uint64_t>{}(domain._id));
+ return h;
+ }
+};
+}
+
+
diff --git a/src/seastar/include/seastar/rpc/rpc_types.hh b/src/seastar/include/seastar/rpc/rpc_types.hh
new file mode 100644
index 000000000..b411bbd37
--- /dev/null
+++ b/src/seastar/include/seastar/rpc/rpc_types.hh
@@ -0,0 +1,386 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <seastar/net/api.hh>
+#include <stdexcept>
+#include <string>
+#include <boost/any.hpp>
+#include <boost/type.hpp>
+#include <seastar/util/std-compat.hh>
+#include <seastar/util/variant_utils.hh>
+#include <seastar/core/timer.hh>
+#include <seastar/core/circular_buffer.hh>
+#include <seastar/core/simple-stream.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <boost/functional/hash.hpp>
+#include <seastar/core/sharded.hh>
+
+namespace seastar {
+
+namespace rpc {
+
+using rpc_clock_type = lowres_clock;
+
+// used to tag a type for serializers
+template<typename T>
+using type = boost::type<T>;
+
+struct stats {
+ using counter_type = uint64_t;
+ counter_type replied = 0;
+ counter_type pending = 0;
+ counter_type exception_received = 0;
+ counter_type sent_messages = 0;
+ counter_type wait_reply = 0;
+ counter_type timeout = 0;
+};
+
+
+struct client_info {
+ socket_address addr;
+ std::unordered_map<sstring, boost::any> user_data;
+ template <typename T>
+ void attach_auxiliary(const sstring& key, T&& object) {
+ user_data.emplace(key, boost::any(std::forward<T>(object)));
+ }
+ template <typename T>
+ T& retrieve_auxiliary(const sstring& key) {
+ auto it = user_data.find(key);
+ assert(it != user_data.end());
+ return boost::any_cast<T&>(it->second);
+ }
+ template <typename T>
+ typename std::add_const<T>::type& retrieve_auxiliary(const sstring& key) const {
+ return const_cast<client_info*>(this)->retrieve_auxiliary<typename std::add_const<T>::type>(key);
+ }
+};
+
+class error : public std::runtime_error {
+public:
+ error(const std::string& msg) : std::runtime_error(msg) {}
+};
+
+class closed_error : public error {
+public:
+ closed_error() : error("connection is closed") {}
+};
+
+class timeout_error : public error {
+public:
+ timeout_error() : error("rpc call timed out") {}
+};
+
+class unknown_verb_error : public error {
+public:
+ uint64_t type;
+ unknown_verb_error(uint64_t type_) : error("unknown verb"), type(type_) {}
+};
+
+class unknown_exception_error : public error {
+public:
+ unknown_exception_error() : error("unknown exception") {}
+};
+
+class rpc_protocol_error : public error {
+public:
+ rpc_protocol_error() : error("rpc protocol exception") {}
+};
+
+class canceled_error : public error {
+public:
+ canceled_error() : error("rpc call was canceled") {}
+};
+
+class stream_closed : public error {
+public:
+ stream_closed() : error("rpc stream was closed by peer") {}
+};
+
+struct no_wait_type {};
+
+// return this from a callback if client does not want to waiting for a reply
+extern no_wait_type no_wait;
+
+/// \addtogroup rpc
+/// @{
+
+template <typename T>
+class optional : public std::optional<T> {
+public:
+ using std::optional<T>::optional;
+};
+
+class opt_time_point : public std::optional<rpc_clock_type::time_point> {
+public:
+ using std::optional<rpc_clock_type::time_point>::optional;
+ opt_time_point(std::optional<rpc_clock_type::time_point> time_point) {
+ static_cast<std::optional<rpc_clock_type::time_point>&>(*this) = time_point;
+ }
+};
+
+/// @}
+
+struct cancellable {
+ std::function<void()> cancel_send;
+ std::function<void()> cancel_wait;
+ cancellable** send_back_pointer = nullptr;
+ cancellable** wait_back_pointer = nullptr;
+ cancellable() = default;
+ cancellable(cancellable&& x) : cancel_send(std::move(x.cancel_send)), cancel_wait(std::move(x.cancel_wait)), send_back_pointer(x.send_back_pointer), wait_back_pointer(x.wait_back_pointer) {
+ if (send_back_pointer) {
+ *send_back_pointer = this;
+ x.send_back_pointer = nullptr;
+ }
+ if (wait_back_pointer) {
+ *wait_back_pointer = this;
+ x.wait_back_pointer = nullptr;
+ }
+ }
+ cancellable& operator=(cancellable&& x) {
+ if (&x != this) {
+ this->~cancellable();
+ new (this) cancellable(std::move(x));
+ }
+ return *this;
+ }
+ void cancel() {
+ if (cancel_send) {
+ cancel_send();
+ }
+ if (cancel_wait) {
+ cancel_wait();
+ }
+ }
+ ~cancellable() {
+ cancel();
+ }
+};
+
+struct rcv_buf {
+ uint32_t size = 0;
+ std::optional<semaphore_units<>> su;
+ std::variant<std::vector<temporary_buffer<char>>, temporary_buffer<char>> bufs;
+ using iterator = std::vector<temporary_buffer<char>>::iterator;
+ rcv_buf() {}
+ explicit rcv_buf(size_t size_) : size(size_) {}
+ explicit rcv_buf(temporary_buffer<char> b) : size(b.size()), bufs(std::move(b)) {};
+ explicit rcv_buf(std::vector<temporary_buffer<char>> bufs, size_t size)
+ : size(size), bufs(std::move(bufs)) {};
+};
+
+struct snd_buf {
+ // Preferred, but not required, chunk size.
+ static constexpr size_t chunk_size = 128*1024;
+ uint32_t size = 0;
+ std::variant<std::vector<temporary_buffer<char>>, temporary_buffer<char>> bufs;
+ using iterator = std::vector<temporary_buffer<char>>::iterator;
+ snd_buf() {}
+ snd_buf(snd_buf&&) noexcept;
+ snd_buf& operator=(snd_buf&&) noexcept;
+ explicit snd_buf(size_t size_);
+ explicit snd_buf(temporary_buffer<char> b) : size(b.size()), bufs(std::move(b)) {};
+
+ explicit snd_buf(std::vector<temporary_buffer<char>> bufs, size_t size)
+ : size(size), bufs(std::move(bufs)) {};
+
+ temporary_buffer<char>& front();
+};
+
+static inline memory_input_stream<rcv_buf::iterator> make_deserializer_stream(rcv_buf& input) {
+ auto* b = std::get_if<temporary_buffer<char>>(&input.bufs);
+ if (b) {
+ return memory_input_stream<rcv_buf::iterator>(memory_input_stream<rcv_buf::iterator>::simple(b->begin(), b->size()));
+ } else {
+ auto& ar = std::get<std::vector<temporary_buffer<char>>>(input.bufs);
+ return memory_input_stream<rcv_buf::iterator>(memory_input_stream<rcv_buf::iterator>::fragmented(ar.begin(), input.size));
+ }
+}
+
+class compressor {
+public:
+ virtual ~compressor() {}
+ // compress data and leave head_space bytes at the beginning of returned buffer
+ virtual snd_buf compress(size_t head_space, snd_buf data) = 0;
+ // decompress data
+ virtual rcv_buf decompress(rcv_buf data) = 0;
+ virtual sstring name() const = 0;
+
+ // factory to create compressor for a connection
+ class factory {
+ public:
+ virtual ~factory() {}
+ // return feature string that will be sent as part of protocol negotiation
+ virtual const sstring& supported() const = 0;
+ // negotiate compress algorithm
+ virtual std::unique_ptr<compressor> negotiate(sstring feature, bool is_server) const = 0;
+ };
+};
+
+class connection;
+
+struct connection_id {
+ uint64_t id;
+ bool operator==(const connection_id& o) const {
+ return id == o.id;
+ }
+ operator bool() const {
+ return shard() != 0xffff;
+ }
+ size_t shard() const {
+ return size_t(id & 0xffff);
+ }
+ constexpr static connection_id make_invalid_id(uint64_t id = 0) {
+ return make_id(id, 0xffff);
+ }
+ constexpr static connection_id make_id(uint64_t id, uint16_t shard) {
+ return {id << 16 | shard};
+ }
+};
+
+constexpr connection_id invalid_connection_id = connection_id::make_invalid_id();
+
+std::ostream& operator<<(std::ostream&, const connection_id&);
+
+using xshard_connection_ptr = lw_shared_ptr<foreign_ptr<shared_ptr<connection>>>;
+constexpr size_t max_queued_stream_buffers = 50;
+constexpr size_t max_stream_buffers_memory = 100 * 1024;
+
+/// \addtogroup rpc
+/// @{
+
+// send data Out...
+template<typename... Out>
+class sink {
+public:
+ class impl {
+ protected:
+ xshard_connection_ptr _con;
+ semaphore _sem;
+ std::exception_ptr _ex;
+ impl(xshard_connection_ptr con) : _con(std::move(con)), _sem(max_stream_buffers_memory) {}
+ public:
+ virtual ~impl() {};
+ virtual future<> operator()(const Out&... args) = 0;
+ virtual future<> close() = 0;
+ virtual future<> flush() = 0;
+ friend sink;
+ };
+
+private:
+ shared_ptr<impl> _impl;
+
+public:
+ sink(shared_ptr<impl> impl) : _impl(std::move(impl)) {}
+ future<> operator()(const Out&... args) {
+ return _impl->operator()(args...);
+ }
+ future<> close() {
+ return _impl->close();
+ }
+ // Calling this function makes sure that any data buffered
+ // by the stream sink will be flushed to the network.
+ // It does not mean the data was received by the corresponding
+ // source.
+ future<> flush() {
+ return _impl->flush();
+ }
+ connection_id get_id() const;
+};
+
+// receive data In...
+template<typename... In>
+class source {
+public:
+ class impl {
+ protected:
+ xshard_connection_ptr _con;
+ circular_buffer<foreign_ptr<std::unique_ptr<rcv_buf>>> _bufs;
+ impl(xshard_connection_ptr con) : _con(std::move(con)) {
+ _bufs.reserve(max_queued_stream_buffers);
+ }
+ public:
+ virtual ~impl() {}
+ virtual future<std::optional<std::tuple<In...>>> operator()() = 0;
+ friend source;
+ };
+private:
+ shared_ptr<impl> _impl;
+
+public:
+ source(shared_ptr<impl> impl) : _impl(std::move(impl)) {}
+ future<std::optional<std::tuple<In...>>> operator()() {
+ return _impl->operator()();
+ };
+ connection_id get_id() const;
+ template<typename Serializer, typename... Out> sink<Out...> make_sink();
+};
+
+/// Used to return multiple values in rpc without variadic futures
+///
+/// If you wish to return multiple values from an rpc procedure, use a
+/// signature `future<rpc::tuple<return type list> (argument list)>>`. This
+/// will be marshalled by rpc, so you do not need to have your Serializer
+/// serialize/deserialize this tuple type. The serialization format is
+/// compatible with the deprecated variadic future support, and is compatible
+/// with adding new return types in a backwards compatible way provided new
+/// parameters are appended only, and wrapped with rpc::optional:
+/// `future<rpc::tuple<existing return type list, rpc::optional<new_return_type>>> (argument list)`
+///
+/// You may also use another tuple type, such as std::tuple. In this case,
+/// your Serializer type must recognize your tuple type and provide serialization
+/// and deserialization for it.
+template <typename... T>
+class tuple : public std::tuple<T...> {
+public:
+ using std::tuple<T...>::tuple;
+ tuple(std::tuple<T...>&& x) : std::tuple<T...>(std::move(x)) {}
+};
+
+/// @}
+
+template <typename... T>
+tuple(T&&...) -> tuple<T...>;
+
+} // namespace rpc
+
+}
+
+namespace std {
+template<>
+struct hash<seastar::rpc::connection_id> {
+ size_t operator()(const seastar::rpc::connection_id& id) const {
+ size_t h = 0;
+ boost::hash_combine(h, std::hash<uint64_t>{}(id.id));
+ return h;
+ }
+};
+
+template <typename... T>
+struct tuple_size<seastar::rpc::tuple<T...>> : tuple_size<tuple<T...>> {
+};
+
+template <size_t I, typename... T>
+struct tuple_element<I, seastar::rpc::tuple<T...>> : tuple_element<I, tuple<T...>> {
+};
+
+}
diff --git a/src/seastar/include/seastar/testing/entry_point.hh b/src/seastar/include/seastar/testing/entry_point.hh
new file mode 100644
index 000000000..f352db11d
--- /dev/null
+++ b/src/seastar/include/seastar/testing/entry_point.hh
@@ -0,0 +1,33 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2018 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+namespace seastar {
+
+namespace testing {
+
+int entry_point(int argc, char **argv);
+
+}
+
+} \ No newline at end of file
diff --git a/src/seastar/include/seastar/testing/exchanger.hh b/src/seastar/include/seastar/testing/exchanger.hh
new file mode 100644
index 000000000..6b91170ee
--- /dev/null
+++ b/src/seastar/include/seastar/testing/exchanger.hh
@@ -0,0 +1,88 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <mutex>
+#include <condition_variable>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+namespace testing {
+
+class exchanger_base {
+protected:
+ exchanger_base();
+ ~exchanger_base();
+ std::mutex _mutex;
+ std::condition_variable _cv;
+ std::exception_ptr _exception;
+ void interrupt_ptr(std::exception_ptr e) {
+ std::unique_lock<std::mutex> lock(_mutex);
+ if (!_exception) {
+ _exception = e;
+ _cv.notify_all();
+ }
+ // FIXME: log if already interrupted
+ }
+};
+
+// Single-element blocking queue
+template <typename T>
+class exchanger : public exchanger_base {
+private:
+ std::optional<T> _element;
+
+public:
+ template <typename Exception>
+ void interrupt(Exception e) {
+ try {
+ throw e;
+ } catch (...) {
+ interrupt_ptr(std::current_exception());
+ }
+ }
+ void give(T value) {
+ std::unique_lock<std::mutex> lock(_mutex);
+ _cv.wait(lock, [this] { return !_element || _exception; });
+ if (_exception) {
+ std::rethrow_exception(_exception);
+ }
+ _element = value;
+ _cv.notify_one();
+ }
+ T take() {
+ std::unique_lock<std::mutex> lock(_mutex);
+ _cv.wait(lock, [this] { return bool(_element) || _exception; });
+ if (_exception) {
+ std::rethrow_exception(_exception);
+ }
+ auto v = *_element;
+ _element = {};
+ _cv.notify_one();
+ return v;
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/testing/on_internal_error.hh b/src/seastar/include/seastar/testing/on_internal_error.hh
new file mode 100644
index 000000000..f40943235
--- /dev/null
+++ b/src/seastar/include/seastar/testing/on_internal_error.hh
@@ -0,0 +1,40 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+namespace seastar {
+namespace testing {
+
+// Disables aborting in on_internal_error() for a scope.
+//
+// Intended for tests, which want to test error paths that invoke
+// on_internal_error() without aborting, at the same time, having it enabled
+// for other, indirectly affected code paths, that are not a direct target of
+// the test.
+class scoped_no_abort_on_internal_error {
+public:
+ scoped_no_abort_on_internal_error();
+ ~scoped_no_abort_on_internal_error();
+};
+
+}
+}
diff --git a/src/seastar/include/seastar/testing/perf_tests.hh b/src/seastar/include/seastar/testing/perf_tests.hh
new file mode 100644
index 000000000..d225310ba
--- /dev/null
+++ b/src/seastar/include/seastar/testing/perf_tests.hh
@@ -0,0 +1,259 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2018 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include <fmt/format.h>
+
+#include <seastar/core/future.hh>
+#include <seastar/core/loop.hh>
+
+using namespace seastar;
+
+namespace perf_tests {
+namespace internal {
+
+struct config;
+
+using clock_type = std::chrono::steady_clock;
+
+class performance_test {
+ std::string _test_case;
+ std::string _test_group;
+
+ uint64_t _single_run_iterations = 0;
+ std::atomic<uint64_t> _max_single_run_iterations;
+private:
+ void do_run(const config&);
+protected:
+ [[gnu::always_inline]] [[gnu::hot]]
+ bool stop_iteration() const {
+ return _single_run_iterations >= _max_single_run_iterations.load(std::memory_order_relaxed);
+ }
+
+ [[gnu::always_inline]] [[gnu::hot]]
+ void next_iteration(size_t n) {
+ _single_run_iterations += n;
+ }
+
+ virtual void set_up() = 0;
+ virtual void tear_down() noexcept = 0;
+ virtual future<clock_type::duration> do_single_run() = 0;
+public:
+ performance_test(const std::string& test_case, const std::string& test_group)
+ : _test_case(test_case)
+ , _test_group(test_group)
+ { }
+
+ virtual ~performance_test() = default;
+
+ const std::string& test_case() const { return _test_case; }
+ const std::string& test_group() const { return _test_group; }
+ std::string name() const { return fmt::format("{}.{}", test_group(), test_case()); }
+
+ void run(const config&);
+public:
+ static void register_test(std::unique_ptr<performance_test>);
+};
+
+// Helper for measuring time.
+// Each microbenchmark can either use the default behaviour which measures
+// only the start and stop time of the whole run or manually invoke
+// start_measuring_time() and stop_measuring_time() in order to measure
+// only parts of each iteration.
+class time_measurement {
+ clock_type::time_point _run_start_time;
+ clock_type::time_point _start_time;
+ clock_type::duration _total_time;
+public:
+ [[gnu::always_inline]] [[gnu::hot]]
+ void start_run() {
+ _total_time = { };
+ auto t = clock_type::now();
+ _run_start_time = t;
+ _start_time = t;
+ }
+
+ [[gnu::always_inline]] [[gnu::hot]]
+ clock_type::duration stop_run() {
+ auto t = clock_type::now();
+ if (_start_time == _run_start_time) {
+ return t - _start_time;
+ }
+ return _total_time;
+ }
+
+ [[gnu::always_inline]] [[gnu::hot]]
+ void start_iteration() {
+ _start_time = clock_type::now();
+ }
+
+ [[gnu::always_inline]] [[gnu::hot]]
+ void stop_iteration() {
+ auto t = clock_type::now();
+ _total_time += t - _start_time;
+ }
+};
+
+extern time_measurement measure_time;
+
+namespace {
+
+template<bool Condition, typename TrueFn, typename FalseFn>
+struct do_if_constexpr_ : FalseFn {
+ do_if_constexpr_(TrueFn, FalseFn false_fn) : FalseFn(std::move(false_fn)) { }
+ decltype(auto) operator()() const {
+ // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64095
+ return FalseFn::operator()(0);
+ }
+};
+template<typename TrueFn, typename FalseFn>
+struct do_if_constexpr_<true, TrueFn, FalseFn> : TrueFn {
+ do_if_constexpr_(TrueFn true_fn, FalseFn) : TrueFn(std::move(true_fn)) { }
+ decltype(auto) operator()() const { return TrueFn::operator()(0); }
+};
+
+template<bool Condition, typename TrueFn, typename FalseFn>
+do_if_constexpr_<Condition, TrueFn, FalseFn> if_constexpr_(TrueFn&& true_fn, FalseFn&& false_fn)
+{
+ return do_if_constexpr_<Condition, TrueFn, FalseFn>(std::forward<TrueFn>(true_fn),
+ std::forward<FalseFn>(false_fn));
+}
+
+}
+
+template<typename Test>
+class concrete_performance_test final : public performance_test {
+ std::optional<Test> _test;
+private:
+ template<typename... Args>
+ auto run_test(Args&&...) {
+ return _test->run();
+ }
+
+protected:
+ virtual void set_up() override {
+ _test.emplace();
+ }
+
+ virtual void tear_down() noexcept override {
+ _test = std::nullopt;
+ }
+
+ [[gnu::hot]]
+ virtual future<clock_type::duration> do_single_run() override {
+ // Redundant 'this->'s courtesy of https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61636
+ return if_constexpr_<is_future<decltype(_test->run())>::value>([&] (auto&&...) {
+ measure_time.start_run();
+ return do_until([this] { return this->stop_iteration(); }, [this] {
+ return if_constexpr_<std::is_same<decltype(_test->run()), future<>>::value>([&] (auto&&...) {
+ this->next_iteration(1);
+ return _test->run();
+ }, [&] (auto&&... dependency) {
+ // We need `dependency` to make sure the compiler won't be able to instantiate anything
+ // (and notice that the code does not compile) if this part of if_constexpr_ is not active.
+ return run_test(dependency...).then([&] (size_t n) {
+ this->next_iteration(n);
+ });
+ })();
+ }).then([] {
+ return measure_time.stop_run();
+ });
+ }, [&] (auto&&...) {
+ measure_time.start_run();
+ while (!stop_iteration()) {
+ if_constexpr_<std::is_void<decltype(_test->run())>::value>([&] (auto&&...) {
+ (void)_test->run();
+ this->next_iteration(1);
+ }, [&] (auto&&... dependency) {
+ // We need `dependency` to make sure the compiler won't be able to instantiate anything
+ // (and notice that the code does not compile) if this part of if_constexpr_ is not active.
+ this->next_iteration(run_test(dependency...));
+ })();
+ }
+ return make_ready_future<clock_type::duration>(measure_time.stop_run());
+ })();
+ }
+public:
+ using performance_test::performance_test;
+};
+
+void register_test(std::unique_ptr<performance_test>);
+
+template<typename Test>
+struct test_registrar {
+ test_registrar(const std::string& test_group, const std::string& test_case) {
+ auto test = std::make_unique<concrete_performance_test<Test>>(test_case, test_group);
+ performance_test::register_test(std::move(test));
+ }
+};
+
+}
+
+[[gnu::always_inline]]
+inline void start_measuring_time()
+{
+ internal::measure_time.start_iteration();
+}
+
+[[gnu::always_inline]]
+inline void stop_measuring_time()
+{
+ internal::measure_time.stop_iteration();
+}
+
+
+template<typename T>
+void do_not_optimize(const T& v)
+{
+ asm volatile("" : : "r,m" (v));
+}
+
+}
+
+// PERF_TEST and PERF_TEST_F support both synchronous and asynchronous functions.
+// The former should return `void`, the latter `future<>`.
+//
+// Test cases may perform multiple operations in a single run, this may be desirable
+// if the cost of an individual operation is very small. This allows measuring either
+// the latency of throughput depending on how the test in written. In such cases,
+// the test function shall return either size_t or future<size_t> for synchronous and
+// asynchronous cases respectively. The returned value shall be the number of iterations
+// done in a single test run.
+
+#define PERF_TEST_F(test_group, test_case) \
+ struct test_##test_group##_##test_case : test_group { \
+ [[gnu::always_inline]] inline auto run(); \
+ }; \
+ static ::perf_tests::internal::test_registrar<test_##test_group##_##test_case> \
+ test_##test_group##_##test_case##_registrar(#test_group, #test_case); \
+ [[gnu::always_inline]] auto test_##test_group##_##test_case::run()
+
+#define PERF_TEST(test_group, test_case) \
+ struct test_##test_group##_##test_case { \
+ [[gnu::always_inline]] inline auto run(); \
+ }; \
+ static ::perf_tests::internal::test_registrar<test_##test_group##_##test_case> \
+ test_##test_group##_##test_case##_registrar(#test_group, #test_case); \
+ [[gnu::always_inline]] auto test_##test_group##_##test_case::run()
diff --git a/src/seastar/include/seastar/testing/seastar_test.hh b/src/seastar/include/seastar/testing/seastar_test.hh
new file mode 100644
index 000000000..4bb848f88
--- /dev/null
+++ b/src/seastar/include/seastar/testing/seastar_test.hh
@@ -0,0 +1,68 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+
+#include <seastar/core/future.hh>
+#include <seastar/util/std-compat.hh>
+#include <seastar/testing/entry_point.hh>
+
+namespace seastar {
+
+namespace testing {
+
+class seastar_test {
+public:
+ seastar_test();
+ virtual ~seastar_test() {}
+ virtual const char* get_test_file() = 0;
+ virtual const char* get_name() = 0;
+ virtual int get_expected_failures() { return 0; }
+ virtual future<> run_test_case() = 0;
+ void run();
+};
+
+const std::vector<seastar_test*>& known_tests();
+
+// BOOST_REQUIRE_EXCEPTION predicates
+namespace exception_predicate {
+
+std::function<bool(const std::exception&)> message_equals(std::string_view expected_message);
+std::function<bool(const std::exception&)> message_contains(std::string_view expected_message);
+
+} // exception_predicate
+
+}
+
+}
+
+#ifdef SEASTAR_TESTING_MAIN
+
+int main(int argc, char** argv) {
+ return seastar::testing::entry_point(argc, argv);
+}
+
+#endif // SEASTAR_TESTING_MAIN \ No newline at end of file
diff --git a/src/seastar/include/seastar/testing/test_case.hh b/src/seastar/include/seastar/testing/test_case.hh
new file mode 100644
index 000000000..65bf48341
--- /dev/null
+++ b/src/seastar/include/seastar/testing/test_case.hh
@@ -0,0 +1,36 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2018 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+#include <seastar/testing/seastar_test.hh>
+
+#define SEASTAR_TEST_CASE(name) \
+ struct name : public seastar::testing::seastar_test { \
+ const char* get_test_file() override { return __FILE__; } \
+ const char* get_name() override { return #name; } \
+ seastar::future<> run_test_case() override; \
+ }; \
+ static name name ## _instance; \
+ seastar::future<> name::run_test_case()
diff --git a/src/seastar/include/seastar/testing/test_runner.hh b/src/seastar/include/seastar/testing/test_runner.hh
new file mode 100644
index 000000000..17d032977
--- /dev/null
+++ b/src/seastar/include/seastar/testing/test_runner.hh
@@ -0,0 +1,59 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory>
+#include <functional>
+#include <atomic>
+#include <random>
+#include <seastar/core/future.hh>
+#include <seastar/core/posix.hh>
+#include <seastar/testing/exchanger.hh>
+
+namespace seastar {
+
+namespace testing {
+
+extern thread_local std::default_random_engine local_random_engine;
+
+class test_runner {
+private:
+ std::unique_ptr<posix_thread> _thread;
+ std::atomic<bool> _started{false};
+ exchanger<std::function<future<>()>> _task;
+ bool _done = false;
+ int _exit_code{0};
+public:
+ // Returns whether initialization was successful.
+ // Will return as soon as the seastar::app was started.
+ bool start(int argc, char** argv);
+ ~test_runner();
+ void run_sync(std::function<future<>()> task);
+ // Returns the return value of the underlying `seastar::app::run()`.
+ int finalize();
+};
+
+test_runner& global_test_runner();
+
+}
+
+}
diff --git a/src/seastar/include/seastar/testing/thread_test_case.hh b/src/seastar/include/seastar/testing/thread_test_case.hh
new file mode 100644
index 000000000..3d94a19bb
--- /dev/null
+++ b/src/seastar/include/seastar/testing/thread_test_case.hh
@@ -0,0 +1,46 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2018 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/thread.hh>
+
+#include <seastar/testing/seastar_test.hh>
+
+#define SEASTAR_THREAD_TEST_CASE_EXPECTED_FAILURES(name, failures) \
+ struct name : public seastar::testing::seastar_test { \
+ const char* get_test_file() override { return __FILE__; } \
+ const char* get_name() override { return #name; } \
+ int get_expected_failures() override { return failures; } \
+ seastar::future<> run_test_case() override { \
+ return seastar::async([this] { \
+ do_run_test_case(); \
+ }); \
+ } \
+ void do_run_test_case(); \
+ }; \
+ static name name ## _instance; \
+ void name::do_run_test_case()
+
+#define SEASTAR_THREAD_TEST_CASE(name) \
+ SEASTAR_THREAD_TEST_CASE_EXPECTED_FAILURES(name, 0)
diff --git a/src/seastar/include/seastar/util/alloc_failure_injector.hh b/src/seastar/include/seastar/util/alloc_failure_injector.hh
new file mode 100644
index 000000000..fc43b4dc3
--- /dev/null
+++ b/src/seastar/include/seastar/util/alloc_failure_injector.hh
@@ -0,0 +1,132 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <limits>
+#include <cstdint>
+#include <functional>
+#include <seastar/util/noncopyable_function.hh>
+#include <seastar/util/critical_alloc_section.hh>
+
+namespace seastar {
+namespace memory {
+
+///
+/// Allocation failure injection framework. Allows testing for exception safety.
+///
+/// To exhaustively inject failure at every allocation point:
+///
+/// uint64_t i = 0;
+/// while (true) {
+/// try {
+/// local_failure_injector().fail_after(i++);
+/// code_under_test();
+/// local_failure_injector().cancel();
+/// break;
+/// } catch (const std::bad_alloc&) {
+/// // expected
+/// }
+/// }
+class alloc_failure_injector {
+ uint64_t _alloc_count;
+ uint64_t _fail_at = std::numeric_limits<uint64_t>::max();
+ noncopyable_function<void()> _on_alloc_failure = [] { throw std::bad_alloc(); };
+ bool _failed;
+private:
+ void fail();
+public:
+ /// \brief Marks a point in code which should be considered for failure injection.
+ void on_alloc_point() {
+ if (is_critical_alloc_section()) {
+ return;
+ }
+ if (_alloc_count >= _fail_at) {
+ fail();
+ }
+ ++_alloc_count;
+ }
+
+ /// Counts encountered allocation points which didn't fail and didn't have failure suppressed.
+ uint64_t alloc_count() const {
+ return _alloc_count;
+ }
+
+ /// Will cause count-th allocation point from now to fail, counting from 0.
+ void fail_after(uint64_t count) {
+ _fail_at = _alloc_count + count;
+ _failed = false;
+ }
+
+ /// Cancels the failure scheduled by fail_after().
+ void cancel() {
+ _fail_at = std::numeric_limits<uint64_t>::max();
+ }
+
+ /// Returns true iff allocation was failed since last fail_after().
+ bool failed() const {
+ return _failed;
+ }
+
+ /// Runs given function with a custom failure action instead of the default std::bad_alloc throw.
+ void run_with_callback(noncopyable_function<void()> callback, noncopyable_function<void()> to_run);
+};
+
+/// \cond internal
+extern thread_local alloc_failure_injector the_alloc_failure_injector;
+/// \endcond
+
+/// \brief Return the shard-local \ref alloc_failure_injector instance.
+inline
+alloc_failure_injector& local_failure_injector() {
+ return the_alloc_failure_injector;
+}
+
+#ifdef SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
+
+#ifdef SEASTAR_DEFAULT_ALLOCATOR
+#error SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION is not supported when using SEASTAR_DEFAULT_ALLOCATOR
+#endif
+
+#endif
+
+
+struct [[deprecated("Use scoped_critical_section instead")]] disable_failure_guard {
+ scoped_critical_alloc_section cs;
+};
+
+/// \brief Marks a point in code which should be considered for failure injection.
+inline
+void on_alloc_point() {
+#ifdef SEASTAR_ENABLE_ALLOC_FAILURE_INJECTION
+ local_failure_injector().on_alloc_point();
+#endif
+}
+
+/// Repeatedly run func with allocation failures
+///
+/// Initially, allocations start to fail immediately. In each
+/// subsequent run the failures start one allocation later. This
+/// returns when func is run and no allocation failures are detected.
+void with_allocation_failures(noncopyable_function<void()> func);
+
+}
+}
diff --git a/src/seastar/include/seastar/util/attribute-compat.hh b/src/seastar/include/seastar/util/attribute-compat.hh
new file mode 100644
index 000000000..243d9b870
--- /dev/null
+++ b/src/seastar/include/seastar/util/attribute-compat.hh
@@ -0,0 +1,27 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2019 ScyllaDB
+ */
+#pragma once
+
+#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard)
+ #define SEASTAR_NODISCARD [[nodiscard]]
+#else
+ #define SEASTAR_NODISCARD
+#endif
diff --git a/src/seastar/include/seastar/util/backtrace.hh b/src/seastar/include/seastar/util/backtrace.hh
new file mode 100644
index 000000000..885c85d68
--- /dev/null
+++ b/src/seastar/include/seastar/util/backtrace.hh
@@ -0,0 +1,230 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <execinfo.h>
+#include <iosfwd>
+#include <variant>
+#include <boost/container/static_vector.hpp>
+
+#include <seastar/core/sstring.hh>
+#include <seastar/core/print.hh>
+#include <seastar/core/scheduling.hh>
+#include <seastar/core/shared_ptr.hh>
+
+namespace seastar {
+
+struct shared_object {
+ sstring name;
+ uintptr_t begin;
+ uintptr_t end; // C++-style, last addr + 1
+};
+
+struct frame {
+ const shared_object* so;
+ uintptr_t addr;
+};
+
+bool operator==(const frame& a, const frame& b);
+
+
+// If addr doesn't seem to belong to any of the provided shared objects, it
+// will be considered as part of the executable.
+frame decorate(uintptr_t addr);
+
+// Invokes func for each frame passing it as argument.
+template<typename Func>
+void backtrace(Func&& func) noexcept(noexcept(func(frame()))) {
+ constexpr size_t max_backtrace = 100;
+ void* buffer[max_backtrace];
+ int n = ::backtrace(buffer, max_backtrace);
+ for (int i = 0; i < n; ++i) {
+ auto ip = reinterpret_cast<uintptr_t>(buffer[i]);
+ func(decorate(ip - 1));
+ }
+}
+
+// Represents a call stack of a single thread.
+class simple_backtrace {
+public:
+ using vector_type = boost::container::static_vector<frame, 64>;
+private:
+ vector_type _frames;
+ size_t _hash;
+private:
+ size_t calculate_hash() const;
+public:
+ simple_backtrace() = default;
+ simple_backtrace(vector_type f) : _frames(std::move(f)) {}
+ size_t hash() const { return _hash; }
+
+ friend std::ostream& operator<<(std::ostream& out, const simple_backtrace&);
+
+ bool operator==(const simple_backtrace& o) const {
+ return _hash == o._hash && _frames == o._frames;
+ }
+
+ bool operator!=(const simple_backtrace& o) const {
+ return !(*this == o);
+ }
+};
+
+using shared_backtrace = seastar::lw_shared_ptr<simple_backtrace>;
+
+// Represents a task object inside a tasktrace.
+class task_entry {
+ const std::type_info* _task_type;
+public:
+ task_entry(const std::type_info& ti)
+ : _task_type(&ti)
+ { }
+
+ friend std::ostream& operator<<(std::ostream& out, const task_entry&);
+
+ bool operator==(const task_entry& o) const {
+ return *_task_type == *o._task_type;
+ }
+
+ bool operator!=(const task_entry& o) const {
+ return !(*this == o);
+ }
+
+ size_t hash() const { return _task_type->hash_code(); }
+};
+
+// Extended backtrace which consists of a backtrace of the currently running task
+// and information about the chain of tasks waiting for the current operation to complete.
+class tasktrace {
+public:
+ using entry = std::variant<shared_backtrace, task_entry>;
+ using vector_type = boost::container::static_vector<entry, 16>;
+private:
+ simple_backtrace _main;
+ vector_type _prev;
+ scheduling_group _sg;
+ size_t _hash;
+public:
+ tasktrace() = default;
+ tasktrace(simple_backtrace main, vector_type prev, size_t prev_hash, scheduling_group sg);
+ ~tasktrace();
+
+ size_t hash() const { return _hash; }
+
+ friend std::ostream& operator<<(std::ostream& out, const tasktrace&);
+
+ bool operator==(const tasktrace& o) const;
+
+ bool operator!=(const tasktrace& o) const {
+ return !(*this == o);
+ }
+};
+
+}
+
+namespace std {
+
+template<>
+struct hash<seastar::simple_backtrace> {
+ size_t operator()(const seastar::simple_backtrace& b) const {
+ return b.hash();
+ }
+};
+
+template<>
+struct hash<seastar::tasktrace> {
+ size_t operator()(const seastar::tasktrace& b) const {
+ return b.hash();
+ }
+};
+
+}
+
+namespace seastar {
+
+using saved_backtrace = tasktrace;
+
+saved_backtrace current_backtrace() noexcept;
+
+tasktrace current_tasktrace() noexcept;
+
+// Collects backtrace only within the currently executing task.
+simple_backtrace current_backtrace_tasklocal() noexcept;
+
+std::ostream& operator<<(std::ostream& out, const tasktrace& b);
+
+namespace internal {
+
+template<class Exc>
+class backtraced : public Exc {
+ std::shared_ptr<sstring> _backtrace;
+public:
+ template<typename... Args>
+ backtraced(Args&&... args)
+ : Exc(std::forward<Args>(args)...)
+ , _backtrace(std::make_shared<sstring>(format("{} Backtrace: {}", Exc::what(), current_backtrace()))) {}
+
+ /**
+ * Returns the original exception message with a backtrace appended to it
+ *
+ * @return original exception message followed by a backtrace
+ */
+ virtual const char* what() const noexcept override {
+ assert(_backtrace);
+ return _backtrace->c_str();
+ }
+};
+
+}
+
+
+/// Create an exception pointer of unspecified type that is derived from Exc type
+/// with a backtrace attached to its message.
+///
+/// \tparam Exc exception type to be caught at the receiving side
+/// \tparam Args types of arguments forwarded to the constructor of Exc
+/// \param args arguments forwarded to the constructor of Exc
+/// \return std::exception_ptr containing the exception with the backtrace.
+template <class Exc, typename... Args>
+std::exception_ptr make_backtraced_exception_ptr(Args&&... args) {
+ using exc_type = std::decay_t<Exc>;
+ static_assert(std::is_base_of<std::exception, exc_type>::value,
+ "throw_with_backtrace only works with exception types");
+ return std::make_exception_ptr<internal::backtraced<exc_type>>(Exc(std::forward<Args>(args)...));
+}
+
+ /**
+ * Throws an exception of unspecified type that is derived from the Exc type
+ * with a backtrace attached to its message
+ *
+ * @tparam Exc exception type to be caught at the receiving side
+ * @tparam Args types of arguments forwarded to the constructor of Exc
+ * @param args arguments forwarded to the constructor of Exc
+ * @return never returns (throws an exception)
+ */
+template <class Exc, typename... Args>
+[[noreturn]]
+void
+throw_with_backtrace(Args&&... args) {
+ std::rethrow_exception(make_backtraced_exception_ptr<Exc>(std::forward<Args>(args)...));
+};
+
+}
diff --git a/src/seastar/include/seastar/util/bool_class.hh b/src/seastar/include/seastar/util/bool_class.hh
new file mode 100644
index 000000000..6acadb0fd
--- /dev/null
+++ b/src/seastar/include/seastar/util/bool_class.hh
@@ -0,0 +1,110 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB.
+ */
+
+#pragma once
+
+#include <ostream>
+
+namespace seastar {
+
+/// \addtogroup utilities
+/// @{
+
+/// \brief Type-safe boolean
+///
+/// bool_class objects are type-safe boolean values that cannot be implicitly
+/// casted to untyped bools, integers or different bool_class types while still
+/// provides all relevant logical and comparison operators.
+///
+/// bool_class template parameter is a tag type that is going to be used to
+/// distinguish booleans of different types.
+///
+/// Usage examples:
+/// \code
+/// struct foo_tag { };
+/// using foo = bool_class<foo_tag>;
+///
+/// struct bar_tag { };
+/// using bar = bool_class<bar_tag>;
+///
+/// foo v1 = foo::yes; // OK
+/// bar v2 = foo::yes; // ERROR, no implicit cast
+/// foo v4 = v1 || foo::no; // OK
+/// bar v5 = bar::yes && bar(true); // OK
+/// bool v6 = v5; // ERROR, no implicit cast
+/// \endcode
+///
+/// \tparam Tag type used as a tag
+template<typename Tag>
+class bool_class {
+ bool _value;
+public:
+ static const bool_class yes;
+ static const bool_class no;
+
+ /// Constructs a bool_class object initialised to \c false.
+ constexpr bool_class() noexcept : _value(false) { }
+
+ /// Constructs a bool_class object initialised to \c v.
+ constexpr explicit bool_class(bool v) noexcept : _value(v) { }
+
+ /// Casts a bool_class object to an untyped \c bool.
+ explicit operator bool() const noexcept { return _value; }
+
+ /// Logical OR.
+ friend bool_class operator||(bool_class x, bool_class y) noexcept {
+ return bool_class(x._value || y._value);
+ }
+
+ /// Logical AND.
+ friend bool_class operator&&(bool_class x, bool_class y) noexcept {
+ return bool_class(x._value && y._value);
+ }
+
+ /// Logical NOT.
+ friend bool_class operator!(bool_class x) noexcept {
+ return bool_class(!x._value);
+ }
+
+ /// Equal-to operator.
+ friend bool operator==(bool_class x, bool_class y) noexcept {
+ return x._value == y._value;
+ }
+
+ /// Not-equal-to operator.
+ friend bool operator!=(bool_class x, bool_class y) noexcept {
+ return x._value != y._value;
+ }
+
+ /// Prints bool_class value to an output stream.
+ friend std::ostream& operator<<(std::ostream& os, bool_class v) {
+ return os << (v._value ? "true" : "false");
+ }
+};
+
+template<typename Tag>
+const bool_class<Tag> bool_class<Tag>::yes { true };
+template<typename Tag>
+const bool_class<Tag> bool_class<Tag>::no { false };
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/util/concepts.hh b/src/seastar/include/seastar/util/concepts.hh
new file mode 100644
index 000000000..26620fe50
--- /dev/null
+++ b/src/seastar/include/seastar/util/concepts.hh
@@ -0,0 +1,34 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2020 ScyllaDB
+ */
+#pragma once
+
+#if defined(__cpp_concepts) && __cpp_concepts >= 201907
+
+#define SEASTAR_CONCEPT(x...) x
+#define SEASTAR_NO_CONCEPT(x...)
+
+#else
+
+#define SEASTAR_CONCEPT(x...)
+#define SEASTAR_NO_CONCEPT(x...) x
+
+#endif
+
diff --git a/src/seastar/include/seastar/util/conversions.hh b/src/seastar/include/seastar/util/conversions.hh
new file mode 100644
index 000000000..4e005badd
--- /dev/null
+++ b/src/seastar/include/seastar/util/conversions.hh
@@ -0,0 +1,48 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <cstdlib>
+#include <string>
+#include <vector>
+
+namespace seastar {
+
+// Convert a string to a memory size, allowing binary SI
+// suffixes (intentionally, even though SI suffixes are
+// decimal, to follow existing usage).
+//
+// "5" -> 5
+// "4k" -> (4 << 10)
+// "8M" -> (8 << 20)
+// "7G" -> (7 << 30)
+// "1T" -> (1 << 40)
+// anything else: exception
+size_t parse_memory_size(std::string s);
+
+static inline std::vector<char> string2vector(std::string str) {
+ auto v = std::vector<char>(str.begin(), str.end());
+ v.push_back('\0');
+ return v;
+}
+
+}
diff --git a/src/seastar/include/seastar/util/critical_alloc_section.hh b/src/seastar/include/seastar/util/critical_alloc_section.hh
new file mode 100644
index 000000000..0657600a7
--- /dev/null
+++ b/src/seastar/include/seastar/util/critical_alloc_section.hh
@@ -0,0 +1,70 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+namespace seastar {
+namespace memory {
+
+/// \cond internal
+namespace internal {
+
+// This variable is used in hot paths so we want to avoid the compiler
+// generating TLS init guards for it. In C++20 we have constinit to tell the
+// compiler that it can be initialized compile time (although gcc still doesn't
+// completely drops the init guards - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97848).
+// In < c++20 we use `__thread` which results in no TLS init guards generated.
+#ifdef __cpp_constinit
+extern thread_local constinit int critical_alloc_section;
+#else
+extern __thread int critical_alloc_section;
+#endif
+
+} // namespace internal
+/// \endcond
+
+/// \brief Marks scopes that contain critical allocations.
+///
+/// Critical allocations are those, whose failure the application cannot
+/// tolerate. In a perfect world, there should be no such allocation, but we
+/// don't live in a perfect world.
+/// This information is used by other parts of the memory subsystem:
+/// * \ref alloc_failure_injector will not inject errors into these scopes.
+/// * A memory diagnostics report will be dumped if an allocation fails in these
+/// scopes when the memory diagnostics subsystem is configured to dump reports
+/// for \ref alloc_failure_kind \ref alloc_failure_kind::critical or above.
+/// See \ref set_dump_memory_diagnostics_on_alloc_failure_kind().
+class scoped_critical_alloc_section {
+public:
+ scoped_critical_alloc_section() { ++internal::critical_alloc_section; }
+ ~scoped_critical_alloc_section() { --internal::critical_alloc_section; }
+};
+
+/// \brief Is the current context inside a critical alloc section?
+///
+/// Will return true if there is at least one \ref scoped_critical_alloc_section
+/// alive in the current scope or the scope of any of the caller functions.
+inline bool is_critical_alloc_section() {
+ return bool(internal::critical_alloc_section);
+}
+
+} // namespace seastar
+} // namespace memory
diff --git a/src/seastar/include/seastar/util/defer.hh b/src/seastar/include/seastar/util/defer.hh
new file mode 100644
index 000000000..5c359d4b5
--- /dev/null
+++ b/src/seastar/include/seastar/util/defer.hh
@@ -0,0 +1,58 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+namespace seastar {
+
+template <typename Func>
+class deferred_action {
+ Func _func;
+ bool _cancelled = false;
+public:
+ static_assert(std::is_nothrow_move_constructible<Func>::value, "Func(Func&&) must be noexcept");
+ deferred_action(Func&& func) noexcept : _func(std::move(func)) {}
+ deferred_action(deferred_action&& o) noexcept : _func(std::move(o._func)), _cancelled(o._cancelled) {
+ o._cancelled = true;
+ }
+ deferred_action& operator=(deferred_action&& o) noexcept {
+ if (this != &o) {
+ this->~deferred_action();
+ new (this) deferred_action(std::move(o));
+ }
+ return *this;
+ }
+ deferred_action(const deferred_action&) = delete;
+ ~deferred_action() { if (!_cancelled) { _func(); }; }
+ void cancel() { _cancelled = true; }
+};
+
+template <typename Func>
+inline
+deferred_action<Func>
+defer(Func&& func) {
+ return deferred_action<Func>(std::forward<Func>(func));
+}
+
+}
diff --git a/src/seastar/include/seastar/util/eclipse.hh b/src/seastar/include/seastar/util/eclipse.hh
new file mode 100644
index 000000000..25ed6158c
--- /dev/null
+++ b/src/seastar/include/seastar/util/eclipse.hh
@@ -0,0 +1,36 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ *
+ */
+
+#pragma once
+
+// Workarounds for deficiencies in Eclipse's C++ parser
+//
+// Tell Eclipse that IN_ECLIPSE is defined so it will ignore all the unknown syntax.
+
+#ifndef IN_ECLIPSE
+
+#else
+
+// Eclipse doesn't grok alignof
+#define alignof sizeof
+
+#endif
diff --git a/src/seastar/include/seastar/util/exceptions.hh b/src/seastar/include/seastar/util/exceptions.hh
new file mode 100644
index 000000000..9b58e987e
--- /dev/null
+++ b/src/seastar/include/seastar/util/exceptions.hh
@@ -0,0 +1,45 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+/// \brief make a filesystem_error for system calls with a single file operand.
+///
+/// \param what - describes the action that failed
+/// \param path - path of the file that hit the error
+/// \param error - the system error number (see errno(3))
+///
+std::filesystem::filesystem_error make_filesystem_error(const std::string& what, std::filesystem::path path, int error);
+
+/// \brief make a filesystem_error for system calls with two file operands.
+///
+/// \param what - describes the action that failed
+/// \param path1, path2 - paths of the files that hit the error
+/// \param error - the system error number (see errno(3))
+///
+std::filesystem::filesystem_error make_filesystem_error(const std::string& what, std::filesystem::path path1, std::filesystem::path path2, int error);
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/util/file.hh b/src/seastar/include/seastar/util/file.hh
new file mode 100644
index 000000000..9383e8fa6
--- /dev/null
+++ b/src/seastar/include/seastar/util/file.hh
@@ -0,0 +1,45 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+/// Recursively removes a directory and all of its contents.
+///
+/// \param path path of the directory to recursively remove
+///
+/// \note
+/// Unlike `rm -rf` path has to be a directory and may not refer to a regular file.
+///
+/// The function flushes the parent directory of the removed path and so guaranteeing that
+/// the remove is stable on disk.
+///
+/// The function bails out on first error. In that case, some files and/or sub-directories
+/// (and their contents) may be left behind at the level in which the error was detected.
+///
+future<> recursive_remove_directory(std::filesystem::path path) noexcept;
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/util/function_input_iterator.hh b/src/seastar/include/seastar/util/function_input_iterator.hh
new file mode 100644
index 000000000..1155db505
--- /dev/null
+++ b/src/seastar/include/seastar/util/function_input_iterator.hh
@@ -0,0 +1,72 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+namespace seastar {
+
+template <typename Function, typename State>
+struct function_input_iterator {
+ Function _func;
+ State _state;
+public:
+ function_input_iterator(Function func, State state)
+ : _func(func), _state(state) {
+ }
+ function_input_iterator(const function_input_iterator&) = default;
+ function_input_iterator(function_input_iterator&&) = default;
+ function_input_iterator& operator=(const function_input_iterator&) = default;
+ function_input_iterator& operator=(function_input_iterator&&) = default;
+ auto operator*() const {
+ return _func();
+ }
+ function_input_iterator& operator++() {
+ ++_state;
+ return *this;
+ }
+ function_input_iterator operator++(int) {
+ function_input_iterator ret{*this};
+ ++_state;
+ return ret;
+ }
+ bool operator==(const function_input_iterator& x) const {
+ return _state == x._state;
+ }
+ bool operator!=(const function_input_iterator& x) const {
+ return !operator==(x);
+ }
+};
+
+template <typename Function, typename State>
+inline
+function_input_iterator<Function, State>
+make_function_input_iterator(Function func, State state) {
+ return function_input_iterator<Function, State>(func, state);
+}
+
+template <typename Function, typename State>
+inline
+function_input_iterator<Function, State>
+make_function_input_iterator(Function&& func) {
+ return function_input_iterator<Function, State>(func, State{});
+}
+
+}
diff --git a/src/seastar/include/seastar/util/gcc6-concepts.hh b/src/seastar/include/seastar/util/gcc6-concepts.hh
new file mode 100644
index 000000000..1f4ac8565
--- /dev/null
+++ b/src/seastar/include/seastar/util/gcc6-concepts.hh
@@ -0,0 +1,25 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+#pragma once
+
+#define GCC6_CONCEPT(x...)
+#define GCC6_NO_CONCEPT(x...) x
+
diff --git a/src/seastar/include/seastar/util/indirect.hh b/src/seastar/include/seastar/util/indirect.hh
new file mode 100644
index 000000000..e544a5f1a
--- /dev/null
+++ b/src/seastar/include/seastar/util/indirect.hh
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <memory>
+
+namespace seastar {
+
+// This header defines functors for comparing and hashing pointers by pointed-to values instead of pointer addresses.
+//
+// Examples:
+//
+// std::multiset<shared_ptr<sstring>, indirect_less<shared_ptr<sstring>>> _multiset;
+//
+// std::unordered_map<shared_ptr<sstring>, bool,
+// indirect_hash<shared_ptr<sstring>>, indirect_equal_to<shared_ptr<sstring>>> _unordered_map;
+//
+
+template<typename Pointer, typename Equal = std::equal_to<typename std::pointer_traits<Pointer>::element_type>>
+struct indirect_equal_to {
+ Equal _eq;
+ indirect_equal_to(Equal eq = Equal()) : _eq(std::move(eq)) {}
+ bool operator()(const Pointer& i1, const Pointer& i2) const {
+ if (bool(i1) ^ bool(i2)) {
+ return false;
+ }
+ return !i1 || _eq(*i1, *i2);
+ }
+};
+
+template<typename Pointer, typename Less = std::less<typename std::pointer_traits<Pointer>::element_type>>
+struct indirect_less {
+ Less _cmp;
+ indirect_less(Less cmp = Less()) : _cmp(std::move(cmp)) {}
+ bool operator()(const Pointer& i1, const Pointer& i2) const {
+ if (i1 && i2) {
+ return _cmp(*i1, *i2);
+ }
+ return !i1 && i2;
+ }
+};
+
+template<typename Pointer, typename Hash = std::hash<typename std::pointer_traits<Pointer>::element_type>>
+struct indirect_hash {
+ Hash _h;
+ indirect_hash(Hash h = Hash()) : _h(std::move(h)) {}
+ size_t operator()(const Pointer& p) const {
+ if (p) {
+ return _h(*p);
+ }
+ return 0;
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/util/is_smart_ptr.hh b/src/seastar/include/seastar/util/is_smart_ptr.hh
new file mode 100644
index 000000000..68fb9b961
--- /dev/null
+++ b/src/seastar/include/seastar/util/is_smart_ptr.hh
@@ -0,0 +1,34 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <memory> // for std::unique_ptr
+
+namespace seastar {
+
+template<typename T>
+struct is_smart_ptr : std::false_type {};
+
+template<typename T>
+struct is_smart_ptr<std::unique_ptr<T>> : std::true_type {};
+
+}
diff --git a/src/seastar/include/seastar/util/later.hh b/src/seastar/include/seastar/util/later.hh
new file mode 100644
index 000000000..cacce1440
--- /dev/null
+++ b/src/seastar/include/seastar/util/later.hh
@@ -0,0 +1,44 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (C) 2020 ScyllaDB.
+ */
+
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+namespace seastar {
+
+/// \addtogroup future-util
+/// @{
+
+/// \brief Returns a ready future.
+inline
+future<> now() {
+ return make_ready_future<>();
+}
+
+/// \brief Returns a future which is not ready but is scheduled to resolve soon.
+future<> later() noexcept;
+
+/// @}
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/util/lazy.hh b/src/seastar/include/seastar/util/lazy.hh
new file mode 100644
index 000000000..4684efb32
--- /dev/null
+++ b/src/seastar/include/seastar/util/lazy.hh
@@ -0,0 +1,153 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2016 Cloudius Systems, Ltd.
+ */
+#pragma once
+
+#include <ostream>
+
+/// \addtogroup logging
+/// @{
+
+namespace seastar {
+
+/// \brief This class is a wrapper for a lazy evaluation of a value.
+///
+/// The value is evaluated by a functor that gets no parameters, which is
+/// provided to a lazy_value constructor.
+///
+/// The instance may be created only using seastar::value_of helper function.
+///
+/// The evaluation is triggered by operator().
+template<typename Func>
+class lazy_eval {
+private:
+ Func _func;
+
+private:
+ lazy_eval(Func&& f) : _func(std::forward<Func>(f)) {}
+
+public:
+ /// \brief Evaluate a value.
+ ///
+ /// \return the evaluated value
+ auto operator()() {
+ return _func();
+ }
+
+ /// \brief Evaluate a value (const version).
+ ///
+ /// \return the evaluated value
+ auto operator()() const {
+ return _func();
+ }
+
+ template <typename F>
+ friend lazy_eval<F> value_of(F&& func);
+};
+
+
+/// Create a seastar::lazy_eval object that will use a given functor for
+/// evaluating a value when the evaluation is triggered.
+///
+/// The actual evaluation is triggered by applying a () operator on a
+/// returned object.
+///
+/// \param func a functor to evaluate the value
+///
+/// \return a lazy_eval object that may be used for evaluating a value
+template <typename Func>
+inline lazy_eval<Func> value_of(Func&& func) {
+ return lazy_eval<Func>(std::forward<Func>(func));
+}
+
+/// \brief This struct is a wrapper for lazy dereferencing a pointer.
+///
+/// In particular this is to be used in situations where the value of a
+/// pointer has to be converted to string in a lazy manner. Since
+/// pointers can be null adding a check at the point of calling the
+/// log function for example, will introduce an unnecessary branch in
+/// potentially useless code. Using lazy_deref this check can be
+/// deferred to the point where the code is actually evaluated.
+template <typename T>
+struct lazy_deref_wrapper {
+ const T& p;
+
+ constexpr lazy_deref_wrapper(const T& p) : p(p) {
+ }
+};
+
+/// Create a seastar::lazy_deref_wrapper object.
+///
+/// The actual dereferencing will happen when the object is inserted
+/// into a stream. The pointer is not copied, only a reference is saved
+/// to it. Smart pointers are supported as well.
+///
+/// \param p a raw pointer or a smart pointer
+///
+/// \return a lazy_deref_wrapper object
+template <typename T>
+lazy_deref_wrapper<T>
+lazy_deref(const T& p) {
+ return lazy_deref_wrapper<T>(p);
+}
+
+}
+
+namespace std {
+/// Output operator for a seastar::lazy_eval<Func>
+/// This would allow printing a seastar::lazy_eval<Func> as if it's a regular
+/// value.
+///
+/// For example:
+///
+/// `logger.debug("heavy eval result:{}", seastar::value_of([&] { return <heavy evaluation>; }));`
+///
+/// (If a logging level is lower than "debug" the evaluation will not take place.)
+///
+/// \tparam Func a functor type
+/// \param os ostream to print to
+/// \param lf a reference to a lazy_eval<Func> to be printed
+///
+/// \return os
+template <typename Func>
+ostream& operator<<(ostream& os, const seastar::lazy_eval<Func>& lf) {
+ return os << lf();
+}
+
+template <typename Func>
+ostream& operator<<(ostream& os, seastar::lazy_eval<Func>& lf) {
+ return os << lf();
+}
+
+template <typename Func>
+ostream& operator<<(ostream& os, seastar::lazy_eval<Func>&& lf) {
+ return os << lf();
+}
+
+template <typename T>
+ostream& operator<<(ostream& os, seastar::lazy_deref_wrapper<T> ld) {
+ if (ld.p) {
+ return os << *ld.p;
+ }
+
+ return os << "null";
+}
+}
+/// @}
diff --git a/src/seastar/include/seastar/util/log-cli.hh b/src/seastar/include/seastar/util/log-cli.hh
new file mode 100644
index 000000000..83536698b
--- /dev/null
+++ b/src/seastar/include/seastar/util/log-cli.hh
@@ -0,0 +1,80 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/util/log.hh>
+#include <seastar/util/program-options.hh>
+
+#include <seastar/core/sstring.hh>
+
+#include <boost/program_options.hpp>
+
+#include <algorithm>
+#include <unordered_map>
+
+/// \addtogroup logging
+/// @{
+namespace seastar {
+
+///
+/// \brief Configure application logging at run-time with program options.
+///
+namespace log_cli {
+
+///
+/// \brief Options for controlling logging at run-time.
+///
+boost::program_options::options_description get_options_description();
+
+///
+/// \brief Print a human-friendly list of the available loggers.
+///
+void print_available_loggers(std::ostream& os);
+
+///
+/// \brief Parse a log-level ({error, warn, info, debug, trace}) string, throwing \c std::runtime_error for an invalid
+/// level.
+///
+log_level parse_log_level(const sstring&);
+
+//
+// \brief Parse associations from loggers to log-levels and write the resulting pairs to the output iterator.
+//
+// \throws \c std::runtime_error for an invalid log-level.
+//
+template <class OutputIter>
+void parse_logger_levels(const program_options::string_map& levels, OutputIter out) {
+ std::for_each(levels.begin(), levels.end(), [&out](auto&& pair) {
+ *out++ = std::make_pair(pair.first, parse_log_level(pair.second));
+ });
+}
+
+///
+/// \brief Extract CLI options into a logging configuration.
+//
+logging_settings extract_settings(const boost::program_options::variables_map&);
+
+}
+
+}
+
+/// @}
diff --git a/src/seastar/include/seastar/util/log-impl.hh b/src/seastar/include/seastar/util/log-impl.hh
new file mode 100644
index 000000000..d7875a23b
--- /dev/null
+++ b/src/seastar/include/seastar/util/log-impl.hh
@@ -0,0 +1,117 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2020 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+#include <iterator>
+
+/// \addtogroup logging
+/// @{
+
+namespace seastar {
+
+/// \cond internal
+namespace internal {
+
+/// A buffer to format log messages into.
+///
+/// It was designed to allow formatting the entire message into it, without any
+/// intermediary buffers. To minimize the amount of reallocations it supports
+/// using an external buffer. When this is full it moves to using buffers
+/// allocated by itself.
+/// To accommodate the most widely used way of formatting messages -- fmt --,
+/// it provides an output iterator interface for writing into it.
+class log_buf {
+ char* _begin;
+ char* _end;
+ char* _current;
+ bool _own_buf;
+
+private:
+ void free_buffer() noexcept;
+ void realloc_buffer();
+
+public:
+ class inserter_iterator {
+ public:
+ using iterator_category = std::output_iterator_tag;
+ using difference_type = std::ptrdiff_t;
+ using value_type = char;
+ using pointer = char*;
+ using reference = char&;
+
+ private:
+ log_buf* _buf;
+ char* _current;
+
+ public:
+ explicit inserter_iterator(log_buf& buf) noexcept : _buf(&buf), _current(_buf->_current) { }
+ inserter_iterator(const inserter_iterator& o) noexcept : _buf(o._buf), _current(o._current) { }
+
+ reference operator*() {
+ if (__builtin_expect(_current == _buf->_end, false)) {
+ _buf->realloc_buffer();
+ _current = _buf->_current;
+ }
+ return *_current;
+ }
+ inserter_iterator& operator++() noexcept {
+ if (__builtin_expect(_current == _buf->_current, true)) {
+ ++_buf->_current;
+ }
+ ++_current;
+ return *this;
+ }
+ inserter_iterator operator++(int) noexcept {
+ inserter_iterator o(*this);
+ ++(*this);
+ return o;
+ }
+ };
+
+ /// Default ctor.
+ ///
+ /// Allocates an internal buffer of 512 bytes.
+ log_buf();
+ /// External buffer ctor.
+ ///
+ /// Use the external buffer until its full, then switch to internally
+ /// allocated buffer. log_buf doesn't take ownership of the buffer.
+ log_buf(char* external_buf, size_t size) noexcept;
+ ~log_buf();
+ /// Create an output iterator which allows writing into the buffer.
+ inserter_iterator back_insert_begin() noexcept { return inserter_iterator(*this); }
+ /// The amount of data written so far.
+ const size_t size() const noexcept { return _current - _begin; }
+ /// The size of the buffer.
+ const size_t capacity() const noexcept { return _end - _begin; }
+ /// Read only pointer to the buffer.
+ /// Note that the buffer is not guaranteed to be null terminated. The writer
+ /// has to ensure that, should it wish to.
+ const char* data() const noexcept { return _begin; }
+ /// A view of the buffer content.
+ std::string_view view() const noexcept { return std::string_view(_begin, size()); }
+};
+
+} // namespace internal
+/// \endcond
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/util/log.hh b/src/seastar/include/seastar/util/log.hh
new file mode 100644
index 000000000..d1cb6eb32
--- /dev/null
+++ b/src/seastar/include/seastar/util/log.hh
@@ -0,0 +1,464 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2015 Cloudius Systems, Ltd.
+ */
+#pragma once
+
+#include <seastar/core/sstring.hh>
+#include <seastar/util/concepts.hh>
+#include <seastar/util/log-impl.hh>
+#include <seastar/core/lowres_clock.hh>
+#include <unordered_map>
+#include <exception>
+#include <iosfwd>
+#include <atomic>
+#include <mutex>
+#include <boost/lexical_cast.hpp>
+#include <fmt/format.h>
+
+
+/// \addtogroup logging
+/// @{
+
+namespace seastar {
+
+/// \brief log level used with \see {logger}
+/// used with the logger.do_log method.
+/// Levels are in increasing order. That is if you want to see debug(3) logs you
+/// will also see error(0), warn(1), info(2).
+///
+enum class log_level {
+ error,
+ warn,
+ info,
+ debug,
+ trace,
+};
+
+std::ostream& operator<<(std::ostream& out, log_level level);
+std::istream& operator>>(std::istream& in, log_level& level);
+}
+
+// Boost doesn't auto-deduce the existence of the streaming operators for some reason
+
+namespace boost {
+template<>
+seastar::log_level lexical_cast(const std::string& source);
+
+}
+
+namespace seastar {
+
+class logger;
+class logger_registry;
+
+/// \brief Logger class for ostream or syslog.
+///
+/// Java style api for logging.
+/// \code {.cpp}
+/// static seastar::logger logger("lsa-api");
+/// logger.info("Triggering compaction");
+/// \endcode
+/// The output format is: (depending on level)
+/// DEBUG %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+///
+/// It is possible to rate-limit log messages, see \ref logger::rate_limit.
+class logger {
+ sstring _name;
+ std::atomic<log_level> _level = { log_level::info };
+ static std::ostream* _out;
+ static std::atomic<bool> _ostream;
+ static std::atomic<bool> _syslog;
+
+public:
+ class log_writer {
+ public:
+ virtual ~log_writer() = default;
+ virtual internal::log_buf::inserter_iterator operator()(internal::log_buf::inserter_iterator) = 0;
+ };
+ template <typename Func>
+ SEASTAR_CONCEPT(requires requires (Func fn, internal::log_buf::inserter_iterator it) {
+ it = fn(it);
+ })
+ class lambda_log_writer : public log_writer {
+ Func _func;
+ public:
+ lambda_log_writer(Func&& func) : _func(std::forward<Func>(func)) { }
+ virtual ~lambda_log_writer() override = default;
+ virtual internal::log_buf::inserter_iterator operator()(internal::log_buf::inserter_iterator it) override { return _func(it); }
+ };
+
+private:
+
+ // We can't use an std::function<> as it potentially allocates.
+ void do_log(log_level level, log_writer& writer);
+ void failed_to_log(std::exception_ptr ex) noexcept;
+public:
+ /// Apply a rate limit to log message(s)
+ ///
+ /// Pass this to \ref logger::log() to apply a rate limit to the message.
+ /// The rate limit is applied to all \ref logger::log() calls this rate
+ /// limit is passed to. Example:
+ ///
+ /// void handle_request() {
+ /// static thread_local logger::rate_limit my_rl(std::chrono::seconds(10));
+ /// // ...
+ /// my_log.log(log_level::info, my_rl, "a message we don't want to log on every request, only at most once each 10 seconds");
+ /// // ...
+ /// }
+ ///
+ /// The rate limit ensures that at most one message per interval will be
+ /// logged. If there were messages dropped due to rate-limiting the
+ /// following snippet will be prepended to the first non-dropped log
+ /// messages:
+ ///
+ /// (rate limiting dropped $N similar messages)
+ ///
+ /// Where $N is the number of messages dropped.
+ class rate_limit {
+ friend class logger;
+
+ using clock = lowres_clock;
+
+ private:
+ clock::duration _interval;
+ clock::time_point _next;
+ uint64_t _dropped_messages = 0;
+
+ private:
+ bool check();
+ bool has_dropped_messages() const { return bool(_dropped_messages); }
+ uint64_t get_and_reset_dropped_messages() {
+ return std::exchange(_dropped_messages, 0);
+ }
+
+ public:
+ explicit rate_limit(std::chrono::milliseconds interval);
+ };
+
+public:
+ explicit logger(sstring name);
+ logger(logger&& x);
+ ~logger();
+
+ bool is_shard_zero() noexcept;
+
+ /// Test if desired log level is enabled
+ ///
+ /// \param level - enum level value (info|error...)
+ /// \return true if the log level has been enabled.
+ bool is_enabled(log_level level) const noexcept {
+ return __builtin_expect(level <= _level.load(std::memory_order_relaxed), false);
+ }
+
+ /// logs to desired level if enabled, otherwise we ignore the log line
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void log(log_level level, const char* fmt, Args&&... args) noexcept {
+ if (is_enabled(level)) {
+ try {
+ lambda_log_writer writer([&] (internal::log_buf::inserter_iterator it) {
+ return fmt::format_to(it, fmt, std::forward<Args>(args)...);
+ });
+ do_log(level, writer);
+ } catch (...) {
+ failed_to_log(std::current_exception());
+ }
+ }
+ }
+
+ /// logs with a rate limit to desired level if enabled, otherwise we ignore the log line
+ ///
+ /// If there were messages dropped due to rate-limiting the following snippet
+ /// will be prepended to the first non-dropped log messages:
+ ///
+ /// (rate limiting dropped $N similar messages)
+ ///
+ /// Where $N is the number of messages dropped.
+ ///
+ /// \param rl - the \ref rate_limit to apply to this log
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void log(log_level level, rate_limit& rl, const char* fmt, Args&&... args) noexcept {
+ if (is_enabled(level) && rl.check()) {
+ try {
+ lambda_log_writer writer([&] (internal::log_buf::inserter_iterator it) {
+ if (rl.has_dropped_messages()) {
+ it = fmt::format_to(it, "(rate limiting dropped {} similar messages) ", rl.get_and_reset_dropped_messages());
+ }
+ return fmt::format_to(it, fmt, std::forward<Args>(args)...);
+ });
+ do_log(level, writer);
+ } catch (...) {
+ failed_to_log(std::current_exception());
+ }
+ }
+ }
+
+ /// \cond internal
+ /// logs to desired level if enabled, otherwise we ignore the log line
+ ///
+ /// \param writer a function which writes directly to the underlying log buffer
+ ///
+ /// This is a low level method for use cases where it is very important to
+ /// avoid any allocations. The \arg writer will be passed a
+ /// internal::log_buf::inserter_iterator that allows it to write into the log
+ /// buffer directly, avoiding the use of any intermediary buffers.
+ void log(log_level level, log_writer& writer) noexcept {
+ if (is_enabled(level)) {
+ try {
+ do_log(level, writer);
+ } catch (...) {
+ failed_to_log(std::current_exception());
+ }
+ }
+ }
+ /// logs to desired level if enabled, otherwise we ignore the log line
+ ///
+ /// \param writer a function which writes directly to the underlying log buffer
+ ///
+ /// This is a low level method for use cases where it is very important to
+ /// avoid any allocations. The \arg writer will be passed a
+ /// internal::log_buf::inserter_iterator that allows it to write into the log
+ /// buffer directly, avoiding the use of any intermediary buffers.
+ /// This is rate-limited version, see \ref rate_limit.
+ void log(log_level level, rate_limit& rl, log_writer& writer) noexcept {
+ if (is_enabled(level) && rl.check()) {
+ try {
+ lambda_log_writer writer_wrapper([&] (internal::log_buf::inserter_iterator it) {
+ if (rl.has_dropped_messages()) {
+ it = fmt::format_to(it, "(rate limiting dropped {} similar messages) ", rl.get_and_reset_dropped_messages());
+ }
+ return writer(it);
+ });
+ do_log(level, writer_wrapper);
+ } catch (...) {
+ failed_to_log(std::current_exception());
+ }
+ }
+ }
+ /// \endcond
+
+ /// Log with error tag:
+ /// ERROR %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void error(const char* fmt, Args&&... args) noexcept {
+ log(log_level::error, fmt, std::forward<Args>(args)...);
+ }
+ /// Log with warning tag:
+ /// WARN %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void warn(const char* fmt, Args&&... args) noexcept {
+ log(log_level::warn, fmt, std::forward<Args>(args)...);
+ }
+ /// Log with info tag:
+ /// INFO %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void info(const char* fmt, Args&&... args) noexcept {
+ log(log_level::info, fmt, std::forward<Args>(args)...);
+ }
+ /// Log with info tag on shard zero only:
+ /// INFO %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void info0(const char* fmt, Args&&... args) noexcept {
+ if (is_shard_zero()) {
+ log(log_level::info, fmt, std::forward<Args>(args)...);
+ }
+ }
+ /// Log with debug tag:
+ /// DEBUG %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void debug(const char* fmt, Args&&... args) noexcept {
+ log(log_level::debug, fmt, std::forward<Args>(args)...);
+ }
+ /// Log with trace tag:
+ /// TRACE %Y-%m-%d %T,%03d [shard 0] - "your msg" \n
+ ///
+ /// \param fmt - {fmt} style format
+ /// \param args - args to print string
+ ///
+ template <typename... Args>
+ void trace(const char* fmt, Args&&... args) noexcept {
+ log(log_level::trace, fmt, std::forward<Args>(args)...);
+ }
+
+ /// \return name of the logger. Usually one logger per module
+ ///
+ const sstring& name() const noexcept {
+ return _name;
+ }
+
+ /// \return current log level for this logger
+ ///
+ log_level level() const noexcept {
+ return _level.load(std::memory_order_relaxed);
+ }
+
+ /// \param level - set the log level
+ ///
+ void set_level(log_level level) noexcept {
+ _level.store(level, std::memory_order_relaxed);
+ }
+
+ /// Set output stream, default is std::cerr
+ static void set_ostream(std::ostream& out) noexcept;
+
+ /// Also output to ostream. default is true
+ static void set_ostream_enabled(bool enabled) noexcept;
+
+ /// Also output to stdout. default is true
+ [[deprecated("Use set_ostream_enabled instead")]]
+ static void set_stdout_enabled(bool enabled) noexcept;
+
+ /// Also output to syslog. default is false
+ ///
+ /// NOTE: syslog() can block, which will stall the reactor thread.
+ /// this should be rare (will have to fill the pipe buffer
+ /// before syslogd can clear it) but can happen.
+ static void set_syslog_enabled(bool enabled) noexcept;
+};
+
+/// \brief used to keep a static registry of loggers
+/// since the typical use case is to do:
+/// \code {.cpp}
+/// static seastar::logger("my_module");
+/// \endcode
+/// this class is used to wrap around the static map
+/// that holds pointers to all logs
+///
+class logger_registry {
+ mutable std::mutex _mutex;
+ std::unordered_map<sstring, logger*> _loggers;
+public:
+ /// loops through all registered loggers and sets the log level
+ /// Note: this method locks
+ ///
+ /// \param level - desired level: error,info,...
+ void set_all_loggers_level(log_level level);
+
+ /// Given a name for a logger returns the log_level enum
+ /// Note: this method locks
+ ///
+ /// \return log_level for the given logger name
+ log_level get_logger_level(sstring name) const;
+
+ /// Sets the log level for a given logger
+ /// Note: this method locks
+ ///
+ /// \param name - name of logger
+ /// \param level - desired level of logging
+ void set_logger_level(sstring name, log_level level);
+
+ /// Returns a list of registered loggers
+ /// Note: this method locks
+ ///
+ /// \return all registered loggers
+ std::vector<sstring> get_all_logger_names();
+
+ /// Registers a logger with the static map
+ /// Note: this method locks
+ ///
+ void register_logger(logger* l);
+ /// Unregisters a logger with the static map
+ /// Note: this method locks
+ ///
+ void unregister_logger(logger* l);
+ /// Swaps the logger given the from->name() in the static map
+ /// Note: this method locks
+ ///
+ void moved(logger* from, logger* to);
+};
+
+logger_registry& global_logger_registry();
+
+enum class logger_timestamp_style {
+ none,
+ boot,
+ real,
+};
+
+enum class logger_ostream_type {
+ none,
+ stdout,
+ stderr,
+};
+
+struct logging_settings final {
+ std::unordered_map<sstring, log_level> logger_levels;
+ log_level default_level;
+ bool stdout_enabled;
+ bool syslog_enabled;
+ logger_timestamp_style stdout_timestamp_style = logger_timestamp_style::real;
+ logger_ostream_type logger_ostream = logger_ostream_type::stderr;
+};
+
+/// Shortcut for configuring the logging system all at once.
+///
+void apply_logging_settings(const logging_settings&);
+
+/// \cond internal
+
+extern thread_local uint64_t logging_failures;
+
+sstring pretty_type_name(const std::type_info&);
+
+sstring level_name(log_level level);
+
+template <typename T>
+class logger_for : public logger {
+public:
+ logger_for() : logger(pretty_type_name(typeid(T))) {}
+};
+
+/// \endcond
+} // end seastar namespace
+
+// Pretty-printer for exceptions to be logged, e.g., std::current_exception().
+namespace std {
+std::ostream& operator<<(std::ostream&, const std::exception_ptr&);
+std::ostream& operator<<(std::ostream&, const std::exception&);
+std::ostream& operator<<(std::ostream&, const std::system_error&);
+}
+
+/// @}
diff --git a/src/seastar/include/seastar/util/memory_diagnostics.hh b/src/seastar/include/seastar/util/memory_diagnostics.hh
new file mode 100644
index 000000000..48bce7329
--- /dev/null
+++ b/src/seastar/include/seastar/util/memory_diagnostics.hh
@@ -0,0 +1,85 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/util/noncopyable_function.hh>
+
+namespace seastar {
+namespace memory {
+
+/// \brief The kind of allocation failures to dump diagnostics report for.
+///
+/// Note that if the seastar_memory logger is set to level debug, there will
+/// be a report dumped for any allocation failure, regardless of this
+/// configuration.
+enum class alloc_failure_kind {
+ /// Dump diagnostic error report for none of the allocation failures.
+ none,
+ /// Dump diagnostic error report for critical allocation failures, see
+ /// \ref scoped_critical_alloc_section.
+ critical,
+ /// Dump diagnostic error report for all the allocation failures.
+ all,
+};
+
+/// \brief Configure when memory diagnostics are dumped.
+///
+/// See \ref alloc_failure_kind on available options.
+/// Applies configuration on all shards.
+void set_dump_memory_diagnostics_on_alloc_failure_kind(alloc_failure_kind);
+
+/// \brief Configure when memory diagnostics are dumped.
+///
+/// String version. See \ref alloc_failure_kind on available options.
+/// Applies configuration on all shards.
+void set_dump_memory_diagnostics_on_alloc_failure_kind(std::string_view);
+
+/// \brief A functor which writes its argument into the diagnostics report.
+using memory_diagnostics_writer = noncopyable_function<void(std::string_view)>;
+
+/// \brief Set a producer of additional diagnostic information.
+///
+/// This allows the application running on top of seastar to add its own part to
+/// the diagnostics dump. The application can supply higher level diagnostics
+/// information, that might help explain how the memory was consumed.
+///
+/// The application specific part will be added just below the main stats
+/// (free/used/total memory).
+///
+/// \param producer - the functor to produce the additional diagnostics, specific
+/// to the application, to be added to the generated report. The producer is
+/// passed a writer functor, which it can use to add its parts to the report.
+///
+/// \note As the report is generated at a time when allocations are failing, the
+/// producer should try as hard as possible to not allocate while producing
+/// the output.
+void set_additional_diagnostics_producer(noncopyable_function<void(memory_diagnostics_writer)> producer);
+
+/// Manually generate a diagnostics report
+///
+/// Note that contrary to the automated report generation (triggered by
+/// allocation failure), this method does allocate memory and can fail in
+/// low-memory conditions.
+sstring generate_memory_diagnostics_report();
+
+} // namespace memory
+} // namespace seastar
diff --git a/src/seastar/include/seastar/util/noncopyable_function.hh b/src/seastar/include/seastar/util/noncopyable_function.hh
new file mode 100644
index 000000000..24d1e121d
--- /dev/null
+++ b/src/seastar/include/seastar/util/noncopyable_function.hh
@@ -0,0 +1,225 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <seastar/util/used_size.hh>
+
+#include <utility>
+#include <type_traits>
+#include <functional>
+
+namespace seastar {
+
+template <typename Signature>
+class noncopyable_function;
+
+namespace internal {
+
+class noncopyable_function_base {
+private:
+ noncopyable_function_base() = default;
+ static constexpr size_t nr_direct = 32;
+ union [[gnu::may_alias]] storage {
+ char direct[nr_direct];
+ void* indirect;
+ };
+ using move_type = void (*)(noncopyable_function_base* from, noncopyable_function_base* to);
+ using destroy_type = void (*)(noncopyable_function_base* func);
+
+ static void empty_move(noncopyable_function_base* from, noncopyable_function_base* to) {}
+ static void empty_destroy(noncopyable_function_base* func) {}
+
+ static void indirect_move(noncopyable_function_base* from, noncopyable_function_base* to) {
+ using void_ptr = void*;
+ new (&to->_storage.indirect) void_ptr(from->_storage.indirect);
+ }
+
+ template <size_t N>
+ static void trivial_direct_move(noncopyable_function_base* from, noncopyable_function_base* to) {
+ // Avoid including <algorithm> just for this
+ for (unsigned i = 0; i != N; ++i) {
+ to->_storage.direct[i] = from->_storage.direct[i];
+ }
+ }
+
+ static void trivial_direct_destroy(noncopyable_function_base* func) {
+ }
+
+private:
+ storage _storage;
+
+ template <typename Signature>
+ friend class seastar::noncopyable_function;
+};
+
+template<typename FirstArg = void, typename... RemainingArgs>
+struct is_nothrow_if_object {
+ static constexpr bool value = is_nothrow_if_object<FirstArg>::value && is_nothrow_if_object<RemainingArgs...>::value;
+};
+
+template<typename Arg>
+struct is_nothrow_if_object<Arg> {
+ static constexpr bool value = !std::is_object<Arg>::value || std::is_nothrow_move_constructible<Arg>::value;
+};
+
+template<>
+struct is_nothrow_if_object<> {
+ static constexpr bool value = true;
+};
+
+}
+
+/// A clone of \c std::function, but only invokes the move constructor
+/// of the contained function.
+template <typename Ret, typename... Args, bool Noexcept>
+class noncopyable_function<Ret (Args...) noexcept(Noexcept)> : private internal::noncopyable_function_base {
+ using call_type = Ret (*)(const noncopyable_function* func, Args...);
+ struct vtable {
+ const call_type call;
+ const move_type move;
+ const destroy_type destroy;
+ };
+private:
+ const vtable* _vtable;
+private:
+ static Ret empty_call(const noncopyable_function* func, Args... args) {
+ throw std::bad_function_call();
+ }
+
+ static constexpr vtable _s_empty_vtable = {empty_call, empty_move, empty_destroy};
+
+ template <typename Func>
+ struct direct_vtable_for {
+ static Func* access(noncopyable_function* func) { return reinterpret_cast<Func*>(func->_storage.direct); }
+ static const Func* access(const noncopyable_function* func) { return reinterpret_cast<const Func*>(func->_storage.direct); }
+ static Func* access(noncopyable_function_base* func) { return access(static_cast<noncopyable_function*>(func)); }
+ static Ret call(const noncopyable_function* func, Args... args) noexcept(Noexcept) {
+ return (*access(const_cast<noncopyable_function*>(func)))(std::forward<Args>(args)...);
+ }
+ static void move(noncopyable_function_base* from, noncopyable_function_base* to) {
+ new (access(to)) Func(std::move(*access(from)));
+ destroy(from);
+ }
+ static constexpr move_type select_move_thunk() {
+ bool can_trivially_move = std::is_trivially_move_constructible<Func>::value
+ && std::is_trivially_destructible<Func>::value;
+ return can_trivially_move ? trivial_direct_move<internal::used_size<Func>::value> : move;
+ }
+ static void destroy(noncopyable_function_base* func) {
+ access(func)->~Func();
+ }
+ static constexpr destroy_type select_destroy_thunk() {
+ return std::is_trivially_destructible<Func>::value ? trivial_direct_destroy : destroy;
+ }
+ static void initialize(Func&& from, noncopyable_function* to) {
+ new (access(to)) Func(std::move(from));
+ }
+ static constexpr vtable make_vtable() { return { call, select_move_thunk(), select_destroy_thunk() }; }
+ static const vtable s_vtable;
+ };
+ template <typename Func>
+ struct indirect_vtable_for {
+ static Func* access(noncopyable_function* func) { return reinterpret_cast<Func*>(func->_storage.indirect); }
+ static const Func* access(const noncopyable_function* func) { return reinterpret_cast<const Func*>(func->_storage.indirect); }
+ static Func* access(noncopyable_function_base* func) { return access(static_cast<noncopyable_function*>(func)); }
+ static Ret call(const noncopyable_function* func, Args... args) noexcept(Noexcept) {
+ return (*access(const_cast<noncopyable_function*>(func)))(std::forward<Args>(args)...);
+ }
+ static void destroy(noncopyable_function_base* func) {
+ delete access(func);
+ }
+ static void initialize(Func&& from, noncopyable_function* to) {
+ to->_storage.indirect = new Func(std::move(from));
+ }
+ static constexpr vtable make_vtable() { return { call, indirect_move, destroy }; }
+ static const vtable s_vtable;
+ };
+ template <typename Func, bool Direct = true>
+ struct select_vtable_for : direct_vtable_for<Func> {};
+ template <typename Func>
+ struct select_vtable_for<Func, false> : indirect_vtable_for<Func> {};
+ template <typename Func>
+ static constexpr bool is_direct() {
+ return sizeof(Func) <= nr_direct && alignof(Func) <= alignof(storage)
+ && std::is_nothrow_move_constructible<Func>::value;
+ }
+ template <typename Func>
+ struct vtable_for : select_vtable_for<Func, is_direct<Func>()> {};
+public:
+ noncopyable_function() noexcept : _vtable(&_s_empty_vtable) {}
+ template <typename Func>
+ noncopyable_function(Func func) {
+ static_assert(!Noexcept || noexcept(std::declval<Func>()(std::declval<Args>()...)));
+ vtable_for<Func>::initialize(std::move(func), this);
+ _vtable = &vtable_for<Func>::s_vtable;
+ }
+ template <typename Object, typename... AllButFirstArg>
+ noncopyable_function(Ret (Object::*member)(AllButFirstArg...) noexcept(Noexcept)) : noncopyable_function(std::mem_fn(member)) {}
+ template <typename Object, typename... AllButFirstArg>
+ noncopyable_function(Ret (Object::*member)(AllButFirstArg...) const noexcept(Noexcept)) : noncopyable_function(std::mem_fn(member)) {}
+
+ ~noncopyable_function() {
+ _vtable->destroy(this);
+ }
+
+ noncopyable_function(const noncopyable_function&) = delete;
+ noncopyable_function& operator=(const noncopyable_function&) = delete;
+
+ noncopyable_function(noncopyable_function&& x) noexcept : _vtable(std::exchange(x._vtable, &_s_empty_vtable)) {
+ _vtable->move(&x, this);
+ }
+
+ noncopyable_function& operator=(noncopyable_function&& x) noexcept {
+ if (this != &x) {
+ this->~noncopyable_function();
+ new (this) noncopyable_function(std::move(x));
+ }
+ return *this;
+ }
+
+ Ret operator()(Args... args) const noexcept(Noexcept) {
+ static_assert(!Noexcept || internal::is_nothrow_if_object<Args...>::value);
+ return _vtable->call(this, std::forward<Args>(args)...);
+ }
+
+ explicit operator bool() const {
+ return _vtable != &_s_empty_vtable;
+ }
+};
+
+
+template <typename Ret, typename... Args, bool Noexcept>
+constexpr typename noncopyable_function<Ret (Args...) noexcept(Noexcept)>::vtable noncopyable_function<Ret (Args...) noexcept(Noexcept)>::_s_empty_vtable;
+
+template <typename Ret, typename... Args, bool Noexcept>
+template <typename Func>
+const typename noncopyable_function<Ret (Args...) noexcept(Noexcept)>::vtable noncopyable_function<Ret (Args...) noexcept(Noexcept)>::direct_vtable_for<Func>::s_vtable
+ = noncopyable_function<Ret (Args...) noexcept(Noexcept)>::direct_vtable_for<Func>::make_vtable();
+
+
+template <typename Ret, typename... Args, bool Noexcept>
+template <typename Func>
+const typename noncopyable_function<Ret (Args...) noexcept(Noexcept)>::vtable noncopyable_function<Ret (Args...) noexcept(Noexcept)>::indirect_vtable_for<Func>::s_vtable
+ = noncopyable_function<Ret (Args...) noexcept(Noexcept)>::indirect_vtable_for<Func>::make_vtable();
+
+}
+
diff --git a/src/seastar/include/seastar/util/optimized_optional.hh b/src/seastar/include/seastar/util/optimized_optional.hh
new file mode 100644
index 000000000..60febceae
--- /dev/null
+++ b/src/seastar/include/seastar/util/optimized_optional.hh
@@ -0,0 +1,101 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/util/concepts.hh>
+#include <seastar/util/std-compat.hh>
+
+#include <type_traits>
+#include <iostream>
+
+namespace seastar {
+
+SEASTAR_CONCEPT(
+
+template<typename T>
+concept OptimizableOptional =
+ std::is_default_constructible<T>::value
+ && std::is_nothrow_move_assignable<T>::value
+ && requires(const T& obj) {
+ { bool(obj) } noexcept;
+ };
+
+)
+
+/// \c optimized_optional<> is intended mainly for use with classes that store
+/// their data externally and expect pointer to this data to be always non-null.
+/// In such case there is no real need for another flag signifying whether
+/// the optional is engaged.
+template<typename T>
+class optimized_optional {
+ T _object;
+public:
+ optimized_optional() = default;
+ optimized_optional(std::nullopt_t) noexcept { }
+ optimized_optional(const T& obj) : _object(obj) { }
+ optimized_optional(T&& obj) noexcept : _object(std::move(obj)) { }
+ optimized_optional(std::optional<T>&& obj) noexcept {
+ if (obj) {
+ _object = std::move(*obj);
+ }
+ }
+ optimized_optional(const optimized_optional&) = default;
+ optimized_optional(optimized_optional&&) = default;
+
+ optimized_optional& operator=(std::nullopt_t) noexcept {
+ _object = T();
+ return *this;
+ }
+ template<typename U>
+ std::enable_if_t<std::is_same<std::decay_t<U>, T>::value, optimized_optional&>
+ operator=(U&& obj) noexcept {
+ _object = std::forward<U>(obj);
+ return *this;
+ }
+ optimized_optional& operator=(const optimized_optional&) = default;
+ optimized_optional& operator=(optimized_optional&&) = default;
+
+ explicit operator bool() const noexcept {
+ return bool(_object);
+ }
+
+ T* operator->() noexcept { return &_object; }
+ const T* operator->() const noexcept { return &_object; }
+
+ T& operator*() noexcept { return _object; }
+ const T& operator*() const noexcept { return _object; }
+
+ bool operator==(const optimized_optional& other) const {
+ return _object == other._object;
+ }
+ bool operator!=(const optimized_optional& other) const {
+ return _object != other._object;
+ }
+ friend std::ostream& operator<<(std::ostream& out, const optimized_optional& opt) {
+ if (!opt) {
+ return out << "null";
+ }
+ return out << *opt;
+ }
+};
+
+}
diff --git a/src/seastar/include/seastar/util/print_safe.hh b/src/seastar/include/seastar/util/print_safe.hh
new file mode 100644
index 000000000..488180c81
--- /dev/null
+++ b/src/seastar/include/seastar/util/print_safe.hh
@@ -0,0 +1,113 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <stdio.h>
+
+namespace seastar {
+
+//
+// Collection of async-signal safe printing functions.
+//
+
+// Outputs string to stderr.
+// Async-signal safe.
+inline
+void print_safe(const char *str, size_t len) noexcept {
+ while (len) {
+ auto result = write(STDERR_FILENO, str, len);
+ if (result > 0) {
+ len -= result;
+ str += result;
+ } else if (result == 0) {
+ break;
+ } else {
+ if (errno == EINTR) {
+ // retry
+ } else {
+ break; // what can we do?
+ }
+ }
+ }
+}
+
+// Outputs string to stderr.
+// Async-signal safe.
+inline
+void print_safe(const char *str) noexcept {
+ print_safe(str, strlen(str));
+}
+
+// Fills a buffer with a zero-padded hexadecimal representation of an integer.
+// For example, convert_zero_padded_hex_safe(buf, 4, uint16_t(12)) fills the buffer with "000c".
+template<typename Integral>
+void convert_zero_padded_hex_safe(char *buf, size_t bufsz, Integral n) noexcept {
+ const char *digits = "0123456789abcdef";
+ memset(buf, '0', bufsz);
+ unsigned i = bufsz;
+ while (n) {
+ buf[--i] = digits[n & 0xf];
+ n >>= 4;
+ }
+}
+
+// Prints zero-padded hexadecimal representation of an integer to stderr.
+// For example, print_zero_padded_hex_safe(uint16_t(12)) prints "000c".
+// Async-signal safe.
+template<typename Integral>
+void print_zero_padded_hex_safe(Integral n) noexcept {
+ static_assert(std::is_integral<Integral>::value && !std::is_signed<Integral>::value, "Requires unsigned integrals");
+
+ char buf[sizeof(n) * 2];
+ convert_zero_padded_hex_safe(buf, sizeof(buf), n);
+ print_safe(buf, sizeof(buf));
+}
+
+// Fills a buffer with a decimal representation of an integer.
+// The argument bufsz is the maximum size of the buffer.
+// For example, print_decimal_safe(buf, 16, 12) prints "12".
+template<typename Integral>
+size_t convert_decimal_safe(char *buf, size_t bufsz, Integral n) noexcept {
+ static_assert(std::is_integral<Integral>::value && !std::is_signed<Integral>::value, "Requires unsigned integrals");
+
+ char tmp[sizeof(n) * 3];
+ unsigned i = bufsz;
+ do {
+ tmp[--i] = '0' + n % 10;
+ n /= 10;
+ } while (n);
+ memcpy(buf, tmp + i, sizeof(tmp) - i);
+ return sizeof(tmp) - i;
+}
+
+// Prints decimal representation of an integer to stderr.
+// For example, print_decimal_safe(12) prints "12".
+// Async-signal safe.
+template<typename Integral>
+void print_decimal_safe(Integral n) noexcept {
+ char buf[sizeof(n) * 3];
+ unsigned i = sizeof(buf);
+ auto len = convert_decimal_safe(buf, i, n);
+ print_safe(buf, len);
+}
+
+}
diff --git a/src/seastar/include/seastar/util/program-options.hh b/src/seastar/include/seastar/util/program-options.hh
new file mode 100644
index 000000000..c487a3250
--- /dev/null
+++ b/src/seastar/include/seastar/util/program-options.hh
@@ -0,0 +1,99 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/sstring.hh>
+
+#include <boost/any.hpp>
+#include <boost/program_options.hpp>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace seastar {
+
+namespace program_options {
+
+///
+/// \brief Wrapper for command-line options with arbitrary string associations.
+///
+/// This type, to be used with Boost.Program_options, will result in an option that stores an arbitrary number of
+/// string associations.
+///
+/// Values are specified in the form "key0=value0:[key1=value1:...]". Options of this type can be specified multiple
+/// times, and the values will be merged (with the last-provided value for a key taking precedence).
+///
+/// \note We need a distinct type (rather than a simple type alias) for overload resolution in the implementation, but
+/// advertizing our inheritance of \c std::unordered_map would introduce the possibility of memory leaks since STL
+/// containers do not declare virtual destructors.
+///
+class string_map final : private std::unordered_map<sstring, sstring> {
+private:
+ using base = std::unordered_map<sstring, sstring>;
+public:
+ using base::value_type;
+ using base::key_type;
+ using base::mapped_type;
+
+ using base::base;
+ using base::at;
+ using base::find;
+ using base::count;
+ using base::emplace;
+ using base::clear;
+ using base::operator[];
+ using base::begin;
+ using base::end;
+
+ friend bool operator==(const string_map&, const string_map&);
+ friend bool operator!=(const string_map&, const string_map&);
+};
+
+inline bool operator==(const string_map& lhs, const string_map& rhs) {
+ return static_cast<const string_map::base&>(lhs) == static_cast<const string_map::base&>(rhs);
+}
+
+inline bool operator!=(const string_map& lhs, const string_map& rhs) {
+ return !(lhs == rhs);
+}
+
+///
+/// \brief Query the value of a key in a \c string_map, or a default value if the key doesn't exist.
+///
+sstring get_or_default(const string_map&, const sstring& key, const sstring& def = sstring());
+
+std::istream& operator>>(std::istream& is, string_map&);
+std::ostream& operator<<(std::ostream& os, const string_map&);
+
+/// \cond internal
+
+//
+// Required implementation hook for Boost.Program_options.
+//
+void validate(boost::any& out, const std::vector<std::string>& in, string_map*, int);
+
+/// \endcond
+
+}
+
+}
diff --git a/src/seastar/include/seastar/util/read_first_line.hh b/src/seastar/include/seastar/util/read_first_line.hh
new file mode 100644
index 000000000..bf77d9e39
--- /dev/null
+++ b/src/seastar/include/seastar/util/read_first_line.hh
@@ -0,0 +1,14 @@
+#include <seastar/util/std-compat.hh>
+#include <seastar/core/sstring.hh>
+#include <boost/lexical_cast.hpp>
+
+namespace seastar {
+
+sstring read_first_line(std::filesystem::path sys_file);
+
+template <typename Type>
+Type read_first_line_as(std::filesystem::path sys_file) {
+ return boost::lexical_cast<Type>(read_first_line(sys_file));
+}
+
+}
diff --git a/src/seastar/include/seastar/util/reference_wrapper.hh b/src/seastar/include/seastar/util/reference_wrapper.hh
new file mode 100644
index 000000000..f9c540373
--- /dev/null
+++ b/src/seastar/include/seastar/util/reference_wrapper.hh
@@ -0,0 +1,74 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+namespace seastar {
+
+/// \addtogroup utilities
+/// @{
+
+/// Wrapper for lvalue references
+///
+/// reference_wrapper wraps a lvalue reference into a copyable and assignable
+/// object. It is very similar to std::reference_wrapper except that it doesn't
+/// allow implicit construction from a reference and the only way to construct
+/// it is to use either ref() or cref(). The reason for that discrepancy (and
+/// also the reason why seastar::reference_wrapper was introduced) is that it
+/// server different purpose than std::reference_wrapper. The latter protects
+/// references from decaying and allows copying and assigning them.
+/// seastar::reference_wrapper is used mainly to force user to explicitly
+/// state that object is passed by reference thus reducing the chances that
+/// the referred object being prematurely destroyed in case the execution
+/// is deferred to a continuation.
+template<typename T>
+class reference_wrapper {
+ T* _pointer;
+
+ explicit reference_wrapper(T& object) noexcept : _pointer(&object) { }
+
+ template<typename U>
+ friend reference_wrapper<U> ref(U&) noexcept;
+ template<typename U>
+ friend reference_wrapper<const U> cref(const U&) noexcept;
+public:
+ using type = T;
+
+ operator T&() const noexcept { return *_pointer; }
+ T& get() const noexcept { return *_pointer; }
+
+};
+
+/// Wraps reference in a reference_wrapper
+template<typename T>
+inline reference_wrapper<T> ref(T& object) noexcept {
+ return reference_wrapper<T>(object);
+}
+
+/// Wraps constant reference in a reference_wrapper
+template<typename T>
+inline reference_wrapper<const T> cref(const T& object) noexcept {
+ return reference_wrapper<const T>(object);
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/util/spinlock.hh b/src/seastar/include/seastar/util/spinlock.hh
new file mode 100644
index 000000000..12774e650
--- /dev/null
+++ b/src/seastar/include/seastar/util/spinlock.hh
@@ -0,0 +1,104 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright 2016 ScyllaDB
+ */
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+
+#if defined(__x86_64__) || defined(__i386__)
+#include <xmmintrin.h>
+#endif
+
+namespace seastar {
+
+namespace internal {
+#if defined(__x86_64__) || defined(__i386__)
+
+/// \brief Puts the current CPU thread into a "relaxed" state.
+///
+/// This function is supposed to significantly improve the performance in situations like spinlocks when process spins
+/// in a tight loop waiting for a lock. The actual implementation is different on different platforms. For more details
+/// look for "Pause Intrinsic" for x86 version, and for "yield" assembly instruction documentation for Power platform.
+[[gnu::always_inline]]
+inline void cpu_relax() {
+ _mm_pause();
+}
+
+#elif defined(__PPC__)
+
+[[gnu::always_inline]]
+inline void cpu_relax() {
+ __asm__ volatile("yield");
+}
+
+#elif defined(__s390x__) || defined(__zarch__)
+
+// FIXME: there must be a better way
+[[gnu::always_inline]]
+inline void cpu_relax() {}
+
+#elif defined(__aarch64__)
+
+[[gnu::always_inline]]
+inline void cpu_relax() {
+ __asm__ volatile("yield");
+}
+
+#else
+
+[[gnu::always_inline]]
+inline void cpu_relax() {}
+#warn "Using an empty cpu_relax() for this architecture"
+
+#endif
+
+
+}
+
+namespace util {
+
+// Spin lock implementation.
+// BasicLockable.
+// Async-signal safe.
+// unlock() "synchronizes with" lock().
+class spinlock {
+ std::atomic<bool> _busy = { false };
+public:
+ spinlock() = default;
+ spinlock(const spinlock&) = delete;
+ ~spinlock() { assert(!_busy.load(std::memory_order_relaxed)); }
+ bool try_lock() noexcept {
+ return !_busy.exchange(true, std::memory_order_acquire);
+ }
+ void lock() noexcept {
+ while (_busy.exchange(true, std::memory_order_acquire)) {
+ internal::cpu_relax();
+ }
+ }
+ void unlock() noexcept {
+ _busy.store(false, std::memory_order_release);
+ }
+};
+
+}
+
+}
diff --git a/src/seastar/include/seastar/util/std-compat.hh b/src/seastar/include/seastar/util/std-compat.hh
new file mode 100644
index 000000000..388c6a28d
--- /dev/null
+++ b/src/seastar/include/seastar/util/std-compat.hh
@@ -0,0 +1,54 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2018 ScyllaDB
+ */
+
+#pragma once
+
+#include <optional>
+#include <string_view>
+#include <variant>
+
+#include <filesystem>
+
+#if __has_include(<memory_resource>)
+#include <memory_resource>
+#else
+#include <experimental/memory_resource>
+namespace std::pmr {
+ using namespace std::experimental::pmr;
+}
+#endif
+
+#if defined(__cpp_impl_coroutine) || defined(__cpp_coroutines)
+#define SEASTAR_COROUTINES_ENABLED
+#endif
+
+// Defining SEASTAR_ASAN_ENABLED in here is a bit of a hack, but
+// convenient since it is build system independent and in practice
+// everything includes this header.
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+// clang uses __has_feature, gcc defines __SANITIZE_ADDRESS__
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#define SEASTAR_ASAN_ENABLED
+#endif
diff --git a/src/seastar/include/seastar/util/tmp_file.hh b/src/seastar/include/seastar/util/tmp_file.hh
new file mode 100644
index 000000000..e21ceefe3
--- /dev/null
+++ b/src/seastar/include/seastar/util/tmp_file.hh
@@ -0,0 +1,200 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright 2020 ScyllaDB
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+#include <seastar/core/file.hh>
+#include <seastar/core/thread.hh>
+#include <seastar/util/std-compat.hh>
+#include <seastar/util/defer.hh>
+
+namespace seastar {
+
+const std::filesystem::path& default_tmpdir();
+void set_default_tmpdir(std::filesystem::path);
+
+class tmp_file {
+ std::filesystem::path _path;
+ file _file;
+ bool _is_open = false;
+
+ static_assert(std::is_nothrow_constructible<std::filesystem::path>::value,
+ "filesystem::path's constructor must not throw");
+ static_assert(std::is_nothrow_move_constructible<std::filesystem::path>::value,
+ "filesystem::path's move constructor must not throw");
+public:
+ tmp_file() noexcept = default;
+ tmp_file(const tmp_file&) = delete;
+ tmp_file(tmp_file&& x) noexcept;
+
+ tmp_file& operator=(tmp_file&&) noexcept = default;
+
+ ~tmp_file();
+
+ future<> open(std::filesystem::path path_template = default_tmpdir(),
+ open_flags oflags = open_flags::rw,
+ file_open_options options = {}) noexcept;
+ future<> close() noexcept;
+ future<> remove() noexcept;
+
+ template <typename Func>
+ static future<> do_with(std::filesystem::path path_template, Func&& func,
+ open_flags oflags = open_flags::rw,
+ file_open_options options = {}) noexcept {
+ static_assert(std::is_nothrow_move_constructible<Func>::value,
+ "Func's move constructor must not throw");
+ return seastar::do_with(tmp_file(), [func = std::move(func), path_template = std::move(path_template), oflags, options = std::move(options)] (tmp_file& t) mutable {
+ return t.open(std::move(path_template), oflags, std::move(options)).then([&t, func = std::move(func)] () mutable {
+ return func(t);
+ }).finally([&t] {
+ return t.close().finally([&t] {
+ return t.remove();
+ });
+ });
+ });
+ }
+
+ template <typename Func>
+ static future<> do_with(Func&& func) noexcept {
+ return do_with(default_tmpdir(), std::move(func));
+ }
+
+ bool has_path() const {
+ return !_path.empty();
+ }
+
+ bool is_open() const {
+ return _is_open;
+ }
+
+ const std::filesystem::path& get_path() const {
+ return _path;
+ }
+
+ file& get_file() {
+ return _file;
+ }
+};
+
+/// Returns a future for an opened tmp_file exclusively created by the function.
+///
+/// \param path_template - path where the file is to be created,
+/// optionally including a template for the file name.
+/// \param oflags - optional \ref open_flags (open_flags::create | open_flags::exclusive are added to those by default)
+/// \param options - additional \ref file_open_options, e.g. for setting the created file permission.
+///
+/// \note
+/// path_template may optionally include a filename template in the last component of the path.
+/// The template is indicated by two or more consecutive XX's.
+/// Those will be replaced in the result path by a unique string.
+///
+/// If no filename template is found, then path_template is assumed to refer to the directory where
+/// the temporary file is to be created at (a.k.a. the parent directory) and `default_tmp_name_template`
+/// is appended to the path as the filename template.
+///
+/// The parent directory must exist and be writable to the current process.
+///
+future<tmp_file> make_tmp_file(std::filesystem::path path_template = default_tmpdir(),
+ open_flags oflags = open_flags::rw, file_open_options options = {}) noexcept;
+
+class tmp_dir {
+ std::filesystem::path _path;
+
+public:
+ tmp_dir() = default;
+ tmp_dir(const tmp_dir&) = delete;
+ tmp_dir(tmp_dir&& x) = default;
+
+ tmp_dir& operator=(tmp_dir&&) noexcept = default;
+
+ ~tmp_dir();
+
+ future<> create(std::filesystem::path path_template = default_tmpdir(),
+ file_permissions create_permissions = file_permissions::default_dir_permissions) noexcept;
+ future<> remove() noexcept;
+
+ template <typename Func>
+ SEASTAR_CONCEPT( requires std::is_nothrow_move_constructible_v<Func> )
+ static future<> do_with(std::filesystem::path path_template, Func&& func,
+ file_permissions create_permissions = file_permissions::default_dir_permissions) noexcept {
+ static_assert(std::is_nothrow_move_constructible_v<Func>,
+ "Func's move constructor must not throw");
+ return seastar::do_with(tmp_dir(), [func = std::move(func), path_template = std::move(path_template), create_permissions] (tmp_dir& t) mutable {
+ return t.create(std::move(path_template), create_permissions).then([&t, func = std::move(func)] () mutable {
+ return func(t);
+ }).finally([&t] {
+ return t.remove();
+ });
+ });
+ }
+
+ template <typename Func>
+ static future<> do_with(Func&& func) noexcept {
+ return do_with(default_tmpdir(), std::move(func));
+ }
+
+ template <typename Func>
+
+ SEASTAR_CONCEPT( requires std::is_nothrow_move_constructible_v<Func> )
+ static future<> do_with_thread(Func&& func) noexcept {
+ static_assert(std::is_nothrow_move_constructible_v<Func>,
+ "Func's move constructor must not throw");
+ return async([func = std::move(func)] () mutable {
+ auto t = tmp_dir();
+ t.create().get();
+ futurize_invoke(func, t).finally([&t] {
+ return t.remove();
+ }).get();
+ });
+ }
+
+ bool has_path() const {
+ return !_path.empty();
+ }
+
+ const std::filesystem::path& get_path() const {
+ return _path;
+ }
+};
+
+/// Returns a future for a tmp_dir exclusively created by the function.
+///
+/// \param path_template - path where the file is to be created,
+/// optionally including a template for the file name.
+/// \param create_permissions - optional permissions for the newly created directory.
+///
+/// \note
+/// path_template may optionally include a name template in the last component of the path.
+/// The template is indicated by two or more consecutive XX's.
+/// Those will be replaced in the result path by a unique string.
+///
+/// If no name template is found, then path_template is assumed to refer to the directory where
+/// the temporary dir is to be created at (a.k.a. the parent directory) and `default_tmp_name_template`
+/// is appended to the path as the name template for the to-be-created directory.
+///
+/// The parent directory must exist and be writable to the current process.
+///
+future<tmp_dir> make_tmp_dir(std::filesystem::path path_template = default_tmpdir(),
+ file_permissions create_permissions = file_permissions::default_dir_permissions) noexcept;
+
+} // namespace seastar
diff --git a/src/seastar/include/seastar/util/transform_iterator.hh b/src/seastar/include/seastar/util/transform_iterator.hh
new file mode 100644
index 000000000..95ea1ce51
--- /dev/null
+++ b/src/seastar/include/seastar/util/transform_iterator.hh
@@ -0,0 +1,57 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2014 Cloudius Systems, Ltd.
+ */
+
+#pragma once
+
+namespace seastar {
+
+template <typename Iterator, typename Func>
+class transform_iterator {
+ Iterator _i;
+ Func _f;
+public:
+ transform_iterator(Iterator i, Func f) : _i(i), _f(f) {}
+ auto operator*() { return _f(*_i); }
+ transform_iterator& operator++() {
+ ++_i;
+ return *this;
+ }
+ transform_iterator operator++(int) {
+ transform_iterator ret(*this);
+ _i++;
+ return ret;
+ }
+ bool operator==(const transform_iterator& x) const {
+ return _i == x._i;
+ }
+ bool operator!=(const transform_iterator& x) const {
+ return !operator==(x);
+ }
+};
+
+template <typename Iterator, typename Func>
+inline
+transform_iterator<Iterator, Func>
+make_transform_iterator(Iterator i, Func f) {
+ return transform_iterator<Iterator, Func>(i, f);
+}
+
+}
diff --git a/src/seastar/include/seastar/util/tuple_utils.hh b/src/seastar/include/seastar/util/tuple_utils.hh
new file mode 100644
index 000000000..477f339f9
--- /dev/null
+++ b/src/seastar/include/seastar/util/tuple_utils.hh
@@ -0,0 +1,174 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB.
+ */
+
+#pragma once
+
+#include <tuple>
+#include <utility>
+#include <stddef.h>
+
+namespace seastar {
+
+/// \cond internal
+namespace internal {
+
+template<typename Tuple>
+Tuple untuple(Tuple t) {
+ return t;
+}
+
+template<typename T>
+T untuple(std::tuple<T> t) {
+ return std::get<0>(std::move(t));
+}
+
+template<typename Tuple, typename Function, size_t... I>
+void tuple_for_each_helper(Tuple&& t, Function&& f, std::index_sequence<I...>&&) {
+ auto ignore_me = { (f(std::get<I>(std::forward<Tuple>(t))), 1)... };
+ (void)ignore_me;
+}
+
+template<typename Tuple, typename MapFunction, size_t... I>
+auto tuple_map_helper(Tuple&& t, MapFunction&& f, std::index_sequence<I...>&&) {
+ return std::make_tuple(f(std::get<I>(std::forward<Tuple>(t)))...);
+}
+
+template<size_t I, typename IndexSequence>
+struct prepend;
+
+template<size_t I, size_t... Is>
+struct prepend<I, std::index_sequence<Is...>> {
+ using type = std::index_sequence<I, Is...>;
+};
+
+template<template<typename> class Filter, typename Tuple, typename IndexSequence>
+struct tuple_filter;
+
+template<template<typename> class Filter, typename T, typename... Ts, size_t I, size_t... Is>
+struct tuple_filter<Filter, std::tuple<T, Ts...>, std::index_sequence<I, Is...>> {
+ using tail = typename tuple_filter<Filter, std::tuple<Ts...>, std::index_sequence<Is...>>::type;
+ using type = std::conditional_t<Filter<T>::value, typename prepend<I, tail>::type, tail>;
+};
+
+template<template<typename> class Filter>
+struct tuple_filter<Filter, std::tuple<>, std::index_sequence<>> {
+ using type = std::index_sequence<>;
+};
+
+template<typename Tuple, size_t... I>
+auto tuple_filter_helper(Tuple&& t, std::index_sequence<I...>&&) {
+ return std::make_tuple(std::get<I>(std::forward<Tuple>(t))...);
+}
+
+}
+/// \endcond
+
+/// \addtogroup utilities
+/// @{
+
+/// Applies type transformation to all types in tuple
+///
+/// Member type `type` is set to a tuple type which is a result of applying
+/// transformation `MapClass<T>::type` to each element `T` of the input tuple
+/// type.
+///
+/// \tparam MapClass class template defining type transformation
+/// \tparam Tuple input tuple type
+template<template<typename> class MapClass, typename Tuple>
+struct tuple_map_types;
+
+/// @}
+
+template<template<typename> class MapClass, typename... Elements>
+struct tuple_map_types<MapClass, std::tuple<Elements...>> {
+ using type = std::tuple<typename MapClass<Elements>::type...>;
+};
+
+/// \addtogroup utilities
+/// @{
+
+/// Filters elements in tuple by their type
+///
+/// Returns a tuple containing only those elements which type `T` caused
+/// expression FilterClass<T>::value to be true.
+///
+/// \tparam FilterClass class template having an element value set to true for elements that
+/// should be present in the result
+/// \param t tuple to filter
+/// \return a tuple contaning elements which type passed the test
+template<template<typename> class FilterClass, typename... Elements>
+auto tuple_filter_by_type(const std::tuple<Elements...>& t) {
+ using sequence = typename internal::tuple_filter<FilterClass, std::tuple<Elements...>,
+ std::index_sequence_for<Elements...>>::type;
+ return internal::tuple_filter_helper(t, sequence());
+}
+template<template<typename> class FilterClass, typename... Elements>
+auto tuple_filter_by_type(std::tuple<Elements...>&& t) {
+ using sequence = typename internal::tuple_filter<FilterClass, std::tuple<Elements...>,
+ std::index_sequence_for<Elements...>>::type;
+ return internal::tuple_filter_helper(std::move(t), sequence());
+}
+
+/// Applies function to all elements in tuple
+///
+/// Applies given function to all elements in the tuple and returns a tuple
+/// of results.
+///
+/// \param t original tuple
+/// \param f function to apply
+/// \return tuple of results returned by f for each element in t
+template<typename Function, typename... Elements>
+auto tuple_map(const std::tuple<Elements...>& t, Function&& f) {
+ return internal::tuple_map_helper(t, std::forward<Function>(f),
+ std::index_sequence_for<Elements...>());
+}
+template<typename Function, typename... Elements>
+auto tuple_map(std::tuple<Elements...>&& t, Function&& f) {
+ return internal::tuple_map_helper(std::move(t), std::forward<Function>(f),
+ std::index_sequence_for<Elements...>());
+}
+
+/// Iterate over all elements in tuple
+///
+/// Iterates over given tuple and calls the specified function for each of
+/// it elements.
+///
+/// \param t a tuple to iterate over
+/// \param f function to call for each tuple element
+template<typename Function, typename... Elements>
+void tuple_for_each(const std::tuple<Elements...>& t, Function&& f) {
+ return internal::tuple_for_each_helper(t, std::forward<Function>(f),
+ std::index_sequence_for<Elements...>());
+}
+template<typename Function, typename... Elements>
+void tuple_for_each(std::tuple<Elements...>& t, Function&& f) {
+ return internal::tuple_for_each_helper(t, std::forward<Function>(f),
+ std::index_sequence_for<Elements...>());
+}
+template<typename Function, typename... Elements>
+void tuple_for_each(std::tuple<Elements...>&& t, Function&& f) {
+ return internal::tuple_for_each_helper(std::move(t), std::forward<Function>(f),
+ std::index_sequence_for<Elements...>());
+}
+
+/// @}
+
+}
diff --git a/src/seastar/include/seastar/util/used_size.hh b/src/seastar/include/seastar/util/used_size.hh
new file mode 100644
index 000000000..7b4212d7e
--- /dev/null
+++ b/src/seastar/include/seastar/util/used_size.hh
@@ -0,0 +1,36 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2020 ScyllaDB Ltd.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <type_traits>
+
+namespace seastar {
+namespace internal {
+// Empty types have a size of 1, but that byte is not actually
+// used. This helper is used to avoid accessing that byte.
+template<typename T>
+struct used_size {
+ static constexpr size_t value = std::is_empty<T>::value ? 0 : sizeof(T);
+};
+}
+}
diff --git a/src/seastar/include/seastar/util/variant_utils.hh b/src/seastar/include/seastar/util/variant_utils.hh
new file mode 100644
index 000000000..ddb2192d2
--- /dev/null
+++ b/src/seastar/include/seastar/util/variant_utils.hh
@@ -0,0 +1,102 @@
+/*
+ * This file is open source software, licensed to you under the terms
+ * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
+ * distributed with this work for additional information regarding copyright
+ * ownership. You may not use this file except in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Copyright (C) 2017 ScyllaDB.
+ */
+
+#pragma once
+
+#include <seastar/util/std-compat.hh>
+
+namespace seastar {
+
+/// \cond internal
+namespace internal {
+
+template<typename... Args>
+struct variant_visitor : Args... {
+ variant_visitor(Args&&... a) : Args(std::move(a))... {}
+ using Args::operator()...;
+};
+
+template<typename... Args> variant_visitor(Args&&...) -> variant_visitor<Args...>;
+
+}
+/// \endcond
+
+/// \addtogroup utilities
+/// @{
+
+/// Creates a visitor from function objects.
+///
+/// Returns a visitor object comprised of the provided function objects. Can be
+/// used with std::variant or any other custom variant implementation.
+///
+/// \param args function objects each accepting one or some types stored in the variant as input
+template <typename... Args>
+auto make_visitor(Args&&... args)
+{
+ return internal::variant_visitor<Args...>(std::forward<Args>(args)...);
+}
+
+/// Applies a static visitor comprised of supplied lambdas to a variant.
+/// Note that the lambdas should cover all the types that the variant can possibly hold.
+///
+/// Returns the common type of return types of all lambdas.
+///
+/// \tparam Variant the type of a variant
+/// \tparam Args types of lambda objects
+/// \param variant the variant object
+/// \param args lambda objects each accepting one or some types stored in the variant as input
+/// \return
+template <typename Variant, typename... Args>
+inline auto visit(Variant&& variant, Args&&... args)
+{
+ static_assert(sizeof...(Args) > 0, "At least one lambda must be provided for visitation");
+ return std::visit(
+ make_visitor(std::forward<Args>(args)...),
+ variant);
+}
+
+namespace internal {
+template<typename... Args>
+struct castable_variant {
+ std::variant<Args...> var;
+
+ template<typename... SuperArgs>
+ operator std::variant<SuperArgs...>() && {
+ return std::visit([] (auto&& x) {
+ return std::variant<SuperArgs...>(std::move(x));
+ }, var);
+ }
+};
+}
+
+template<typename... Args>
+internal::castable_variant<Args...> variant_cast(std::variant<Args...>&& var) {
+ return {std::move(var)};
+}
+
+template<typename... Args>
+internal::castable_variant<Args...> variant_cast(const std::variant<Args...>& var) {
+ return {var};
+}
+
+/// @}
+
+}