From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- src/seastar/include/seastar/core/abort_on_ebadf.hh | 36 + src/seastar/include/seastar/core/abort_source.hh | 139 ++ src/seastar/include/seastar/core/alien.hh | 184 ++ src/seastar/include/seastar/core/align.hh | 55 + src/seastar/include/seastar/core/aligned_buffer.hh | 45 + src/seastar/include/seastar/core/app-template.hh | 100 + src/seastar/include/seastar/core/apply.hh | 64 + src/seastar/include/seastar/core/array_map.hh | 50 + src/seastar/include/seastar/core/bitops.hh | 73 + src/seastar/include/seastar/core/bitset-iter.hh | 198 ++ src/seastar/include/seastar/core/byteorder.hh | 127 ++ src/seastar/include/seastar/core/cacheline.hh | 42 + src/seastar/include/seastar/core/checked_ptr.hh | 199 ++ src/seastar/include/seastar/core/chunked_fifo.hh | 626 ++++++ .../include/seastar/core/circular_buffer.hh | 511 +++++ .../seastar/core/circular_buffer_fixed_capacity.hh | 378 ++++ .../include/seastar/core/condition-variable.hh | 171 ++ src/seastar/include/seastar/core/coroutine.hh | 196 ++ src/seastar/include/seastar/core/deleter.hh | 281 +++ src/seastar/include/seastar/core/distributed.hh | 32 + src/seastar/include/seastar/core/do_with.hh | 153 ++ src/seastar/include/seastar/core/dpdk_rte.hh | 64 + src/seastar/include/seastar/core/enum.hh | 46 + .../include/seastar/core/exception_hacks.hh | 26 + .../include/seastar/core/execution_stage.hh | 543 +++++ src/seastar/include/seastar/core/expiring_fifo.hh | 217 ++ src/seastar/include/seastar/core/fair_queue.hh | 247 +++ src/seastar/include/seastar/core/file-types.hh | 140 ++ src/seastar/include/seastar/core/file.hh | 586 ++++++ src/seastar/include/seastar/core/fsqual.hh | 30 + src/seastar/include/seastar/core/fstream.hh | 151 ++ .../include/seastar/core/function_traits.hh | 68 + src/seastar/include/seastar/core/future-util.hh | 31 + src/seastar/include/seastar/core/future.hh | 2196 ++++++++++++++++++++ src/seastar/include/seastar/core/gate.hh | 170 ++ .../include/seastar/core/idle_cpu_handler.hh | 59 + .../include/seastar/core/internal/api-level.hh | 82 + .../seastar/core/internal/buffer_allocator.hh | 43 + .../include/seastar/core/internal/io_desc.hh | 35 + .../include/seastar/core/internal/io_request.hh | 266 +++ src/seastar/include/seastar/core/internal/poll.hh | 59 + .../include/seastar/core/internal/pollable_fd.hh | 219 ++ src/seastar/include/seastar/core/io_queue.hh | 170 ++ src/seastar/include/seastar/core/iostream-impl.hh | 535 +++++ src/seastar/include/seastar/core/iostream.hh | 360 ++++ src/seastar/include/seastar/core/layered_file.hh | 67 + src/seastar/include/seastar/core/linux-aio.hh | 234 +++ src/seastar/include/seastar/core/loop.hh | 715 +++++++ src/seastar/include/seastar/core/lowres_clock.hh | 160 ++ src/seastar/include/seastar/core/make_task.hh | 62 + src/seastar/include/seastar/core/manual_clock.hh | 51 + src/seastar/include/seastar/core/map_reduce.hh | 254 +++ src/seastar/include/seastar/core/memory.hh | 370 ++++ src/seastar/include/seastar/core/metrics.hh | 587 ++++++ src/seastar/include/seastar/core/metrics_api.hh | 386 ++++ .../include/seastar/core/metrics_registration.hh | 173 ++ src/seastar/include/seastar/core/metrics_types.hh | 83 + .../include/seastar/core/on_internal_error.hh | 56 + src/seastar/include/seastar/core/pipe.hh | 267 +++ .../seastar/core/polymorphic_temporary_buffer.hh | 43 + src/seastar/include/seastar/core/posix.hh | 492 +++++ src/seastar/include/seastar/core/preempt.hh | 58 + src/seastar/include/seastar/core/prefetch.hh | 115 + src/seastar/include/seastar/core/print.hh | 148 ++ src/seastar/include/seastar/core/prometheus.hh | 51 + src/seastar/include/seastar/core/queue.hh | 279 +++ src/seastar/include/seastar/core/ragel.hh | 140 ++ src/seastar/include/seastar/core/reactor.hh | 755 +++++++ src/seastar/include/seastar/core/reactor_config.hh | 47 + .../include/seastar/core/report_exception.hh | 31 + src/seastar/include/seastar/core/resource.hh | 94 + src/seastar/include/seastar/core/rwlock.hh | 180 ++ .../include/seastar/core/scattered_message.hh | 112 + src/seastar/include/seastar/core/scheduling.hh | 366 ++++ .../include/seastar/core/scheduling_specific.hh | 189 ++ src/seastar/include/seastar/core/scollectd.hh | 848 ++++++++ src/seastar/include/seastar/core/scollectd_api.hh | 35 + src/seastar/include/seastar/core/seastar.hh | 386 ++++ src/seastar/include/seastar/core/semaphore.hh | 572 +++++ src/seastar/include/seastar/core/sharded.hh | 909 ++++++++ src/seastar/include/seastar/core/shared_future.hh | 299 +++ src/seastar/include/seastar/core/shared_mutex.hh | 183 ++ src/seastar/include/seastar/core/shared_ptr.hh | 868 ++++++++ .../seastar/core/shared_ptr_debug_helper.hh | 70 + .../include/seastar/core/shared_ptr_incomplete.hh | 44 + src/seastar/include/seastar/core/simple-stream.hh | 639 ++++++ src/seastar/include/seastar/core/slab.hh | 568 +++++ src/seastar/include/seastar/core/sleep.hh | 93 + src/seastar/include/seastar/core/smp.hh | 444 ++++ src/seastar/include/seastar/core/sstring.hh | 779 +++++++ src/seastar/include/seastar/core/stall_sampler.hh | 53 + src/seastar/include/seastar/core/std-coroutine.hh | 94 + src/seastar/include/seastar/core/stream.hh | 174 ++ .../seastar/core/systemwide_memory_barrier.hh | 37 + src/seastar/include/seastar/core/task.hh | 67 + .../include/seastar/core/temporary_buffer.hh | 246 +++ src/seastar/include/seastar/core/thread.hh | 285 +++ .../include/seastar/core/thread_cputime_clock.hh | 48 + src/seastar/include/seastar/core/thread_impl.hh | 84 + .../include/seastar/core/timed_out_error.hh | 42 + src/seastar/include/seastar/core/timer-set.hh | 253 +++ src/seastar/include/seastar/core/timer.hh | 225 ++ src/seastar/include/seastar/core/transfer.hh | 75 + src/seastar/include/seastar/core/unaligned.hh | 78 + src/seastar/include/seastar/core/units.hh | 30 + .../include/seastar/core/vector-data-sink.hh | 47 + src/seastar/include/seastar/core/weak_ptr.hh | 120 ++ src/seastar/include/seastar/core/when_all.hh | 562 +++++ .../include/seastar/core/with_scheduling_group.hh | 77 + src/seastar/include/seastar/core/with_timeout.hh | 73 + src/seastar/include/seastar/http/api_docs.hh | 340 +++ src/seastar/include/seastar/http/common.hh | 74 + src/seastar/include/seastar/http/exception.hh | 142 ++ src/seastar/include/seastar/http/file_handler.hh | 172 ++ .../include/seastar/http/function_handlers.hh | 132 ++ src/seastar/include/seastar/http/handlers.hh | 73 + src/seastar/include/seastar/http/httpd.hh | 243 +++ src/seastar/include/seastar/http/json_path.hh | 190 ++ src/seastar/include/seastar/http/matcher.hh | 111 + src/seastar/include/seastar/http/matchrules.hh | 122 ++ src/seastar/include/seastar/http/mime_types.hh | 33 + src/seastar/include/seastar/http/reply.hh | 182 ++ src/seastar/include/seastar/http/request.hh | 134 ++ src/seastar/include/seastar/http/routes.hh | 289 +++ src/seastar/include/seastar/http/transformers.hh | 59 + src/seastar/include/seastar/json/formatter.hh | 335 +++ src/seastar/include/seastar/json/json_elements.hh | 355 ++++ src/seastar/include/seastar/net/api.hh | 392 ++++ src/seastar/include/seastar/net/arp.hh | 296 +++ src/seastar/include/seastar/net/byteorder.hh | 128 ++ src/seastar/include/seastar/net/config.hh | 65 + src/seastar/include/seastar/net/const.hh | 44 + src/seastar/include/seastar/net/dhcp.hh | 84 + src/seastar/include/seastar/net/dns.hh | 156 ++ src/seastar/include/seastar/net/dpdk.hh | 54 + src/seastar/include/seastar/net/ethernet.hh | 96 + src/seastar/include/seastar/net/inet_address.hh | 128 ++ src/seastar/include/seastar/net/ip.hh | 529 +++++ src/seastar/include/seastar/net/ip_checksum.hh | 76 + src/seastar/include/seastar/net/native-stack.hh | 36 + src/seastar/include/seastar/net/net.hh | 308 +++ .../include/seastar/net/packet-data-source.hh | 53 + src/seastar/include/seastar/net/packet-util.hh | 158 ++ src/seastar/include/seastar/net/packet.hh | 622 ++++++ src/seastar/include/seastar/net/posix-stack.hh | 223 ++ src/seastar/include/seastar/net/proxy.hh | 32 + src/seastar/include/seastar/net/socket_defs.hh | 186 ++ src/seastar/include/seastar/net/stack.hh | 103 + src/seastar/include/seastar/net/tcp-stack.hh | 50 + src/seastar/include/seastar/net/tcp.hh | 2135 +++++++++++++++++++ src/seastar/include/seastar/net/tls.hh | 346 +++ src/seastar/include/seastar/net/toeplitz.hh | 99 + src/seastar/include/seastar/net/udp.hh | 59 + src/seastar/include/seastar/net/unix_address.hh | 75 + .../include/seastar/net/virtio-interface.hh | 131 ++ src/seastar/include/seastar/net/virtio.hh | 33 + src/seastar/include/seastar/rpc/lz4_compressor.hh | 48 + .../seastar/rpc/lz4_fragmented_compressor.hh | 44 + .../seastar/rpc/multi_algo_compressor_factory.hh | 80 + src/seastar/include/seastar/rpc/rpc.hh | 870 ++++++++ src/seastar/include/seastar/rpc/rpc_impl.hh | 881 ++++++++ src/seastar/include/seastar/rpc/rpc_types.hh | 386 ++++ src/seastar/include/seastar/testing/entry_point.hh | 33 + src/seastar/include/seastar/testing/exchanger.hh | 88 + .../include/seastar/testing/on_internal_error.hh | 40 + src/seastar/include/seastar/testing/perf_tests.hh | 259 +++ .../include/seastar/testing/seastar_test.hh | 68 + src/seastar/include/seastar/testing/test_case.hh | 36 + src/seastar/include/seastar/testing/test_runner.hh | 59 + .../include/seastar/testing/thread_test_case.hh | 46 + .../include/seastar/util/alloc_failure_injector.hh | 132 ++ .../include/seastar/util/attribute-compat.hh | 27 + src/seastar/include/seastar/util/backtrace.hh | 230 ++ src/seastar/include/seastar/util/bool_class.hh | 110 + src/seastar/include/seastar/util/concepts.hh | 34 + src/seastar/include/seastar/util/conversions.hh | 48 + .../include/seastar/util/critical_alloc_section.hh | 70 + src/seastar/include/seastar/util/defer.hh | 58 + src/seastar/include/seastar/util/eclipse.hh | 36 + src/seastar/include/seastar/util/exceptions.hh | 45 + src/seastar/include/seastar/util/file.hh | 45 + .../seastar/util/function_input_iterator.hh | 72 + src/seastar/include/seastar/util/gcc6-concepts.hh | 25 + src/seastar/include/seastar/util/indirect.hh | 74 + src/seastar/include/seastar/util/is_smart_ptr.hh | 34 + src/seastar/include/seastar/util/later.hh | 44 + src/seastar/include/seastar/util/lazy.hh | 153 ++ src/seastar/include/seastar/util/log-cli.hh | 80 + src/seastar/include/seastar/util/log-impl.hh | 117 ++ src/seastar/include/seastar/util/log.hh | 464 +++++ .../include/seastar/util/memory_diagnostics.hh | 85 + .../include/seastar/util/noncopyable_function.hh | 225 ++ .../include/seastar/util/optimized_optional.hh | 101 + src/seastar/include/seastar/util/print_safe.hh | 113 + .../include/seastar/util/program-options.hh | 99 + .../include/seastar/util/read_first_line.hh | 14 + .../include/seastar/util/reference_wrapper.hh | 74 + src/seastar/include/seastar/util/spinlock.hh | 104 + src/seastar/include/seastar/util/std-compat.hh | 54 + src/seastar/include/seastar/util/tmp_file.hh | 200 ++ .../include/seastar/util/transform_iterator.hh | 57 + src/seastar/include/seastar/util/tuple_utils.hh | 174 ++ src/seastar/include/seastar/util/used_size.hh | 36 + src/seastar/include/seastar/util/variant_utils.hh | 102 + 204 files changed, 41932 insertions(+) create mode 100644 src/seastar/include/seastar/core/abort_on_ebadf.hh create mode 100644 src/seastar/include/seastar/core/abort_source.hh create mode 100644 src/seastar/include/seastar/core/alien.hh create mode 100644 src/seastar/include/seastar/core/align.hh create mode 100644 src/seastar/include/seastar/core/aligned_buffer.hh create mode 100644 src/seastar/include/seastar/core/app-template.hh create mode 100644 src/seastar/include/seastar/core/apply.hh create mode 100644 src/seastar/include/seastar/core/array_map.hh create mode 100644 src/seastar/include/seastar/core/bitops.hh create mode 100644 src/seastar/include/seastar/core/bitset-iter.hh create mode 100644 src/seastar/include/seastar/core/byteorder.hh create mode 100644 src/seastar/include/seastar/core/cacheline.hh create mode 100644 src/seastar/include/seastar/core/checked_ptr.hh create mode 100644 src/seastar/include/seastar/core/chunked_fifo.hh create mode 100644 src/seastar/include/seastar/core/circular_buffer.hh create mode 100644 src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh create mode 100644 src/seastar/include/seastar/core/condition-variable.hh create mode 100644 src/seastar/include/seastar/core/coroutine.hh create mode 100644 src/seastar/include/seastar/core/deleter.hh create mode 100644 src/seastar/include/seastar/core/distributed.hh create mode 100644 src/seastar/include/seastar/core/do_with.hh create mode 100644 src/seastar/include/seastar/core/dpdk_rte.hh create mode 100644 src/seastar/include/seastar/core/enum.hh create mode 100644 src/seastar/include/seastar/core/exception_hacks.hh create mode 100644 src/seastar/include/seastar/core/execution_stage.hh create mode 100644 src/seastar/include/seastar/core/expiring_fifo.hh create mode 100644 src/seastar/include/seastar/core/fair_queue.hh create mode 100644 src/seastar/include/seastar/core/file-types.hh create mode 100644 src/seastar/include/seastar/core/file.hh create mode 100644 src/seastar/include/seastar/core/fsqual.hh create mode 100644 src/seastar/include/seastar/core/fstream.hh create mode 100644 src/seastar/include/seastar/core/function_traits.hh create mode 100644 src/seastar/include/seastar/core/future-util.hh create mode 100644 src/seastar/include/seastar/core/future.hh create mode 100644 src/seastar/include/seastar/core/gate.hh create mode 100644 src/seastar/include/seastar/core/idle_cpu_handler.hh create mode 100644 src/seastar/include/seastar/core/internal/api-level.hh create mode 100644 src/seastar/include/seastar/core/internal/buffer_allocator.hh create mode 100644 src/seastar/include/seastar/core/internal/io_desc.hh create mode 100644 src/seastar/include/seastar/core/internal/io_request.hh create mode 100644 src/seastar/include/seastar/core/internal/poll.hh create mode 100644 src/seastar/include/seastar/core/internal/pollable_fd.hh create mode 100644 src/seastar/include/seastar/core/io_queue.hh create mode 100644 src/seastar/include/seastar/core/iostream-impl.hh create mode 100644 src/seastar/include/seastar/core/iostream.hh create mode 100644 src/seastar/include/seastar/core/layered_file.hh create mode 100644 src/seastar/include/seastar/core/linux-aio.hh create mode 100644 src/seastar/include/seastar/core/loop.hh create mode 100644 src/seastar/include/seastar/core/lowres_clock.hh create mode 100644 src/seastar/include/seastar/core/make_task.hh create mode 100644 src/seastar/include/seastar/core/manual_clock.hh create mode 100644 src/seastar/include/seastar/core/map_reduce.hh create mode 100644 src/seastar/include/seastar/core/memory.hh create mode 100644 src/seastar/include/seastar/core/metrics.hh create mode 100644 src/seastar/include/seastar/core/metrics_api.hh create mode 100644 src/seastar/include/seastar/core/metrics_registration.hh create mode 100644 src/seastar/include/seastar/core/metrics_types.hh create mode 100644 src/seastar/include/seastar/core/on_internal_error.hh create mode 100644 src/seastar/include/seastar/core/pipe.hh create mode 100644 src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh create mode 100644 src/seastar/include/seastar/core/posix.hh create mode 100644 src/seastar/include/seastar/core/preempt.hh create mode 100644 src/seastar/include/seastar/core/prefetch.hh create mode 100644 src/seastar/include/seastar/core/print.hh create mode 100644 src/seastar/include/seastar/core/prometheus.hh create mode 100644 src/seastar/include/seastar/core/queue.hh create mode 100644 src/seastar/include/seastar/core/ragel.hh create mode 100644 src/seastar/include/seastar/core/reactor.hh create mode 100644 src/seastar/include/seastar/core/reactor_config.hh create mode 100644 src/seastar/include/seastar/core/report_exception.hh create mode 100644 src/seastar/include/seastar/core/resource.hh create mode 100644 src/seastar/include/seastar/core/rwlock.hh create mode 100644 src/seastar/include/seastar/core/scattered_message.hh create mode 100644 src/seastar/include/seastar/core/scheduling.hh create mode 100644 src/seastar/include/seastar/core/scheduling_specific.hh create mode 100644 src/seastar/include/seastar/core/scollectd.hh create mode 100644 src/seastar/include/seastar/core/scollectd_api.hh create mode 100644 src/seastar/include/seastar/core/seastar.hh create mode 100644 src/seastar/include/seastar/core/semaphore.hh create mode 100644 src/seastar/include/seastar/core/sharded.hh create mode 100644 src/seastar/include/seastar/core/shared_future.hh create mode 100644 src/seastar/include/seastar/core/shared_mutex.hh create mode 100644 src/seastar/include/seastar/core/shared_ptr.hh create mode 100644 src/seastar/include/seastar/core/shared_ptr_debug_helper.hh create mode 100644 src/seastar/include/seastar/core/shared_ptr_incomplete.hh create mode 100644 src/seastar/include/seastar/core/simple-stream.hh create mode 100644 src/seastar/include/seastar/core/slab.hh create mode 100644 src/seastar/include/seastar/core/sleep.hh create mode 100644 src/seastar/include/seastar/core/smp.hh create mode 100644 src/seastar/include/seastar/core/sstring.hh create mode 100644 src/seastar/include/seastar/core/stall_sampler.hh create mode 100644 src/seastar/include/seastar/core/std-coroutine.hh create mode 100644 src/seastar/include/seastar/core/stream.hh create mode 100644 src/seastar/include/seastar/core/systemwide_memory_barrier.hh create mode 100644 src/seastar/include/seastar/core/task.hh create mode 100644 src/seastar/include/seastar/core/temporary_buffer.hh create mode 100644 src/seastar/include/seastar/core/thread.hh create mode 100644 src/seastar/include/seastar/core/thread_cputime_clock.hh create mode 100644 src/seastar/include/seastar/core/thread_impl.hh create mode 100644 src/seastar/include/seastar/core/timed_out_error.hh create mode 100644 src/seastar/include/seastar/core/timer-set.hh create mode 100644 src/seastar/include/seastar/core/timer.hh create mode 100644 src/seastar/include/seastar/core/transfer.hh create mode 100644 src/seastar/include/seastar/core/unaligned.hh create mode 100644 src/seastar/include/seastar/core/units.hh create mode 100644 src/seastar/include/seastar/core/vector-data-sink.hh create mode 100644 src/seastar/include/seastar/core/weak_ptr.hh create mode 100644 src/seastar/include/seastar/core/when_all.hh create mode 100644 src/seastar/include/seastar/core/with_scheduling_group.hh create mode 100644 src/seastar/include/seastar/core/with_timeout.hh create mode 100644 src/seastar/include/seastar/http/api_docs.hh create mode 100644 src/seastar/include/seastar/http/common.hh create mode 100644 src/seastar/include/seastar/http/exception.hh create mode 100644 src/seastar/include/seastar/http/file_handler.hh create mode 100644 src/seastar/include/seastar/http/function_handlers.hh create mode 100644 src/seastar/include/seastar/http/handlers.hh create mode 100644 src/seastar/include/seastar/http/httpd.hh create mode 100644 src/seastar/include/seastar/http/json_path.hh create mode 100644 src/seastar/include/seastar/http/matcher.hh create mode 100644 src/seastar/include/seastar/http/matchrules.hh create mode 100644 src/seastar/include/seastar/http/mime_types.hh create mode 100644 src/seastar/include/seastar/http/reply.hh create mode 100644 src/seastar/include/seastar/http/request.hh create mode 100644 src/seastar/include/seastar/http/routes.hh create mode 100644 src/seastar/include/seastar/http/transformers.hh create mode 100644 src/seastar/include/seastar/json/formatter.hh create mode 100644 src/seastar/include/seastar/json/json_elements.hh create mode 100644 src/seastar/include/seastar/net/api.hh create mode 100644 src/seastar/include/seastar/net/arp.hh create mode 100644 src/seastar/include/seastar/net/byteorder.hh create mode 100644 src/seastar/include/seastar/net/config.hh create mode 100644 src/seastar/include/seastar/net/const.hh create mode 100644 src/seastar/include/seastar/net/dhcp.hh create mode 100644 src/seastar/include/seastar/net/dns.hh create mode 100644 src/seastar/include/seastar/net/dpdk.hh create mode 100644 src/seastar/include/seastar/net/ethernet.hh create mode 100644 src/seastar/include/seastar/net/inet_address.hh create mode 100644 src/seastar/include/seastar/net/ip.hh create mode 100644 src/seastar/include/seastar/net/ip_checksum.hh create mode 100644 src/seastar/include/seastar/net/native-stack.hh create mode 100644 src/seastar/include/seastar/net/net.hh create mode 100644 src/seastar/include/seastar/net/packet-data-source.hh create mode 100644 src/seastar/include/seastar/net/packet-util.hh create mode 100644 src/seastar/include/seastar/net/packet.hh create mode 100644 src/seastar/include/seastar/net/posix-stack.hh create mode 100644 src/seastar/include/seastar/net/proxy.hh create mode 100644 src/seastar/include/seastar/net/socket_defs.hh create mode 100644 src/seastar/include/seastar/net/stack.hh create mode 100644 src/seastar/include/seastar/net/tcp-stack.hh create mode 100644 src/seastar/include/seastar/net/tcp.hh create mode 100644 src/seastar/include/seastar/net/tls.hh create mode 100644 src/seastar/include/seastar/net/toeplitz.hh create mode 100644 src/seastar/include/seastar/net/udp.hh create mode 100644 src/seastar/include/seastar/net/unix_address.hh create mode 100644 src/seastar/include/seastar/net/virtio-interface.hh create mode 100644 src/seastar/include/seastar/net/virtio.hh create mode 100644 src/seastar/include/seastar/rpc/lz4_compressor.hh create mode 100644 src/seastar/include/seastar/rpc/lz4_fragmented_compressor.hh create mode 100644 src/seastar/include/seastar/rpc/multi_algo_compressor_factory.hh create mode 100644 src/seastar/include/seastar/rpc/rpc.hh create mode 100644 src/seastar/include/seastar/rpc/rpc_impl.hh create mode 100644 src/seastar/include/seastar/rpc/rpc_types.hh create mode 100644 src/seastar/include/seastar/testing/entry_point.hh create mode 100644 src/seastar/include/seastar/testing/exchanger.hh create mode 100644 src/seastar/include/seastar/testing/on_internal_error.hh create mode 100644 src/seastar/include/seastar/testing/perf_tests.hh create mode 100644 src/seastar/include/seastar/testing/seastar_test.hh create mode 100644 src/seastar/include/seastar/testing/test_case.hh create mode 100644 src/seastar/include/seastar/testing/test_runner.hh create mode 100644 src/seastar/include/seastar/testing/thread_test_case.hh create mode 100644 src/seastar/include/seastar/util/alloc_failure_injector.hh create mode 100644 src/seastar/include/seastar/util/attribute-compat.hh create mode 100644 src/seastar/include/seastar/util/backtrace.hh create mode 100644 src/seastar/include/seastar/util/bool_class.hh create mode 100644 src/seastar/include/seastar/util/concepts.hh create mode 100644 src/seastar/include/seastar/util/conversions.hh create mode 100644 src/seastar/include/seastar/util/critical_alloc_section.hh create mode 100644 src/seastar/include/seastar/util/defer.hh create mode 100644 src/seastar/include/seastar/util/eclipse.hh create mode 100644 src/seastar/include/seastar/util/exceptions.hh create mode 100644 src/seastar/include/seastar/util/file.hh create mode 100644 src/seastar/include/seastar/util/function_input_iterator.hh create mode 100644 src/seastar/include/seastar/util/gcc6-concepts.hh create mode 100644 src/seastar/include/seastar/util/indirect.hh create mode 100644 src/seastar/include/seastar/util/is_smart_ptr.hh create mode 100644 src/seastar/include/seastar/util/later.hh create mode 100644 src/seastar/include/seastar/util/lazy.hh create mode 100644 src/seastar/include/seastar/util/log-cli.hh create mode 100644 src/seastar/include/seastar/util/log-impl.hh create mode 100644 src/seastar/include/seastar/util/log.hh create mode 100644 src/seastar/include/seastar/util/memory_diagnostics.hh create mode 100644 src/seastar/include/seastar/util/noncopyable_function.hh create mode 100644 src/seastar/include/seastar/util/optimized_optional.hh create mode 100644 src/seastar/include/seastar/util/print_safe.hh create mode 100644 src/seastar/include/seastar/util/program-options.hh create mode 100644 src/seastar/include/seastar/util/read_first_line.hh create mode 100644 src/seastar/include/seastar/util/reference_wrapper.hh create mode 100644 src/seastar/include/seastar/util/spinlock.hh create mode 100644 src/seastar/include/seastar/util/std-compat.hh create mode 100644 src/seastar/include/seastar/util/tmp_file.hh create mode 100644 src/seastar/include/seastar/util/transform_iterator.hh create mode 100644 src/seastar/include/seastar/util/tuple_utils.hh create mode 100644 src/seastar/include/seastar/util/used_size.hh create mode 100644 src/seastar/include/seastar/util/variant_utils.hh (limited to 'src/seastar/include') diff --git a/src/seastar/include/seastar/core/abort_on_ebadf.hh b/src/seastar/include/seastar/core/abort_on_ebadf.hh new file mode 100644 index 000000000..7cb8c05b1 --- /dev/null +++ b/src/seastar/include/seastar/core/abort_on_ebadf.hh @@ -0,0 +1,36 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2019 ScyllaDB + */ + +#pragma once + +namespace seastar { + +/// Determines whether seastar should throw or abort when operation made by +/// seastar fails because the target file descriptor is not valid. This is +/// detected when underlying system calls return EBADF or ENOTSOCK. +/// The default behavior is to throw std::system_error. +void set_abort_on_ebadf(bool do_abort); + +/// Queries the current setting for seastar's behavior on invalid file descriptor access. +/// See set_abort_on_ebadf(). +bool is_abort_on_ebadf_enabled(); + +} diff --git a/src/seastar/include/seastar/core/abort_source.hh b/src/seastar/include/seastar/core/abort_source.hh new file mode 100644 index 000000000..e2a4f66bf --- /dev/null +++ b/src/seastar/include/seastar/core/abort_source.hh @@ -0,0 +1,139 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 ScyllaDB. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace bi = boost::intrusive; + +namespace seastar { + +/// \addtogroup fiber-module +/// @{ + +/// Exception thrown when an \ref abort_source object has been +/// notified by the \ref abort_source::request_abort() method. +class abort_requested_exception : public std::exception { +public: + virtual const char* what() const noexcept override { + return "abort requested"; + } +}; + +/// Facility to communicate a cancellation request to a fiber. +/// Callbacks can be registered with the \c abort_source, which are called +/// atomically with a call to request_abort(). +class abort_source { + using subscription_callback_type = noncopyable_function; + +public: + /// Represents a handle to the callback registered by a given fiber. Ending the + /// lifetime of the \c subscription will unregister the callback, if it hasn't + /// been invoked yet. + class subscription : public bi::list_base_hook> { + friend class abort_source; + + abort_source* _as = nullptr; + subscription_callback_type _target; + + explicit subscription(abort_source& as, subscription_callback_type target) + : _as(&as) + , _target(std::move(target)) { + as._subscriptions->push_back(*this); + } + + void on_abort() { + _target(); + } + + public: + subscription() = default; + + subscription(subscription&& other) noexcept(std::is_nothrow_move_constructible::value) + : _as(other._as) + , _target(std::move(other._target)) { + subscription_list_type::node_algorithms::swap_nodes(other.this_ptr(), this_ptr()); + } + + subscription& operator=(subscription&& other) noexcept(std::is_nothrow_move_assignable::value) { + if (this != &other) { + _target = std::move(other._target); + _as = other._as; + if (is_linked()) { + subscription_list_type::node_algorithms::unlink(this_ptr()); + } + subscription_list_type::node_algorithms::swap_nodes(other.this_ptr(), this_ptr()); + } + return *this; + } + + explicit operator bool() const noexcept { + return _as != nullptr; + } + }; + +private: + using subscription_list_type = bi::list>; + std::optional _subscriptions = subscription_list_type(); + +public: + /// Delays the invocation of the callback \c f until \ref request_abort() is called. + /// \returns an engaged \ref optimized_optional containing a \ref subscription that can be used to control + /// the lifetime of the callback \c f, if \ref abort_requested() is \c false. Otherwise, + /// returns a disengaged \ref optimized_optional. + optimized_optional subscribe(subscription_callback_type f) noexcept(std::is_nothrow_move_constructible::value) { + if (abort_requested()) { + return { }; + } + return { subscription(*this, std::move(f)) }; + } + + /// Requests that the target operation be aborted. Current subscriptions + /// are invoked inline with this call, and no new ones can be registered. + void request_abort() { + _subscriptions->clear_and_dispose([] (subscription* s) { s->on_abort(); }); + _subscriptions = { }; + } + + /// Returns whether an abort has been requested. + bool abort_requested() const noexcept { + return !_subscriptions; + } + + + /// Throws a \ref abort_requested_exception if cancellation has been requested. + void check() const { + if (abort_requested()) { + throw abort_requested_exception(); + } + } +}; + +/// @} + +} diff --git a/src/seastar/include/seastar/core/alien.hh b/src/seastar/include/seastar/core/alien.hh new file mode 100644 index 000000000..43510d4ea --- /dev/null +++ b/src/seastar/include/seastar/core/alien.hh @@ -0,0 +1,184 @@ +// -*- mode:C++; tab-width:4; c-basic-offset:4; indent-tabs-mode:nil -*- +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2018 Red Hat + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +/// \file + +namespace seastar { + +class reactor; + +/// \brief Integration with non-seastar applications. +namespace alien { + +class message_queue { + static constexpr size_t batch_size = 128; + static constexpr size_t prefetch_cnt = 2; + struct work_item; + struct lf_queue_remote { + reactor* remote; + }; + using lf_queue_base = boost::lockfree::queue; + // use inheritence to control placement order + struct lf_queue : lf_queue_remote, lf_queue_base { + lf_queue(reactor* remote) + : lf_queue_remote{remote}, lf_queue_base{batch_size} {} + void maybe_wakeup(); + } _pending; + struct alignas(seastar::cache_line_size) { + std::atomic value{0}; + } _sent; + // keep this between two structures with statistics + // this makes sure that they have at least one cache line + // between them, so hw prefetcher will not accidentally prefetch + // cache line used by another cpu. + metrics::metric_groups _metrics; + struct alignas(seastar::cache_line_size) { + size_t _received = 0; + size_t _last_rcv_batch = 0; + }; + struct work_item { + virtual ~work_item() = default; + virtual void process() = 0; + }; + template + struct async_work_item : work_item { + Func _func; + async_work_item(Func&& func) : _func(std::move(func)) {} + void process() override { + _func(); + } + }; + template + size_t process_queue(lf_queue& q, Func process); + void submit_item(std::unique_ptr wi); +public: + message_queue(reactor *to); + void start(); + void stop(); + template + void submit(Func&& func) { + auto wi = std::make_unique>(std::forward(func)); + submit_item(std::move(wi)); + } + size_t process_incoming(); + bool pure_poll_rx() const; +}; + +class smp { + struct qs_deleter { + unsigned count; + qs_deleter(unsigned n = 0) : count(n) {} + qs_deleter(const qs_deleter& d) : count(d.count) {} + void operator()(message_queue* qs) const; + }; + using qs = std::unique_ptr; +public: + static qs create_qs(const std::vector& reactors); + static qs _qs; + static bool poll_queues(); + static bool pure_poll_queues(); +}; + +/// Runs a function on a remote shard from an alien thread where engine() is not available. +/// +/// \param shard designates the shard to run the function on +/// \param func a callable to run on shard \c t. If \c func is a temporary object, +/// its lifetime will be extended by moving it. If \c func is a reference, +/// the caller must guarantee that it will survive the call. +/// \note the func must not throw and should return \c void. as we cannot identify the +/// alien thread, hence we are not able to post the fulfilled promise to the +/// message queue managed by the shard executing the alien thread which is +/// interested to the return value. Please use \c submit_to() instead, if +/// \c func throws. +template +void run_on(unsigned shard, Func func) { + smp::_qs[shard].submit(std::move(func)); +} + +namespace internal { +template +using return_value_t = typename futurize>::value_type; + +template>> +struct return_type_of { + using type = void; + static void set(std::promise& p, return_value_t&&) { + p.set_value(); + } +}; +template +struct return_type_of { + using return_tuple_t = typename futurize>::tuple_type; + using type = std::tuple_element_t<0, return_tuple_t>; + static void set(std::promise& p, return_value_t&& t) { +#if SEASTAR_API_LEVEL < 5 + p.set_value(std::get<0>(std::move(t))); +#else + p.set_value(std::move(t)); +#endif + } +}; +template using return_type_t = typename return_type_of::type; +} + +/// Runs a function on a remote shard from an alien thread where engine() is not available. +/// +/// \param shard designates the shard to run the function on +/// \param func a callable to run on \c shard. If \c func is a temporary object, +/// its lifetime will be extended by moving it. If \c func is a reference, +/// the caller must guarantee that it will survive the call. +/// \return whatever \c func returns, as a \c std::future<> +/// \note the caller must keep the returned future alive until \c func returns +template> +std::future submit_to(unsigned shard, Func func) { + std::promise pr; + auto fut = pr.get_future(); + run_on(shard, [pr = std::move(pr), func = std::move(func)] () mutable { + // std::future returned via std::promise above. + (void)func().then_wrapped([pr = std::move(pr)] (auto&& result) mutable { + try { + internal::return_type_of::set(pr, result.get()); + } catch (...) { + pr.set_exception(std::current_exception()); + } + }); + }); + return fut; +} + +} +} diff --git a/src/seastar/include/seastar/core/align.hh b/src/seastar/include/seastar/core/align.hh new file mode 100644 index 000000000..5dd07c279 --- /dev/null +++ b/src/seastar/include/seastar/core/align.hh @@ -0,0 +1,55 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include + +namespace seastar { + +template +inline constexpr +T align_up(T v, T align) { + return (v + align - 1) & ~(align - 1); +} + +template +inline constexpr +T* align_up(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast(align_up(reinterpret_cast(v), align)); +} + +template +inline constexpr +T align_down(T v, T align) { + return v & ~(align - 1); +} + +template +inline constexpr +T* align_down(T* v, size_t align) { + static_assert(sizeof(T) == 1, "align byte pointers only"); + return reinterpret_cast(align_down(reinterpret_cast(v), align)); +} + +} diff --git a/src/seastar/include/seastar/core/aligned_buffer.hh b/src/seastar/include/seastar/core/aligned_buffer.hh new file mode 100644 index 000000000..2cc0f5a74 --- /dev/null +++ b/src/seastar/include/seastar/core/aligned_buffer.hh @@ -0,0 +1,45 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB. + */ +#pragma once +#include +#include +#include + +namespace seastar { + +namespace internal { +void* allocate_aligned_buffer_impl(size_t size, size_t align); +} + +struct free_deleter { + void operator()(void* p) { ::free(p); } +}; + +template +inline +std::unique_ptr allocate_aligned_buffer(size_t size, size_t align) { + static_assert(sizeof(CharType) == 1, "must allocate byte type"); + void* ret = internal::allocate_aligned_buffer_impl(size, align); + return std::unique_ptr(reinterpret_cast(ret)); +} + + +} diff --git a/src/seastar/include/seastar/core/app-template.hh b/src/seastar/include/seastar/core/app-template.hh new file mode 100644 index 000000000..97fb65c38 --- /dev/null +++ b/src/seastar/include/seastar/core/app-template.hh @@ -0,0 +1,100 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace seastar { + +class app_template { +public: + struct config { + /// The name of the application. + /// + /// Will be used in the --help output to distinguish command line args + /// registered by the application, as opposed to those registered by + /// seastar and its subsystems. + sstring name = "App"; + /// The description of the application. + /// + /// Will be printed on the top of the --help output. Lines should be + /// hard-wrapped for 80 chars. + sstring description = ""; + std::chrono::duration default_task_quota = std::chrono::microseconds(500); + /// \brief Handle SIGINT/SIGTERM by calling reactor::stop() + /// + /// When true, Seastar will set up signal handlers for SIGINT/SIGTERM that call + /// reactor::stop(). The reactor will then execute callbacks installed by + /// reactor::at_exit(). + /// + /// When false, Seastar will not set up signal handlers for SIGINT/SIGTERM + /// automatically. The default behavior (terminate the program) will be kept. + /// You can adjust the behavior of SIGINT/SIGTERM by installing signal handlers + /// via reactor::handle_signal(). + bool auto_handle_sigint_sigterm = true; + config() {} + }; + + using configuration_reader = std::function; +private: + config _cfg; + boost::program_options::options_description _opts; + boost::program_options::options_description _opts_conf_file; + boost::program_options::positional_options_description _pos_opts; + std::optional _configuration; + configuration_reader _conf_reader; + + configuration_reader get_default_configuration_reader(); +public: + struct positional_option { + const char* name; + const boost::program_options::value_semantic* value_semantic; + const char* help; + int max_count; + }; +public: + explicit app_template(config cfg = config()); + + boost::program_options::options_description& get_options_description(); + boost::program_options::options_description& get_conf_file_options_description(); + boost::program_options::options_description_easy_init add_options(); + void add_positional_options(std::initializer_list options); + boost::program_options::variables_map& configuration(); + int run_deprecated(int ac, char ** av, std::function&& func); + + void set_configuration_reader(configuration_reader conf_reader); + + // Runs given function and terminates the application when the future it + // returns resolves. The value with which the future resolves will be + // returned by this function. + int run(int ac, char ** av, std::function ()>&& func); + + // Like run() which takes std::function()>, but returns + // with exit code 0 when the future returned by func resolves + // successfully. + int run(int ac, char ** av, std::function ()>&& func); +}; + +} diff --git a/src/seastar/include/seastar/core/apply.hh b/src/seastar/include/seastar/core/apply.hh new file mode 100644 index 000000000..0a6ae6e87 --- /dev/null +++ b/src/seastar/include/seastar/core/apply.hh @@ -0,0 +1,64 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include + +namespace seastar { + +template +struct apply_helper; + +template +struct apply_helper> { + static auto apply(Func&& func, Tuple args) { + return func(std::get(std::forward(args))...); + } +}; + +template +[[deprecated("use std::apply() instead")]] +inline +auto apply(Func&& func, std::tuple&& args) { + using helper = apply_helper&&, std::index_sequence_for>; + return helper::apply(std::forward(func), std::move(args)); +} + +template +[[deprecated("use std::apply() instead")]] +inline +auto apply(Func&& func, std::tuple& args) { + using helper = apply_helper&, std::index_sequence_for>; + return helper::apply(std::forward(func), args); +} + +template +[[deprecated("use std::apply() instead")]] +inline +auto apply(Func&& func, const std::tuple& args) { + using helper = apply_helper&, std::index_sequence_for>; + return helper::apply(std::forward(func), args); +} + +} diff --git a/src/seastar/include/seastar/core/array_map.hh b/src/seastar/include/seastar/core/array_map.hh new file mode 100644 index 000000000..14ec4118a --- /dev/null +++ b/src/seastar/include/seastar/core/array_map.hh @@ -0,0 +1,50 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include + +namespace seastar { + +// unordered_map implemented as a simple array + +template +class array_map { + std::array _a {}; +public: + array_map(std::initializer_list> i) { + for (auto kv : i) { + _a[kv.first] = kv.second; + } + } + Value& operator[](size_t key) { return _a[key]; } + const Value& operator[](size_t key) const { return _a[key]; } + + Value& at(size_t key) { + if (key >= Max) { + throw std::out_of_range(std::to_string(key) + " >= " + std::to_string(Max)); + } + return _a[key]; + } +}; + +} diff --git a/src/seastar/include/seastar/core/bitops.hh b/src/seastar/include/seastar/core/bitops.hh new file mode 100644 index 000000000..3ea178536 --- /dev/null +++ b/src/seastar/include/seastar/core/bitops.hh @@ -0,0 +1,73 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include + +namespace seastar { + +inline +constexpr unsigned count_leading_zeros(unsigned x) { + return __builtin_clz(x); +} + +inline +constexpr unsigned count_leading_zeros(unsigned long x) { + return __builtin_clzl(x); +} + +inline +constexpr unsigned count_leading_zeros(unsigned long long x) { + return __builtin_clzll(x); +} + +inline +constexpr unsigned count_trailing_zeros(unsigned x) { + return __builtin_ctz(x); +} + +inline +constexpr unsigned count_trailing_zeros(unsigned long x) { + return __builtin_ctzl(x); +} + +inline +constexpr unsigned count_trailing_zeros(unsigned long long x) { + return __builtin_ctzll(x); +} + +template +//requires stdx::is_integral_v +inline constexpr unsigned log2ceil(T n) { + if (n == 1) { + return 0; + } + return std::numeric_limits::digits - count_leading_zeros(n - 1); +} + +template +//requires stdx::is_integral_v +inline constexpr unsigned log2floor(T n) { + return std::numeric_limits::digits - count_leading_zeros(n) - 1; +} + +} diff --git a/src/seastar/include/seastar/core/bitset-iter.hh b/src/seastar/include/seastar/core/bitset-iter.hh new file mode 100644 index 000000000..d00744dd0 --- /dev/null +++ b/src/seastar/include/seastar/core/bitset-iter.hh @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +/* + * Imported from OSv: + * + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#pragma once + +#include +#include + +namespace seastar { + +namespace bitsets { + +static constexpr int ulong_bits = std::numeric_limits::digits; + +/** + * Returns the number of leading zeros in value's binary representation. + * + * If value == 0 the result is undefied. If T is signed and value is negative + * the result is undefined. + * + * The highest value that can be returned is std::numeric_limits::digits - 1, + * which is returned when value == 1. + */ +template +inline size_t count_leading_zeros(T value) noexcept; + +/** + * Returns the number of trailing zeros in value's binary representation. + * + * If value == 0 the result is undefied. If T is signed and value is negative + * the result is undefined. + * + * The highest value that can be returned is std::numeric_limits::digits - 1. + */ +template +static inline size_t count_trailing_zeros(T value) noexcept; + +template<> +inline size_t count_leading_zeros(unsigned long value) noexcept +{ + return __builtin_clzl(value); +} + +template<> +inline size_t count_leading_zeros(long value) noexcept +{ + return __builtin_clzl((unsigned long)value) - 1; +} + +template<> +inline size_t count_leading_zeros(unsigned long long value) noexcept +{ + return __builtin_clzll(value); +} + +template<> +inline size_t count_leading_zeros(long long value) noexcept +{ + return __builtin_clzll((unsigned long long)value) - 1; +} + +template<> +inline +size_t count_trailing_zeros(unsigned long value) noexcept +{ + return __builtin_ctzl(value); +} + +template<> +inline +size_t count_trailing_zeros(long value) noexcept +{ + return __builtin_ctzl((unsigned long)value); +} + +template<> +inline +size_t count_trailing_zeros(unsigned long long value) noexcept +{ + return __builtin_ctzll(value); +} + +template<> +inline +size_t count_trailing_zeros(long long value) noexcept +{ + return __builtin_ctzll((unsigned long long)value); +} + +/** + * Returns the index of the first set bit. + * Result is undefined if bitset.any() == false. + */ +template +static inline size_t get_first_set(const std::bitset& bitset) noexcept +{ + static_assert(N <= ulong_bits, "bitset too large"); + return count_trailing_zeros(bitset.to_ulong()); +} + +/** + * Returns the index of the last set bit in the bitset. + * Result is undefined if bitset.any() == false. + */ +template +static inline size_t get_last_set(const std::bitset& bitset) noexcept +{ + static_assert(N <= ulong_bits, "bitset too large"); + return ulong_bits - 1 - count_leading_zeros(bitset.to_ulong()); +} + +template +class set_iterator : public std::iterator +{ +private: + void advance() noexcept + { + if (_bitset.none()) { + _index = -1; + } else { + auto shift = get_first_set(_bitset) + 1; + _index += shift; + _bitset >>= shift; + } + } +public: + set_iterator(std::bitset bitset, int offset = 0) noexcept + : _bitset(bitset) + , _index(offset - 1) + { + static_assert(N <= ulong_bits, "This implementation is inefficient for large bitsets"); + _bitset >>= offset; + advance(); + } + + void operator++() noexcept + { + advance(); + } + + int operator*() const noexcept + { + return _index; + } + + bool operator==(const set_iterator& other) const noexcept + { + return _index == other._index; + } + + bool operator!=(const set_iterator& other) const noexcept + { + return !(*this == other); + } +private: + std::bitset _bitset; + int _index; +}; + +template +class set_range +{ +public: + using iterator = set_iterator; + using value_type = int; + + set_range(std::bitset bitset, int offset = 0) noexcept + : _bitset(bitset) + , _offset(offset) + { + } + + iterator begin() const noexcept { return iterator(_bitset, _offset); } + iterator end() const noexcept { return iterator(0); } +private: + std::bitset _bitset; + int _offset; +}; + +template +static inline set_range for_each_set(std::bitset bitset, int offset = 0) noexcept +{ + return set_range(bitset, offset); +} + +} + +} diff --git a/src/seastar/include/seastar/core/byteorder.hh b/src/seastar/include/seastar/core/byteorder.hh new file mode 100644 index 000000000..8b5bc9276 --- /dev/null +++ b/src/seastar/include/seastar/core/byteorder.hh @@ -0,0 +1,127 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Scylladb, Ltd. + */ + +#pragma once + +#include +#include +#include + +namespace seastar { + +inline uint8_t cpu_to_le(uint8_t x) noexcept { return x; } +inline uint8_t le_to_cpu(uint8_t x) noexcept { return x; } +inline uint16_t cpu_to_le(uint16_t x) noexcept { return htole16(x); } +inline uint16_t le_to_cpu(uint16_t x) noexcept { return le16toh(x); } +inline uint32_t cpu_to_le(uint32_t x) noexcept { return htole32(x); } +inline uint32_t le_to_cpu(uint32_t x) noexcept { return le32toh(x); } +inline uint64_t cpu_to_le(uint64_t x) noexcept { return htole64(x); } +inline uint64_t le_to_cpu(uint64_t x) noexcept { return le64toh(x); } + +inline int8_t cpu_to_le(int8_t x) noexcept { return x; } +inline int8_t le_to_cpu(int8_t x) noexcept { return x; } +inline int16_t cpu_to_le(int16_t x) noexcept { return htole16(x); } +inline int16_t le_to_cpu(int16_t x) noexcept { return le16toh(x); } +inline int32_t cpu_to_le(int32_t x) noexcept { return htole32(x); } +inline int32_t le_to_cpu(int32_t x) noexcept { return le32toh(x); } +inline int64_t cpu_to_le(int64_t x) noexcept { return htole64(x); } +inline int64_t le_to_cpu(int64_t x) noexcept { return le64toh(x); } + +inline uint8_t cpu_to_be(uint8_t x) noexcept { return x; } +inline uint8_t be_to_cpu(uint8_t x) noexcept { return x; } +inline uint16_t cpu_to_be(uint16_t x) noexcept { return htobe16(x); } +inline uint16_t be_to_cpu(uint16_t x) noexcept { return be16toh(x); } +inline uint32_t cpu_to_be(uint32_t x) noexcept { return htobe32(x); } +inline uint32_t be_to_cpu(uint32_t x) noexcept { return be32toh(x); } +inline uint64_t cpu_to_be(uint64_t x) noexcept { return htobe64(x); } +inline uint64_t be_to_cpu(uint64_t x) noexcept { return be64toh(x); } + +inline int8_t cpu_to_be(int8_t x) noexcept { return x; } +inline int8_t be_to_cpu(int8_t x) noexcept { return x; } +inline int16_t cpu_to_be(int16_t x) noexcept { return htobe16(x); } +inline int16_t be_to_cpu(int16_t x) noexcept { return be16toh(x); } +inline int32_t cpu_to_be(int32_t x) noexcept { return htobe32(x); } +inline int32_t be_to_cpu(int32_t x) noexcept { return be32toh(x); } +inline int64_t cpu_to_be(int64_t x) noexcept { return htobe64(x); } +inline int64_t be_to_cpu(int64_t x) noexcept { return be64toh(x); } + +template +inline T cpu_to_le(const unaligned& v) noexcept { + return cpu_to_le(T(v)); +} + +template +inline T le_to_cpu(const unaligned& v) noexcept { + return le_to_cpu(T(v)); +} + +template +inline +T +read_le(const char* p) noexcept { + T datum; + std::copy_n(p, sizeof(T), reinterpret_cast(&datum)); + return le_to_cpu(datum); +} + +template +inline +void +write_le(char* p, T datum) noexcept { + datum = cpu_to_le(datum); + std::copy_n(reinterpret_cast(&datum), sizeof(T), p); +} + +template +inline +T +read_be(const char* p) noexcept { + T datum; + std::copy_n(p, sizeof(T), reinterpret_cast(&datum)); + return be_to_cpu(datum); +} + +template +inline +void +write_be(char* p, T datum) noexcept { + datum = cpu_to_be(datum); + std::copy_n(reinterpret_cast(&datum), sizeof(T), p); +} + +template +inline +T +consume_be(const char*& p) noexcept { + auto ret = read_be(p); + p += sizeof(T); + return ret; +} + +template +inline +void +produce_be(char*& p, T datum) noexcept { + write_be(p, datum); + p += sizeof(T); +} + +} diff --git a/src/seastar/include/seastar/core/cacheline.hh b/src/seastar/include/seastar/core/cacheline.hh new file mode 100644 index 000000000..89bb3846c --- /dev/null +++ b/src/seastar/include/seastar/core/cacheline.hh @@ -0,0 +1,42 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 IBM. + */ + +#pragma once + +#include + +namespace seastar { + +// Platform-dependent cache line size for alignment and padding purposes. +static constexpr size_t cache_line_size = +#if defined(__x86_64__) || defined(__i386__) + 64; +#elif defined(__s390x__) || defined(__zarch__) + 256; +#elif defined(__PPC64__) + 128; +#elif defined(__aarch64__) + 128; // from Linux, may vary among different microarchitetures? +#else +#error "cache_line_size not defined for this architecture" +#endif + +} diff --git a/src/seastar/include/seastar/core/checked_ptr.hh b/src/seastar/include/seastar/core/checked_ptr.hh new file mode 100644 index 000000000..a4d10d26d --- /dev/null +++ b/src/seastar/include/seastar/core/checked_ptr.hh @@ -0,0 +1,199 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 ScyllaDB + */ + +#pragma once + +/// \file +/// \brief Contains a seastar::checked_ptr class implementation. + +#include +#include + +/// \namespace seastar +namespace seastar { + +/// The exception thrown by a default_null_deref_action. +class checked_ptr_is_null_exception : public std::exception {}; + +/// \brief +/// Default not engaged seastar::checked_ptr dereferencing action (functor). +/// +/// Throws a seastar::checked_ptr_is_null_exception. +/// +struct default_null_deref_action { + /// \throw seastar::checked_ptr_is_null_exception + void operator()() const { + throw checked_ptr_is_null_exception(); + } +}; + +/// \cond internal +/// \namespace seastar::internal +namespace internal { + +/// \name seastar::checked_ptr::get() helpers +/// Helper functions that simplify the seastar::checked_ptr::get() implementation. +/// @{ + +/// Invokes the get() method of a smart pointer object. +/// \param ptr A smart pointer object +/// \return A pointer to the underlying object +template +/// cond SEASTAR_CONCEPT_DOC - nested '\ cond' doesn't seem to work (bug 736553), so working it around +SEASTAR_CONCEPT( requires requires (T ptr) { + ptr.get(); +}) +/// endcond +inline typename std::pointer_traits>::element_type* checked_ptr_do_get(T& ptr) { + return ptr.get(); +} + +/// Return a pointer itself for a naked pointer argument. +/// \param ptr A naked pointer object +/// \return An input naked pointer object +template +inline T* checked_ptr_do_get(T* ptr) noexcept { + return ptr; +} +/// @} +} +/// \endcond + +/// \class seastar::checked_ptr +/// \brief +/// seastar::checked_ptr class is a wrapper class that may be used with any pointer type +/// (smart like std::unique_ptr or raw pointers like int*). +/// +/// The seastar::checked_ptr object will invoke the NullDerefAction functor if +/// it is dereferenced when the underlying pointer is not engaged. +/// +/// It may still be assigned, compared to other seastar::checked_ptr objects or +/// moved without limitations. +/// +/// The default NullDerefAction will throw a seastar::default_null_deref_action exception. +/// +/// \tparam NullDerefAction a functor that is invoked when a user tries to dereference a not engaged pointer. +/// +template +/// \cond SEASTAR_CONCEPT_DOC +SEASTAR_CONCEPT( requires std::is_default_constructible::value && requires (NullDerefAction action) { + NullDerefAction(); +}) +/// \endcond +class checked_ptr { +public: + /// Underlying element type + using element_type = typename std::pointer_traits::element_type; + + /// Type of the pointer to the underlying element + using pointer = element_type*; + +private: + Ptr _ptr = nullptr; + +private: + /// Invokes a NullDerefAction functor if the underlying pointer is not engaged. + void check() const { + if (!_ptr) { + NullDerefAction()(); + } + } + +public: + checked_ptr() noexcept(noexcept(Ptr(nullptr))) = default; + checked_ptr(std::nullptr_t) noexcept(std::is_nothrow_default_constructible>::value) : checked_ptr() {} + checked_ptr(Ptr&& ptr) noexcept(std::is_nothrow_move_constructible::value) : _ptr(std::move(ptr)) {} + checked_ptr(const Ptr& p) noexcept(std::is_nothrow_copy_constructible::value) : _ptr(p) {} + + /// \name Checked Methods + /// These methods start with invoking a NullDerefAction functor if the underlying pointer is not engaged. + /// @{ + + /// Invokes the get() method of the underlying smart pointer or returns the pointer itself for a raw pointer (const variant). + /// \return The pointer to the underlying object + pointer get() const { + check(); + return internal::checked_ptr_do_get(_ptr); + } + + /// Gets a reference to the underlying pointer object. + /// \return The underlying pointer object + const Ptr& operator->() const { + check(); + return _ptr; + } + + /// Gets a reference to the underlying pointer object (const variant). + /// \return The underlying pointer object + Ptr& operator->() { + check(); + return _ptr; + } + + /// Gets the reference to the underlying object (const variant). + /// \return The reference to the underlying object + const element_type& operator*() const { + check(); + return *_ptr; + } + + /// Gets the reference to the underlying object. + /// \return The reference to the underlying object + element_type& operator*() { + check(); + return *_ptr; + } + /// @} + + /// \name Unchecked methods + /// These methods may be invoked when the underlying pointer is not engaged. + /// @{ + + /// Checks if the underlying pointer is engaged. + /// \return TRUE if the underlying pointer is engaged + explicit operator bool() const { return bool(_ptr); } + + bool operator==(const checked_ptr& other) const { return _ptr == other._ptr; } + bool operator!=(const checked_ptr& other) const { return _ptr != other._ptr; } + + /// Gets the hash value for the underlying pointer object. + /// \return The hash value for the underlying pointer object + size_t hash() const { + return std::hash()(_ptr); + } + ///@} +}; + +} + +namespace std { +/// std::hash specialization for seastar::checked_ptr class +template +struct hash> { + /// Get the hash value for the given seastar::checked_ptr object. + /// The hash will calculated using the seastar::checked_ptr::hash method. + /// \param p object for hash value calculation + /// \return The hash value for the given object + size_t operator()(const seastar::checked_ptr& p) const { + return p.hash(); + } +}; +} diff --git a/src/seastar/include/seastar/core/chunked_fifo.hh b/src/seastar/include/seastar/core/chunked_fifo.hh new file mode 100644 index 000000000..cfe4a7eaf --- /dev/null +++ b/src/seastar/include/seastar/core/chunked_fifo.hh @@ -0,0 +1,626 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB Ltd. + */ + +#pragma once + +#include +#include + +namespace seastar { + +// An unbounded FIFO queue of objects of type T. +// +// It provides operations to push items in one end of the queue, and pop them +// from the other end of the queue - both operations are guaranteed O(1) +// (not just amortized O(1)). The size() operation is also O(1). +// chunked_fifo also guarantees that the largest contiguous memory allocation +// it does is O(1). The total memory used is, of course, O(N). +// +// How does chunked_fifo differ from std::list<>, circular_buffer<> and +// std::deque()? +// +// std::list<> can also make all the above guarantees, but is inefficient - +// both at run speed (every operation requires an allocation), and in memory +// use. Much more efficient than std::list<> is our circular_buffer<>, which +// allocates a contiguous array to hold the items and only reallocates it, +// exponentially, when the queue grows. On one test of several different +// push/pop scenarios, circular_buffer<> was between 5 and 20 times faster +// than std::list, and also used considerably less memory. +// The problem with circular_buffer<> is that gives up on the last guarantee +// we made above: circular_buffer<> allocates all the items in one large +// contiguous allocation - that might not be possible when the memory is +// highly fragmented. +// std::deque<> aims to solve the contiguous allocation problem by allocating +// smaller chunks of the queue, and keeping a list of them in an array. This +// array is necessary to allow for O(1) random access to any element, a +// feature which we do not need; But this array is itself contiguous so +// std::deque<> attempts larger contiguous allocations the larger the queue +// gets: std::deque<>'s contiguous allocation is still O(N) and in fact +// exactly 1/64 of the size of circular_buffer<>'s contiguous allocation. +// So it's an improvement over circular_buffer<>, but not a full solution. +// +// chunked_fifo<> is such a solution: it also allocates the queue in fixed- +// size chunks (just like std::deque) but holds them in a linked list, not +// a contiguous array, so there are no large contiguous allocations. +// +// Unlike std::deque<> or circular_buffer<>, chunked_fifo only provides the +// operations needed by std::queue, i.e.,: empty(), size(), front(), back(), +// push_back() and pop_front(). For simplicity, we do *not* implement other +// possible operations, like inserting or deleting elements from the "wrong" +// side of the queue or from the middle, nor random-access to items in the +// middle of the queue. However, chunked_fifo does allow iterating over all +// of the queue's elements without popping them, a feature which std::queue +// is missing. +// +// Another feature of chunked_fifo which std::deque is missing is the ability +// to control the chunk size, as a template parameter. In std::deque the +// chunk size is undocumented and fixed - in gcc, it is always 512 bytes. +// chunked_fifo, on the other hand, makes the chunk size (in number of items +// instead of bytes) a template parameter; In situations where the queue is +// expected to become very long, using a larger chunk size might make sense +// because it will result in fewer allocations. +// +// chunked_fifo uses uninitialized storage for unoccupied elements, and thus +// uses move/copy constructors instead of move/copy assignments, which are +// less efficient. + +template +class chunked_fifo { + static_assert((items_per_chunk & (items_per_chunk - 1)) == 0, + "chunked_fifo chunk size must be power of two"); + union maybe_item { + maybe_item() noexcept {} + ~maybe_item() {} + T data; + }; + struct chunk { + maybe_item items[items_per_chunk]; + struct chunk* next; + // begin and end interpreted mod items_per_chunk + unsigned begin; + unsigned end; + }; + // We pop from the chunk at _front_chunk. This chunk is then linked to + // the following chunks via the "next" link. _back_chunk points to the + // last chunk in this list, and it is where we push. + chunk* _front_chunk = nullptr; // where we pop + chunk* _back_chunk = nullptr; // where we push + // We want an O(1) size but don't want to maintain a size() counter + // because this will slow down every push and pop operation just for + // the rare size() call. Instead, we just keep a count of chunks (which + // doesn't change on every push or pop), from which we can calculate + // size() when needed, and still be O(1). + // This assumes the invariant that all middle chunks (except the front + // and back) are always full. + size_t _nchunks = 0; + // A list of freed chunks, to support reserve() and to improve + // performance of repeated push and pop, especially on an empty queue. + // It is a performance/memory tradeoff how many freed chunks to keep + // here (see save_free_chunks constant below). + chunk* _free_chunks = nullptr; + size_t _nfree_chunks = 0; +public: + using value_type = T; + using size_type = size_t; + using reference = T&; + using pointer = T*; + using const_reference = const T&; + using const_pointer = const T*; + +private: + template + class basic_iterator { + friend class chunked_fifo; + + public: + using iterator_category = std::forward_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = U; + using pointer = U*; + using reference = U&; + + protected: + chunk* _chunk = nullptr; + size_t _item_index = 0; + + protected: + inline explicit basic_iterator(chunk* c) noexcept; + inline basic_iterator(chunk* c, size_t item_index) noexcept; + + public: + inline bool operator==(const basic_iterator& o) const noexcept; + inline bool operator!=(const basic_iterator& o) const noexcept; + inline pointer operator->() const noexcept; + inline reference operator*() const noexcept; + inline basic_iterator operator++(int) noexcept; + basic_iterator& operator++() noexcept; + }; + +public: + class iterator : public basic_iterator { + using basic_iterator::basic_iterator; + public: + iterator() noexcept = default; + }; + class const_iterator : public basic_iterator { + using basic_iterator::basic_iterator; + public: + const_iterator() noexcept = default; + inline const_iterator(iterator o) noexcept; + }; + +public: + chunked_fifo() noexcept = default; + chunked_fifo(chunked_fifo&& x) noexcept; + chunked_fifo(const chunked_fifo& X) = delete; + ~chunked_fifo(); + chunked_fifo& operator=(const chunked_fifo&) = delete; + chunked_fifo& operator=(chunked_fifo&&) noexcept; + inline void push_back(const T& data); + inline void push_back(T&& data); + T& back() noexcept; + const T& back() const noexcept; + template + inline void emplace_back(A&&... args); + inline T& front() const noexcept; + inline void pop_front() noexcept; + inline bool empty() const noexcept; + inline size_t size() const noexcept; + void clear() noexcept; + // reserve(n) ensures that at least (n - size()) further push() calls can + // be served without needing new memory allocation. + // Calling pop()s between these push()es is also allowed and does not + // alter this guarantee. + // Note that reserve() does not reduce the amount of memory already + // reserved - use shrink_to_fit() for that. + void reserve(size_t n); + // shrink_to_fit() frees memory held, but unused, by the queue. Such + // unused memory might exist after pops, or because of reserve(). + void shrink_to_fit() noexcept; + inline iterator begin() noexcept; + inline iterator end() noexcept; + inline const_iterator begin() const noexcept; + inline const_iterator end() const noexcept; + inline const_iterator cbegin() const noexcept; + inline const_iterator cend() const noexcept; +private: + void back_chunk_new(); + void front_chunk_delete() noexcept; + inline void ensure_room_back(); + void undo_room_back() noexcept; + static inline size_t mask(size_t idx) noexcept; + +}; + +template +template +inline +chunked_fifo::basic_iterator::basic_iterator(chunk* c) noexcept : _chunk(c), _item_index(_chunk ? _chunk->begin : 0) { +} + +template +template +inline +chunked_fifo::basic_iterator::basic_iterator(chunk* c, size_t item_index) noexcept : _chunk(c), _item_index(item_index) { +} + +template +template +inline bool +chunked_fifo::basic_iterator::operator==(const basic_iterator& o) const noexcept { + return _chunk == o._chunk && _item_index == o._item_index; +} + +template +template +inline bool +chunked_fifo::basic_iterator::operator!=(const basic_iterator& o) const noexcept { + return !(*this == o); +} + +template +template +inline typename chunked_fifo::template basic_iterator::pointer +chunked_fifo::basic_iterator::operator->() const noexcept { + return &_chunk->items[chunked_fifo::mask(_item_index)].data; +} + +template +template +inline typename chunked_fifo::template basic_iterator::reference +chunked_fifo::basic_iterator::operator*() const noexcept { + return _chunk->items[chunked_fifo::mask(_item_index)].data; +} + +template +template +inline typename chunked_fifo::template basic_iterator +chunked_fifo::basic_iterator::operator++(int) noexcept { + auto it = *this; + ++(*this); + return it; +} + +template +template +typename chunked_fifo::template basic_iterator& +chunked_fifo::basic_iterator::operator++() noexcept { + ++_item_index; + if (_item_index == _chunk->end) { + _chunk = _chunk->next; + _item_index = _chunk ? _chunk->begin : 0; + } + return *this; +} + +template +inline +chunked_fifo::const_iterator::const_iterator(chunked_fifo::iterator o) noexcept + : basic_iterator(o._chunk, o._item_index) { +} + +template +inline +chunked_fifo::chunked_fifo(chunked_fifo&& x) noexcept + : _front_chunk(x._front_chunk) + , _back_chunk(x._back_chunk) + , _nchunks(x._nchunks) + , _free_chunks(x._free_chunks) + , _nfree_chunks(x._nfree_chunks) { + x._front_chunk = nullptr; + x._back_chunk = nullptr; + x._nchunks = 0; + x._free_chunks = nullptr; + x._nfree_chunks = 0; +} + +template +inline +chunked_fifo& +chunked_fifo::operator=(chunked_fifo&& x) noexcept { + if (&x != this) { + this->~chunked_fifo(); + new (this) chunked_fifo(std::move(x)); + } + return *this; +} + +template +inline size_t +chunked_fifo::mask(size_t idx) noexcept { + return idx & (items_per_chunk - 1); +} + +template +inline bool +chunked_fifo::empty() const noexcept { + return _front_chunk == nullptr; +} + +template +inline size_t +chunked_fifo::size() const noexcept{ + if (_front_chunk == nullptr) { + return 0; + } else if (_back_chunk == _front_chunk) { + // Single chunk. + return _front_chunk->end - _front_chunk->begin; + } else { + return _front_chunk->end - _front_chunk->begin + +_back_chunk->end - _back_chunk->begin + + (_nchunks - 2) * items_per_chunk; + } +} + +template +void chunked_fifo::clear() noexcept { +#if 1 + while (!empty()) { + pop_front(); + } +#else + // This is specialized code to free the contents of all the chunks and the + // chunks themselves. but since destroying a very full queue is not an + // important use case to optimize, the simple loop above is preferable. + if (!_front_chunk) { + // Empty, nothing to do + return; + } + // Delete front chunk (partially filled) + for (auto i = _front_chunk->begin; i != _front_chunk->end; ++i) { + _front_chunk->items[mask(i)].data.~T(); + } + chunk *p = _front_chunk->next; + delete _front_chunk; + // Delete all the middle chunks (all completely filled) + if (p) { + while (p != _back_chunk) { + // These are full chunks + chunk *nextp = p->next; + for (auto i = 0; i != items_per_chunk; ++i) { + // Note we delete out of order (we don't start with p->begin). + // That should be fine.. + p->items[i].data.~T(); + } + delete p; + p = nextp; + } + // Finally delete back chunk (partially filled) + for (auto i = _back_chunk->begin; i != _back_chunk->end; ++i) { + _back_chunk->items[mask(i)].data.~T(); + } + delete _back_chunk; + } + _front_chunk = nullptr; + _back_chunk = nullptr; + _nchunks = 0; +#endif +} + +template void +chunked_fifo::shrink_to_fit() noexcept { + while (_free_chunks) { + auto next = _free_chunks->next; + delete _free_chunks; + _free_chunks = next; + } + _nfree_chunks = 0; +} + +template +chunked_fifo::~chunked_fifo() { + clear(); + shrink_to_fit(); +} + +template +void +chunked_fifo::back_chunk_new() { + chunk *old = _back_chunk; + if (_free_chunks) { + _back_chunk = _free_chunks; + _free_chunks = _free_chunks->next; + --_nfree_chunks; + } else { + _back_chunk = new chunk; + } + _back_chunk->next = nullptr; + _back_chunk->begin = 0; + _back_chunk->end = 0; + if (old) { + old->next = _back_chunk; + } + if (_front_chunk == nullptr) { + _front_chunk = _back_chunk; + } + _nchunks++; +} + + +template +inline void +chunked_fifo::ensure_room_back() { + // If we don't have a back chunk or it's full, we need to create a new one + if (_back_chunk == nullptr || + (_back_chunk->end - _back_chunk->begin) == items_per_chunk) { + back_chunk_new(); + } +} + +template +void +chunked_fifo::undo_room_back() noexcept { + // If we failed creating a new item after ensure_room_back() created a + // new empty chunk, we must remove it, or empty() will be incorrect + // (either immediately, if the fifo was empty, or when all the items are + // popped, if it already had items). + if (_back_chunk->begin == _back_chunk->end) { + delete _back_chunk; + --_nchunks; + if (_nchunks == 0) { + _back_chunk = nullptr; + _front_chunk = nullptr; + } else { + // Because we don't usually pop from the back, we don't have a "prev" + // pointer so we need to find the previous chunk the hard and slow + // way. B + chunk *old = _back_chunk; + _back_chunk = _front_chunk; + while (_back_chunk->next != old) { + _back_chunk = _back_chunk->next; + } + _back_chunk->next = nullptr; + } + } + +} + +template +template +inline void +chunked_fifo::emplace_back(Args&&... args) { + ensure_room_back(); + auto p = &_back_chunk->items[mask(_back_chunk->end)].data; + try { + new(p) T(std::forward(args)...); + } catch(...) { + undo_room_back(); + throw; + } + ++_back_chunk->end; +} + +template +inline void +chunked_fifo::push_back(const T& data) { + ensure_room_back(); + auto p = &_back_chunk->items[mask(_back_chunk->end)].data; + try { + new(p) T(data); + } catch(...) { + undo_room_back(); + throw; + } + ++_back_chunk->end; +} + +template +inline void +chunked_fifo::push_back(T&& data) { + ensure_room_back(); + auto p = &_back_chunk->items[mask(_back_chunk->end)].data; + try { + new(p) T(std::move(data)); + } catch(...) { + undo_room_back(); + throw; + } + ++_back_chunk->end; +} + +template +inline +T& +chunked_fifo::back() noexcept { + return _back_chunk->items[mask(_back_chunk->end - 1)].data; +} + +template +inline +const T& +chunked_fifo::back() const noexcept { + return _back_chunk->items[mask(_back_chunk->end - 1)].data; +} + +template +inline T& +chunked_fifo::front() const noexcept { + return _front_chunk->items[mask(_front_chunk->begin)].data; +} + +template +inline void +chunked_fifo::front_chunk_delete() noexcept { + chunk *next = _front_chunk->next; + // Certain use cases may need to repeatedly allocate and free a chunk - + // an obvious example is an empty queue to which we push, and then pop, + // repeatedly. Another example is pushing and popping to a non-empty queue + // we push and pop at different chunks so we need to free and allocate a + // chunk every items_per_chunk operations. + // The solution is to keep a list of freed chunks instead of freeing them + // immediately. There is a performance/memory tradeoff of how many freed + // chunks to save: If we save them all, the queue can never shrink from + // its maximum memory use (this is how circular_buffer behaves). + // The ad-hoc choice made here is to limit the number of saved chunks to 1, + // but this could easily be made a configuration option. + static constexpr int save_free_chunks = 1; + if (_nfree_chunks < save_free_chunks) { + _front_chunk->next = _free_chunks; + _free_chunks = _front_chunk; + ++_nfree_chunks; + } else { + delete _front_chunk; + } + // If we only had one chunk, _back_chunk is gone too. + if (_back_chunk == _front_chunk) { + _back_chunk = nullptr; + } + _front_chunk = next; + --_nchunks; +} + +template +inline void +chunked_fifo::pop_front() noexcept { + front().~T(); + // If the front chunk has become empty, we need to free remove it and use + // the next one. + if (++_front_chunk->begin == _front_chunk->end) { + front_chunk_delete(); + } +} + +template +void chunked_fifo::reserve(size_t n) { + // reserve() guarantees that (n - size()) additional push()es will + // succeed without reallocation: + if (n <= size()) { + return; + } + size_t need = n - size(); + // If we already have a back chunk, it might have room for some pushes + // before filling up, so decrease "need": + if (_back_chunk) { + size_t back_chunk_n = items_per_chunk - (_back_chunk->end - _back_chunk->begin); + need -= std::min(back_chunk_n, need); + } + size_t needed_chunks = (need + items_per_chunk - 1) / items_per_chunk; + // If we already have some freed chunks saved, we need to allocate fewer + // additional chunks, or none at all + if (needed_chunks <= _nfree_chunks) { + return; + } + needed_chunks -= _nfree_chunks; + while (needed_chunks--) { + chunk *c = new chunk; + c->next = _free_chunks; + _free_chunks = c; + ++_nfree_chunks; + } +} + +template +inline typename chunked_fifo::iterator +chunked_fifo::begin() noexcept { + return iterator(_front_chunk); +} + +template +inline typename chunked_fifo::iterator +chunked_fifo::end() noexcept { + return iterator(nullptr); +} + +template +inline typename chunked_fifo::const_iterator +chunked_fifo::begin() const noexcept { + return const_iterator(_front_chunk); +} + +template +inline typename chunked_fifo::const_iterator +chunked_fifo::end() const noexcept { + return const_iterator(nullptr); +} + +template +inline typename chunked_fifo::const_iterator +chunked_fifo::cbegin() const noexcept { + return const_iterator(_front_chunk); +} + +template +inline typename chunked_fifo::const_iterator +chunked_fifo::cend() const noexcept { + return const_iterator(nullptr); +} + +} diff --git a/src/seastar/include/seastar/core/circular_buffer.hh b/src/seastar/include/seastar/core/circular_buffer.hh new file mode 100644 index 000000000..d1444e4c2 --- /dev/null +++ b/src/seastar/include/seastar/core/circular_buffer.hh @@ -0,0 +1,511 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace seastar { + +/// A growable double-ended queue container that can be efficiently +/// extended (and shrunk) from both ends. Implementation is a single +/// storage vector. +/// +/// Similar to libstdc++'s std::deque, except that it uses a single +/// level store, and so is more efficient for simple stored items. +/// Similar to boost::circular_buffer_space_optimized, except it uses +/// uninitialized storage for unoccupied elements (and thus move/copy +/// constructors instead of move/copy assignments, which are less +/// efficient). +/// +/// The storage of the circular_buffer is expanded automatically in +/// exponential increments. +/// When adding new elements: +/// * if size + 1 > capacity: all iterators and references are +/// invalidated, +/// * otherwise only the begin() or end() iterator is invalidated: +/// * push_front() and emplace_front() will invalidate begin() and +/// * push_back() and emplace_back() will invalidate end(). +/// +/// Removing elements never invalidates any references and only +/// invalidates begin() or end() iterators: +/// * pop_front() will invalidate begin() and +/// * pop_back() will invalidate end(). +/// +/// reserve() may also invalidate all iterators and references. +template > +class circular_buffer { + struct impl : Alloc { + T* storage = nullptr; + // begin, end interpreted (mod capacity) + size_t begin = 0; + size_t end = 0; + size_t capacity = 0; + + impl(Alloc a) noexcept : Alloc(std::move(a)) { } + void reset() { + storage = {}; + begin = 0; + end = 0; + capacity = 0; + } + }; + static_assert(!std::is_default_constructible_v + || std::is_nothrow_default_constructible_v); + static_assert(std::is_nothrow_move_constructible_v); + impl _impl; +public: + using value_type = T; + using size_type = size_t; + using reference = T&; + using pointer = T*; + using const_reference = const T&; + using const_pointer = const T*; +public: + circular_buffer() noexcept SEASTAR_CONCEPT(requires std::default_initializable) : circular_buffer(Alloc()) {} + circular_buffer(Alloc alloc) noexcept; + circular_buffer(circular_buffer&& X) noexcept; + circular_buffer(const circular_buffer& X) = delete; + ~circular_buffer(); + circular_buffer& operator=(const circular_buffer&) = delete; + circular_buffer& operator=(circular_buffer&& b) noexcept; + void push_front(const T& data); + void push_front(T&& data); + template + void emplace_front(A&&... args); + void push_back(const T& data); + void push_back(T&& data); + template + void emplace_back(A&&... args); + T& front() noexcept; + const T& front() const noexcept; + T& back() noexcept; + const T& back() const noexcept; + void pop_front() noexcept; + void pop_back() noexcept; + bool empty() const; + size_t size() const; + size_t capacity() const; + void reserve(size_t); + void clear(); + T& operator[](size_t idx) noexcept; + const T& operator[](size_t idx) const noexcept; + template + void for_each(Func func); + // access an element, may return wrong or destroyed element + // only useful if you do not rely on data accuracy (e.g. prefetch) + T& access_element_unsafe(size_t idx) noexcept; +private: + void expand(); + void expand(size_t); + void maybe_expand(size_t nr = 1); + size_t mask(size_t idx) const; + + template + struct cbiterator : std::iterator { + typedef std::iterator super_t; + + ValueType& operator*() const noexcept { return cb->_impl.storage[cb->mask(idx)]; } + ValueType* operator->() const noexcept { return &cb->_impl.storage[cb->mask(idx)]; } + // prefix + cbiterator& operator++() noexcept { + idx++; + return *this; + } + // postfix + cbiterator operator++(int unused) noexcept { + auto v = *this; + idx++; + return v; + } + // prefix + cbiterator& operator--() noexcept { + idx--; + return *this; + } + // postfix + cbiterator operator--(int unused) noexcept { + auto v = *this; + idx--; + return v; + } + cbiterator operator+(typename super_t::difference_type n) const noexcept { + return cbiterator(cb, idx + n); + } + cbiterator operator-(typename super_t::difference_type n) const noexcept { + return cbiterator(cb, idx - n); + } + cbiterator& operator+=(typename super_t::difference_type n) noexcept { + idx += n; + return *this; + } + cbiterator& operator-=(typename super_t::difference_type n) noexcept { + idx -= n; + return *this; + } + bool operator==(const cbiterator& rhs) const noexcept { + return idx == rhs.idx; + } + bool operator!=(const cbiterator& rhs) const noexcept { + return idx != rhs.idx; + } + bool operator<(const cbiterator& rhs) const noexcept { + return idx < rhs.idx; + } + bool operator>(const cbiterator& rhs) const noexcept { + return idx > rhs.idx; + } + bool operator>=(const cbiterator& rhs) const noexcept { + return idx >= rhs.idx; + } + bool operator<=(const cbiterator& rhs) const noexcept { + return idx <= rhs.idx; + } + typename super_t::difference_type operator-(const cbiterator& rhs) const noexcept { + return idx - rhs.idx; + } + private: + CB* cb; + size_t idx; + cbiterator(CB* b, size_t i) noexcept : cb(b), idx(i) {} + friend class circular_buffer; + }; + friend class iterator; + +public: + typedef cbiterator iterator; + typedef cbiterator const_iterator; + + iterator begin() noexcept { + return iterator(this, _impl.begin); + } + const_iterator begin() const noexcept { + return const_iterator(this, _impl.begin); + } + iterator end() noexcept { + return iterator(this, _impl.end); + } + const_iterator end() const noexcept { + return const_iterator(this, _impl.end); + } + const_iterator cbegin() const noexcept { + return const_iterator(this, _impl.begin); + } + const_iterator cend() const noexcept { + return const_iterator(this, _impl.end); + } + iterator erase(iterator first, iterator last) noexcept; +}; + +template +inline +size_t +circular_buffer::mask(size_t idx) const { + return idx & (_impl.capacity - 1); +} + +template +inline +bool +circular_buffer::empty() const { + return _impl.begin == _impl.end; +} + +template +inline +size_t +circular_buffer::size() const { + return _impl.end - _impl.begin; +} + +template +inline +size_t +circular_buffer::capacity() const { + return _impl.capacity; +} + +template +inline +void +circular_buffer::reserve(size_t size) { + if (capacity() < size) { + // Make sure that the new capacity is a power of two. + expand(size_t(1) << log2ceil(size)); + } +} + +template +inline +void +circular_buffer::clear() { + erase(begin(), end()); +} + +template +inline +circular_buffer::circular_buffer(Alloc alloc) noexcept + : _impl(std::move(alloc)) { +} + +template +inline +circular_buffer::circular_buffer(circular_buffer&& x) noexcept + : _impl(std::move(x._impl)) { + x._impl.reset(); +} + +template +inline +circular_buffer& circular_buffer::operator=(circular_buffer&& x) noexcept { + if (this != &x) { + this->~circular_buffer(); + new (this) circular_buffer(std::move(x)); + } + return *this; +} + +template +template +inline +void +circular_buffer::for_each(Func func) { + auto s = _impl.storage; + auto m = _impl.capacity - 1; + for (auto i = _impl.begin; i != _impl.end; ++i) { + func(s[i & m]); + } +} + +template +inline +circular_buffer::~circular_buffer() { + for_each([this] (T& obj) { + std::allocator_traits::destroy(_impl, &obj); + }); + _impl.deallocate(_impl.storage, _impl.capacity); +} + +template +void +circular_buffer::expand() { + expand(std::max(_impl.capacity * 2, 1)); +} + +template +void +circular_buffer::expand(size_t new_cap) { + auto new_storage = _impl.allocate(new_cap); + auto p = new_storage; + try { + for_each([this, &p] (T& obj) { + transfer_pass1(_impl, &obj, p); + p++; + }); + } catch (...) { + while (p != new_storage) { + std::allocator_traits::destroy(_impl, --p); + } + _impl.deallocate(new_storage, new_cap); + throw; + } + p = new_storage; + for_each([this, &p] (T& obj) { + transfer_pass2(_impl, &obj, p++); + }); + std::swap(_impl.storage, new_storage); + std::swap(_impl.capacity, new_cap); + _impl.begin = 0; + _impl.end = p - _impl.storage; + _impl.deallocate(new_storage, new_cap); +} + +template +inline +void +circular_buffer::maybe_expand(size_t nr) { + if (_impl.end - _impl.begin + nr > _impl.capacity) { + expand(); + } +} + +template +inline +void +circular_buffer::push_front(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + std::allocator_traits::construct(_impl, p, data); + --_impl.begin; +} + +template +inline +void +circular_buffer::push_front(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + std::allocator_traits::construct(_impl, p, std::move(data)); + --_impl.begin; +} + +template +template +inline +void +circular_buffer::emplace_front(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.begin - 1)]; + std::allocator_traits::construct(_impl, p, std::forward(args)...); + --_impl.begin; +} + +template +inline +void +circular_buffer::push_back(const T& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + std::allocator_traits::construct(_impl, p, data); + ++_impl.end; +} + +template +inline +void +circular_buffer::push_back(T&& data) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + std::allocator_traits::construct(_impl, p, std::move(data)); + ++_impl.end; +} + +template +template +inline +void +circular_buffer::emplace_back(Args&&... args) { + maybe_expand(); + auto p = &_impl.storage[mask(_impl.end)]; + std::allocator_traits::construct(_impl, p, std::forward(args)...); + ++_impl.end; +} + +template +inline +T& +circular_buffer::front() noexcept { + return _impl.storage[mask(_impl.begin)]; +} + +template +inline +const T& +circular_buffer::front() const noexcept { + return _impl.storage[mask(_impl.begin)]; +} + +template +inline +T& +circular_buffer::back() noexcept { + return _impl.storage[mask(_impl.end - 1)]; +} + +template +inline +const T& +circular_buffer::back() const noexcept { + return _impl.storage[mask(_impl.end - 1)]; +} + +template +inline +void +circular_buffer::pop_front() noexcept { + std::allocator_traits::destroy(_impl, &front()); + ++_impl.begin; +} + +template +inline +void +circular_buffer::pop_back() noexcept { + std::allocator_traits::destroy(_impl, &back()); + --_impl.end; +} + +template +inline +T& +circular_buffer::operator[](size_t idx) noexcept { + return _impl.storage[mask(_impl.begin + idx)]; +} + +template +inline +const T& +circular_buffer::operator[](size_t idx) const noexcept { + return _impl.storage[mask(_impl.begin + idx)]; +} + +template +inline +T& +circular_buffer::access_element_unsafe(size_t idx) noexcept { + return _impl.storage[mask(_impl.begin + idx)]; +} + +template +inline +typename circular_buffer::iterator +circular_buffer::erase(iterator first, iterator last) noexcept { + static_assert(std::is_nothrow_move_assignable::value, "erase() assumes move assignment does not throw"); + if (first == last) { + return last; + } + // Move to the left or right depending on which would result in least amount of moves. + // This also guarantees that iterators will be stable when removing from either front or back. + if (std::distance(begin(), first) < std::distance(last, end())) { + auto new_start = std::move_backward(begin(), first, last); + auto i = begin(); + while (i < new_start) { + std::allocator_traits::destroy(_impl, &*i++); + } + _impl.begin = new_start.idx; + return last; + } else { + auto new_end = std::move(last, end(), first); + auto i = new_end; + auto e = end(); + while (i < e) { + std::allocator_traits::destroy(_impl, &*i++); + } + _impl.end = new_end.idx; + return first; + } +} + +} diff --git a/src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh b/src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh new file mode 100644 index 000000000..7e7c093e2 --- /dev/null +++ b/src/seastar/include/seastar/core/circular_buffer_fixed_capacity.hh @@ -0,0 +1,378 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 ScyllaDB + */ + +#pragma once + +// A fixed capacity double-ended queue container that can be efficiently +// extended (and shrunk) from both ends. Implementation is a single +// storage vector. +// +// Similar to libstdc++'s std::deque, except that it uses a single level +// store, and so is more efficient for simple stored items. + +#include +#include +#include +#include + + +/// \file + +namespace seastar { + +/// A fixed-capacity container (like boost::static_vector) that can insert +/// and remove at both ends (like std::deque). Does not allocate. +/// +/// Does not perform overflow checking when size exceeds capacity. +/// +/// \tparam T type of objects stored in the container; must be noexcept move enabled +/// \tparam Capacity maximum number of objects that can be stored in the container; must be a power of 2 +template +class circular_buffer_fixed_capacity { + size_t _begin = 0; + size_t _end = 0; + union maybe_storage { + T data; + maybe_storage() noexcept {} + ~maybe_storage() {} + }; + maybe_storage _storage[Capacity]; +private: + static size_t mask(size_t idx) { return idx % Capacity; } + T* obj(size_t idx) { return &_storage[mask(idx)].data; } + const T* obj(size_t idx) const { return &_storage[mask(idx)].data; } +public: + static_assert((Capacity & (Capacity - 1)) == 0, "capacity must be a power of two"); + static_assert(std::is_nothrow_move_constructible::value && std::is_nothrow_move_assignable::value, + "circular_buffer_fixed_capacity only supports nothrow-move value types"); + using value_type = T; + using size_type = size_t; + using reference = T&; + using pointer = T*; + using const_reference = const T&; + using const_pointer = const T*; + using difference_type = ssize_t; +public: + template + class cbiterator { + using holder = std::conditional_t::value, const maybe_storage, maybe_storage>; + holder* _start; + size_t _idx; + private: + cbiterator(holder* start, size_t idx) noexcept : _start(start), _idx(idx) {} + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = ValueType; + using difference_type = ssize_t; + using pointer = ValueType*; + using reference = ValueType&; + public: + cbiterator(); + ValueType& operator*() const { return _start[mask(_idx)].data; } + ValueType* operator->() const { return &operator*(); } + // prefix + cbiterator& operator++() { + ++_idx; + return *this; + } + // postfix + cbiterator operator++(int) { + auto v = *this; + ++_idx; + return v; + } + // prefix + cbiterator& operator--() { + --_idx; + return *this; + } + // postfix + cbiterator operator--(int) { + auto v = *this; + --_idx; + return v; + } + cbiterator operator+(difference_type n) const { + return cbiterator{_start, _idx + n}; + } + friend cbiterator operator+(difference_type n, cbiterator i) { + return i + n; + } + cbiterator operator-(difference_type n) const { + return cbiterator{_start, _idx - n}; + } + cbiterator& operator+=(difference_type n) { + _idx += n; + return *this; + } + cbiterator& operator-=(difference_type n) { + _idx -= n; + return *this; + } + bool operator==(const cbiterator& rhs) const { + return _idx == rhs._idx; + } + bool operator!=(const cbiterator& rhs) const { + return _idx != rhs._idx; + } + bool operator<(const cbiterator& rhs) const { + return ssize_t(_idx - rhs._idx) < 0; + } + bool operator>(const cbiterator& rhs) const { + return ssize_t(_idx - rhs._idx) > 0; + } + bool operator<=(const cbiterator& rhs) const { + return ssize_t(_idx - rhs._idx) <= 0; + } + bool operator>=(const cbiterator& rhs) const { + return ssize_t(_idx - rhs._idx) >= 0; + } + difference_type operator-(const cbiterator& rhs) const { + return _idx - rhs._idx; + } + friend class circular_buffer_fixed_capacity; + }; +public: + using iterator = cbiterator; + using const_iterator = cbiterator; +public: + circular_buffer_fixed_capacity() = default; + circular_buffer_fixed_capacity(circular_buffer_fixed_capacity&& x) noexcept; + ~circular_buffer_fixed_capacity(); + circular_buffer_fixed_capacity& operator=(circular_buffer_fixed_capacity&& x) noexcept; + void push_front(const T& data); + void push_front(T&& data); + template + T& emplace_front(A&&... args); + void push_back(const T& data); + void push_back(T&& data); + template + T& emplace_back(A&&... args); + T& front(); + T& back(); + void pop_front(); + void pop_back(); + bool empty() const; + size_t size() const; + size_t capacity() const; + T& operator[](size_t idx); + void clear(); + iterator begin() { + return iterator(_storage, _begin); + } + const_iterator begin() const { + return const_iterator(_storage, _begin); + } + iterator end() { + return iterator(_storage, _end); + } + const_iterator end() const { + return const_iterator(_storage, _end); + } + const_iterator cbegin() const { + return const_iterator(_storage, _begin); + } + const_iterator cend() const { + return const_iterator(_storage, _end); + } + iterator erase(iterator first, iterator last); +}; + +template +inline +bool +circular_buffer_fixed_capacity::empty() const { + return _begin == _end; +} + +template +inline +size_t +circular_buffer_fixed_capacity::size() const { + return _end - _begin; +} + +template +inline +size_t +circular_buffer_fixed_capacity::capacity() const { + return Capacity; +} + +template +inline +circular_buffer_fixed_capacity::circular_buffer_fixed_capacity(circular_buffer_fixed_capacity&& x) noexcept + : _begin(x._begin), _end(x._end) { + // This is std::uninitialized_move, but that is c++17 only + auto dest = begin(); + for (auto& obj : x) { + new (&*dest++) T(std::move(obj)); + } +} + +template +inline +circular_buffer_fixed_capacity& +circular_buffer_fixed_capacity::operator=(circular_buffer_fixed_capacity&& x) noexcept { + if (this != &x) { + this->~circular_buffer_fixed_capacity(); + new (this) circular_buffer_fixed_capacity(std::move(x)); + } + return *this; +} + +template +inline +circular_buffer_fixed_capacity::~circular_buffer_fixed_capacity() { + clear(); +} + +template +inline +void +circular_buffer_fixed_capacity::push_front(const T& data) { + new (obj(_begin - 1)) T(data); + --_begin; +} + +template +inline +void +circular_buffer_fixed_capacity::push_front(T&& data) { + new (obj(_begin - 1)) T(std::move(data)); + --_begin; +} + +template +template +inline +T& +circular_buffer_fixed_capacity::emplace_front(Args&&... args) { + auto p = new (obj(_begin - 1)) T(std::forward(args)...); + --_begin; + return *p; +} + +template +inline +void +circular_buffer_fixed_capacity::push_back(const T& data) { + new (obj(_end)) T(data); + ++_end; +} + +template +inline +void +circular_buffer_fixed_capacity::push_back(T&& data) { + new (obj(_end)) T(std::move(data)); + ++_end; +} + +template +template +inline +T& +circular_buffer_fixed_capacity::emplace_back(Args&&... args) { + auto p = new (obj(_end)) T(std::forward(args)...); + ++_end; + return *p; +} + +template +inline +T& +circular_buffer_fixed_capacity::front() { + return *obj(_begin); +} + +template +inline +T& +circular_buffer_fixed_capacity::back() { + return *obj(_end - 1); +} + +template +inline +void +circular_buffer_fixed_capacity::pop_front() { + obj(_begin)->~T(); + ++_begin; +} + +template +inline +void +circular_buffer_fixed_capacity::pop_back() { + obj(_end - 1)->~T(); + --_end; +} + +template +inline +T& +circular_buffer_fixed_capacity::operator[](size_t idx) { + return *obj(_begin + idx); +} + +template +inline +typename circular_buffer_fixed_capacity::iterator +circular_buffer_fixed_capacity::erase(iterator first, iterator last) { + static_assert(std::is_nothrow_move_assignable::value, "erase() assumes move assignment does not throw"); + if (first == last) { + return last; + } + // Move to the left or right depending on which would result in least amount of moves. + // This also guarantees that iterators will be stable when removing from either front or back. + if (std::distance(begin(), first) < std::distance(last, end())) { + auto new_start = std::move_backward(begin(), first, last); + auto i = begin(); + while (i < new_start) { + *i++.~T(); + } + _begin = new_start.idx; + return last; + } else { + auto new_end = std::move(last, end(), first); + auto i = new_end; + auto e = end(); + while (i < e) { + *i++.~T(); + } + _end = new_end.idx; + return first; + } +} + +template +inline +void +circular_buffer_fixed_capacity::clear() { + for (auto& obj : *this) { + obj.~T(); + } + _begin = _end = 0; +} + +} + diff --git a/src/seastar/include/seastar/core/condition-variable.hh b/src/seastar/include/seastar/core/condition-variable.hh new file mode 100644 index 000000000..fec28f285 --- /dev/null +++ b/src/seastar/include/seastar/core/condition-variable.hh @@ -0,0 +1,171 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB, Ltd. + */ + +#pragma once + +#include +#include + +namespace seastar { + +/// \addtogroup fiber-module +/// @{ + +/// Exception thrown when a condition variable is broken by +/// \ref condition_variable::broken(). +class broken_condition_variable : public std::exception { +public: + /// Reports the exception reason. + virtual const char* what() const noexcept; +}; + +/// Exception thrown when wait() operation times out +/// \ref condition_variable::wait(time_point timeout). +class condition_variable_timed_out : public std::exception { +public: + /// Reports the exception reason. + virtual const char* what() const noexcept; +}; + +/// \brief Conditional variable. +/// +/// This is a standard computer science condition variable sans locking, +/// since in seastar access to variables is atomic anyway, adapted +/// for futures. You can wait for variable to be notified. +/// +/// To support exceptional conditions, a \ref broken() method +/// is provided, which causes all current waiters to stop waiting, +/// with an exceptional future returned. This allows causing all +/// fibers that are blocked on a condition variable to continue. +/// This issimilar to POSIX's `pthread_cancel()`, with \ref wait() +/// acting as a cancellation point. + +class condition_variable { + using duration = semaphore::duration; + using clock = semaphore::clock; + using time_point = semaphore::time_point; + struct condition_variable_exception_factory { + static condition_variable_timed_out timeout() noexcept; + static broken_condition_variable broken() noexcept; + }; + basic_semaphore _sem; +public: + /// Constructs a condition_variable object. + /// Initialzie the semaphore with a default value of 0 to enusre + /// the first call to wait() before signal() won't be waken up immediately. + condition_variable() noexcept : _sem(0) {} + + /// Waits until condition variable is signaled, may wake up without condition been met + /// + /// \return a future that becomes ready when \ref signal() is called + /// If the condition variable was \ref broken() will return \ref broken_condition_variable + /// exception. + future<> wait() noexcept { + return _sem.wait(); + } + + /// Waits until condition variable is signaled or timeout is reached + /// + /// \param timeout time point at which wait will exit with a timeout + /// + /// \return a future that becomes ready when \ref signal() is called + /// If the condition variable was \ref broken() will return \ref broken_condition_variable + /// exception. If timepoint is reached will return \ref condition_variable_timed_out exception. + future<> wait(time_point timeout) noexcept { + return _sem.wait(timeout); + } + + /// Waits until condition variable is signaled or timeout is reached + /// + /// \param timeout duration after which wait will exit with a timeout + /// + /// \return a future that becomes ready when \ref signal() is called + /// If the condition variable was \ref broken() will return \ref broken_condition_variable + /// exception. If timepoint is passed will return \ref condition_variable_timed_out exception. + future<> wait(duration timeout) noexcept { + return _sem.wait(timeout); + } + + /// Waits until condition variable is notified and pred() == true, otherwise + /// wait again. + /// + /// \param pred predicate that checks that awaited condition is true + /// + /// \return a future that becomes ready when \ref signal() is called + /// If the condition variable was \ref broken(), may contain an exception. + template + future<> wait(Pred&& pred) noexcept { + return do_until(std::forward(pred), [this] { + return wait(); + }); + } + + /// Waits until condition variable is notified and pred() == true or timeout is reached, otherwise + /// wait again. + /// + /// \param timeout time point at which wait will exit with a timeout + /// \param pred predicate that checks that awaited condition is true + /// + /// \return a future that becomes ready when \ref signal() is called + /// If the condition variable was \ref broken() will return \ref broken_condition_variable + /// exception. If timepoint is reached will return \ref condition_variable_timed_out exception. + template + future<> wait(time_point timeout, Pred&& pred) noexcept { + return do_until(std::forward(pred), [this, timeout] () mutable { + return wait(timeout); + }); + } + + /// Waits until condition variable is notified and pred() == true or timeout is reached, otherwise + /// wait again. + /// + /// \param timeout duration after which wait will exit with a timeout + /// \param pred predicate that checks that awaited condition is true + /// + /// \return a future that becomes ready when \ref signal() is called + /// If the condition variable was \ref broken() will return \ref broken_condition_variable + /// exception. If timepoint is passed will return \ref condition_variable_timed_out exception. + template + future<> wait(duration timeout, Pred&& pred) noexcept { + return wait(clock::now() + timeout, std::forward(pred)); + } + /// Notify variable and wake up a waiter if there is one + void signal() noexcept { + if (_sem.waiters()) { + _sem.signal(); + } + } + /// Notify variable and wake up all waiter + void broadcast() noexcept { + _sem.signal(_sem.waiters()); + } + + /// Signal to waiters that an error occurred. \ref wait() will see + /// an exceptional future<> containing the provided exception parameter. + /// The future is made available immediately. + void broken() noexcept { + _sem.broken(); + } +}; + +/// @} + +} diff --git a/src/seastar/include/seastar/core/coroutine.hh b/src/seastar/include/seastar/core/coroutine.hh new file mode 100644 index 000000000..98e6e6794 --- /dev/null +++ b/src/seastar/include/seastar/core/coroutine.hh @@ -0,0 +1,196 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2019 ScyllaDB Ltd. + */ + +#pragma once + +#include + +#ifndef SEASTAR_COROUTINES_ENABLED +#error Coroutines support disabled. +#endif + +#include + +namespace seastar { + +namespace internal { + +template +class coroutine_traits_base { +public: + class promise_type final : public seastar::task { + seastar::promise _promise; + public: + promise_type() = default; + promise_type(promise_type&&) = delete; + promise_type(const promise_type&) = delete; + + template + void return_value(U&&... value) { + _promise.set_value(std::forward(value)...); + } + + void return_value(future&& fut) noexcept { + fut.forward_to(std::move(_promise)); + } + + void unhandled_exception() noexcept { + _promise.set_exception(std::current_exception()); + } + + seastar::future get_return_object() noexcept { + return _promise.get_future(); + } + + SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never initial_suspend() noexcept { return { }; } + SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never final_suspend() noexcept { return { }; } + + virtual void run_and_dispose() noexcept override { + auto handle = SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle::from_promise(*this); + handle.resume(); + } + + task* waiting_task() noexcept override { return _promise.waiting_task(); } + }; +}; + +template <> +class coroutine_traits_base<> { +public: + class promise_type final : public seastar::task { + seastar::promise<> _promise; + public: + promise_type() = default; + promise_type(promise_type&&) = delete; + promise_type(const promise_type&) = delete; + + void return_void() noexcept { + _promise.set_value(); + } + +// Clang complains if both return_value and return_void are defined +#if !defined(__clang__) + void return_value(future<>&& fut) noexcept { + fut.forward_to(std::move(_promise)); + } +#endif + + void unhandled_exception() noexcept { + _promise.set_exception(std::current_exception()); + } + + seastar::future<> get_return_object() noexcept { + return _promise.get_future(); + } + + SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never initial_suspend() noexcept { return { }; } + SEASTAR_INTERNAL_COROUTINE_NAMESPACE::suspend_never final_suspend() noexcept { return { }; } + + virtual void run_and_dispose() noexcept override { + auto handle = SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle::from_promise(*this); + handle.resume(); + } + + task* waiting_task() noexcept override { return _promise.waiting_task(); } + }; +}; + +template +struct awaiter { + seastar::future _future; +public: + explicit awaiter(seastar::future&& f) noexcept : _future(std::move(f)) { } + + awaiter(const awaiter&) = delete; + awaiter(awaiter&&) = delete; + + bool await_ready() const noexcept { + return _future.available(); + } + + template + void await_suspend(SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle hndl) noexcept { + _future.set_coroutine(hndl.promise()); + } + + std::tuple await_resume() { return _future.get(); } +}; + +template +struct awaiter { + seastar::future _future; +public: + explicit awaiter(seastar::future&& f) noexcept : _future(std::move(f)) { } + + awaiter(const awaiter&) = delete; + awaiter(awaiter&&) = delete; + + bool await_ready() const noexcept { + return _future.available(); + } + + template + void await_suspend(SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle hndl) noexcept { + _future.set_coroutine(hndl.promise()); + } + + T await_resume() { return _future.get0(); } +}; + +template<> +struct awaiter<> { + seastar::future<> _future; +public: + explicit awaiter(seastar::future<>&& f) noexcept : _future(std::move(f)) { } + + awaiter(const awaiter&) = delete; + awaiter(awaiter&&) = delete; + + bool await_ready() const noexcept { + return _future.available(); + } + + template + void await_suspend(SEASTAR_INTERNAL_COROUTINE_NAMESPACE::coroutine_handle hndl) noexcept { + _future.set_coroutine(hndl.promise()); + } + + void await_resume() { _future.get(); } +}; + +} // seastar::internal + +template +auto operator co_await(future f) noexcept { + return internal::awaiter(std::move(f)); +} + +} // seastar + + +namespace SEASTAR_INTERNAL_COROUTINE_NAMESPACE { + +template +class coroutine_traits, Args...> : public seastar::internal::coroutine_traits_base { +}; + +} // SEASTAR_INTERNAL_COROUTINE_NAMESPACE + diff --git a/src/seastar/include/seastar/core/deleter.hh b/src/seastar/include/seastar/core/deleter.hh new file mode 100644 index 000000000..b637336e1 --- /dev/null +++ b/src/seastar/include/seastar/core/deleter.hh @@ -0,0 +1,281 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include + +namespace seastar { + +/// \addtogroup memory-module +/// @{ + +/// Provides a mechanism for managing the lifetime of a buffer. +/// +/// A \c deleter is an object that is used to inform the consumer +/// of some buffer (not referenced by the deleter itself) how to +/// delete the buffer. This can be by calling an arbitrary function +/// or destroying an object carried by the deleter. Examples of +/// a deleter's encapsulated actions are: +/// +/// - calling \c std::free(p) on some captured pointer, p +/// - calling \c delete \c p on some captured pointer, p +/// - decrementing a reference count somewhere +/// +/// A deleter performs its action from its destructor. +class deleter final { +public: + /// \cond internal + struct impl; + struct raw_object_tag {}; + /// \endcond +private: + // if bit 0 set, point to object to be freed directly. + impl* _impl = nullptr; +public: + /// Constructs an empty deleter that does nothing in its destructor. + deleter() noexcept = default; + deleter(const deleter&) = delete; + /// Moves a deleter. + deleter(deleter&& x) noexcept : _impl(x._impl) { x._impl = nullptr; } + /// \cond internal + explicit deleter(impl* i) noexcept : _impl(i) {} + deleter(raw_object_tag tag, void* object) noexcept + : _impl(from_raw_object(object)) {} + /// \endcond + /// Destroys the deleter and carries out the encapsulated action. + ~deleter(); + deleter& operator=(deleter&& x) noexcept; + deleter& operator=(deleter&) = delete; + /// Performs a sharing operation. The encapsulated action will only + /// be carried out after both the original deleter and the returned + /// deleter are both destroyed. + /// + /// \return a deleter with the same encapsulated action as this one. + deleter share(); + /// Checks whether the deleter has an associated action. + explicit operator bool() const noexcept { return bool(_impl); } + /// \cond internal + void reset(impl* i) { + this->~deleter(); + new (this) deleter(i); + } + /// \endcond + /// Appends another deleter to this deleter. When this deleter is + /// destroyed, both encapsulated actions will be carried out. + void append(deleter d); +private: + static bool is_raw_object(impl* i) noexcept { + auto x = reinterpret_cast(i); + return x & 1; + } + bool is_raw_object() const noexcept { + return is_raw_object(_impl); + } + static void* to_raw_object(impl* i) noexcept { + auto x = reinterpret_cast(i); + return reinterpret_cast(x & ~uintptr_t(1)); + } + void* to_raw_object() const noexcept { + return to_raw_object(_impl); + } + impl* from_raw_object(void* object) noexcept { + auto x = reinterpret_cast(object); + return reinterpret_cast(x | 1); + } +}; + +/// \cond internal +struct deleter::impl { + unsigned refs = 1; + deleter next; + impl(deleter next) : next(std::move(next)) {} + virtual ~impl() {} +}; +/// \endcond + +inline +deleter::~deleter() { + if (is_raw_object()) { + std::free(to_raw_object()); + return; + } + if (_impl && --_impl->refs == 0) { + delete _impl; + } +} + +inline +deleter& deleter::operator=(deleter&& x) noexcept { + if (this != &x) { + this->~deleter(); + new (this) deleter(std::move(x)); + } + return *this; +} + +/// \cond internal +template +struct lambda_deleter_impl final : deleter::impl { + Deleter del; + lambda_deleter_impl(deleter next, Deleter&& del) + : impl(std::move(next)), del(std::move(del)) {} + virtual ~lambda_deleter_impl() override { del(); } +}; + +template +struct object_deleter_impl final : deleter::impl { + Object obj; + object_deleter_impl(deleter next, Object&& obj) + : impl(std::move(next)), obj(std::move(obj)) {} +}; + +template +inline +object_deleter_impl* make_object_deleter_impl(deleter next, Object obj) { + return new object_deleter_impl(std::move(next), std::move(obj)); +} +/// \endcond + +/// Makes a \ref deleter that encapsulates the action of +/// destroying an object, as well as running another deleter. The input +/// object is moved to the deleter, and destroyed when the deleter is destroyed. +/// +/// \param next deleter that will become part of the new deleter's encapsulated action +/// \param o object whose destructor becomes part of the new deleter's encapsulated action +/// \related deleter +template +deleter +make_deleter(deleter next, Object o) { + return deleter(new lambda_deleter_impl(std::move(next), std::move(o))); +} + +/// Makes a \ref deleter that encapsulates the action of destroying an object. The input +/// object is moved to the deleter, and destroyed when the deleter is destroyed. +/// +/// \param o object whose destructor becomes the new deleter's encapsulated action +/// \related deleter +template +deleter +make_deleter(Object o) { + return make_deleter(deleter(), std::move(o)); +} + +/// \cond internal +struct free_deleter_impl final : deleter::impl { + void* obj; + free_deleter_impl(void* obj) : impl(deleter()), obj(obj) {} + virtual ~free_deleter_impl() override { std::free(obj); } +}; +/// \endcond + +inline +deleter +deleter::share() { + if (!_impl) { + return deleter(); + } + if (is_raw_object()) { + _impl = new free_deleter_impl(to_raw_object()); + } + ++_impl->refs; + return deleter(_impl); +} + +// Appends 'd' to the chain of deleters. Avoids allocation if possible. For +// performance reasons the current chain should be shorter and 'd' should be +// longer. +inline +void deleter::append(deleter d) { + if (!d._impl) { + return; + } + impl* next_impl = _impl; + deleter* next_d = this; + while (next_impl) { + if (next_impl == d._impl) { + return; // Already appended + } + if (is_raw_object(next_impl)) { + next_d->_impl = next_impl = new free_deleter_impl(to_raw_object(next_impl)); + } + + if (next_impl->refs != 1) { + next_d->_impl = next_impl = make_object_deleter_impl(deleter(next_impl), std::move(d)); + return; + } + + next_d = &next_impl->next; + next_impl = next_d->_impl; + } + next_d->_impl = d._impl; + d._impl = nullptr; +} + +/// Makes a deleter that calls \c std::free() when it is destroyed. +/// +/// \param obj object to free. +/// \related deleter +inline +deleter +make_free_deleter(void* obj) { + if (!obj) { + return deleter(); + } + return deleter(deleter::raw_object_tag(), obj); +} + +/// Makes a deleter that calls \c std::free() when it is destroyed, as well +/// as invoking the encapsulated action of another deleter. +/// +/// \param next deleter to invoke. +/// \param obj object to free. +/// \related deleter +inline +deleter +make_free_deleter(deleter next, void* obj) { + return make_deleter(std::move(next), [obj] () mutable { std::free(obj); }); +} + +/// \see make_deleter(Object) +/// \related deleter +template +inline +deleter +make_object_deleter(T&& obj) { + return deleter{make_object_deleter_impl(deleter(), std::move(obj))}; +} + +/// \see make_deleter(deleter, Object) +/// \related deleter +template +inline +deleter +make_object_deleter(deleter d, T&& obj) { + return deleter{make_object_deleter_impl(std::move(d), std::move(obj))}; +} + +/// @} + +} diff --git a/src/seastar/include/seastar/core/distributed.hh b/src/seastar/include/seastar/core/distributed.hh new file mode 100644 index 000000000..c2641f612 --- /dev/null +++ b/src/seastar/include/seastar/core/distributed.hh @@ -0,0 +1,32 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include + +namespace seastar { + + +template +using distributed = sharded; + +} diff --git a/src/seastar/include/seastar/core/do_with.hh b/src/seastar/include/seastar/core/do_with.hh new file mode 100644 index 000000000..758d75388 --- /dev/null +++ b/src/seastar/include/seastar/core/do_with.hh @@ -0,0 +1,153 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include + +namespace seastar { + + +/// \cond internal + +namespace internal { + +template +class do_with_state final : public continuation_base_from_future::type { + HeldState _held; + typename Future::promise_type _pr; +public: + template + explicit do_with_state(T&&... args) : _held(std::forward(args)...) {} + virtual void run_and_dispose() noexcept override { + _pr.set_urgent_state(std::move(this->_state)); + delete this; + } + task* waiting_task() noexcept override { + return _pr.waiting_task(); + } + HeldState& data() { + return _held; + } + Future get_future() { + return _pr.get_future(); + } +}; + +} +/// \endcond + +namespace internal { +template +inline +auto +cherry_pick_tuple(std::index_sequence, Tuple&& tuple) { + return std::forward_as_tuple(std::get(std::forward(tuple))...); +} + +template +struct subtuple; + +template +struct subtuple> { + using type = std::tuple>...>; +}; + +template +inline +auto +do_with_impl(T1&& rv1, T2&& rv2, More&&... more) { + auto all = std::forward_as_tuple( + std::forward(rv1), + std::forward(rv2), + std::forward(more)...); + constexpr size_t nr = std::tuple_size::value - 1; + using idx = std::make_index_sequence; + auto&& just_values = cherry_pick_tuple(idx(), std::move(all)); + auto&& just_func = std::move(std::get(std::move(all))); + using value_tuple = typename subtuple::type; + using ret_type = decltype(std::apply(just_func, std::declval())); + auto task = std::apply( + [](auto&&... x) { + return std::make_unique>(std::forward(x)...); + }, + std::move(just_values)); + auto fut = std::apply(just_func, task->data()); + if (fut.available()) { + return fut; + } + auto ret = task->get_future(); + internal::set_callback(fut, task.release()); + return ret; +} +} + +/// \addtogroup future-util +/// @{ + +/// do_with() holds a objects alive until a future completes, and +/// allow the code involved in making the future complete to have easy +/// access to this object. +/// +/// do_with() takes multiple arguments: The last is a function +/// returning a future. The other are temporary objects (rvalue). The +/// function is given (a moved copy of) these temporary object, by +/// reference, and it is ensured that the objects will not be +/// destructed until the completion of the future returned by the +/// function. +/// +/// do_with() returns a future which resolves to whatever value the given future +/// (returned by the given function) resolves to. This returned value must not +/// contain references to the temporary object, as at that point the temporary +/// is destructed. +/// +/// \return whatever the function returns +template +inline +auto +do_with(T1&& rv1, T2&& rv2, More&&... more) noexcept { + auto func = internal::do_with_impl; + return futurize_invoke(func, std::forward(rv1), std::forward(rv2), std::forward(more)...); +} + +/// Executes the function \c func making sure the lock \c lock is taken, +/// and later on properly released. +/// +/// \param lock the lock, which is any object having providing a lock() / unlock() semantics. +/// Caller must make sure that it outlives \c func. +/// \param func function to be executed +/// \returns whatever \c func returns +template +inline +auto with_lock(Lock& lock, Func&& func) { + return lock.lock().then([&lock, func = std::forward(func)] () mutable { + return futurize_invoke(func).finally([&lock] { + lock.unlock(); + }); + }); +} + +/// @} + +} diff --git a/src/seastar/include/seastar/core/dpdk_rte.hh b/src/seastar/include/seastar/core/dpdk_rte.hh new file mode 100644 index 000000000..24c3d97a5 --- /dev/null +++ b/src/seastar/include/seastar/core/dpdk_rte.hh @@ -0,0 +1,64 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#pragma once + +#ifdef SEASTAR_HAVE_DPDK + +#include +#include +#include +#include +#include + +/*********************** Compat section ***************************************/ +// We currently support only versions 2.0 and above. +#if (RTE_VERSION < RTE_VERSION_NUM(2,0,0,0)) +#error "DPDK version above 2.0.0 is required" +#endif + +#if defined(RTE_MBUF_REFCNT_ATOMIC) +#warning "CONFIG_RTE_MBUF_REFCNT_ATOMIC should be disabled in DPDK's " \ + "config/common_linuxapp" +#endif +/******************************************************************************/ + +namespace seastar { + +namespace dpdk { + +// DPDK Environment Abstraction Layer +class eal { +public: + using cpuset = std::bitset; + + static void init(cpuset cpus, boost::program_options::variables_map opts); + /** + * Returns the amount of memory needed for DPDK + * @param num_cpus Number of CPUs the application is going to use + * + * @return + */ + static size_t mem_size(int num_cpus, bool hugetlbfs_membackend = true); + static bool initialized; +}; + +} // namespace dpdk + +} + +#endif // SEASTAR_HAVE_DPDK diff --git a/src/seastar/include/seastar/core/enum.hh b/src/seastar/include/seastar/core/enum.hh new file mode 100644 index 000000000..1ea342322 --- /dev/null +++ b/src/seastar/include/seastar/core/enum.hh @@ -0,0 +1,46 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +/* + * This header file defines a hash function for enum types, using the + * standard hash function of the underlying type (such as int). This makes + * it possible to inherit from this type to + */ + +#include +#include +#include + +namespace seastar { + +template +class enum_hash { + static_assert(std::is_enum::value, "must be an enum"); +public: + std::size_t operator()(const T& e) const { + using utype = typename std::underlying_type::type; + return std::hash()(static_cast(e)); + } +}; + +} diff --git a/src/seastar/include/seastar/core/exception_hacks.hh b/src/seastar/include/seastar/core/exception_hacks.hh new file mode 100644 index 000000000..5a8d54035 --- /dev/null +++ b/src/seastar/include/seastar/core/exception_hacks.hh @@ -0,0 +1,26 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 ScyllaDB + */ + +#pragma once + +namespace seastar { +void init_phdr_cache(); +} diff --git a/src/seastar/include/seastar/core/execution_stage.hh b/src/seastar/include/seastar/core/execution_stage.hh new file mode 100644 index 000000000..f3e4a7310 --- /dev/null +++ b/src/seastar/include/seastar/core/execution_stage.hh @@ -0,0 +1,543 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 ScyllaDB Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +/// \defgroup execution-stages Execution Stages +/// +/// \brief +/// Execution stages provide an infrastructure for processing function calls in +/// batches in order to improve instruction cache locality. +/// +/// When the application logic becomes more and more complex and the length +/// of the data processing pipeline grows it may happen that the most +/// significant bottleneck are instruction cache misses. The solution for that +/// problem may be processing similar operations in batches so that instruction +/// cache locality is improved at the cost of potentially higher latencies and +/// worse data cache locality. +/// +/// Execution stages allow batching calls to the specified function object. +/// Every time concrete_execution_stage::operator()() is used the function call +/// is added to the queue and a future is returned. Once the number of queued +/// calls reaches certain threshold the stage is flushed and a task is which +/// would execute these function calls is scheduled. Execution stages are also +/// flushed when the reactor polls for events. +/// +/// When calling a function that is wrapped inside execution stage it is +/// important to remember that the actual function call will happen at some +/// later time and it has to be guaranteed the objects passed by lvalue +/// reference are still alive. In order to avoid accidental passing of a +/// temporary object by lvalue reference the interface of execution stages +/// accepts only lvalue references wrapped in reference_wrapper. It is safe to +/// pass rvalue references, they are decayed and the objects are moved. See +/// concrete_execution_stage::operator()() for more details. + +/// \addtogroup execution-stages +/// @{ + +/// \cond internal +namespace internal { + +// Execution wraps lreferences in reference_wrapper so that the caller is forced +// to use seastar::ref(). Then when the function is actually called the +// reference is unwrapped. However, we need to distinguish between functions +// which argument is lvalue reference and functions that take +// reference_wrapper<> as an argument and not unwrap the latter. To solve this +// issue reference_wrapper_for_es type is used for wrappings done automatically +// by execution stage. +template +struct reference_wrapper_for_es : reference_wrapper { + reference_wrapper_for_es(reference_wrapper rw) noexcept + : reference_wrapper(std::move(rw)) {} +}; + +template +struct wrap_for_es { + using type = T; +}; + +template +struct wrap_for_es { + using type = reference_wrapper_for_es; +}; + +template +struct wrap_for_es { + using type = T; +}; + +template +decltype(auto) unwrap_for_es(T&& object) { + return std::forward(object); +} + +template +std::reference_wrapper unwrap_for_es(reference_wrapper_for_es ref) { + return std::reference_wrapper(ref.get()); +} + +} +/// \endcond + +/// Base execution stage class +class execution_stage { +public: + struct stats { + uint64_t tasks_scheduled = 0; + uint64_t tasks_preempted = 0; + uint64_t function_calls_enqueued = 0; + uint64_t function_calls_executed = 0; + }; +protected: + bool _empty = true; + bool _flush_scheduled = false; + scheduling_group _sg; + stats _stats; + sstring _name; + metrics::metric_group _metric_group; +protected: + virtual void do_flush() noexcept = 0; +public: + explicit execution_stage(const sstring& name, scheduling_group sg = {}); + virtual ~execution_stage(); + + execution_stage(const execution_stage&) = delete; + + /// Move constructor + /// + /// \warning It is illegal to move execution_stage after any operation has + /// been pushed to it. The only reason why the move constructor is not + /// deleted is the fact that C++14 does not guarantee return value + /// optimisation which is required by make_execution_stage(). + execution_stage(execution_stage&&); + + /// Returns execution stage name + const sstring& name() const noexcept { return _name; } + + /// Returns execution stage usage statistics + const stats& get_stats() const noexcept { return _stats; } + + /// Flushes execution stage + /// + /// Ensures that a task which would execute all queued operations is + /// scheduled. Does not schedule a new task if there is one already pending + /// or the queue is empty. + /// + /// \return true if a new task has been scheduled + bool flush() noexcept; + + /// Checks whether there are pending operations. + /// + /// \return true if there is at least one queued operation + bool poll() const noexcept { + return !_empty; + } +}; + +/// \cond internal +namespace internal { + +class execution_stage_manager { + std::vector _execution_stages; + std::unordered_map _stages_by_name; +private: + execution_stage_manager() = default; + execution_stage_manager(const execution_stage_manager&) = delete; + execution_stage_manager(execution_stage_manager&&) = delete; +public: + void register_execution_stage(execution_stage& stage); + void unregister_execution_stage(execution_stage& stage) noexcept; + void update_execution_stage_registration(execution_stage& old_es, execution_stage& new_es) noexcept; + execution_stage* get_stage(const sstring& name); + bool flush() noexcept; + bool poll() const noexcept; +public: + static execution_stage_manager& get() noexcept; +}; + +} +/// \endcond + +/// \brief Concrete execution stage class +/// +/// \note The recommended way of creating execution stages is to use +/// make_execution_stage(). +/// +/// \tparam ReturnType return type of the function object +/// \tparam Args argument pack containing arguments to the function object, needs +/// to have move constructor that doesn't throw +template +SEASTAR_CONCEPT(requires std::is_nothrow_move_constructible>::value) +class concrete_execution_stage final : public execution_stage { + using args_tuple = std::tuple; + static_assert(std::is_nothrow_move_constructible::value, + "Function arguments need to be nothrow move constructible"); + + static constexpr size_t flush_threshold = 128; + static constexpr size_t max_queue_length = 1024; + + using return_type = futurize_t; + using promise_type = typename return_type::promise_type; + using input_type = typename tuple_map_types::type; + + struct work_item { + input_type _in; + promise_type _ready; + + work_item(typename internal::wrap_for_es::type... args) : _in(std::move(args)...) { } + + work_item(work_item&& other) = delete; + work_item(const work_item&) = delete; + work_item(work_item&) = delete; + }; + chunked_fifo _queue; + + noncopyable_function _function; +private: + auto unwrap(input_type&& in) { + return tuple_map(std::move(in), [] (auto&& obj) { + return internal::unwrap_for_es(std::forward(obj)); + }); + } + + virtual void do_flush() noexcept override { + while (!_queue.empty()) { + auto& wi = _queue.front(); + auto wi_in = std::move(wi._in); + auto wi_ready = std::move(wi._ready); + _queue.pop_front(); + futurize::apply(_function, unwrap(std::move(wi_in))).forward_to(std::move(wi_ready)); + _stats.function_calls_executed++; + + if (need_preempt()) { + _stats.tasks_preempted++; + break; + } + } + _empty = _queue.empty(); + } +public: + explicit concrete_execution_stage(const sstring& name, scheduling_group sg, noncopyable_function f) + : execution_stage(name, sg) + , _function(std::move(f)) + { + _queue.reserve(flush_threshold); + } + explicit concrete_execution_stage(const sstring& name, noncopyable_function f) + : concrete_execution_stage(name, scheduling_group(), std::move(f)) { + } + + /// Enqueues a call to the stage's function + /// + /// Adds a function call to the queue. Objects passed by value are moved, + /// rvalue references are decayed and the objects are moved, lvalue + /// references need to be explicitly wrapped using seastar::ref(). + /// + /// Usage example: + /// ``` + /// void do_something(int&, int, std::vector&&); + /// thread_local auto stage = seastar::make_execution_stage("execution-stage", do_something); + /// + /// int global_value; + /// + /// future<> func(std::vector vec) { + /// //return stage(global_value, 42, std::move(vec)); // fail: use seastar::ref to pass references + /// return stage(seastar::ref(global_value), 42, std::move(vec)); // ok + /// } + /// ``` + /// + /// \param args arguments passed to the stage's function + /// \return future containing the result of the call to the stage's function + return_type operator()(typename internal::wrap_for_es::type... args) { + if (_queue.size() >= max_queue_length) { + do_flush(); + } + _queue.emplace_back(std::move(args)...); + _empty = false; + _stats.function_calls_enqueued++; + auto f = _queue.back()._ready.get_future(); + flush(); + return f; + } +}; + +/// \brief Base class for execution stages with support for automatic \ref scheduling_group inheritance +class inheriting_execution_stage { +public: + struct per_scheduling_group_stats { + scheduling_group sg; + execution_stage::stats stats; + }; + using stats = boost::container::static_vector; +}; + +/// \brief Concrete execution stage class, with support for automatic \ref scheduling_group inheritance +/// +/// A variation of \ref concrete_execution_stage that inherits the \ref scheduling_group +/// from the caller. Each call (of `operator()`) can be in its own scheduling group. +/// +/// \tparam ReturnType return type of the function object +/// \tparam Args argument pack containing arguments to the function object, needs +/// to have move constructor that doesn't throw +template +SEASTAR_CONCEPT(requires std::is_nothrow_move_constructible>::value) +class inheriting_concrete_execution_stage final : public inheriting_execution_stage { + using return_type = futurize_t; + using args_tuple = std::tuple; + using per_group_stage_type = concrete_execution_stage; + + static_assert(std::is_nothrow_move_constructible::value, + "Function arguments need to be nothrow move constructible"); + + sstring _name; + noncopyable_function _function; + std::vector> _stage_for_group{max_scheduling_groups()}; +private: + per_group_stage_type make_stage_for_group(scheduling_group sg) { + // We can't use std::ref(function), because reference_wrapper decays to noncopyable_function& and + // that selects the noncopyable_function copy constructor. Use a lambda instead. + auto wrapped_function = [&_function = _function] (Args... args) { + return _function(std::forward(args)...); + }; + auto name = fmt::format("{}.{}", _name, sg.name()); + return per_group_stage_type(name, sg, wrapped_function); + } +public: + /// Construct an inheriting concrete execution stage. + /// + /// \param name A name for the execution stage; must be unique + /// \param f Function to be called in response to operator(). The function + /// call will be deferred and batched with similar calls to increase + /// instruction cache hit rate. + inheriting_concrete_execution_stage(const sstring& name, noncopyable_function f) + : _name(std::move(name)),_function(std::move(f)) { + } + + /// Enqueues a call to the stage's function + /// + /// Adds a function call to the queue. Objects passed by value are moved, + /// rvalue references are decayed and the objects are moved, lvalue + /// references need to be explicitly wrapped using seastar::ref(). + /// + /// The caller's \ref scheduling_group will be preserved across the call. + /// + /// Usage example: + /// ``` + /// void do_something(int); + /// thread_local auto stage = seastar::inheriting_concrete_execution_stage("execution-stage", do_something); + /// + /// future<> func(int x) { + /// return stage(x); + /// } + /// ``` + /// + /// \param args arguments passed to the stage's function + /// \return future containing the result of the call to the stage's function + return_type operator()(typename internal::wrap_for_es::type... args) { + auto sg = current_scheduling_group(); + auto sg_id = internal::scheduling_group_index(sg); + auto& slot = _stage_for_group[sg_id]; + if (!slot) { + slot.emplace(make_stage_for_group(sg)); + } + return (*slot)(std::move(args)...); + } + + /// Returns summary of individual execution stage usage statistics + /// + /// \returns a vector of the stats of the individual per-scheduling group + /// executation stages. Each element in the vector is a pair composed of + /// the scheduling group and the stats for the respective execution + /// stage. Scheduling groups that have had no respective calls enqueued + /// yet are omitted. + inheriting_execution_stage::stats get_stats() const noexcept { + inheriting_execution_stage::stats summary; + for (unsigned sg_id = 0; sg_id != _stage_for_group.size(); ++sg_id) { + auto sg = internal::scheduling_group_from_index(sg_id); + if (_stage_for_group[sg_id]) { + summary.push_back({sg, _stage_for_group[sg_id]->get_stats()}); + } + } + return summary; + } +}; + + +/// \cond internal +namespace internal { + +template +struct concrete_execution_stage_helper; + +template +struct concrete_execution_stage_helper> { + using type = concrete_execution_stage; +}; + +} +/// \endcond + +/// Creates a new execution stage +/// +/// Wraps given function object in a concrete_execution_stage. All arguments +/// of the function object are required to have move constructors that do not +/// throw. Function object may return a future or an immediate object or void. +/// +/// Moving execution stages is discouraged and illegal after first function +/// call is enqueued. +/// +/// Usage example: +/// ``` +/// double do_something(int); +/// thread_local auto stage1 = seastar::make_execution_stage("execution-stage1", do_something); +/// +/// future func1(int val) { +/// return stage1(val); +/// } +/// +/// future do_some_io(int); +/// thread_local auto stage2 = seastar::make_execution_stage("execution-stage2", do_some_io); +/// +/// future func2(int val) { +/// return stage2(val); +/// } +/// ``` +/// +/// \param name unique name of the execution stage +/// \param sg scheduling group to run under +/// \param fn function to be executed by the stage +/// \return concrete_execution_stage +/// +template +auto make_execution_stage(const sstring& name, scheduling_group sg, Function&& fn) { + using traits = function_traits; + using ret_type = typename traits::return_type; + using args_as_tuple = typename traits::args_as_tuple; + using concrete_execution_stage = typename internal::concrete_execution_stage_helper::type; + return concrete_execution_stage(name, sg, std::forward(fn)); +} + +/// Creates a new execution stage (variant taking \ref scheduling_group) +/// +/// Wraps given function object in a concrete_execution_stage. All arguments +/// of the function object are required to have move constructors that do not +/// throw. Function object may return a future or an immediate object or void. +/// +/// Moving execution stages is discouraged and illegal after first function +/// call is enqueued. +/// +/// Usage example: +/// ``` +/// double do_something(int); +/// thread_local auto stage1 = seastar::make_execution_stage("execution-stage1", do_something); +/// +/// future func1(int val) { +/// return stage1(val); +/// } +/// +/// future do_some_io(int); +/// thread_local auto stage2 = seastar::make_execution_stage("execution-stage2", do_some_io); +/// +/// future func2(int val) { +/// return stage2(val); +/// } +/// ``` +/// +/// \param name unique name of the execution stage (variant not taking \ref scheduling_group) +/// \param fn function to be executed by the stage +/// \return concrete_execution_stage +/// +template +auto make_execution_stage(const sstring& name, Function&& fn) { + return make_execution_stage(name, scheduling_group(), std::forward(fn)); +} + +/// Creates a new execution stage from a member function +/// +/// Wraps a pointer to member function in a concrete_execution_stage. When +/// a function call is pushed to the stage the first argument should be a +/// pointer to the object the function is a member of. +/// +/// Usage example: +/// ``` +/// struct foo { +/// void do_something(int); +/// }; +/// +/// thread_local auto stage = seastar::make_execution_stage("execution-stage", &foo::do_something); +/// +/// future<> func(foo& obj, int val) { +/// return stage(&obj, val); +/// } +/// ``` +/// +/// \see make_execution_stage(const sstring&, Function&&) +/// \param name unique name of the execution stage +/// \param fn member function to be executed by the stage +/// \return concrete_execution_stage +template +concrete_execution_stage +make_execution_stage(const sstring& name, scheduling_group sg, Ret (Object::*fn)(Args...)) { + return concrete_execution_stage(name, sg, std::mem_fn(fn)); +} + +template +concrete_execution_stage +make_execution_stage(const sstring& name, scheduling_group sg, Ret (Object::*fn)(Args...) const) { + return concrete_execution_stage(name, sg, std::mem_fn(fn)); +} + +template +concrete_execution_stage +make_execution_stage(const sstring& name, Ret (Object::*fn)(Args...)) { + return make_execution_stage(name, scheduling_group(), fn); +} + +template +concrete_execution_stage +make_execution_stage(const sstring& name, Ret (Object::*fn)(Args...) const) { + return make_execution_stage(name, scheduling_group(), fn); +} + +/// @} + +} diff --git a/src/seastar/include/seastar/core/expiring_fifo.hh b/src/seastar/include/seastar/core/expiring_fifo.hh new file mode 100644 index 000000000..acc747c29 --- /dev/null +++ b/src/seastar/include/seastar/core/expiring_fifo.hh @@ -0,0 +1,217 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +template +struct dummy_expiry { + void operator()(T&) noexcept {}; +}; + +template +struct promise_expiry { + void operator()(promise& pr) noexcept { + pr.set_exception(std::make_exception_ptr(timed_out_error())); + }; +}; + +/// Container for elements with support for expiration of entries. +/// +/// OnExpiry is a functor which will be called with a reference to T right before it expires. +/// T is removed and destroyed from the container immediately after OnExpiry returns. +/// OnExpiry callback must not modify the container, it can only modify its argument. +/// +/// The container can only be moved before any elements are pushed. +/// +template , typename Clock = lowres_clock> +class expiring_fifo { +public: + using clock = Clock; + using time_point = typename Clock::time_point; +private: + struct entry { + std::optional payload; // disengaged means that it's expired + timer tr; + entry(T&& payload_) : payload(std::move(payload_)) {} + entry(const T& payload_) : payload(payload_) {} + entry(T payload_, expiring_fifo& ef, time_point timeout) + : payload(std::move(payload_)) + , tr([this, &ef] { + ef._on_expiry(*payload); + payload = std::nullopt; + --ef._size; + ef.drop_expired_front(); + }) + { + tr.arm(timeout); + } + entry(entry&& x) = delete; + entry(const entry& x) = delete; + }; + + // If engaged, represents the first element. + // This is to avoid large allocations done by chunked_fifo for single-element cases. + // expiring_fifo is used to implement wait lists in synchronization primitives + // and in some uses it's common to have at most one waiter. + std::unique_ptr _front; + + // There is an invariant that the front element is never expired. + chunked_fifo _list; + OnExpiry _on_expiry; + size_t _size = 0; + + // Ensures that front() is not expired by dropping expired elements from the front. + void drop_expired_front() noexcept { + while (!_list.empty() && !_list.front().payload) { + _list.pop_front(); + } + if (_front && !_front->payload) { + _front.reset(); + } + } +public: + expiring_fifo() noexcept = default; + expiring_fifo(OnExpiry on_expiry) noexcept(std::is_nothrow_move_constructible_v) : _on_expiry(std::move(on_expiry)) {} + + expiring_fifo(expiring_fifo&& o) noexcept + : expiring_fifo(std::move(o._on_expiry)) { + // entry objects hold a reference to this so non-empty containers cannot be moved. + assert(o._size == 0); + } + + expiring_fifo& operator=(expiring_fifo&& o) noexcept { + if (this != &o) { + this->~expiring_fifo(); + new (this) expiring_fifo(std::move(o)); + } + return *this; + } + + /// Checks if container contains any elements + /// + /// \note Inside OnExpiry callback, the expired element is still contained. + /// + /// \return true if and only if there are any elements contained. + bool empty() const noexcept { + return _size == 0; + } + + /// Equivalent to !empty() + explicit operator bool() const noexcept { + return !empty(); + } + + /// Returns a reference to the element in the front. + /// Valid only when !empty(). + T& front() noexcept { + if (_front) { + return *_front->payload; + } + return *_list.front().payload; + } + + /// Returns a reference to the element in the front. + /// Valid only when !empty(). + const T& front() const noexcept { + if (_front) { + return *_front->payload; + } + return *_list.front().payload; + } + + /// Returns the number of elements contained. + /// + /// \note Expired elements are not contained. Expiring element is still contained when OnExpiry is called. + size_t size() const noexcept { + return _size; + } + + /// Reserves storage in the container for at least 'size' elements. + /// Note that expired elements may also take space when they are not in the front of the queue. + /// + /// Doesn't give any guarantees about exception safety of subsequent push_back(). + void reserve(size_t size) { + return _list.reserve(size); + } + + /// Adds element to the back of the queue. + /// The element will never expire. + void push_back(const T& payload) { + if (_size == 0) { + _front = std::make_unique(payload); + } else { + _list.emplace_back(payload); + } + ++_size; + } + + /// Adds element to the back of the queue. + /// The element will never expire. + void push_back(T&& payload) { + if (_size == 0) { + _front = std::make_unique(std::move(payload)); + } else { + _list.emplace_back(std::move(payload)); + } + ++_size; + } + + /// Adds element to the back of the queue. + /// The element will expire when timeout is reached, unless it is time_point::max(), in which + /// case it never expires. + void push_back(T&& payload, time_point timeout) { + if (timeout == time_point::max()) { + push_back(std::move(payload)); + return; + } + if (_size == 0) { + _front = std::make_unique(std::move(payload), *this, timeout); + } else { + _list.emplace_back(std::move(payload), *this, timeout); + } + ++_size; + } + + /// Removes the element at the front. + /// Can be called only if !empty(). + void pop_front() noexcept { + if (_front) { + _front.reset(); + } else { + _list.pop_front(); + } + --_size; + drop_expired_front(); + } +}; + +} diff --git a/src/seastar/include/seastar/core/fair_queue.hh b/src/seastar/include/seastar/core/fair_queue.hh new file mode 100644 index 000000000..85e47b2aa --- /dev/null +++ b/src/seastar/include/seastar/core/fair_queue.hh @@ -0,0 +1,247 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2016 ScyllaDB + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace seastar { + +/// \brief describes a request that passes through the \ref fair_queue. +/// +/// A ticket is specified by a \c weight and a \c size. For example, one can specify a request of \c weight +/// 1 and \c size 16kB. If the \ref fair_queue accepts one such request per second, it will sustain 1 IOPS +/// at 16kB/s bandwidth. +/// +/// \related fair_queue +class fair_queue_ticket { + uint32_t _weight = 0; ///< the total weight of these requests for capacity purposes (IOPS). + uint32_t _size = 0; ///< the total effective size of these requests +public: + /// Constructs a fair_queue_ticket with a given \c weight and a given \c size + /// + /// \param weight the weight of the request + /// \param size the size of the request + fair_queue_ticket(uint32_t weight, uint32_t size); + fair_queue_ticket() {} + fair_queue_ticket operator+(fair_queue_ticket desc) const; + fair_queue_ticket operator-(fair_queue_ticket desc) const; + /// Increase the quantity represented in this ticket by the amount represented by \c desc + /// \param desc another \ref fair_queue_ticket whose \c weight \c and size will be added to this one + fair_queue_ticket& operator+=(fair_queue_ticket desc); + /// Decreases the quantity represented in this ticket by the amount represented by \c desc + /// \param desc another \ref fair_queue_ticket whose \c weight \c and size will be decremented from this one + fair_queue_ticket& operator-=(fair_queue_ticket desc); + + /// \returns true if this fair_queue_ticket is strictly less than \c rhs. + /// + /// For a fair_queue_ticket to be considered strictly less than another, both its quantities need to be + /// less than the other. Note that there is no total ordering between two fair_queue_tickets + // + /// \param rhs another \ref fair_queue_ticket to be compared to this one. + bool strictly_less(fair_queue_ticket rhs) const; + + /// \returns true if the fair_queue_ticket represents a non-zero quantity. + /// + /// For a fair_queue ticket to be non-zero, at least one of its represented quantities need to + /// be non-zero + explicit operator bool() const; + + friend std::ostream& operator<<(std::ostream& os, fair_queue_ticket t); + + /// \returns the normalized value of this \ref fair_queue_ticket along a base axis + /// + /// The normalization function itself is an implementation detail, but one can expect either weight or + /// size to have more or less relative importance depending on which of the dimensions in the + /// denominator is relatively higher. For example, given this request a, and two other requests + /// b and c, such that that c has the same \c weight but a higher \c size than b, one can expect + /// the \c size component of this request to play a larger role. + /// + /// It is legal for the numerator to have one of the quantities set to zero, in which case only + /// the other quantity is taken into consideration. + /// + /// It is however not legal for the axis to have any quantity set to zero. + /// \param axis another \ref fair_queue_ticket to be used as a a base vector against which to normalize this fair_queue_ticket. + float normalize(fair_queue_ticket axis) const; +}; + +/// \addtogroup io-module +/// @{ + +/// \cond internal +class priority_class { + struct request { + noncopyable_function func; + fair_queue_ticket desc; + }; + friend class fair_queue; + uint32_t _shares = 0; + float _accumulated = 0; + circular_buffer _queue; + bool _queued = false; + + friend struct shared_ptr_no_esft; + explicit priority_class(uint32_t shares) noexcept : _shares(std::max(shares, 1u)) {} + +public: + /// \brief return the current amount of shares for this priority class + uint32_t shares() const noexcept { + return _shares; + } + + void update_shares(uint32_t shares) noexcept { + _shares = (std::max(shares, 1u)); + } +}; +/// \endcond + +/// \brief Priority class, to be used with a given \ref fair_queue +/// +/// An instance of this class is associated with a given \ref fair_queue. When registering +/// a class, the caller will receive a \ref lw_shared_ptr to an object of this class. All its methods +/// are private, so the only thing the caller is expected to do with it is to pass it later +/// to the \ref fair_queue to identify a given class. +/// +/// \related fair_queue +using priority_class_ptr = lw_shared_ptr; + +/// \brief Fair queuing class +/// +/// This is a fair queue, allowing multiple request producers to queue requests +/// that will then be served proportionally to their classes' shares. +/// +/// To each request, a weight can also be associated. A request of weight 1 will consume +/// 1 share. Higher weights for a request will consume a proportionally higher amount of +/// shares. +/// +/// The user of this interface is expected to register multiple `priority_class` +/// objects, which will each have a shares attribute. +/// +/// Internally, each priority class may keep a separate queue of requests. +/// Requests pertaining to a class can go through even if they are over its +/// share limit, provided that the other classes have empty queues. +/// +/// When the classes that lag behind start seeing requests, the fair queue will serve +/// them first, until balance is restored. This balancing is expected to happen within +/// a certain time window that obeys an exponential decay. +class fair_queue { +public: + /// \brief Fair Queue configuration structure. + /// + /// \sets the operation parameters of a \ref fair_queue + /// \related fair_queue + struct config { + std::chrono::microseconds tau = std::chrono::milliseconds(100); + unsigned max_req_count = std::numeric_limits::max(); + unsigned max_bytes_count = std::numeric_limits::max(); + }; +private: + friend priority_class; + + struct class_compare { + bool operator() (const priority_class_ptr& lhs, const priority_class_ptr& rhs) const { + return lhs->_accumulated > rhs->_accumulated; + } + }; + + config _config; + fair_queue_ticket _maximum_capacity; + fair_queue_ticket _current_capacity; + fair_queue_ticket _resources_executing; + fair_queue_ticket _resources_queued; + unsigned _requests_executing = 0; + unsigned _requests_queued = 0; + using clock_type = std::chrono::steady_clock::time_point; + clock_type _base; + using prioq = std::priority_queue, class_compare>; + prioq _handles; + std::unordered_set _all_classes; + + void push_priority_class(priority_class_ptr pc); + + priority_class_ptr pop_priority_class(); + + float normalize_factor() const; + + void normalize_stats(); + + bool can_dispatch() const; +public: + /// Constructs a fair queue with configuration parameters \c cfg. + /// + /// \param cfg an instance of the class \ref config + explicit fair_queue(config cfg); + + /// Constructs a fair queue with a given \c capacity, expressed in IOPS. + /// + /// \param capacity how many concurrent requests are allowed in this queue. + /// \param tau the queue exponential decay parameter, as in exp(-1/tau * t) + explicit fair_queue(unsigned capacity, std::chrono::microseconds tau = std::chrono::milliseconds(100)) + : fair_queue(config{tau, capacity}) {} + + /// Registers a priority class against this fair queue. + /// + /// \param shares how many shares to create this class with + priority_class_ptr register_priority_class(uint32_t shares); + + /// Unregister a priority class. + /// + /// It is illegal to unregister a priority class that still have pending requests. + void unregister_priority_class(priority_class_ptr pclass); + + /// \return how many waiters are currently queued for all classes. + [[deprecated("fair_queue users should not track individual requests, but resources (weight, size) passing through the queue")]] + size_t waiters() const; + + /// \return the number of requests currently executing + [[deprecated("fair_queue users should not track individual requests, but resources (weight, size) passing through the queue")]] + size_t requests_currently_executing() const; + + /// \return how much resources (weight, size) are currently queued for all classes. + fair_queue_ticket resources_currently_waiting() const; + + /// \return the amount of resources (weight, size) currently executing + fair_queue_ticket resources_currently_executing() const; + + /// Queue the function \c func through this class' \ref fair_queue, with weight \c weight + /// + /// It is expected that \c func doesn't throw. If it does throw, it will be just removed from + /// the queue and discarded. + /// + /// The user of this interface is supposed to call \ref notify_requests_finished when the + /// request finishes executing - regardless of success or failure. + void queue(priority_class_ptr pc, fair_queue_ticket desc, noncopyable_function func); + + /// Notifies that ont request finished + /// \param desc an instance of \c fair_queue_ticket structure describing the request that just finished. + void notify_requests_finished(fair_queue_ticket desc, unsigned nr = 1) noexcept; + + /// Try to execute new requests if there is capacity left in the queue. + void dispatch_requests(); +}; +/// @} + +} diff --git a/src/seastar/include/seastar/core/file-types.hh b/src/seastar/include/seastar/core/file-types.hh new file mode 100644 index 000000000..38052c897 --- /dev/null +++ b/src/seastar/include/seastar/core/file-types.hh @@ -0,0 +1,140 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2015 Cloudius Systems + */ + +#pragma once + +#include +#include +#include + +namespace seastar { + +/// \addtogroup fileio-module +/// @{ + +/// Enumeration describing how a file is to be opened. +/// +/// \see file::open_file_dma() +enum class open_flags { + rw = O_RDWR, + ro = O_RDONLY, + wo = O_WRONLY, + create = O_CREAT, + truncate = O_TRUNC, + exclusive = O_EXCL, + dsync = O_DSYNC, +}; + +inline open_flags operator|(open_flags a, open_flags b) { + return open_flags(std::underlying_type_t(a) | std::underlying_type_t(b)); +} + +inline void operator|=(open_flags& a, open_flags b) { + a = (a | b); +} + +inline open_flags operator&(open_flags a, open_flags b) { + return open_flags(std::underlying_type_t(a) & std::underlying_type_t(b)); +} + +inline void operator&=(open_flags& a, open_flags b) { + a = (a & b); +} + +/// Enumeration describing the type of a directory entry being listed. +/// +/// \see file::list_directory() +enum class directory_entry_type { + unknown, + block_device, + char_device, + directory, + fifo, + link, + regular, + socket, +}; + +/// Enumeration describing the type of a particular filesystem +enum class fs_type { + other, + xfs, + ext2, + ext3, + ext4, + btrfs, + hfs, + tmpfs, +}; + +// Access flags for files/directories +enum class access_flags { + exists = F_OK, + read = R_OK, + write = W_OK, + execute = X_OK, + + // alias for directory access + lookup = execute, +}; + +inline access_flags operator|(access_flags a, access_flags b) { + return access_flags(std::underlying_type_t(a) | std::underlying_type_t(b)); +} + +inline access_flags operator&(access_flags a, access_flags b) { + return access_flags(std::underlying_type_t(a) & std::underlying_type_t(b)); +} + +// Permissions for files/directories +enum class file_permissions { + user_read = S_IRUSR, // Read by owner + user_write = S_IWUSR, // Write by owner + user_execute = S_IXUSR, // Execute by owner + + group_read = S_IRGRP, // Read by group + group_write = S_IWGRP, // Write by group + group_execute = S_IXGRP, // Execute by group + + others_read = S_IROTH, // Read by others + others_write = S_IWOTH, // Write by others + others_execute = S_IXOTH, // Execute by others + + user_permissions = user_read | user_write | user_execute, + group_permissions = group_read | group_write | group_execute, + others_permissions = others_read | others_write | others_execute, + all_permissions = user_permissions | group_permissions | others_permissions, + + default_file_permissions = user_read | user_write | group_read | group_write | others_read | others_write, // 0666 + default_dir_permissions = all_permissions, // 0777 +}; + +inline constexpr file_permissions operator|(file_permissions a, file_permissions b) { + return file_permissions(std::underlying_type_t(a) | std::underlying_type_t(b)); +} + +inline constexpr file_permissions operator&(file_permissions a, file_permissions b) { + return file_permissions(std::underlying_type_t(a) & std::underlying_type_t(b)); +} + +/// @} + +} // namespace seastar diff --git a/src/seastar/include/seastar/core/file.hh b/src/seastar/include/seastar/core/file.hh new file mode 100644 index 000000000..74a56cb1d --- /dev/null +++ b/src/seastar/include/seastar/core/file.hh @@ -0,0 +1,586 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2015 Cloudius Systems + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +/// \addtogroup fileio-module +/// @{ + +/// A directory entry being listed. +struct directory_entry { + /// Name of the file in a directory entry. Will never be "." or "..". Only the last component is included. + sstring name; + /// Type of the directory entry, if known. + std::optional type; +}; + +/// Filesystem object stat information +struct stat_data { + uint64_t device_id; // ID of device containing file + uint64_t inode_number; // Inode number + uint64_t mode; // File type and mode + directory_entry_type type; + uint64_t number_of_links;// Number of hard links + uint64_t uid; // User ID of owner + uint64_t gid; // Group ID of owner + uint64_t rdev; // Device ID (if special file) + uint64_t size; // Total size, in bytes + uint64_t block_size; // Block size for filesystem I/O + uint64_t allocated_size; // Total size of allocated storage, in bytes + + std::chrono::system_clock::time_point time_accessed; // Time of last content access + std::chrono::system_clock::time_point time_modified; // Time of last content modification + std::chrono::system_clock::time_point time_changed; // Time of last status change (either content or attributes) +}; + +/// File open options +/// +/// Options used to configure an open file. +/// +/// \ref file +struct file_open_options { + uint64_t extent_allocation_size_hint = 1 << 20; ///< Allocate this much disk space when extending the file + bool sloppy_size = false; ///< Allow the file size not to track the amount of data written until a flush + uint64_t sloppy_size_hint = 1 << 20; ///< Hint as to what the eventual file size will be + file_permissions create_permissions = file_permissions::default_file_permissions; ///< File permissions to use when creating a file +}; + +/// \cond internal +class io_queue; +using io_priority_class_id = unsigned; +class io_priority_class { + io_priority_class_id _id; + friend io_queue; + + io_priority_class() = delete; + explicit io_priority_class(io_priority_class_id id) noexcept + : _id(id) + { } + +public: + io_priority_class_id id() const { + return _id; + } +}; + +const io_priority_class& default_priority_class(); + +class file; +class file_impl; + +class file_handle; + +// A handle that can be transported across shards and used to +// create a dup(2)-like `file` object referring to the same underlying file +class file_handle_impl { +public: + virtual ~file_handle_impl() = default; + virtual std::unique_ptr clone() const = 0; + virtual shared_ptr to_file() && = 0; +}; + +class file_impl { +protected: + static file_impl* get_file_impl(file& f); +public: + unsigned _memory_dma_alignment = 4096; + unsigned _disk_read_dma_alignment = 4096; + unsigned _disk_write_dma_alignment = 4096; +public: + virtual ~file_impl() {} + + virtual future write_dma(uint64_t pos, const void* buffer, size_t len, const io_priority_class& pc) = 0; + virtual future write_dma(uint64_t pos, std::vector iov, const io_priority_class& pc) = 0; + virtual future read_dma(uint64_t pos, void* buffer, size_t len, const io_priority_class& pc) = 0; + virtual future read_dma(uint64_t pos, std::vector iov, const io_priority_class& pc) = 0; + virtual future<> flush(void) = 0; + virtual future stat(void) = 0; + virtual future<> truncate(uint64_t length) = 0; + virtual future<> discard(uint64_t offset, uint64_t length) = 0; + virtual future<> allocate(uint64_t position, uint64_t length) = 0; + virtual future size(void) = 0; + virtual future<> close() = 0; + virtual std::unique_ptr dup(); + virtual subscription list_directory(std::function (directory_entry de)> next) = 0; + virtual future> dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc) = 0; + + friend class reactor; +}; + +future> make_file_impl(int fd, file_open_options options, int oflags) noexcept; + +/// \endcond + +/// A data file on persistent storage. +/// +/// File objects represent uncached, unbuffered files. As such great care +/// must be taken to cache data at the application layer; neither seastar +/// nor the OS will cache these file. +/// +/// Data is transferred using direct memory access (DMA). This imposes +/// restrictions on file offsets and data pointers. The former must be aligned +/// on a 4096 byte boundary, while a 512 byte boundary suffices for the latter. +class file { + shared_ptr _file_impl; +public: + /// Default constructor constructs an uninitialized file object. + /// + /// A default constructor is useful for the common practice of declaring + /// a variable, and only assigning to it later. The uninitialized file + /// must not be used, or undefined behavior will result (currently, a null + /// pointer dereference). + /// + /// One can check whether a file object is in uninitialized state with + /// \ref operator bool(); One can reset a file back to uninitialized state + /// by assigning file() to it. + file() noexcept : _file_impl(nullptr) {} + + file(shared_ptr impl) noexcept + : _file_impl(std::move(impl)) {} + + /// Constructs a file object from a \ref file_handle obtained from another shard + explicit file(file_handle&& handle) noexcept; + + /// Checks whether the file object was initialized. + /// + /// \return false if the file object is uninitialized (default + /// constructed), true if the file object refers to an actual file. + explicit operator bool() const noexcept { return bool(_file_impl); } + + /// Copies a file object. The new and old objects refer to the + /// same underlying file. + /// + /// \param x file object to be copied + file(const file& x) = default; + /// Moves a file object. + file(file&& x) noexcept : _file_impl(std::move(x._file_impl)) {} + /// Assigns a file object. After assignent, the destination and source refer + /// to the same underlying file. + /// + /// \param x file object to assign to `this`. + file& operator=(const file& x) noexcept = default; + /// Moves assigns a file object. + file& operator=(file&& x) noexcept = default; + + // O_DIRECT reading requires that buffer, offset, and read length, are + // all aligned. Alignment of 4096 was necessary in the past, but no longer + // is - 512 is usually enough; But we'll need to use BLKSSZGET ioctl to + // be sure it is really enough on this filesystem. 4096 is always safe. + // In addition, if we start reading in things outside page boundaries, + // we will end up with various pages around, some of them with + // overlapping ranges. Those would be very challenging to cache. + + /// Alignment requirement for file offsets (for reads) + uint64_t disk_read_dma_alignment() const noexcept { + return _file_impl->_disk_read_dma_alignment; + } + + /// Alignment requirement for file offsets (for writes) + uint64_t disk_write_dma_alignment() const noexcept { + return _file_impl->_disk_write_dma_alignment; + } + + /// Alignment requirement for data buffers + uint64_t memory_dma_alignment() const noexcept { + return _file_impl->_memory_dma_alignment; + } + + + /** + * Perform a single DMA read operation. + * + * @param aligned_pos offset to begin reading at (should be aligned) + * @param aligned_buffer output buffer (should be aligned) + * @param aligned_len number of bytes to read (should be aligned) + * @param pc the IO priority class under which to queue this operation + * + * Alignment is HW dependent but use 4KB alignment to be on the safe side as + * explained above. + * + * @return number of bytes actually read + * or exceptional future in case of I/O error + */ + template + future + dma_read(uint64_t aligned_pos, CharType* aligned_buffer, size_t aligned_len, const io_priority_class& pc = default_priority_class()) noexcept { + return dma_read_impl(aligned_pos, reinterpret_cast(aligned_buffer), aligned_len, pc); + } + + /** + * Read the requested amount of bytes starting from the given offset. + * + * @param pos offset to begin reading from + * @param len number of bytes to read + * @param pc the IO priority class under which to queue this operation + * + * @return temporary buffer containing the requested data. + * or exceptional future in case of I/O error + * + * This function doesn't require any alignment for both "pos" and "len" + * + * @note size of the returned buffer may be smaller than "len" if EOF is + * reached or in case of I/O error. + */ + template + future> dma_read(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) noexcept { + return dma_read_impl(pos, len, pc).then([] (temporary_buffer t) { + return temporary_buffer(reinterpret_cast(t.get_write()), t.size(), t.release()); + }); + } + + /// Error thrown when attempting to read past end-of-file + /// with \ref dma_read_exactly(). + class eof_error : public std::exception {}; + + /** + * Read the exact amount of bytes. + * + * @param pos offset in a file to begin reading from + * @param len number of bytes to read + * @param pc the IO priority class under which to queue this operation + * + * @return temporary buffer containing the read data + * or exceptional future in case an error, holding: + * end_of_file_error if EOF is reached, file_io_error or + * std::system_error in case of I/O error. + */ + template + future> + dma_read_exactly(uint64_t pos, size_t len, const io_priority_class& pc = default_priority_class()) noexcept { + return dma_read_exactly_impl(pos, len, pc).then([] (temporary_buffer t) { + return temporary_buffer(reinterpret_cast(t.get_write()), t.size(), t.release()); + }); + } + + /// Performs a DMA read into the specified iovec. + /// + /// \param pos offset to read from. Must be aligned to \ref disk_read_dma_alignment. + /// \param iov vector of address/size pairs to read into. Addresses must be + /// aligned. + /// \param pc the IO priority class under which to queue this operation + /// + /// \return a future representing the number of bytes actually read. A short + /// read may happen due to end-of-file or an I/O error. + future dma_read(uint64_t pos, std::vector iov, const io_priority_class& pc = default_priority_class()) noexcept; + + /// Performs a DMA write from the specified buffer. + /// + /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment. + /// \param buffer aligned address of buffer to read from. Buffer must exists + /// until the future is made ready. + /// \param len number of bytes to write. Must be aligned. + /// \param pc the IO priority class under which to queue this operation + /// + /// \return a future representing the number of bytes actually written. A short + /// write may happen due to an I/O error. + template + future dma_write(uint64_t pos, const CharType* buffer, size_t len, const io_priority_class& pc = default_priority_class()) noexcept { + return dma_write_impl(pos, reinterpret_cast(buffer), len, pc); + } + + /// Performs a DMA write to the specified iovec. + /// + /// \param pos offset to write into. Must be aligned to \ref disk_write_dma_alignment. + /// \param iov vector of address/size pairs to write from. Addresses must be + /// aligned. + /// \param pc the IO priority class under which to queue this operation + /// + /// \return a future representing the number of bytes actually written. A short + /// write may happen due to an I/O error. + future dma_write(uint64_t pos, std::vector iov, const io_priority_class& pc = default_priority_class()) noexcept; + + /// Causes any previously written data to be made stable on persistent storage. + /// + /// Prior to a flush, written data may or may not survive a power failure. After + /// a flush, data is guaranteed to be on disk. + future<> flush() noexcept; + + /// Returns \c stat information about the file. + future stat() noexcept; + + /// Truncates the file to a specified length. + future<> truncate(uint64_t length) noexcept; + + /// Preallocate disk blocks for a specified byte range. + /// + /// Requests the file system to allocate disk blocks to + /// back the specified range (\c length bytes starting at + /// \c position). The range may be outside the current file + /// size; the blocks can then be used when appending to the + /// file. + /// + /// \param position beginning of the range at which to allocate + /// blocks. + /// \param length length of range to allocate. + /// \return future that becomes ready when the operation completes. + future<> allocate(uint64_t position, uint64_t length) noexcept; + + /// Discard unneeded data from the file. + /// + /// The discard operation tells the file system that a range of offsets + /// (which be aligned) is no longer needed and can be reused. + future<> discard(uint64_t offset, uint64_t length) noexcept; + + /// Gets the file size. + future size() const noexcept; + + /// Closes the file. + /// + /// Flushes any pending operations and release any resources associated with + /// the file (except for stable storage). + /// + /// \note + /// to ensure file data reaches stable storage, you must call \ref flush() + /// before calling \c close(). + future<> close() noexcept; + + /// Returns a directory listing, given that this file object is a directory. + subscription list_directory(std::function (directory_entry de)> next); + + /** + * Read a data bulk containing the provided addresses range that starts at + * the given offset and ends at either the address aligned to + * dma_alignment (4KB) or at the file end. + * + * @param offset starting address of the range the read bulk should contain + * @param range_size size of the addresses range + * @param pc the IO priority class under which to queue this operation + * + * @return temporary buffer containing the read data bulk. + * or exceptional future holding: + * system_error exception in case of I/O error or eof_error when + * "offset" is beyond EOF. + */ + template + future> + dma_read_bulk(uint64_t offset, size_t range_size, const io_priority_class& pc = default_priority_class()) noexcept { + return dma_read_bulk_impl(offset, range_size, pc).then([] (temporary_buffer t) { + return temporary_buffer(reinterpret_cast(t.get_write()), t.size(), t.release()); + }); + } + + /// \brief Creates a handle that can be transported across shards. + /// + /// Creates a handle that can be transported across shards, and then + /// used to create a new shard-local \ref file object that refers to + /// the same on-disk file. + /// + /// \note Use on read-only files. + /// + file_handle dup(); + + template + struct read_state; +private: + future> + dma_read_bulk_impl(uint64_t offset, size_t range_size, const io_priority_class& pc) noexcept; + + future + dma_write_impl(uint64_t pos, const uint8_t* buffer, size_t len, const io_priority_class& pc) noexcept; + + future> + dma_read_impl(uint64_t pos, size_t len, const io_priority_class& pc) noexcept; + + future + dma_read_impl(uint64_t aligned_pos, uint8_t* aligned_buffer, size_t aligned_len, const io_priority_class& pc) noexcept; + + future> + dma_read_exactly_impl(uint64_t pos, size_t len, const io_priority_class& pc) noexcept; + + friend class reactor; + friend class file_impl; +}; + +/// \brief Helper for ensuring a file is closed after \c func is called. +/// +/// The file provided by the \c file_fut future is passed to \c func. +/// +/// \param file_fut A future that produces a file +/// \param func A function that uses a file +/// \returns the future returned by \c func, or an exceptional future if either \c file_fut or closing the file failed. +template +SEASTAR_CONCEPT( requires std::invocable && std::is_nothrow_move_constructible_v ) +auto with_file(future file_fut, Func func) noexcept { + static_assert(std::is_nothrow_move_constructible_v, "Func's move constructor must not throw"); + return file_fut.then([func = std::move(func)] (file f) mutable { + return do_with(std::move(f), [func = std::move(func)] (file& f) mutable { + return futurize_invoke(func, f).finally([&f] { + return f.close(); + }); + }); + }); +} + +/// \brief Helper for ensuring a file is closed if \c func fails. +/// +/// The file provided by the \c file_fut future is passed to \c func. +/// * If func throws an exception E, the file is closed and we return +/// a failed future with E. +/// * If func returns a value V, the file is not closed and we return +/// a future with V. +/// Note that when an exception is not thrown, it is the +/// responsibility of func to make sure the file will be closed. It +/// can close the file itself, return it, or store it somewhere. +/// +/// \param file_fut A future that produces a file +/// \param func A function that uses a file +/// \returns the future returned by \c func, or an exceptional future if \c file_fut failed or a nested exception if closing the file failed. +template +SEASTAR_CONCEPT( requires std::invocable && std::is_nothrow_move_constructible_v ) +auto with_file_close_on_failure(future file_fut, Func func) noexcept { + static_assert(std::is_nothrow_move_constructible_v, "Func's move constructor must not throw"); + return file_fut.then([func = std::move(func)] (file f) mutable { + return do_with(std::move(f), [func = std::move(func)] (file& f) mutable { + return futurize_invoke(std::move(func), f).then_wrapped([&f] (auto ret) mutable { + if (!ret.failed()) { + return ret; + } + return ret.finally([&f] { + // If f.close() fails, return that as nested exception. + return f.close(); + }); + }); + }); + }); +} + +/// \example file_demo.cc +/// A program demonstrating the use of \ref seastar::with_file +/// and \ref seastar::with_file_close_on_failure + +/// \brief A shard-transportable handle to a file +/// +/// If you need to access a file (for reads only) across multiple shards, +/// you can use the file::dup() method to create a `file_handle`, transport +/// this file handle to another shard, and use the handle to create \ref file +/// object on that shard. This is more efficient than calling open_file_dma() +/// again. +class file_handle { + std::unique_ptr _impl; +private: + explicit file_handle(std::unique_ptr impl) : _impl(std::move(impl)) {} +public: + /// Copies a file handle object + file_handle(const file_handle&); + /// Moves a file handle object + file_handle(file_handle&&) noexcept; + /// Assigns a file handle object + file_handle& operator=(const file_handle&); + /// Move-assigns a file handle object + file_handle& operator=(file_handle&&) noexcept; + /// Converts the file handle object to a \ref file. + file to_file() const &; + /// Converts the file handle object to a \ref file. + file to_file() &&; + + friend class file; +}; + +/// \cond internal + +template +struct file::read_state { + typedef temporary_buffer tmp_buf_type; + + read_state(uint64_t offset, uint64_t front, size_t to_read, + size_t memory_alignment, size_t disk_alignment) + : buf(tmp_buf_type::aligned(memory_alignment, + align_up(to_read, disk_alignment))) + , _offset(offset) + , _to_read(to_read) + , _front(front) {} + + bool done() const { + return eof || pos >= _to_read; + } + + /** + * Trim the buffer to the actual number of read bytes and cut the + * bytes from offset 0 till "_front". + * + * @note this function has to be called only if we read bytes beyond + * "_front". + */ + void trim_buf_before_ret() { + if (have_good_bytes()) { + buf.trim(pos); + buf.trim_front(_front); + } else { + buf.trim(0); + } + } + + uint64_t cur_offset() const { + return _offset + pos; + } + + size_t left_space() const { + return buf.size() - pos; + } + + size_t left_to_read() const { + // positive as long as (done() == false) + return _to_read - pos; + } + + void append_new_data(tmp_buf_type& new_data) { + auto to_copy = std::min(left_space(), new_data.size()); + + std::memcpy(buf.get_write() + pos, new_data.get(), to_copy); + pos += to_copy; + } + + bool have_good_bytes() const { + return pos > _front; + } + +public: + bool eof = false; + tmp_buf_type buf; + size_t pos = 0; +private: + uint64_t _offset; + size_t _to_read; + uint64_t _front; +}; + +/// \endcond + +/// @} + +} diff --git a/src/seastar/include/seastar/core/fsqual.hh b/src/seastar/include/seastar/core/fsqual.hh new file mode 100644 index 000000000..24a3d1d72 --- /dev/null +++ b/src/seastar/include/seastar/core/fsqual.hh @@ -0,0 +1,30 @@ +/* + * Copyright 2017 ScyllaDB + */ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +namespace seastar { + +bool filesystem_has_good_aio_support(sstring directory, bool verbose = false); + +} diff --git a/src/seastar/include/seastar/core/fstream.hh b/src/seastar/include/seastar/core/fstream.hh new file mode 100644 index 000000000..67d59abfd --- /dev/null +++ b/src/seastar/include/seastar/core/fstream.hh @@ -0,0 +1,151 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +/// \file + +// File <-> streams adapters +// +// Seastar files are block-based due to the reliance on DMA - you must read +// on sector boundaries. The adapters in this file provide a byte stream +// interface to files, while retaining the zero-copy characteristics of +// seastar files. + +#include +#include +#include +#include + +namespace seastar { + +class file_input_stream_history { + static constexpr uint64_t window_size = 4 * 1024 * 1024; + struct window { + uint64_t total_read = 0; + uint64_t unused_read = 0; + }; + window current_window; + window previous_window; + unsigned read_ahead = 1; + + friend class file_data_source_impl; +}; + +/// Data structure describing options for opening a file input stream +struct file_input_stream_options { + size_t buffer_size = 8192; ///< I/O buffer size + unsigned read_ahead = 0; ///< Maximum number of extra read-ahead operations + ::seastar::io_priority_class io_priority_class = default_priority_class(); + lw_shared_ptr dynamic_adjustments = { }; ///< Input stream history, if null dynamic adjustments are disabled +}; + +/// \brief Creates an input_stream to read a portion of a file. +/// +/// \param file File to read; multiple streams for the same file may coexist +/// \param offset Starting offset to read from (no alignment restrictions) +/// \param len Maximum number of bytes to read; the stream will stop at end-of-file +/// even if `offset + len` is beyond end-of-file. +/// \param options A set of options controlling the stream. +/// +/// \note Multiple input streams may exist concurrently for the same file. +input_stream make_file_input_stream( + file file, uint64_t offset, uint64_t len, file_input_stream_options options = {}); + +// Create an input_stream for a given file, with the specified options. +// Multiple fibers of execution (continuations) may safely open +// multiple input streams concurrently for the same file. +input_stream make_file_input_stream( + file file, uint64_t offset, file_input_stream_options = {}); + +// Create an input_stream for reading starting at a given position of the +// given file. Multiple fibers of execution (continuations) may safely open +// multiple input streams concurrently for the same file. +input_stream make_file_input_stream( + file file, file_input_stream_options = {}); + +struct file_output_stream_options { + // For small files, setting preallocation_size can make it impossible for XFS to find + // an aligned extent. On the other hand, without it, XFS will divide the file into + // file_size/buffer_size extents. To avoid fragmentation, we set the default buffer_size + // to 64k (so each extent will be a minimum of 64k) and preallocation_size to 0 (to avoid + // extent allocation problems). + // + // Large files should increase both buffer_size and preallocation_size. + unsigned buffer_size = 65536; + unsigned preallocation_size = 0; ///< Preallocate extents. For large files, set to a large number (a few megabytes) to reduce fragmentation + unsigned write_behind = 1; ///< Number of buffers to write in parallel + ::seastar::io_priority_class io_priority_class = default_priority_class(); +}; + +SEASTAR_INCLUDE_API_V2 namespace api_v2 { + +/// Create an output_stream for writing starting at the position zero of a +/// newly created file. +/// NOTE: flush() should be the last thing to be called on a file output stream. +[[deprecated("use Seastar_API_LEVEL=3 instead")]] +output_stream make_file_output_stream( + file file, + uint64_t buffer_size = 8192); + +/// Create an output_stream for writing starting at the position zero of a +/// newly created file. +/// NOTE: flush() should be the last thing to be called on a file output stream. +[[deprecated("use Seastar_API_LEVEL=3 instead")]] +output_stream make_file_output_stream( + file file, + file_output_stream_options options); + +/// Create a data_sink for writing starting at the position zero of a +/// newly created file. +[[deprecated("use Seastar_API_LEVEL=3 instead")]] +data_sink make_file_data_sink(file, file_output_stream_options); + +} + +SEASTAR_INCLUDE_API_V3 namespace api_v3 { +inline namespace and_newer { + +/// Create an output_stream for writing starting at the position zero of a +/// newly created file. +/// NOTE: flush() should be the last thing to be called on a file output stream. +/// Closes the file if the stream creation fails. +future> make_file_output_stream( + file file, + uint64_t buffer_size = 8192) noexcept; + +/// Create an output_stream for writing starting at the position zero of a +/// newly created file. +/// NOTE: flush() should be the last thing to be called on a file output stream. +/// Closes the file if the stream creation fails. +future> make_file_output_stream( + file file, + file_output_stream_options options) noexcept; + +/// Create a data_sink for writing starting at the position zero of a +/// newly created file. +/// Closes the file if the sink creation fails. +future make_file_data_sink(file, file_output_stream_options) noexcept; + +} +} + +} diff --git a/src/seastar/include/seastar/core/function_traits.hh b/src/seastar/include/seastar/core/function_traits.hh new file mode 100644 index 000000000..a3b9b9d31 --- /dev/null +++ b/src/seastar/include/seastar/core/function_traits.hh @@ -0,0 +1,68 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2015 Cloudius Systems + */ + +#pragma once + +#include + +namespace seastar { + +template +struct function_traits; + +template +struct function_traits +{ + using return_type = Ret; + using args_as_tuple = std::tuple; + using signature = Ret (Args...); + + static constexpr std::size_t arity = sizeof...(Args); + + template + struct arg + { + static_assert(N < arity, "no such parameter index."); + using type = typename std::tuple_element>::type; + }; +}; + +template +struct function_traits : public function_traits +{}; + +template +struct function_traits : public function_traits +{}; + +template +struct function_traits : public function_traits +{}; + +template +struct function_traits : public function_traits +{}; + +template +struct function_traits : public function_traits> +{}; + +} diff --git a/src/seastar/include/seastar/core/future-util.hh b/src/seastar/include/seastar/core/future-util.hh new file mode 100644 index 000000000..3252accf5 --- /dev/null +++ b/src/seastar/include/seastar/core/future-util.hh @@ -0,0 +1,31 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include diff --git a/src/seastar/include/seastar/core/future.hh b/src/seastar/include/seastar/core/future.hh new file mode 100644 index 000000000..81ee18528 --- /dev/null +++ b/src/seastar/include/seastar/core/future.hh @@ -0,0 +1,2196 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __cplusplus > 201703L +#include +#endif + +namespace seastar { + +struct nested_exception : public std::exception { + std::exception_ptr inner; + std::exception_ptr outer; + nested_exception(std::exception_ptr inner, std::exception_ptr outer) noexcept; + nested_exception(nested_exception&&) noexcept; + nested_exception(const nested_exception&) noexcept; + [[noreturn]] void rethrow_nested() const; + virtual const char* what() const noexcept override; +}; + +/// \defgroup future-module Futures and Promises +/// +/// \brief +/// Futures and promises are the basic tools for asynchronous +/// programming in seastar. A future represents a result that +/// may not have been computed yet, for example a buffer that +/// is being read from the disk, or the result of a function +/// that is executed on another cpu. A promise object allows +/// the future to be eventually resolved by assigning it a value. +/// +/// \brief +/// Another way to look at futures and promises are as the reader +/// and writer sides, respectively, of a single-item, single use +/// queue. You read from the future, and write to the promise, +/// and the system takes care that it works no matter what the +/// order of operations is. +/// +/// \brief +/// The normal way of working with futures is to chain continuations +/// to them. A continuation is a block of code (usually a lamdba) +/// that is called when the future is assigned a value (the future +/// is resolved); the continuation can then access the actual value. +/// + +/// \defgroup future-module-impl Implementation overview +/// \ingroup future-module +/// +/// A future has a stored value. Semantically, the value is a +/// std::optional>. The actual +/// type of the value in the implementation is future_state. +/// +/// A future without an initial value can be created by first creating +/// a promise and then calling promise::get_future. The promise also +/// stores a future_state in case promise::set_value is called +/// before get_future. +/// +/// In addition to the future_state, the promise and the future +/// point to each other and the pointers are updated when either is +/// moved. +/// +/// If a future is consumed by future::then before the future is +/// ready, a continuation is dynamically allocated. The continuation +/// also has a future_state, but unlinke a future it is never +/// moved. +/// +/// After a future creates a continuation, the corresponding promise +/// points to the newly allocated continuation. When +/// promise::set_value is called, the continuation is ready and is +/// scheduled. +/// +/// A promise then consists of +/// * A future_state for use when there is no corresponding future +/// or continuation (_local_state). +/// * A pointer to a future to allow updates when the promise is moved +/// (_future). +/// * A pointer to the continuation (_task). +/// * A pointer to future_state (_state) that can point to +/// 1. The future_state in the promise itself +/// 2. The future_state in the future +/// 2. The future_state in the continuation +/// +/// A special case is when a future blocks inside a thread. In that +/// case we still need a continuation, but that continuation doesn't +/// need a future_state since the original future still exists on +/// the stack. +/// +/// So the valid states for a promise are: +/// +/// 1. A newly created promise. _state points to _local_state and +/// _task and _future are null. +/// 2. After get_future is called. _state points to the state in the +/// future, _future points to the future and _task is null. +/// 3. The future has been consumed by future::then. Now the _state +/// points to the state in the continuation, _future is null and +/// _task points to the continuation. +/// 4. A call to future::get is blocked in a thread. This is a mix of +/// cases 2 and 3. Like 2, there is a valid future and _future and +/// _state point to the future and its state. Like 3, there is a +/// valid continuation and _task points to it, but that +/// continuation has no state of its own. + +/// \defgroup future-util Future Utilities +/// \ingroup future-module +/// +/// \brief +/// These utilities are provided to help perform operations on futures. + + +/// \addtogroup future-module +/// @{ + +#if SEASTAR_API_LEVEL < 6 +template +#else +template +#endif +class promise; + +template +class future; + +template +class shared_future; + +struct future_state_base; + +/// \brief Creates a \ref future in an available, value state. +/// +/// Creates a \ref future object that is already resolved. This +/// is useful when it is determined that no I/O needs to be performed +/// to perform a computation (for example, because the data is cached +/// in some buffer). +template +future make_ready_future(A&&... value) noexcept; + +/// \brief Creates a \ref future in an available, failed state. +/// +/// Creates a \ref future object that is already resolved in a failed +/// state. This is useful when no I/O needs to be performed to perform +/// a computation (for example, because the connection is closed and +/// we cannot read from it). +template +future make_exception_future(std::exception_ptr&& value) noexcept; + +template +future make_exception_future(const std::exception_ptr& ex) noexcept { + return make_exception_future(std::exception_ptr(ex)); +} + +template +future make_exception_future(std::exception_ptr& ex) noexcept { + return make_exception_future(static_cast(ex)); +} + +/// \cond internal +void engine_exit(std::exception_ptr eptr = {}); + +void report_failed_future(const std::exception_ptr& ex) noexcept; + +void report_failed_future(const future_state_base& state) noexcept; + +void with_allow_abandoned_failed_futures(unsigned count, noncopyable_function func); + +/// \endcond + +/// \brief Exception type for broken promises +/// +/// When a promise is broken, i.e. a promise object with an attached +/// continuation is destroyed before setting any value or exception, an +/// exception of `broken_promise` type is propagated to that abandoned +/// continuation. +struct broken_promise : std::logic_error { + broken_promise(); +}; + +/// \brief Returns std::current_exception() wrapped in a future +/// +/// This is equivalent to +/// make_exception_future(std::current_exception()), but expands to +/// less code. +template +future current_exception_as_future() noexcept; + +extern template +future<> current_exception_as_future() noexcept; + +namespace internal { +#if SEASTAR_API_LEVEL < 6 +template +#else +template +#endif +class promise_base_with_type; +class promise_base; + +struct monostate {}; + +template +struct future_stored_type; + +template <> +struct future_stored_type<> { +#if SEASTAR_API_LEVEL < 5 + using type = std::tuple<>; +#else + using type = monostate; +#endif +}; + +template +struct future_stored_type { +#if SEASTAR_API_LEVEL < 5 + using type = std::tuple; +#else + using type = std::conditional_t, internal::monostate, T>; +#endif +}; + +template +using future_stored_type_t = typename future_stored_type::type; + +template +#if SEASTAR_API_LEVEL < 5 +using future_tuple_type_t = T; +#else +using future_tuple_type_t = std::conditional_t, std::tuple<>, std::tuple>; +#endif + +// It doesn't seem to be possible to use std::tuple_element_t with an empty tuple. There is an static_assert in it that +// fails the build even if it is in the non enabled side of std::conditional. +template +struct get0_return_type; + +template <> +struct get0_return_type> { + using type = void; + static type get0(std::tuple<> v) { } +}; + +template +struct get0_return_type> { + using type = T0; + static type get0(std::tuple v) { return std::get<0>(std::move(v)); } +}; + +template +using maybe_wrap_ref = std::conditional_t, std::reference_wrapper>, T>; + +/// \brief Wrapper for keeping uninitialized values of non default constructible types. +/// +/// This is similar to a std::optional, but it doesn't know if it is holding a value or not, so the user is +/// responsible for calling constructors and destructors. +/// +/// The advantage over just using a union directly is that this uses inheritance when possible and so benefits from the +/// empty base optimization. +template +struct uninitialized_wrapper_base; + +template +struct uninitialized_wrapper_base { + using tuple_type = future_tuple_type_t; + union any { + any() noexcept {} + ~any() {} + // T can be a reference, so wrap it. + maybe_wrap_ref value; + } _v; + +public: + uninitialized_wrapper_base() noexcept = default; + template + std::enable_if_t...>, std::tuple>, void> + uninitialized_set(U&&... vs) { + new (&_v.value) maybe_wrap_ref{T(std::forward(vs)...)}; + } + void uninitialized_set(tuple_type&& v) { + uninitialized_set(std::move(std::get<0>(v))); + } + void uninitialized_set(const tuple_type& v) { + uninitialized_set(std::get<0>(v)); + } + maybe_wrap_ref& uninitialized_get() { + return _v.value; + } + const maybe_wrap_ref& uninitialized_get() const { + return _v.value; + } +}; + +template struct uninitialized_wrapper_base : private T { + using tuple_type = future_tuple_type_t; + uninitialized_wrapper_base() noexcept = default; + template + std::enable_if_t...>, std::tuple>, void> + uninitialized_set(U&&... vs) { + new (this) T(std::forward(vs)...); + } + void uninitialized_set(tuple_type&& v) { + if constexpr (std::tuple_size_v != 0) { + uninitialized_set(std::move(std::get<0>(v))); + } + } + void uninitialized_set(const tuple_type& v) { + if constexpr (std::tuple_size_v != 0) { + uninitialized_set(std::get<0>(v)); + } + } + T& uninitialized_get() { + return *this; + } + const T& uninitialized_get() const { + return *this; + } +}; + +template +constexpr bool can_inherit = +#ifdef _LIBCPP_VERSION +// We expect std::tuple<> to be trivially constructible and +// destructible. That is not the case with libc++ +// (https://bugs.llvm.org/show_bug.cgi?id=41714). We could avoid this +// optimization when using libc++ and relax the asserts, but +// inspection suggests that std::tuple<> is trivial, it is just not +// marked as such. + std::is_same, T>::value || +#endif + (std::is_trivially_destructible::value && std::is_trivially_constructible::value && + std::is_class::value && !std::is_final::value); + +// The objective is to avoid extra space for empty types like std::tuple<>. We could use std::is_empty_v, but it is +// better to check that both the constructor and destructor can be skipped. +template +struct uninitialized_wrapper + : public uninitialized_wrapper_base> {}; + +template +struct is_trivially_move_constructible_and_destructible { + static constexpr bool value = std::is_trivially_move_constructible::value && std::is_trivially_destructible::value; +}; + +template +struct all_true : std::false_type {}; + +template <> +struct all_true<> : std::true_type {}; + +template +struct all_true : public all_true {}; + +template +struct is_tuple_effectively_trivially_move_constructible_and_destructible_helper; + +template +struct is_tuple_effectively_trivially_move_constructible_and_destructible_helper> { + static constexpr bool value = all_true::value...>::value; +}; + +template +static constexpr bool is_tuple_effectively_trivially_move_constructible_and_destructible = + is_tuple_effectively_trivially_move_constructible_and_destructible_helper::value; + +} + +// +// A future/promise pair maintain one logical value (a future_state). +// There are up to three places that can store it, but only one is +// active at any time. +// +// - in the promise _local_state member variable +// +// This is necessary because a promise is created first and there +// would be nowhere else to put the value. +// +// - in the future _state variable +// +// This is used anytime a future exists and then has not been called +// yet. This guarantees a simple access to the value for any code +// that already has a future. +// +// - in the task associated with the .then() clause (after .then() is called, +// if a value was not set) +// +// +// The promise maintains a pointer to the state, which is modified as +// the state moves to a new location due to events (such as .then() or +// get_future being called) or due to the promise or future being +// moved around. +// + +// non templated base class to reduce code duplication +struct future_state_base { + static_assert(sizeof(std::exception_ptr) == sizeof(void*), "exception_ptr not a pointer"); + enum class state : uintptr_t { + invalid = 0, + future = 1, + // the substate is intended to decouple the run-time prevention + // for duplicative result extraction (calling e.g. then() twice + // ends up in abandoned()) from the wrapped object's destruction + // handling which is orchestrated by future_state. Instead of + // creating a temporary future_state just for the sake of setting + // the "invalid" in the source instance, result_unavailable can + // be set to ensure future_state_base::available() returns false. + result_unavailable = 2, + result = 3, + exception_min = 4, // or anything greater + }; + union any { + any() noexcept { st = state::future; } + any(state s) noexcept { st = s; } + void set_exception(std::exception_ptr&& e) noexcept { + new (&ex) std::exception_ptr(std::move(e)); + assert(st >= state::exception_min); + } + any(std::exception_ptr&& e) noexcept { + set_exception(std::move(e)); + } + // From a users' perspective, a result_unavailable is not valid + bool valid() const noexcept { return st != state::invalid && st != state::result_unavailable; } + bool available() const noexcept { return st == state::result || st >= state::exception_min; } + bool failed() const noexcept { return __builtin_expect(st >= state::exception_min, false); } + void check_failure() noexcept; + ~any() noexcept { } + std::exception_ptr take_exception() noexcept { + std::exception_ptr ret(std::move(ex)); + // Unfortunately in libstdc++ ~exception_ptr is defined out of line. We know that it does nothing for + // moved out values, so we omit calling it. This is critical for the code quality produced for this + // function. Without the out of line call, gcc can figure out that both sides of the if produce + // identical code and merges them.if + // We don't make any assumptions about other c++ libraries. + // There is request with gcc to define it inline: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90295 +#ifndef __GLIBCXX__ + ex.~exception_ptr(); +#endif + st = state::invalid; + return ret; + } + void move_it(any&& x) noexcept { +#ifdef __GLIBCXX__ + // Unfortunally gcc cannot fully optimize the regular + // implementation: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95014 + // Given what we know about the libstdc++ implementation + // (see the comment in take_exception), we can just + // memmove and zero x. We use memmove to guarantee + // vaild results if &x == this. + memmove(static_cast(this), &x, sizeof(any)); + x.st = state::invalid; +#else + if (x.st < state::exception_min) { + st = x.st; + x.st = state::invalid; + } else { + new (&ex) std::exception_ptr(x.take_exception()); + } +#endif + } + any(any&& x) noexcept { + move_it(std::move(x)); + } + any& operator=(any&& x) noexcept { + check_failure(); + // If this is a self move assignment, check_failure + // guarantees that we don't have an exception and calling + // move_it is safe. + move_it(std::move(x)); + return *this; + } + bool has_result() const noexcept { + return st == state::result || st == state::result_unavailable; + } + state st; + std::exception_ptr ex; + } _u; + + future_state_base() noexcept = default; + future_state_base(state st) noexcept : _u(st) { } + future_state_base(std::exception_ptr&& ex) noexcept : _u(std::move(ex)) { } + future_state_base(future_state_base&& x) noexcept : _u(std::move(x._u)) { } + + // We never need to destruct this polymorphicly, so we can make it + // protected instead of virtual. +protected: + struct current_exception_future_marker {}; + future_state_base(current_exception_future_marker) noexcept; + struct nested_exception_marker {}; + future_state_base(nested_exception_marker, future_state_base&& old) noexcept; + future_state_base(nested_exception_marker, future_state_base&& n, future_state_base&& old) noexcept; + ~future_state_base() noexcept = default; + + void rethrow_exception() &&; + void rethrow_exception() const&; + +public: + + bool valid() const noexcept { return _u.valid(); } + bool available() const noexcept { return _u.available(); } + bool failed() const noexcept { return _u.failed(); } + + void ignore() noexcept; + + void set_exception(std::exception_ptr&& ex) noexcept { + assert(_u.st == state::future); + _u.set_exception(std::move(ex)); + } + future_state_base& operator=(future_state_base&& x) noexcept = default; + void set_exception(future_state_base&& state) noexcept { + assert(_u.st == state::future); + *this = std::move(state); + } + std::exception_ptr get_exception() && noexcept { + assert(_u.st >= state::exception_min); + // Move ex out so future::~future() knows we've handled it + return _u.take_exception(); + } + const std::exception_ptr& get_exception() const& noexcept { + assert(_u.st >= state::exception_min); + return _u.ex; + } + template + friend struct future_state; + template + friend future current_exception_as_future() noexcept; + template + friend class future; + template + friend struct futurize; +}; + +void report_failed_future(future_state_base::any&& state) noexcept; + +inline void future_state_base::any::check_failure() noexcept { + if (failed()) { + report_failed_future(std::move(*this)); + } +} + +struct ready_future_marker {}; +struct exception_future_marker {}; +struct future_for_get_promise_marker {}; + +/// \cond internal +template +struct future_state : public future_state_base, private internal::uninitialized_wrapper { + static constexpr bool copy_noexcept = std::is_nothrow_copy_constructible::value; +#if SEASTAR_API_LEVEL < 5 + static constexpr bool has_trivial_move_and_destroy = internal::is_tuple_effectively_trivially_move_constructible_and_destructible; +#else + static constexpr bool has_trivial_move_and_destroy = internal::is_trivially_move_constructible_and_destructible::value; +#endif + static_assert(std::is_nothrow_move_constructible::value, + "Types must be no-throw move constructible"); + static_assert(std::is_nothrow_destructible::value, + "Types must be no-throw destructible"); + future_state() noexcept = default; + void move_it(future_state&& x) noexcept { + if constexpr (has_trivial_move_and_destroy) { + memmove(reinterpret_cast(&this->uninitialized_get()), + &x.uninitialized_get(), + internal::used_size>::value); + } else if (_u.has_result()) { + this->uninitialized_set(std::move(x.uninitialized_get())); + std::destroy_at(&x.uninitialized_get()); + } + } + + [[gnu::always_inline]] + future_state(future_state&& x) noexcept : future_state_base(std::move(x)) { + move_it(std::move(x)); + } + + void clear() noexcept { + if (_u.has_result()) { + std::destroy_at(&this->uninitialized_get()); + } else { + _u.check_failure(); + } + } + __attribute__((always_inline)) + ~future_state() noexcept { + clear(); + } + future_state& operator=(future_state&& x) noexcept { + clear(); + future_state_base::operator=(std::move(x)); + // If &x == this, _u.st is now state::invalid and so it is + // safe to call move_it. + move_it(std::move(x)); + return *this; + } + template + future_state(ready_future_marker, A&&... a) noexcept : future_state_base(state::result) { + try { + this->uninitialized_set(std::forward(a)...); + } catch (...) { + new (this) future_state(current_exception_future_marker()); + } + } + template + void set(A&&... a) noexcept { + assert(_u.st == state::future); + new (this) future_state(ready_future_marker(), std::forward(a)...); + } + future_state(exception_future_marker m, std::exception_ptr&& ex) noexcept : future_state_base(std::move(ex)) { } + future_state(exception_future_marker m, future_state_base&& state) noexcept : future_state_base(std::move(state)) { } + future_state(current_exception_future_marker m) noexcept : future_state_base(m) { } + future_state(nested_exception_marker m, future_state_base&& old) noexcept : future_state_base(m, std::move(old)) { } + future_state(nested_exception_marker m, future_state_base&& n, future_state_base&& old) noexcept : future_state_base(m, std::move(n), std::move(old)) { } + T&& get_value() && noexcept { + assert(_u.st == state::result); + return static_cast(this->uninitialized_get()); + } + T&& take_value() && noexcept { + assert(_u.st == state::result); + _u.st = state::result_unavailable; + return static_cast(this->uninitialized_get()); + } + template + const std::enable_if_t::value, U>& get_value() const& noexcept(copy_noexcept) { + assert(_u.st == state::result); + return this->uninitialized_get(); + } + T&& take() && { + assert(available()); + if (_u.st >= state::exception_min) { + std::move(*this).rethrow_exception(); + } + _u.st = state::result_unavailable; + return static_cast(this->uninitialized_get()); + } + T&& get() && { + assert(available()); + if (_u.st >= state::exception_min) { + std::move(*this).rethrow_exception(); + } + return static_cast(this->uninitialized_get()); + } + const T& get() const& { + assert(available()); + if (_u.st >= state::exception_min) { + rethrow_exception(); + } + return this->uninitialized_get(); + } + using get0_return_type = typename internal::get0_return_type>::type; + static get0_return_type get0(T&& x) { + return internal::get0_return_type::get0(std::move(x)); + } + + get0_return_type get0() { +#if SEASTAR_API_LEVEL < 5 + return get0(std::move(*this).get()); +#else + return std::move(*this).get(); +#endif + } +}; + +#if SEASTAR_API_LEVEL < 6 +template +#else +template +#endif +class continuation_base : public task { +protected: + using future_state = seastar::future_state>; + future_state _state; + using future_type = future; + using promise_type = promise; +public: + continuation_base() noexcept = default; + void set_state(future_state&& state) noexcept { + _state = std::move(state); + } + // This override of waiting_task() is needed here because there are cases + // when backtrace is obtained from the destructor of this class and objects + // of derived classes are already destroyed at that time. If we didn't + // have this override we would get a "pure virtual function call" exception. + virtual task* waiting_task() noexcept override { return nullptr; } + friend class internal::promise_base_with_type; + friend class promise; + friend class future; +}; + +// Given a future type, find the corresponding continuation_base. +template +struct continuation_base_from_future; + +template +struct continuation_base_from_future> { + using type = continuation_base; +}; + +template +using continuation_base_from_future_t = typename continuation_base_from_future::type; + +#if SEASTAR_API_LEVEL < 6 +template +#else +template +#endif +class continuation_base_with_promise : public continuation_base { + friend class internal::promise_base_with_type; +protected: + continuation_base_with_promise(Promise&& pr) noexcept : _pr(std::move(pr)) { + task::make_backtrace(); + } + virtual task* waiting_task() noexcept override; + Promise _pr; +}; + +#if SEASTAR_API_LEVEL < 6 +template +#else +template +#endif +struct continuation final : continuation_base_with_promise { + // Func is the original function passed to then/then_wrapped. The + // Wrapper is a helper function that implements the specific logic + // needed by then/then_wrapped. We call the wrapper passing it the + // original function, promise and state. + // Note that if Func's move constructor throws, this will call + // std::unexpected. We could try to require Func to be nothrow + // move constructible, but that will cause a lot of churn. Since + // we can't support a failure to create a continuation, calling + // std::unexpected as close to the failure as possible is the best + // we can do. + continuation(Promise&& pr, Func&& func, Wrapper&& wrapper) noexcept + : continuation_base_with_promise(std::move(pr)) + , _func(std::move(func)) + , _wrapper(std::move(wrapper)) {} + virtual void run_and_dispose() noexcept override { + try { + _wrapper(std::move(this->_pr), _func, std::move(this->_state)); + } catch (...) { + this->_pr.set_to_current_exception(); + } + delete this; + } + Func _func; + [[no_unique_address]] Wrapper _wrapper; +}; + +#if SEASTAR_API_LEVEL < 4 + +// This is an internal future<> payload for seastar::when_all_succeed(). It is used +// to return a variadic future (when two or more of its input futures were non-void), +// but with variadic futures deprecated and soon gone this is no longer possible. +// +// Instead, we use this tuple type, and future::then() knows to unpack it. +// +// The whole thing is temporary for a transition period. +template +struct when_all_succeed_tuple : std::tuple { + using std::tuple::tuple; + when_all_succeed_tuple(std::tuple&& t) + noexcept(std::is_nothrow_move_constructible>::value) + : std::tuple(std::move(t)) {} +}; + +#endif + +namespace internal { + +template +future make_exception_future(future_state_base&& state) noexcept; + +template +void set_callback(future& fut, U* callback) noexcept; + +class future_base; + +class promise_base { +protected: + enum class urgent { no, yes }; + future_base* _future = nullptr; + + // This points to the future_state that is currently being + // used. See comment above the future_state struct definition for + // details. + future_state_base* _state; + + task* _task = nullptr; + + promise_base(const promise_base&) = delete; + promise_base(future_state_base* state) noexcept : _state(state) {} + promise_base(future_base* future, future_state_base* state) noexcept; + void move_it(promise_base&& x) noexcept; + promise_base(promise_base&& x) noexcept; + + void clear() noexcept; + + // We never need to destruct this polymorphicly, so we can make it + // protected instead of virtual + ~promise_base() noexcept { + clear(); + } + + void operator=(const promise_base&) = delete; + promise_base& operator=(promise_base&& x) noexcept; + + template + void make_ready() noexcept; + + template + void set_exception_impl(T&& val) noexcept { + if (_state) { + _state->set_exception(std::move(val)); + make_ready(); + } else { + // We get here if promise::get_future is called and the + // returned future is destroyed without creating a + // continuation. + // In older versions of seastar we would store a local + // copy of ex and warn in the promise destructor. + // Since there isn't any way for the user to clear + // the exception, we issue the warning from here. + report_failed_future(val); + } + } + + void set_exception(future_state_base&& state) noexcept { + set_exception_impl(std::move(state)); + } + + void set_exception(std::exception_ptr&& ex) noexcept { + set_exception_impl(std::move(ex)); + } + + void set_exception(const std::exception_ptr& ex) noexcept { + set_exception(std::exception_ptr(ex)); + } + + template + std::enable_if_t, std::exception_ptr>::value, void> set_exception(Exception&& e) noexcept { + set_exception(make_exception_ptr(std::forward(e))); + } + + friend class future_base; + template friend class seastar::future; + +public: + /// Set this promise to the current exception. + /// + /// This is equivalent to set_exception(std::current_exception()), + /// but expands to less code. + void set_to_current_exception() noexcept; + + /// Returns the task which is waiting for this promise to resolve, or nullptr. + task* waiting_task() const noexcept { return _task; } +}; + +/// \brief A promise with type but no local data. +/// +/// This is a promise without any local data. We use this for when the +/// future is created first, so we know the promise always has an +/// external place to point to. We cannot just use promise_base +/// because we need to know the type that is being stored. +template +class promise_base_with_type : protected internal::promise_base { +protected: + using future_state = seastar::future_state>; + future_state* get_state() noexcept { + return static_cast(_state); + } + static constexpr bool copy_noexcept = future_state::copy_noexcept; +public: + promise_base_with_type(future_state_base* state) noexcept : promise_base(state) { } + promise_base_with_type(future* future) noexcept : promise_base(future, &future->_state) { } + promise_base_with_type(promise_base_with_type&& x) noexcept = default; + promise_base_with_type(const promise_base_with_type&) = delete; + promise_base_with_type& operator=(promise_base_with_type&& x) noexcept = default; + void operator=(const promise_base_with_type&) = delete; + + void set_urgent_state(future_state&& state) noexcept { + auto* ptr = get_state(); + // The state can be null if the corresponding future has been + // destroyed without producing a continuation. + if (ptr) { + // FIXME: This is a fairly expensive assert. It would be a + // good candidate for being disabled in release builds if + // we had such an assert. + assert(ptr->_u.st == future_state_base::state::future); + new (ptr) future_state(std::move(state)); + make_ready(); + } + } + + template + void set_value(A&&... a) noexcept { + if (auto *s = get_state()) { + s->set(std::forward(a)...); + make_ready(); + } + } + + /// Set this promise to the current exception. + /// + /// This is equivalent to set_exception(std::current_exception()), + /// but expands to less code. + void set_to_current_exception() noexcept { + internal::promise_base::set_to_current_exception(); + } + + /// Returns the task which is waiting for this promise to resolve, or nullptr. + using internal::promise_base::waiting_task; + +private: + + template + friend class seastar::future; + + friend future_state; +}; +} +/// \endcond + +/// \brief promise - allows a future value to be made available at a later time. +/// +/// \tparam T A list of types to be carried as the result of the associated future. +/// A list with two or more types is deprecated; use +/// \c promise> instead. +template +class promise : private internal::promise_base_with_type { + using future_state = typename internal::promise_base_with_type::future_state; + future_state _local_state; + +public: + /// \brief Constructs an empty \c promise. + /// + /// Creates promise with no associated future yet (see get_future()). + promise() noexcept : internal::promise_base_with_type(&_local_state) {} + + /// \brief Moves a \c promise object. + void move_it(promise&& x) noexcept; + promise(promise&& x) noexcept : internal::promise_base_with_type(std::move(x)) { + move_it(std::move(x)); + } + promise(const promise&) = delete; + promise& operator=(promise&& x) noexcept { + internal::promise_base_with_type::operator=(std::move(x)); + // If this is a self-move, _state is now nullptr and it is + // safe to call move_it. + move_it(std::move(x)); + return *this; + } + void operator=(const promise&) = delete; + + /// Set this promise to the current exception. + /// + /// This is equivalent to set_exception(std::current_exception()), + /// but expands to less code. + void set_to_current_exception() noexcept { + internal::promise_base::set_to_current_exception(); + } + + /// Returns the task which is waiting for this promise to resolve, or nullptr. + using internal::promise_base::waiting_task; + + /// \brief Gets the promise's associated future. + /// + /// The future and promise will be remember each other, even if either or + /// both are moved. When \c set_value() or \c set_exception() are called + /// on the promise, the future will be become ready, and if a continuation + /// was attached to the future, it will run. + future get_future() noexcept; + + /// \brief Sets the promises value + /// + /// Forwards the arguments and makes them available to the associated + /// future. May be called either before or after \c get_future(). + /// + /// The arguments can have either the types the promise is + /// templated with, or a corresponding std::tuple. That is, given + /// a promise, both calls are valid: + /// + /// pr.set_value(42, 43.0); + /// pr.set_value(std::tuple(42, 43.0)) + template + void set_value(A&&... a) noexcept { + internal::promise_base_with_type::set_value(std::forward(a)...); + } + + /// \brief Marks the promise as failed + /// + /// Forwards the exception argument to the future and makes it + /// available. May be called either before or after \c get_future(). + void set_exception(std::exception_ptr&& ex) noexcept { + internal::promise_base::set_exception(std::move(ex)); + } + + void set_exception(const std::exception_ptr& ex) noexcept { + internal::promise_base::set_exception(ex); + } + + /// \brief Marks the promise as failed + /// + /// Forwards the exception argument to the future and makes it + /// available. May be called either before or after \c get_future(). + template + std::enable_if_t, std::exception_ptr>::value, void> set_exception(Exception&& e) noexcept { + internal::promise_base::set_exception(std::forward(e)); + } + + using internal::promise_base_with_type::set_urgent_state; + + template + friend class future; +}; + +#if SEASTAR_API_LEVEL < 6 +/// \brief Specialization of \c promise +/// +/// This is an alias for \c promise<>, for generic programming purposes. +/// For example, You may have a \c promise where \c T can legally be +/// \c void. +template<> +class promise : public promise<> {}; +#endif + +/// @} + +/// \addtogroup future-util +/// @{ + + +/// \brief Check whether a type is a future +/// +/// This is a type trait evaluating to \c true if the given type is a +/// future. +/// +template struct is_future : std::false_type {}; + +/// \cond internal +/// \addtogroup future-util +template struct is_future> : std::true_type {}; + +/// \endcond + + +/// \brief Converts a type to a future type, if it isn't already. +/// +/// \return Result in member type 'type'. +template +struct futurize; + +SEASTAR_CONCEPT( + +template +concept Future = is_future::value; + +template +concept CanInvoke = std::invocable; + +// Deprecated alias +template +concept CanApply = CanInvoke; + +template +concept CanApplyTuple + = sizeof...(T) == 1 + && requires (Func func, std::tuple wrapped_val) { + { std::apply(func, std::get<0>(std::move(wrapped_val))) }; + }; + +template +concept InvokeReturns = requires (Func f, T... args) { + { f(std::forward(args)...) } -> std::same_as; +}; + +// Deprecated alias +template +concept ApplyReturns = InvokeReturns; + +template +concept InvokeReturnsAnyFuture = requires (Func f, T... args) { + requires is_future(args)...))>::value; +}; + +// Deprecated alias +template +concept ApplyReturnsAnyFuture = InvokeReturnsAnyFuture; + +) + +/// \endcond + +// Converts a type to a future type, if it isn't already. +template +using futurize_t = typename futurize::type; + +/// @} + +template +auto futurize_invoke(Func&& func, Args&&... args) noexcept; + +template +auto futurize_apply(Func&& func, std::tuple&& args) noexcept; + +/// \addtogroup future-module +/// @{ +namespace internal { +class future_base { +protected: + promise_base* _promise; + future_base() noexcept : _promise(nullptr) {} + future_base(promise_base* promise, future_state_base* state) noexcept : _promise(promise) { + _promise->_future = this; + _promise->_state = state; + } + + void move_it(future_base&& x, future_state_base* state) noexcept { + _promise = x._promise; + if (auto* p = _promise) { + x.detach_promise(); + p->_future = this; + p->_state = state; + } + } + + future_base(future_base&& x, future_state_base* state) noexcept { + move_it(std::move(x), state); + } + + void clear() noexcept { + if (_promise) { + detach_promise(); + } + } + + ~future_base() noexcept { + clear(); + } + + promise_base* detach_promise() noexcept { + _promise->_state = nullptr; + _promise->_future = nullptr; + return std::exchange(_promise, nullptr); + } + + void schedule(task* tws, future_state_base* state) noexcept { + promise_base* p = detach_promise(); + p->_state = state; + p->_task = tws; + } + + void do_wait() noexcept; + +#ifdef SEASTAR_COROUTINES_ENABLED + void set_coroutine(task& coroutine) noexcept; +#endif + + friend class promise_base; +}; + +template +struct future_result { + using type = std::invoke_result_t; + using future_type = futurize_t; + using func_type = future_type (T&&...); +}; + +template +struct future_result { + using type = std::invoke_result_t; + using future_type = futurize_t; + using func_type = future_type (); +}; + +template +using future_result_t = typename future_result::type; + +template +auto future_invoke(Func&& func, T&& v) { + if constexpr (std::is_same_v) { + return std::invoke(std::forward(func)); + } else { + return std::invoke(std::forward(func), std::forward(v)); + } +} + +// This is a customization point for future::then()'s implementation. +// It behaves differently when the future value type is a when_all_succeed_tuple +// instantiation, indicating we need to unpack the tuple into multiple lambda +// arguments. +template +struct call_then_impl; + +// Generic case - the input is not a future>, so +// we just forward everything to future::then_impl. +template +struct call_then_impl> { + template + using result_type = typename future_result::future_type; + + template + using func_type = typename future_result::func_type; + + template + static result_type run(future& fut, Func&& func) noexcept { + return fut.then_impl(std::forward(func)); + } +}; + +#if SEASTAR_API_LEVEL < 4 + +// Special case: we unpack the tuple before calling the function +template +struct call_then_impl>> { + template + using result_type = futurize_t>; + + template + using func_type = result_type (T&&...); + + using was_tuple = when_all_succeed_tuple; + using std_tuple = std::tuple; + + template + static auto run(future& fut, Func&& func) noexcept { + // constructing func in the lambda can throw, but there's nothing we can do + // about it, similar to #84. + return fut.then_impl([func = std::forward(func)] (was_tuple&& t) mutable { + return std::apply(func, static_cast(std::move(t))); + }); + } +}; + +#endif + +template +using call_then_impl_result_type = typename call_then_impl>::template result_type; + +SEASTAR_CONCEPT( +template +concept CanInvokeWhenAllSucceed = requires { + typename call_then_impl_result_type; +}; +) + +template +struct result_of_apply { + // no "type" member if not a function call signature or not a tuple +}; + +template +struct result_of_apply> : std::invoke_result { + // Let std::invoke_result_t determine the result if the input is a tuple +}; + +template +using result_of_apply_t = typename result_of_apply::type; + +} + +template +task* continuation_base_with_promise::waiting_task() noexcept { + return _pr.waiting_task(); +} + +/// \brief A representation of a possibly not-yet-computed value. +/// +/// A \c future represents a value that has not yet been computed +/// (an asynchronous computation). It can be in one of several +/// states: +/// - unavailable: the computation has not been completed yet +/// - value: the computation has been completed successfully and a +/// value is available. +/// - failed: the computation completed with an exception. +/// +/// methods in \c future allow querying the state and, most importantly, +/// scheduling a \c continuation to be executed when the future becomes +/// available. Only one such continuation may be scheduled. +/// +/// A \ref future should not be discarded before it is waited upon and +/// its result is extracted. Discarding a \ref future means that the +/// computed value becomes inaccessible, but more importantly, any +/// exceptions raised from the computation will disappear unchecked as +/// well. Another very important consequence is potentially unbounded +/// resource consumption due to the launcher of the deserted +/// continuation not being able track the amount of in-progress +/// continuations, nor their individual resource consumption. +/// To prevent accidental discarding of futures, \ref future is +/// declared `[[nodiscard]]` if the compiler supports it. Also, when a +/// discarded \ref future resolves with an error a warning is logged +/// (at runtime). +/// That said there can be legitimate cases where a \ref future is +/// discarded. The most prominent example is launching a new +/// [fiber](\ref fiber-module), or in other words, moving a continuation +/// chain to the background (off the current [fiber](\ref fiber-module)). +/// Even if a \ref future is discarded purposefully, it is still strongly +/// advisable to wait on it indirectly (via a \ref gate or +/// \ref semaphore), control their concurrency, their resource consumption +/// and handle any errors raised from them. +/// +/// \tparam T A list of types to be carried as the result of the future, +/// similar to \c std::tuple. An empty list (\c future<>) +/// means that there is no result, and an available future only +/// contains a success/failure indication (and in the case of a +/// failure, an exception). +/// A list with two or more types is deprecated; use +/// \c future> instead. +template +class SEASTAR_NODISCARD future : private internal::future_base { + using future_state = seastar::future_state>; + future_state _state; + static constexpr bool copy_noexcept = future_state::copy_noexcept; + using call_then_impl = internal::call_then_impl; + +private: + // This constructor creates a future that is not ready but has no + // associated promise yet. The use case is to have a less flexible + // but more efficient future/promise pair where we know that + // promise::set_value cannot possibly be called without a matching + // future and so that promise doesn't need to store a + // future_state. + future(future_for_get_promise_marker m) noexcept { } + + future(promise* pr) noexcept : future_base(pr, &_state), _state(std::move(pr->_local_state)) { } + template + future(ready_future_marker m, A&&... a) noexcept : _state(m, std::forward(a)...) { } + future(future_state_base::current_exception_future_marker m) noexcept : _state(m) {} + future(future_state_base::nested_exception_marker m, future_state_base&& old) noexcept : _state(m, std::move(old)) {} + future(future_state_base::nested_exception_marker m, future_state_base&& n, future_state_base&& old) noexcept : _state(m, std::move(n), std::move(old)) {} + future(exception_future_marker m, std::exception_ptr&& ex) noexcept : _state(m, std::move(ex)) { } + future(exception_future_marker m, future_state_base&& state) noexcept : _state(m, std::move(state)) { } + [[gnu::always_inline]] + explicit future(future_state&& state) noexcept + : _state(std::move(state)) { + } + internal::promise_base_with_type get_promise() noexcept { + assert(!_promise); + return internal::promise_base_with_type(this); + } + internal::promise_base_with_type* detach_promise() noexcept { + return static_cast*>(future_base::detach_promise()); + } + void schedule(continuation_base* tws) noexcept { + future_base::schedule(tws, &tws->_state); + } + template + void schedule(Pr&& pr, Func&& func, Wrapper&& wrapper) noexcept { + // If this new throws a std::bad_alloc there is nothing that + // can be done about it. The corresponding future is not ready + // and we cannot break the chain. Since this function is + // noexcept, it will call std::terminate if new throws. + memory::scoped_critical_alloc_section _; + auto tws = new continuation(std::move(pr), std::move(func), std::move(wrapper)); + // In a debug build we schedule ready futures, but not in + // other build modes. +#ifdef SEASTAR_DEBUG + if (_state.available()) { + tws->set_state(std::move(_state)); + ::seastar::schedule(tws); + return; + } +#endif + schedule(tws); + _state._u.st = future_state_base::state::invalid; + } + + [[gnu::always_inline]] + future_state&& get_available_state_ref() noexcept { + if (_promise) { + detach_promise(); + } + return std::move(_state); + } + + future rethrow_with_nested(future_state_base&& n) noexcept { + return future(future_state_base::nested_exception_marker(), std::move(n), std::move(_state)); + } + + future rethrow_with_nested() noexcept { + return future(future_state_base::nested_exception_marker(), std::move(_state)); + } + + template + friend class shared_future; +public: + /// \brief The data type carried by the future. + using value_type = internal::future_stored_type_t; + using tuple_type = internal::future_tuple_type_t; + /// \brief The data type carried by the future. + using promise_type = promise; + /// \brief Moves the future into a new object. + [[gnu::always_inline]] + future(future&& x) noexcept : future_base(std::move(x), &_state), _state(std::move(x._state)) { } + future(const future&) = delete; + future& operator=(future&& x) noexcept { + clear(); + move_it(std::move(x), &_state); + _state = std::move(x._state); + return *this; + } + void operator=(const future&) = delete; + /// \brief gets the value returned by the computation + /// + /// Requires that the future be available. If the value + /// was computed successfully, it is returned (as an + /// \c std::tuple). Otherwise, an exception is thrown. + /// + /// If get() is called in a \ref seastar::thread context, + /// then it need not be available; instead, the thread will + /// be paused until the future becomes available. + [[gnu::always_inline]] + value_type&& get() { + wait(); + return get_available_state_ref().take(); + } + + [[gnu::always_inline]] + std::exception_ptr get_exception() noexcept { + return get_available_state_ref().get_exception(); + } + + /// Gets the value returned by the computation. + /// + /// Similar to \ref get(), but instead of returning a + /// tuple, returns the first value of the tuple. This is + /// useful for the common case of a \c future with exactly + /// one type parameter. + /// + /// Equivalent to: \c std::get<0>(f.get()). + using get0_return_type = typename future_state::get0_return_type; + get0_return_type get0() { +#if SEASTAR_API_LEVEL < 5 + return future_state::get0(get()); +#else + return (get0_return_type)get(); +#endif + } + + /// Wait for the future to be available (in a seastar::thread) + /// + /// When called from a seastar::thread, this function blocks the + /// thread until the future is availble. Other threads and + /// continuations continue to execute; only the thread is blocked. + void wait() noexcept { + if (_state.available()) { + return; + } + do_wait(); + } + + /// \brief Checks whether the future is available. + /// + /// \return \c true if the future has a value, or has failed. + [[gnu::always_inline]] + bool available() const noexcept { + return _state.available(); + } + + /// \brief Checks whether the future has failed. + /// + /// \return \c true if the future is availble and has failed. + [[gnu::always_inline]] + bool failed() const noexcept { + return _state.failed(); + } + + /// \brief Schedule a block of code to run when the future is ready. + /// + /// Schedules a function (often a lambda) to run when the future becomes + /// available. The function is called with the result of this future's + /// computation as parameters. The return value of the function becomes + /// the return value of then(), itself as a future; this allows then() + /// calls to be chained. + /// + /// If the future failed, the function is not called, and the exception + /// is propagated into the return value of then(). + /// + /// \param func - function to be called when the future becomes available, + /// unless it has failed. + /// \return a \c future representing the return value of \c func, applied + /// to the eventual value of this future. + template >> + SEASTAR_CONCEPT( requires std::invocable || internal::CanInvokeWhenAllSucceed) + Result + then(Func&& func) noexcept { + // The implementation of then() is customized via the call_then_impl helper + // template, in order to special case the results of when_all_succeed(). + // when_all_succeed() used to return a variadic future, which is deprecated, so + // now it returns a when_all_succeed_tuple, which we intercept in call_then_impl, + // and treat it as a variadic future. +#ifndef SEASTAR_TYPE_ERASE_MORE + return call_then_impl::run(*this, std::move(func)); +#else + using func_type = typename call_then_impl::template func_type; + noncopyable_function ncf; + { + memory::scoped_critical_alloc_section _; + ncf = noncopyable_function([func = std::forward(func)](auto&&... args) mutable { + return futurize_invoke(func, std::forward(args)...); + }); + } + return call_then_impl::run(*this, std::move(ncf)); +#endif + } + + /// \brief Schedule a block of code to run when the future is ready, unpacking tuples. + /// + /// Schedules a function (often a lambda) to run when the future becomes + /// available. The function is called with the result of this future's + /// computation as parameters. The return value of the function becomes + /// the return value of then(), itself as a future; this allows then() + /// calls to be chained. + /// + /// This member function is only available is the payload is std::tuple; + /// The tuple elements are passed as individual arguments to `func`, which + /// must have the same arity as the tuple. + /// + /// If the future failed, the function is not called, and the exception + /// is propagated into the return value of then(). + /// + /// \param func - function to be called when the future becomes available, + /// unless it has failed. + /// \return a \c future representing the return value of \c func, applied + /// to the eventual value of this future. + template >> + SEASTAR_CONCEPT( requires ::seastar::CanApplyTuple) + Result + then_unpack(Func&& func) noexcept { + return then([func = std::forward(func)] (T&& SEASTAR_ELLIPSIS tuple) mutable { + // sizeof...(tuple) is required to be 1 + return std::apply(func, std::move(tuple) SEASTAR_ELLIPSIS); + }); + } + +private: + + // Keep this simple so that Named Return Value Optimization is used. + template + Result then_impl_nrvo(Func&& func) noexcept { + using futurator = futurize>; + typename futurator::type fut(future_for_get_promise_marker{}); + using pr_type = decltype(fut.get_promise()); + schedule(fut.get_promise(), std::move(func), [](pr_type&& pr, Func& func, future_state&& state) { + if (state.failed()) { + pr.set_exception(static_cast(std::move(state))); + } else { + futurator::satisfy_with_result_of(std::move(pr), [&func, &state] { +#if SEASTAR_API_LEVEL < 5 + return std::apply(func, std::move(state).get_value()); +#else + // clang thinks that "state" is not used, below, for future<>. + // Make it think it is used to avoid an unused-lambda-capture warning. + (void)state; + return internal::future_invoke(func, std::move(state).get_value()); +#endif + }); + } + }); + return fut; + } + + template >> + Result + then_impl(Func&& func) noexcept { +#ifndef SEASTAR_DEBUG + using futurator = futurize>; + if (failed()) { + return futurator::make_exception_future(static_cast(get_available_state_ref())); + } else if (available()) { +#if SEASTAR_API_LEVEL < 5 + return futurator::apply(std::forward(func), get_available_state_ref().take_value()); +#else + return futurator::invoke(std::forward(func), get_available_state_ref().take_value()); +#endif + } +#endif + return then_impl_nrvo(std::forward(func)); + } + +public: + /// \brief Schedule a block of code to run when the future is ready, allowing + /// for exception handling. + /// + /// Schedules a function (often a lambda) to run when the future becomes + /// available. The function is called with the this future as a parameter; + /// it will be in an available state. The return value of the function becomes + /// the return value of then_wrapped(), itself as a future; this allows + /// then_wrapped() calls to be chained. + /// + /// Unlike then(), the function will be called for both value and exceptional + /// futures. + /// + /// \param func - function to be called when the future becomes available, + /// \return a \c future representing the return value of \c func, applied + /// to the eventual value of this future. + template > + SEASTAR_CONCEPT( requires std::invocable ) + futurize_t + then_wrapped(Func&& func) & noexcept { + return then_wrapped_maybe_erase(std::forward(func)); + } + + template > + SEASTAR_CONCEPT( requires std::invocable ) + futurize_t + then_wrapped(Func&& func) && noexcept { + return then_wrapped_maybe_erase(std::forward(func)); + } + +private: + + template + futurize_t + then_wrapped_maybe_erase(Func&& func) noexcept { +#ifndef SEASTAR_TYPE_ERASE_MORE + return then_wrapped_common(std::forward(func)); +#else + using futurator = futurize; + using WrapFuncResult = typename futurator::type; + noncopyable_function ncf; + { + memory::scoped_critical_alloc_section _; + ncf = noncopyable_function([func = std::forward(func)](future&& f) mutable { + return futurator::invoke(func, std::move(f)); + }); + } + return then_wrapped_common(std::move(ncf)); +#endif + } + + // Keep this simple so that Named Return Value Optimization is used. + template + futurize_t + then_wrapped_nrvo(Func&& func) noexcept { + using futurator = futurize; + typename futurator::type fut(future_for_get_promise_marker{}); + using pr_type = decltype(fut.get_promise()); + schedule(fut.get_promise(), std::move(func), [](pr_type&& pr, Func& func, future_state&& state) { + futurator::satisfy_with_result_of(std::move(pr), [&func, &state] { + return func(future(std::move(state))); + }); + }); + return fut; + } + + + template + futurize_t + then_wrapped_common(Func&& func) noexcept { +#ifndef SEASTAR_DEBUG + using futurator = futurize; + if (available()) { + if constexpr (AsSelf) { + if (_promise) { + detach_promise(); + } + return futurator::invoke(std::forward(func), std::move(*this)); + } else { + return futurator::invoke(std::forward(func), future(get_available_state_ref())); + } + } +#endif + return then_wrapped_nrvo(std::forward(func)); + } + + void forward_to(internal::promise_base_with_type&& pr) noexcept { + if (_state.available()) { + pr.set_urgent_state(std::move(_state)); + } else { + *detach_promise() = std::move(pr); + } + } + +public: + /// \brief Satisfy some \ref promise object with this future as a result. + /// + /// Arranges so that when this future is resolve, it will be used to + /// satisfy an unrelated promise. This is similar to scheduling a + /// continuation that moves the result of this future into the promise + /// (using promise::set_value() or promise::set_exception(), except + /// that it is more efficient. + /// + /// \param pr a promise that will be fulfilled with the results of this + /// future. + void forward_to(promise&& pr) noexcept { + if (_state.available()) { + pr.set_urgent_state(std::move(_state)); + } else if (&pr._local_state != pr._state) { + // The only case when _state points to _local_state is + // when get_future was never called. Given that pr will + // soon be destroyed, we know get_future will never be + // called and we can just ignore this request. + *detach_promise() = std::move(pr); + } + } + + + + /** + * Finally continuation for statements that require waiting for the result. + * I.e. you need to "finally" call a function that returns a possibly + * unavailable future. The returned future will be "waited for", any + * exception generated will be propagated, but the return value is ignored. + * I.e. the original return value (the future upon which you are making this + * call) will be preserved. + * + * If the original return value or the callback return value is an + * exceptional future it will be propagated. + * + * If both of them are exceptional - the std::nested_exception exception + * with the callback exception on top and the original future exception + * nested will be propagated. + */ + template + SEASTAR_CONCEPT( requires std::invocable ) + future finally(Func&& func) noexcept { + return then_wrapped(finally_body>::value>(std::forward(func))); + } + + + template + struct finally_body; + + template + struct finally_body { + Func _func; + + finally_body(Func&& func) noexcept : _func(std::forward(func)) + { } + + future operator()(future&& result) noexcept { + return futurize_invoke(_func).then_wrapped([result = std::move(result)](auto&& f_res) mutable { + if (!f_res.failed()) { + return std::move(result); + } else { + return result.rethrow_with_nested(std::move(f_res._state)); + } + }); + } + }; + + template + struct finally_body { + Func _func; + + finally_body(Func&& func) noexcept : _func(std::forward(func)) + { } + + future operator()(future&& result) noexcept { + try { + _func(); + return std::move(result); + } catch (...) { + return result.rethrow_with_nested(); + } + }; + }; + + /// \brief Terminate the program if this future fails. + /// + /// Terminates the entire program is this future resolves + /// to an exception. Use with caution. + future<> or_terminate() noexcept { + return then_wrapped([] (auto&& f) { + try { + f.get(); + } catch (...) { + engine_exit(std::current_exception()); + } + }); + } + + /// \brief Discards the value carried by this future. + /// + /// Converts the future into a no-value \c future<>, by + /// ignoring any result. Exceptions are propagated unchanged. + future<> discard_result() noexcept { + // We need the generic variadic lambda, below, because then() behaves differently + // when value_type is when_all_succeed_tuple + return then([] (auto&&...) {}); + } + + /// \brief Handle the exception carried by this future. + /// + /// When the future resolves, if it resolves with an exception, + /// handle_exception(func) replaces the exception with the value + /// returned by func. The exception is passed (as a std::exception_ptr) + /// as a parameter to func; func may return the replacement value + /// immediately (T or std::tuple) or in the future (future) + /// and is even allowed to return (or throw) its own exception. + /// + /// The idiom fut.discard_result().handle_exception(...) can be used + /// to handle an exception (if there is one) without caring about the + /// successful value; Because handle_exception() is used here on a + /// future<>, the handler function does not need to return anything. + template + /* Broken? + SEASTAR_CONCEPT( requires ::seastar::InvokeReturns, std::exception_ptr> + || (sizeof...(T) == 0 && ::seastar::InvokeReturns) + || (sizeof...(T) == 1 && ::seastar::InvokeReturns) + ) */ + future handle_exception(Func&& func) noexcept { + return then_wrapped([func = std::forward(func)] + (auto&& fut) mutable -> future { + if (!fut.failed()) { + return make_ready_future(fut.get()); + } else { + return futurize_invoke(func, fut.get_exception()); + } + }); + } + + /// \brief Handle the exception of a certain type carried by this future. + /// + /// When the future resolves, if it resolves with an exception of a type that + /// provided callback receives as a parameter, handle_exception(func) replaces + /// the exception with the value returned by func. The exception is passed (by + /// reference) as a parameter to func; func may return the replacement value + /// immediately (T or std::tuple) or in the future (future) + /// and is even allowed to return (or throw) its own exception. + /// If exception, that future holds, does not match func parameter type + /// it is propagated as is. + template + future handle_exception_type(Func&& func) noexcept { + using trait = function_traits; + static_assert(trait::arity == 1, "func can take only one parameter"); + using ex_type = typename trait::template arg<0>::type; + return then_wrapped([func = std::forward(func)] + (auto&& fut) mutable -> future { + try { + return make_ready_future(fut.get()); + } catch(ex_type& ex) { + return futurize_invoke(func, ex); + } + }); + } + + /// \brief Ignore any result hold by this future + /// + /// Ignore any result (value or exception) hold by this future. + /// Use with caution since usually ignoring exception is not what + /// you want + void ignore_ready_future() noexcept { + _state.ignore(); + } + +#ifdef SEASTAR_COROUTINES_ENABLED + using future_base::set_coroutine; +#endif +private: + void set_callback(continuation_base* callback) noexcept { + if (_state.available()) { + callback->set_state(get_available_state_ref()); + ::seastar::schedule(callback); + } else { + assert(_promise); + schedule(callback); + } + + } + + /// \cond internal + template + friend class future; + template + friend class promise; + template + friend struct futurize; + template + friend class internal::promise_base_with_type; + template + friend future make_ready_future(A&&... value) noexcept; + template + friend future make_exception_future(std::exception_ptr&& ex) noexcept; + template + friend future make_exception_future(Exception&& ex) noexcept; + template + friend future internal::make_exception_future(future_state_base&& state) noexcept; + template + friend future current_exception_as_future() noexcept; + template + friend void internal::set_callback(future&, V*) noexcept; + template + friend struct internal::call_then_impl; + /// \endcond +}; + + +namespace internal { +template +struct futurize_base { + /// If \c T is a future, \c T; otherwise \c future + using type = future; + /// The promise type associated with \c type. + using promise_type = promise; + using promise_base_with_type = internal::promise_base_with_type; + + /// Convert a value or a future to a future + static inline type convert(T&& value) { return make_ready_future(std::move(value)); } + static inline type convert(type&& value) { return std::move(value); } + + /// Makes an exceptional future of type \ref type. + template + static inline type make_exception_future(Arg&& arg) noexcept; +}; + +template <> +struct futurize_base { + using type = future<>; + using promise_type = promise<>; + using promise_base_with_type = internal::promise_base_with_type<>; + + static inline type convert(type&& value) { + return std::move(value); + } + template + static inline type make_exception_future(Arg&& arg) noexcept; +}; + +template +struct futurize_base> : public futurize_base {}; + +template <> +struct futurize_base> : public futurize_base {}; +} + +template +struct futurize : public internal::futurize_base { + using base = internal::futurize_base; + using type = typename base::type; + using promise_type = typename base::promise_type; + using promise_base_with_type = typename base::promise_base_with_type; + /// The value tuple type associated with \c type + using value_type = typename type::value_type; + using tuple_type = typename type::tuple_type; + using base::convert; + using base::make_exception_future; + + /// Apply a function to an argument list (expressed as a tuple) + /// and return the result, as a future (if it wasn't already). + template + static inline type apply(Func&& func, std::tuple&& args) noexcept; + + /// Invoke a function to an argument list + /// and return the result, as a future (if it wasn't already). + template + static inline type invoke(Func&& func, FuncArgs&&... args) noexcept; + + template + static inline type invoke(Func&& func, internal::monostate) noexcept { + return invoke(std::forward(func)); + } + + /// Deprecated alias of invoke + template + [[deprecated("Use invoke for varargs")]] + static inline type apply(Func&& func, FuncArgs&&... args) noexcept { + return invoke(std::forward(func), std::forward(args)...); + } + + static type current_exception_as_future() noexcept { + return type(future_state_base::current_exception_future_marker()); + } + + /// Convert the tuple representation into a future + static type from_tuple(tuple_type&& value) { + return type(ready_future_marker(), std::move(value)); + } + /// Convert the tuple representation into a future + static type from_tuple(const tuple_type& value) { + return type(ready_future_marker(), value); + } + +#if SEASTAR_API_LEVEL >= 5 + /// Convert the tuple representation into a future + static type from_tuple(value_type&& value) { + return type(ready_future_marker(), std::move(value)); + } + /// Convert the tuple representation into a future + static type from_tuple(const value_type& value) { + return type(ready_future_marker(), value); + } +#endif +private: + /// Forwards the result of, or exception thrown by, func() to the + /// promise. This avoids creating a future if func() doesn't + /// return one. + template + SEASTAR_CONCEPT( requires std::invocable ) + static void satisfy_with_result_of(promise_base_with_type&&, Func&& func); + + template + friend class future; +}; + +inline internal::promise_base::promise_base(future_base* future, future_state_base* state) noexcept + : _future(future), _state(state) { + _future->_promise = this; +} + +template +inline +future +promise::get_future() noexcept { + assert(!this->_future && this->_state && !this->_task); + return future(this); +} + +template +inline +void promise::move_it(promise&& x) noexcept { + if (this->_state == &x._local_state) { + this->_state = &_local_state; + new (&_local_state) future_state(std::move(x._local_state)); + } +} + +template +inline +future make_ready_future(A&&... value) noexcept { + return future(ready_future_marker(), std::forward(value)...); +} + +template +inline +future make_exception_future(std::exception_ptr&& ex) noexcept { + return future(exception_future_marker(), std::move(ex)); +} + +template +inline +future internal::make_exception_future(future_state_base&& state) noexcept { + return future(exception_future_marker(), std::move(state)); +} + +template +future current_exception_as_future() noexcept { + return future(future_state_base::current_exception_future_marker()); +} + +void log_exception_trace() noexcept; + +/// \brief Creates a \ref future in an available, failed state. +/// +/// Creates a \ref future object that is already resolved in a failed +/// state. This no I/O needs to be performed to perform a computation +/// (for example, because the connection is closed and we cannot read +/// from it). +template +inline +future make_exception_future(Exception&& ex) noexcept { + log_exception_trace(); + return make_exception_future(std::make_exception_ptr(std::forward(ex))); +} + +template +future make_exception_future_with_backtrace(Exception&& ex) noexcept { + return make_exception_future(make_backtraced_exception_ptr(std::forward(ex))); +} + +/// @} + +/// \cond internal + +template +template +typename futurize::type futurize::apply(Func&& func, std::tuple&& args) noexcept { + try { + using ret_t = decltype(std::apply(std::forward(func), std::move(args))); + if constexpr (std::is_void_v) { + std::apply(std::forward(func), std::move(args)); + return make_ready_future<>(); + } else if constexpr (is_future::value){ + return std::apply(std::forward(func), std::move(args)); + } else { + return convert(std::apply(std::forward(func), std::move(args))); + } + } catch (...) { + return current_exception_as_future(); + } +} + +template +template +SEASTAR_CONCEPT( requires std::invocable ) +void futurize::satisfy_with_result_of(promise_base_with_type&& pr, Func&& func) { + using ret_t = decltype(func()); + if constexpr (std::is_void_v) { + func(); + pr.set_value(); + } else if constexpr (is_future::value) { + func().forward_to(std::move(pr)); + } else { + pr.set_value(func()); + } +} + +template +template +typename futurize::type futurize::invoke(Func&& func, FuncArgs&&... args) noexcept { + try { + using ret_t = decltype(func(std::forward(args)...)); + if constexpr (std::is_void_v) { + func(std::forward(args)...); + return make_ready_future<>(); + } else if constexpr (is_future::value) { + return func(std::forward(args)...); + } else { + return convert(func(std::forward(args)...)); + } + } catch (...) { + return current_exception_as_future(); + } +} + +template +template +inline +future +internal::futurize_base::make_exception_future(Arg&& arg) noexcept { + using ::seastar::make_exception_future; + using ::seastar::internal::make_exception_future; + return make_exception_future(std::forward(arg)); +} + +template +inline +future<> +internal::futurize_base::make_exception_future(Arg&& arg) noexcept { + using ::seastar::make_exception_future; + using ::seastar::internal::make_exception_future; + return make_exception_future<>(std::forward(arg)); +} + +template +auto futurize_invoke(Func&& func, Args&&... args) noexcept { + using futurator = futurize>; + return futurator::invoke(std::forward(func), std::forward(args)...); +} + +template +[[deprecated("Use futurize_invoke for varargs")]] +auto futurize_apply(Func&& func, Args&&... args) noexcept { + return futurize_invoke(std::forward(func), std::forward(args)...); +} + +template +auto futurize_apply(Func&& func, std::tuple&& args) noexcept { + using futurator = futurize>; + return futurator::apply(std::forward(func), std::move(args)); +} + +namespace internal { + +template +inline +void set_callback(future& fut, U* callback) noexcept { + // It would be better to use continuation_base for U, but + // then a derived class of continuation_base won't be matched + return fut.set_callback(callback); +} + +} + + +/// \endcond + +} diff --git a/src/seastar/include/seastar/core/gate.hh b/src/seastar/include/seastar/core/gate.hh new file mode 100644 index 000000000..5d440c07f --- /dev/null +++ b/src/seastar/include/seastar/core/gate.hh @@ -0,0 +1,170 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2014 Cloudius Systems + */ + +#pragma once + +#include +#include +#include + +namespace seastar { + +/// \addtogroup fiber-module +/// @{ + +/// Exception thrown when a \ref gate object has been closed +/// by the \ref gate::close() method. +class gate_closed_exception : public std::exception { +public: + virtual const char* what() const noexcept override { + return "gate closed"; + } +}; + +/// Facility to stop new requests, and to tell when existing requests are done. +/// +/// When stopping a service that serves asynchronous requests, we are faced with +/// two problems: preventing new requests from coming in, and knowing when existing +/// requests have completed. The \c gate class provides a solution. +class gate { + size_t _count = 0; + std::optional> _stopped; +public: + /// Tries to register an in-progress request. + /// + /// If the gate is not closed, the request is registered and the function returns `true`, + /// Otherwise the function just returns `false` and has no other effect. + bool try_enter() noexcept { + bool opened = !_stopped; + if (opened) { + ++_count; + } + return opened; + } + /// Registers an in-progress request. + /// + /// If the gate is not closed, the request is registered. Otherwise, + /// a \ref gate_closed_exception is thrown. + void enter() { + if (!try_enter()) { + throw gate_closed_exception(); + } + } + /// Unregisters an in-progress request. + /// + /// If the gate is closed, and there are no more in-progress requests, + /// the `_stopped` promise will be fulfilled. + void leave() noexcept { + --_count; + if (!_count && _stopped) { + _stopped->set_value(); + } + } + /// Potentially stop an in-progress request. + /// + /// If the gate is already closed, a \ref gate_closed_exception is thrown. + /// By using \ref enter() and \ref leave(), the program can ensure that + /// no further requests are serviced. However, long-running requests may + /// continue to run. The check() method allows such a long operation to + /// voluntarily stop itself after the gate is closed, by making calls to + /// check() in appropriate places. check() with throw an exception and + /// bail out of the long-running code if the gate is closed. + void check() { + if (_stopped) { + throw gate_closed_exception(); + } + } + /// Closes the gate. + /// + /// Future calls to \ref enter() will fail with an exception, and when + /// all current requests call \ref leave(), the returned future will be + /// made ready. + future<> close() noexcept { + assert(!_stopped && "seastar::gate::close() cannot be called more than once"); + _stopped = std::make_optional(promise<>()); + if (!_count) { + _stopped->set_value(); + } + return _stopped->get_future(); + } + + /// Returns a current number of registered in-progress requests. + size_t get_count() const noexcept { + return _count; + } + + /// Returns whether the gate is closed. + bool is_closed() const noexcept { + return bool(_stopped); + } +}; + +namespace internal { + +template +inline +auto +invoke_func_with_gate(gate& g, Func&& func) noexcept { + return futurize_invoke(std::forward(func)).finally([&g] { g.leave(); }); +} + +} // namespace intgernal + +/// Executes the function \c func making sure the gate \c g is properly entered +/// and later on, properly left. +/// +/// \param func function to be executed +/// \param g the gate. Caller must make sure that it outlives this function. +/// \returns whatever \c func returns +/// +/// \relates gate +template +inline +auto +with_gate(gate& g, Func&& func) { + g.enter(); + return internal::invoke_func_with_gate(g, std::forward(func)); +} + +/// Executes the function \c func if the gate \c g can be entered +/// and later on, properly left. +/// +/// \param func function to be executed +/// \param g the gate. Caller must make sure that it outlives this function. +/// +/// If the gate is already closed, an exception future holding +/// \ref gate_closed_exception is returned, otherwise +/// \returns whatever \c func returns. +/// +/// \relates gate +template +inline +auto +try_with_gate(gate& g, Func&& func) noexcept { + if (!g.try_enter()) { + using futurator = futurize>; + return futurator::make_exception_future(gate_closed_exception()); + } + return internal::invoke_func_with_gate(g, std::forward(func)); +} +/// @} + +} diff --git a/src/seastar/include/seastar/core/idle_cpu_handler.hh b/src/seastar/include/seastar/core/idle_cpu_handler.hh new file mode 100644 index 000000000..793fc422a --- /dev/null +++ b/src/seastar/include/seastar/core/idle_cpu_handler.hh @@ -0,0 +1,59 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2020 ScyllaDB + */ + +#pragma once + +#include + +/// \file + +namespace seastar { + +/// Indicates the outcome of a user callback installed to take advantage of +/// idle CPU cycles. +enum class idle_cpu_handler_result { + no_more_work, //!< The user callback has no more work to perform + interrupted_by_higher_priority_task //!< A call to the work_waiting_on_reactor parameter to idle_cpu_handler returned `true` +}; + +/// Signature of a callback provided by the reactor to a user callback installed to take +/// advantage of idle cpu cycles, used to periodically check if the CPU is still idle. +/// +/// \return true if the reactor has new work to do +using work_waiting_on_reactor = const noncopyable_function&; + +/// Signature of a callback provided by the user, that the reactor calls when it has idle cycles. +/// +/// The `poll` parameter is a work_waiting_on_reactor function that should be periodically called +/// to check if the idle callback should return with idle_cpu_handler_result::interrupted_by_higher_priority_task +using idle_cpu_handler = noncopyable_function; + +/// Set a handler that will be called when there is no task to execute on cpu. +/// Handler should do a low priority work. +/// +/// Handler's return value determines whether handler did any actual work. If no work was done then reactor will go +/// into sleep. +/// +/// Handler's argument is a function that returns true if a task which should be executed on cpu appears or false +/// otherwise. This function should be used by a handler to return early if a task appears. +void set_idle_cpu_handler(idle_cpu_handler&& handler); + +} diff --git a/src/seastar/include/seastar/core/internal/api-level.hh b/src/seastar/include/seastar/core/internal/api-level.hh new file mode 100644 index 000000000..c18e03f82 --- /dev/null +++ b/src/seastar/include/seastar/core/internal/api-level.hh @@ -0,0 +1,82 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2019 ScyllaDB + */ + +#pragma once + +// For IDEs that don't see SEASTAR_API_LEVEL, generate a nice default +#ifndef SEASTAR_API_LEVEL +#define SEASTAR_API_LEVEL 3 +#endif + +#if SEASTAR_API_LEVEL == 6 +#define SEASTAR_INCLUDE_API_V6 inline +#else +#define SEASTAR_INCLUDE_API_V6 +#endif + +#if SEASTAR_API_LEVEL == 5 +#define SEASTAR_INCLUDE_API_V5 inline +#else +#define SEASTAR_INCLUDE_API_V5 +#endif + +#if SEASTAR_API_LEVEL == 4 +#define SEASTAR_INCLUDE_API_V4 inline +#else +#define SEASTAR_INCLUDE_API_V4 +#endif + +#if SEASTAR_API_LEVEL == 3 +#define SEASTAR_INCLUDE_API_V3 inline +#else +#define SEASTAR_INCLUDE_API_V3 +#endif + +#if SEASTAR_API_LEVEL == 2 +#define SEASTAR_INCLUDE_API_V2 inline +#else +#define SEASTAR_INCLUDE_API_V2 +#endif + +// Declare them here so we don't have to use the macros everywhere +namespace seastar { + SEASTAR_INCLUDE_API_V2 namespace api_v2 { + } + SEASTAR_INCLUDE_API_V3 namespace api_v3 { + inline namespace and_newer { + } + } + SEASTAR_INCLUDE_API_V4 namespace api_v4 { + inline namespace and_newer { + using namespace api_v3::and_newer; + } + } + SEASTAR_INCLUDE_API_V5 namespace api_v5 { + inline namespace and_newer { + using namespace api_v4::and_newer; + } + } + SEASTAR_INCLUDE_API_V6 namespace api_v6 { + inline namespace and_newer { + using namespace api_v5::and_newer; + } + } +} diff --git a/src/seastar/include/seastar/core/internal/buffer_allocator.hh b/src/seastar/include/seastar/core/internal/buffer_allocator.hh new file mode 100644 index 000000000..0cdd39e75 --- /dev/null +++ b/src/seastar/include/seastar/core/internal/buffer_allocator.hh @@ -0,0 +1,43 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2020 ScyllaDB + */ + +#pragma once + +namespace seastar { + +template +class temporary_buffer; + +namespace internal { + +// Internal interface for allocating buffers for reads. Used to decouple +// allocation strategies (where to allocate from, and what sizes) from the +// point where allocation happens, to make it as late as possible. +class buffer_allocator { +public: + virtual ~buffer_allocator() = default; + virtual temporary_buffer allocate_buffer() = 0; +}; + + +} + +} diff --git a/src/seastar/include/seastar/core/internal/io_desc.hh b/src/seastar/include/seastar/core/internal/io_desc.hh new file mode 100644 index 000000000..74d1cf90f --- /dev/null +++ b/src/seastar/include/seastar/core/internal/io_desc.hh @@ -0,0 +1,35 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2019 ScyllaDB Ltd. + */ + +#pragma once + +#include +#include + +namespace seastar { + +class kernel_completion { +protected: + ~kernel_completion() = default; +public: + virtual void complete_with(ssize_t res) = 0; +}; +} diff --git a/src/seastar/include/seastar/core/internal/io_request.hh b/src/seastar/include/seastar/core/internal/io_request.hh new file mode 100644 index 000000000..f8d3c9af8 --- /dev/null +++ b/src/seastar/include/seastar/core/internal/io_request.hh @@ -0,0 +1,266 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2020 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace seastar { +namespace internal { + +class io_request { +public: + enum class operation { read, readv, write, writev, fdatasync, recv, recvmsg, send, sendmsg, accept, connect, poll_add, poll_remove, cancel }; +private: + operation _op; + int _fd; + union { + uint64_t pos; + int flags; + int events; + } _attr; + // the upper layers give us void pointers, but storing void pointers here is just + // dangerous. The constructors seem to be happy to convert other pointers to void*, + // even if they are marked as explicit, and then you end up losing approximately 3 hours + // and 15 minutes (hypothetically, of course), trying to chase the weirdest bug. + // Let's store a char* for safety, and cast it back to void* in the accessor. + union { + char* addr; + ::iovec* iovec; + ::msghdr* msghdr; + ::sockaddr* sockaddr; + } _ptr; + + // accept wants a socklen_t*, connect wants a socklen_t + union { + size_t len; + socklen_t* socklen_ptr; + socklen_t socklen; + } _size; + kernel_completion* _kernel_completion; + + explicit io_request(operation op, int fd, int flags, ::msghdr* msg) + : _op(op) + , _fd(fd) + { + _attr.flags = flags; + _ptr.msghdr = msg; + } + + explicit io_request(operation op, int fd, sockaddr* sa, socklen_t sl) + : _op(op) + , _fd(fd) + { + _ptr.sockaddr = sa; + _size.socklen = sl; + } + + explicit io_request(operation op, int fd, int flags, sockaddr* sa, socklen_t* sl) + : _op(op) + , _fd(fd) + { + _attr.flags = flags; + _ptr.sockaddr = sa; + _size.socklen_ptr = sl; + } + explicit io_request(operation op, int fd, uint64_t pos, char* ptr, size_t size) + : _op(op) + , _fd(fd) + { + _attr.pos = pos; + _ptr.addr = ptr; + _size.len = size; + } + + explicit io_request(operation op, int fd, uint64_t pos, iovec* ptr, size_t size) + : _op(op) + , _fd(fd) + { + _attr.pos = pos; + _ptr.iovec = ptr; + _size.len = size; + } + + explicit io_request(operation op, int fd) + : _op(op) + , _fd(fd) + {} + explicit io_request(operation op, int fd, int events) + : _op(op) + , _fd(fd) + { + _attr.events = events; + } + + explicit io_request(operation op, int fd, char *ptr) + : _op(op) + , _fd(fd) + { + _ptr.addr = ptr; + } +public: + bool is_read() const { + switch (_op) { + case operation::read: + case operation::readv: + case operation::recvmsg: + case operation::recv: + return true; + default: + return false; + } + } + + bool is_write() const { + switch (_op) { + case operation::write: + case operation::writev: + case operation::send: + case operation::sendmsg: + return true; + default: + return false; + } + } + + sstring opname() const; + + operation opcode() const { + return _op; + } + + int fd() const { + return _fd; + } + + uint64_t pos() const { + return _attr.pos; + } + + int flags() const { + return _attr.flags; + } + + int events() const { + return _attr.events; + } + + void* address() const { + return reinterpret_cast(_ptr.addr); + } + + iovec* iov() const { + return _ptr.iovec; + } + + ::sockaddr* posix_sockaddr() const { + return _ptr.sockaddr; + } + + ::msghdr* msghdr() const { + return _ptr.msghdr; + } + + size_t size() const { + return _size.len; + } + + size_t iov_len() const { + return _size.len; + } + + socklen_t socklen() const { + return _size.socklen; + } + + socklen_t* socklen_ptr() const { + return _size.socklen_ptr; + } + + void attach_kernel_completion(kernel_completion* kc) { + _kernel_completion = kc; + } + + kernel_completion* get_kernel_completion() const { + return _kernel_completion; + } + + static io_request make_read(int fd, uint64_t pos, void* address, size_t size) { + return io_request(operation::read, fd, pos, reinterpret_cast(address), size); + } + + static io_request make_readv(int fd, uint64_t pos, std::vector& iov) { + return io_request(operation::readv, fd, pos, iov.data(), iov.size()); + } + + static io_request make_recv(int fd, void* address, size_t size, int flags) { + return io_request(operation::recv, fd, flags, reinterpret_cast(address), size); + } + + static io_request make_recvmsg(int fd, ::msghdr* msg, int flags) { + return io_request(operation::recvmsg, fd, flags, msg); + } + + static io_request make_send(int fd, const void* address, size_t size, int flags) { + return io_request(operation::send, fd, flags, const_cast(reinterpret_cast(address)), size); + } + + static io_request make_sendmsg(int fd, ::msghdr* msg, int flags) { + return io_request(operation::sendmsg, fd, flags, msg); + } + + static io_request make_write(int fd, uint64_t pos, const void* address, size_t size) { + return io_request(operation::write, fd, pos, const_cast(reinterpret_cast(address)), size); + } + + static io_request make_writev(int fd, uint64_t pos, std::vector& iov) { + return io_request(operation::writev, fd, pos, iov.data(), iov.size()); + } + + static io_request make_fdatasync(int fd) { + return io_request(operation::fdatasync, fd); + } + + static io_request make_accept(int fd, struct sockaddr* addr, socklen_t* addrlen, int flags) { + return io_request(operation::accept, fd, flags, addr, addrlen); + } + + static io_request make_connect(int fd, struct sockaddr* addr, socklen_t addrlen) { + return io_request(operation::connect, fd, addr, addrlen); + } + + static io_request make_poll_add(int fd, int events) { + return io_request(operation::poll_add, fd, events); + } + + static io_request make_poll_remove(int fd, void *addr) { + return io_request(operation::poll_remove, fd, reinterpret_cast(addr)); + } + static io_request make_cancel(int fd, void *addr) { + return io_request(operation::cancel, fd, reinterpret_cast(addr)); + } +}; +} +} diff --git a/src/seastar/include/seastar/core/internal/poll.hh b/src/seastar/include/seastar/core/internal/poll.hh new file mode 100644 index 000000000..1816a60de --- /dev/null +++ b/src/seastar/include/seastar/core/internal/poll.hh @@ -0,0 +1,59 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2019 ScyllaDB + */ + +#pragma once + +namespace seastar { + +struct pollfn { + virtual ~pollfn() {} + // Returns true if work was done (false = idle) + virtual bool poll() = 0; + // Checks if work needs to be done, but without actually doing any + // returns true if works needs to be done (false = idle) + virtual bool pure_poll() = 0; + // Tries to enter interrupt mode. + // + // If it returns true, then events from this poller will wake + // a sleeping idle loop, and exit_interrupt_mode() must be called + // to return to normal polling. + // + // If it returns false, the sleeping idle loop may not be entered. + virtual bool try_enter_interrupt_mode() = 0; + virtual void exit_interrupt_mode() = 0; +}; + +// The common case for poller -- do not make any difference between +// poll() and pure_poll(), always/never agree to go to sleep and do +// nothing on wakeup. +template +struct simple_pollfn : public pollfn { + virtual bool pure_poll() override final { + return poll(); + } + virtual bool try_enter_interrupt_mode() override final { + return Passive; + } + virtual void exit_interrupt_mode() override final { + } +}; + +} diff --git a/src/seastar/include/seastar/core/internal/pollable_fd.hh b/src/seastar/include/seastar/core/internal/pollable_fd.hh new file mode 100644 index 000000000..c978877c2 --- /dev/null +++ b/src/seastar/include/seastar/core/internal/pollable_fd.hh @@ -0,0 +1,219 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2019 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace seastar { + +class reactor; +class pollable_fd; +class pollable_fd_state; +class socket_address; + +namespace internal { + +class buffer_allocator; + +} + +namespace net { + +class packet; + +} + +class pollable_fd_state; + +using pollable_fd_state_ptr = boost::intrusive_ptr; + +class pollable_fd_state { + unsigned _refs = 0; +public: + virtual ~pollable_fd_state() {} + struct speculation { + int events = 0; + explicit speculation(int epoll_events_guessed = 0) : events(epoll_events_guessed) {} + }; + pollable_fd_state(const pollable_fd_state&) = delete; + void operator=(const pollable_fd_state&) = delete; + void speculate_epoll(int events) { events_known |= events; } + file_desc fd; + bool events_rw = false; // single consumer for both read and write (accept()) + bool no_more_recv = false; // For udp, there is no shutdown indication from the kernel + bool no_more_send = false; // For udp, there is no shutdown indication from the kernel + int events_requested = 0; // wanted by pollin/pollout promises + int events_epoll = 0; // installed in epoll + int events_known = 0; // returned from epoll + + friend class reactor; + friend class pollable_fd; + + future read_some(char* buffer, size_t size); + future read_some(uint8_t* buffer, size_t size); + future read_some(const std::vector& iov); + future> read_some(internal::buffer_allocator* ba); + future<> write_all(const char* buffer, size_t size); + future<> write_all(const uint8_t* buffer, size_t size); + future write_some(net::packet& p); + future<> write_all(net::packet& p); + future<> readable(); + future<> writeable(); + future<> readable_or_writeable(); + void abort_reader(); + void abort_writer(); + future> accept(); + future<> connect(socket_address& sa); + future sendmsg(struct msghdr *msg); + future recvmsg(struct msghdr *msg); + future sendto(socket_address addr, const void* buf, size_t len); + +protected: + explicit pollable_fd_state(file_desc fd, speculation speculate = speculation()) + : fd(std::move(fd)), events_known(speculate.events) {} +private: + void maybe_no_more_recv(); + void maybe_no_more_send(); + void forget(); // called on end-of-life + + friend void intrusive_ptr_add_ref(pollable_fd_state* fd) { + ++fd->_refs; + } + friend void intrusive_ptr_release(pollable_fd_state* fd); +}; + +class pollable_fd { +public: + using speculation = pollable_fd_state::speculation; + pollable_fd() = default; + pollable_fd(file_desc fd, speculation speculate = speculation()); +public: + future read_some(char* buffer, size_t size) { + return _s->read_some(buffer, size); + } + future read_some(uint8_t* buffer, size_t size) { + return _s->read_some(buffer, size); + } + future read_some(const std::vector& iov) { + return _s->read_some(iov); + } + future> read_some(internal::buffer_allocator* ba) { + return _s->read_some(ba); + } + future<> write_all(const char* buffer, size_t size) { + return _s->write_all(buffer, size); + } + future<> write_all(const uint8_t* buffer, size_t size) { + return _s->write_all(buffer, size); + } + future write_some(net::packet& p) { + return _s->write_some(p); + } + future<> write_all(net::packet& p) { + return _s->write_all(p); + } + future<> readable() { + return _s->readable(); + } + future<> writeable() { + return _s->writeable(); + } + future<> readable_or_writeable() { + return _s->readable_or_writeable(); + } + void abort_reader() { + return _s->abort_reader(); + } + void abort_writer() { + return _s->abort_writer(); + } + future> accept() { + return _s->accept(); + } + future<> connect(socket_address& sa) { + return _s->connect(sa); + } + future sendmsg(struct msghdr *msg) { + return _s->sendmsg(msg); + } + future recvmsg(struct msghdr *msg) { + return _s->recvmsg(msg); + } + future sendto(socket_address addr, const void* buf, size_t len) { + return _s->sendto(addr, buf, len); + } + file_desc& get_file_desc() const { return _s->fd; } + void shutdown(int how); + void close() { _s.reset(); } + explicit operator bool() const noexcept { + return bool(_s); + } +protected: + int get_fd() const { return _s->fd.get(); } + void maybe_no_more_recv() { return _s->maybe_no_more_recv(); } + void maybe_no_more_send() { return _s->maybe_no_more_send(); } + friend class reactor; + friend class readable_eventfd; + friend class writeable_eventfd; + friend class aio_storage_context; +private: + pollable_fd_state_ptr _s; +}; + +class writeable_eventfd; + +class readable_eventfd { + pollable_fd _fd; +public: + explicit readable_eventfd(size_t initial = 0) : _fd(try_create_eventfd(initial)) {} + readable_eventfd(readable_eventfd&&) = default; + writeable_eventfd write_side(); + future wait(); + int get_write_fd() { return _fd.get_fd(); } +private: + explicit readable_eventfd(file_desc&& fd) : _fd(std::move(fd)) {} + static file_desc try_create_eventfd(size_t initial); + + friend class writeable_eventfd; +}; + +class writeable_eventfd { + file_desc _fd; +public: + explicit writeable_eventfd(size_t initial = 0) : _fd(try_create_eventfd(initial)) {} + writeable_eventfd(writeable_eventfd&&) = default; + readable_eventfd read_side(); + void signal(size_t nr); + int get_read_fd() { return _fd.get(); } +private: + explicit writeable_eventfd(file_desc&& fd) : _fd(std::move(fd)) {} + static file_desc try_create_eventfd(size_t initial); + + friend class readable_eventfd; +}; + +} diff --git a/src/seastar/include/seastar/core/io_queue.hh b/src/seastar/include/seastar/core/io_queue.hh new file mode 100644 index 000000000..dd7c1b1ab --- /dev/null +++ b/src/seastar/include/seastar/core/io_queue.hh @@ -0,0 +1,170 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2019 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +class io_priority_class; + +/// Renames an io priority class +/// +/// Renames an `io_priority_class` previously created with register_one_priority_class(). +/// +/// The operation is global and affects all shards. +/// The operation affects the exported statistics labels. +/// +/// \param pc The io priority class to be renamed +/// \param new_name The new name for the io priority class +/// \return a future that is ready when the io priority class have been renamed +future<> +rename_priority_class(io_priority_class pc, sstring new_name); + +namespace internal { +namespace linux_abi { + +struct io_event; +struct iocb; + +} +} + +using shard_id = unsigned; + +class io_priority_class; + +class io_queue { +private: + struct priority_class_data { + priority_class_ptr ptr; + size_t bytes; + uint64_t ops; + uint32_t nr_queued; + std::chrono::duration queue_time; + metrics::metric_groups _metric_groups; + priority_class_data(sstring name, sstring mountpoint, priority_class_ptr ptr, shard_id owner); + void rename(sstring new_name, sstring mountpoint, shard_id owner); + private: + void register_stats(sstring name, sstring mountpoint, shard_id owner); + }; + + std::vector>> _priority_classes; + fair_queue _fq; + + static constexpr unsigned _max_classes = 2048; + static std::mutex _register_lock; + static std::array _registered_shares; + static std::array _registered_names; + +public: + static io_priority_class register_one_priority_class(sstring name, uint32_t shares); + static bool rename_one_priority_class(io_priority_class pc, sstring name); + +private: + priority_class_data& find_or_create_class(const io_priority_class& pc, shard_id owner); + + fair_queue_ticket request_fq_ticket(const internal::io_request& req, size_t len) const; + + // The fields below are going away, they are just here so we can implement deprecated + // functions that used to be provided by the fair_queue and are going away (from both + // the fair_queue and the io_queue). Double-accounting for now will allow for easier + // decoupling and is temporary + size_t _queued_requests = 0; + size_t _requests_executing = 0; +public: + // We want to represent the fact that write requests are (maybe) more expensive + // than read requests. To avoid dealing with floating point math we will scale one + // read request to be counted by this amount. + // + // A write request that is 30% more expensive than a read will be accounted as + // (read_request_base_count * 130) / 100. + // It is also technically possible for reads to be the expensive ones, in which case + // writes will have an integer value lower than read_request_base_count. + static constexpr unsigned read_request_base_count = 128; + + struct config { + dev_t devid; + shard_id coordinator; + unsigned capacity = std::numeric_limits::max(); + unsigned max_req_count = std::numeric_limits::max(); + unsigned max_bytes_count = std::numeric_limits::max(); + unsigned disk_req_write_to_read_multiplier = read_request_base_count; + unsigned disk_bytes_write_to_read_multiplier = read_request_base_count; + sstring mountpoint = "undefined"; + }; + + io_queue(config cfg); + ~io_queue(); + + future + queue_request(const io_priority_class& pc, size_t len, internal::io_request req) noexcept; + + [[deprecated("modern I/O queues should use a property file")]] size_t capacity() const { + return _config.capacity; + } + + [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]] + size_t queued_requests() const { + return _queued_requests; + } + + // How many requests are sent to disk but not yet returned. + [[deprecated("I/O queue users should not track individual requests, but resources (weight, size) passing through the queue")]] + size_t requests_currently_executing() const { + return _requests_executing; + } + + void notify_requests_finished(fair_queue_ticket& desc) noexcept; + + // Dispatch requests that are pending in the I/O queue + void poll_io_queue() { + _fq.dispatch_requests(); + } + + sstring mountpoint() const { + return _config.mountpoint; + } + + shard_id coordinator() const { + return _config.coordinator; + } + + dev_t dev_id() const noexcept { + return _config.devid; + } + + future<> update_shares_for_class(io_priority_class pc, size_t new_shares); + void rename_priority_class(io_priority_class pc, sstring new_name); + +private: + config _config; + static fair_queue::config make_fair_queue_config(config cfg); +}; + +} diff --git a/src/seastar/include/seastar/core/iostream-impl.hh b/src/seastar/include/seastar/core/iostream-impl.hh new file mode 100644 index 000000000..2b3354742 --- /dev/null +++ b/src/seastar/include/seastar/core/iostream-impl.hh @@ -0,0 +1,535 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + + +#pragma once + +#include +#include +#include +#include + +namespace seastar { + +inline future> data_source_impl::skip(uint64_t n) +{ + return do_with(uint64_t(n), [this] (uint64_t& n) { + return repeat_until_value([&] { + return get().then([&] (temporary_buffer buffer) -> std::optional> { + if (buffer.size() >= n) { + buffer.trim_front(n); + return buffer; + } + n -= buffer.size(); + return { }; + }); + }); + }); +} + +template +inline +future<> output_stream::write(const char_type* buf) { + return write(buf, strlen(buf)); +} + +template +template +inline +future<> output_stream::write(const basic_sstring& s) { + return write(reinterpret_cast(s.c_str()), s.size()); +} + +template +inline +future<> output_stream::write(const std::basic_string& s) { + return write(s.c_str(), s.size()); +} + +template +future<> output_stream::write(scattered_message msg) { + return write(std::move(msg).release()); +} + +template +future<> +output_stream::zero_copy_put(net::packet p) { + // if flush is scheduled, disable it, so it will not try to write in parallel + _flush = false; + if (_flushing) { + // flush in progress, wait for it to end before continuing + return _in_batch.value().get_future().then([this, p = std::move(p)] () mutable { + return _fd.put(std::move(p)); + }); + } else { + return _fd.put(std::move(p)); + } +} + +// Writes @p in chunks of _size length. The last chunk is buffered if smaller. +template +future<> +output_stream::zero_copy_split_and_put(net::packet p) { + return repeat([this, p = std::move(p)] () mutable { + if (p.len() < _size) { + if (p.len()) { + _zc_bufs = std::move(p); + } else { + _zc_bufs = net::packet::make_null_packet(); + } + return make_ready_future(stop_iteration::yes); + } + auto chunk = p.share(0, _size); + p.trim_front(_size); + return zero_copy_put(std::move(chunk)).then([] { + return stop_iteration::no; + }); + }); +} + +template +future<> output_stream::write(net::packet p) { + static_assert(std::is_same::value, "packet works on char"); + + if (p.len() != 0) { + assert(!_end && "Mixing buffered writes and zero-copy writes not supported yet"); + + if (_zc_bufs) { + _zc_bufs.append(std::move(p)); + } else { + _zc_bufs = std::move(p); + } + + if (_zc_bufs.len() >= _size) { + if (_trim_to_size) { + return zero_copy_split_and_put(std::move(_zc_bufs)); + } else { + return zero_copy_put(std::move(_zc_bufs)); + } + } + } + return make_ready_future<>(); +} + +template +future<> output_stream::write(temporary_buffer p) { + if (p.empty()) { + return make_ready_future<>(); + } + assert(!_end && "Mixing buffered writes and zero-copy writes not supported yet"); + + return write(net::packet(std::move(p))); +} + +template +future> +input_stream::read_exactly_part(size_t n, tmp_buf out, size_t completed) { + if (available()) { + auto now = std::min(n - completed, available()); + std::copy(_buf.get(), _buf.get() + now, out.get_write() + completed); + _buf.trim_front(now); + completed += now; + } + if (completed == n) { + return make_ready_future(std::move(out)); + } + + // _buf is now empty + return _fd.get().then([this, n, out = std::move(out), completed] (auto buf) mutable { + if (buf.size() == 0) { + _eof = true; + return make_ready_future(std::move(buf)); + } + _buf = std::move(buf); + return this->read_exactly_part(n, std::move(out), completed); + }); +} + +template +future> +input_stream::read_exactly(size_t n) { + if (_buf.size() == n) { + // easy case: steal buffer, return to caller + return make_ready_future(std::move(_buf)); + } else if (_buf.size() > n) { + // buffer large enough, share it with caller + auto front = _buf.share(0, n); + _buf.trim_front(n); + return make_ready_future(std::move(front)); + } else if (_buf.size() == 0) { + // buffer is empty: grab one and retry + return _fd.get().then([this, n] (auto buf) mutable { + if (buf.size() == 0) { + _eof = true; + return make_ready_future(std::move(buf)); + } + _buf = std::move(buf); + return this->read_exactly(n); + }); + } else { + // buffer too small: start copy/read loop + tmp_buf b(n); + return read_exactly_part(n, std::move(b), 0); + } +} + +template +template +SEASTAR_CONCEPT(requires InputStreamConsumer || ObsoleteInputStreamConsumer) +future<> +input_stream::consume(Consumer&& consumer) { + return repeat([consumer = std::move(consumer), this] () mutable { + if (_buf.empty() && !_eof) { + return _fd.get().then([this] (tmp_buf buf) { + _buf = std::move(buf); + _eof = _buf.empty(); + return make_ready_future(stop_iteration::no); + }); + } + return consumer(std::move(_buf)).then([this] (consumption_result_type result) { + return seastar::visit(result.get(), [this] (const continue_consuming&) { + // If we're here, consumer consumed entire buffer and is ready for + // more now. So we do not return, and rather continue the loop. + // + // If we're at eof, we should stop. + return make_ready_future(stop_iteration(this->_eof)); + }, [this] (stop_consuming& stop) { + // consumer is done + this->_buf = std::move(stop.get_buffer()); + return make_ready_future(stop_iteration::yes); + }, [this] (const skip_bytes& skip) { + return this->_fd.skip(skip.get_value()).then([this](tmp_buf buf) { + if (!buf.empty()) { + this->_buf = std::move(buf); + } + return make_ready_future(stop_iteration::no); + }); + }); + }); + }); +} + +template +template +SEASTAR_CONCEPT(requires InputStreamConsumer || ObsoleteInputStreamConsumer) +future<> +input_stream::consume(Consumer& consumer) { + return consume(std::ref(consumer)); +} + +template +future> +input_stream::read_up_to(size_t n) { + using tmp_buf = temporary_buffer; + if (_buf.empty()) { + if (_eof) { + return make_ready_future(); + } else { + return _fd.get().then([this, n] (tmp_buf buf) { + _eof = buf.empty(); + _buf = std::move(buf); + return read_up_to(n); + }); + } + } else if (_buf.size() <= n) { + // easy case: steal buffer, return to caller + return make_ready_future(std::move(_buf)); + } else { + // buffer is larger than n, so share its head with a caller + auto front = _buf.share(0, n); + _buf.trim_front(n); + return make_ready_future(std::move(front)); + } +} + +template +future> +input_stream::read() { + using tmp_buf = temporary_buffer; + if (_eof) { + return make_ready_future(); + } + if (_buf.empty()) { + return _fd.get().then([this] (tmp_buf buf) { + _eof = buf.empty(); + return make_ready_future(std::move(buf)); + }); + } else { + return make_ready_future(std::move(_buf)); + } +} + +template +future<> +input_stream::skip(uint64_t n) { + auto skip_buf = std::min(n, _buf.size()); + _buf.trim_front(skip_buf); + n -= skip_buf; + if (!n) { + return make_ready_future<>(); + } + return _fd.skip(n).then([this] (temporary_buffer buffer) { + _buf = std::move(buffer); + }); +} + +template +data_source +input_stream::detach() && { + if (_buf) { + throw std::logic_error("detach() called on a used input_stream"); + } + + return std::move(_fd); +} + +// Writes @buf in chunks of _size length. The last chunk is buffered if smaller. +template +future<> +output_stream::split_and_put(temporary_buffer buf) { + assert(_end == 0); + + return repeat([this, buf = std::move(buf)] () mutable { + if (buf.size() < _size) { + if (!_buf) { + _buf = _fd.allocate_buffer(_size); + } + std::copy(buf.get(), buf.get() + buf.size(), _buf.get_write()); + _end = buf.size(); + return make_ready_future(stop_iteration::yes); + } + auto chunk = buf.share(0, _size); + buf.trim_front(_size); + return put(std::move(chunk)).then([] { + return stop_iteration::no; + }); + }); +} + +template +future<> +output_stream::write(const char_type* buf, size_t n) { + if (__builtin_expect(!_buf || n > _size - _end, false)) { + return slow_write(buf, n); + } + std::copy_n(buf, n, _buf.get_write() + _end); + _end += n; + return make_ready_future<>(); +} + +template +future<> +output_stream::slow_write(const char_type* buf, size_t n) { + assert(!_zc_bufs && "Mixing buffered writes and zero-copy writes not supported yet"); + auto bulk_threshold = _end ? (2 * _size - _end) : _size; + if (n >= bulk_threshold) { + if (_end) { + auto now = _size - _end; + std::copy(buf, buf + now, _buf.get_write() + _end); + _end = _size; + temporary_buffer tmp = _fd.allocate_buffer(n - now); + std::copy(buf + now, buf + n, tmp.get_write()); + _buf.trim(_end); + _end = 0; + return put(std::move(_buf)).then([this, tmp = std::move(tmp)]() mutable { + if (_trim_to_size) { + return split_and_put(std::move(tmp)); + } else { + return put(std::move(tmp)); + } + }); + } else { + temporary_buffer tmp = _fd.allocate_buffer(n); + std::copy(buf, buf + n, tmp.get_write()); + if (_trim_to_size) { + return split_and_put(std::move(tmp)); + } else { + return put(std::move(tmp)); + } + } + } + + if (!_buf) { + _buf = _fd.allocate_buffer(_size); + } + + auto now = std::min(n, _size - _end); + std::copy(buf, buf + now, _buf.get_write() + _end); + _end += now; + if (now == n) { + return make_ready_future<>(); + } else { + temporary_buffer next = _fd.allocate_buffer(_size); + std::copy(buf + now, buf + n, next.get_write()); + _end = n - now; + std::swap(next, _buf); + return put(std::move(next)); + } +} + +template +future<> +output_stream::flush() { + if (!_batch_flushes) { + if (_end) { + _buf.trim(_end); + _end = 0; + return put(std::move(_buf)).then([this] { + return _fd.flush(); + }); + } else if (_zc_bufs) { + return zero_copy_put(std::move(_zc_bufs)).then([this] { + return _fd.flush(); + }); + } + } else { + if (_ex) { + // flush is a good time to deliver outstanding errors + return make_exception_future<>(std::move(_ex)); + } else { + _flush = true; + if (!_in_batch) { + add_to_flush_poller(this); + _in_batch = promise<>(); + } + } + } + return make_ready_future<>(); +} + +void add_to_flush_poller(output_stream* x); + +template +future<> +output_stream::put(temporary_buffer buf) { + // if flush is scheduled, disable it, so it will not try to write in parallel + _flush = false; + if (_flushing) { + // flush in progress, wait for it to end before continuing + return _in_batch.value().get_future().then([this, buf = std::move(buf)] () mutable { + return _fd.put(std::move(buf)); + }); + } else { + return _fd.put(std::move(buf)); + } +} + +template +void +output_stream::poll_flush() { + if (!_flush) { + // flush was canceled, do nothing + _flushing = false; + _in_batch.value().set_value(); + _in_batch = std::nullopt; + return; + } + + auto f = make_ready_future(); + _flush = false; + _flushing = true; // make whoever wants to write into the fd to wait for flush to complete + + if (_end) { + // send whatever is in the buffer right now + _buf.trim(_end); + _end = 0; + f = _fd.put(std::move(_buf)); + } else if(_zc_bufs) { + f = _fd.put(std::move(_zc_bufs)); + } + + // FIXME: future is discarded + (void)f.then([this] { + return _fd.flush(); + }).then_wrapped([this] (future<> f) { + try { + f.get(); + } catch (...) { + _ex = std::current_exception(); + } + // if flush() was called while flushing flush once more + poll_flush(); + }); +} + +template +future<> +output_stream::close() { + return flush().finally([this] { + if (_in_batch) { + return _in_batch.value().get_future(); + } else { + return make_ready_future(); + } + }).then([this] { + // report final exception as close error + if (_ex) { + std::rethrow_exception(_ex); + } + }).finally([this] { + return _fd.close(); + }); +} + +template +data_sink +output_stream::detach() && { + if (_buf) { + throw std::logic_error("detach() called on a used output_stream"); + } + + return std::move(_fd); +} + +namespace internal { + +/// \cond internal +template +struct stream_copy_consumer { +private: + output_stream& _os; + using unconsumed_remainder = std::optional>; +public: + stream_copy_consumer(output_stream& os) : _os(os) { + } + future operator()(temporary_buffer data) { + if (data.empty()) { + return make_ready_future(std::move(data)); + } + return _os.write(data.get(), data.size()).then([] () { + return make_ready_future(); + }); + } +}; +/// \endcond + +} + +extern template struct internal::stream_copy_consumer; + +template +future<> copy(input_stream& in, output_stream& out) { + return in.consume(internal::stream_copy_consumer(out)); +} + +extern template future<> copy(input_stream&, output_stream&); +} diff --git a/src/seastar/include/seastar/core/iostream.hh b/src/seastar/include/seastar/core/iostream.hh new file mode 100644 index 000000000..ae81ecd1d --- /dev/null +++ b/src/seastar/include/seastar/core/iostream.hh @@ -0,0 +1,360 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +// +// Buffered input and output streams +// +// Two abstract classes (data_source and data_sink) provide means +// to acquire bulk data from, or push bulk data to, some provider. +// These could be tied to a TCP connection, a disk file, or a memory +// buffer. +// +// Two concrete classes (input_stream and output_stream) buffer data +// from data_source and data_sink and provide easier means to process +// it. +// + +#pragma once + +#include +#include +#include +#include + +namespace seastar { + +namespace net { class packet; } + +class data_source_impl { +public: + virtual ~data_source_impl() {} + virtual future> get() = 0; + virtual future> skip(uint64_t n); + virtual future<> close() { return make_ready_future<>(); } +}; + +class data_source { + std::unique_ptr _dsi; +protected: + data_source_impl* impl() const { return _dsi.get(); } +public: + data_source() noexcept = default; + explicit data_source(std::unique_ptr dsi) noexcept : _dsi(std::move(dsi)) {} + data_source(data_source&& x) noexcept = default; + data_source& operator=(data_source&& x) noexcept = default; + future> get() { return _dsi->get(); } + future> skip(uint64_t n) { return _dsi->skip(n); } + future<> close() { return _dsi->close(); } +}; + +class data_sink_impl { +public: + virtual ~data_sink_impl() {} + virtual temporary_buffer allocate_buffer(size_t size) { + return temporary_buffer(size); + } + virtual future<> put(net::packet data) = 0; + virtual future<> put(std::vector> data) { + net::packet p; + p.reserve(data.size()); + for (auto& buf : data) { + p = net::packet(std::move(p), net::fragment{buf.get_write(), buf.size()}, buf.release()); + } + return put(std::move(p)); + } + virtual future<> put(temporary_buffer buf) { + return put(net::packet(net::fragment{buf.get_write(), buf.size()}, buf.release())); + } + virtual future<> flush() { + return make_ready_future<>(); + } + virtual future<> close() = 0; +}; + +class data_sink { + std::unique_ptr _dsi; +public: + data_sink() noexcept = default; + explicit data_sink(std::unique_ptr dsi) noexcept : _dsi(std::move(dsi)) {} + data_sink(data_sink&& x) noexcept = default; + data_sink& operator=(data_sink&& x) noexcept = default; + temporary_buffer allocate_buffer(size_t size) { + return _dsi->allocate_buffer(size); + } + future<> put(std::vector> data) { + return _dsi->put(std::move(data)); + } + future<> put(temporary_buffer data) { + return _dsi->put(std::move(data)); + } + future<> put(net::packet p) { + return _dsi->put(std::move(p)); + } + future<> flush() { + return _dsi->flush(); + } + future<> close() { return _dsi->close(); } +}; + +struct continue_consuming {}; + +template +class stop_consuming { +public: + using tmp_buf = temporary_buffer; + explicit stop_consuming(tmp_buf buf) : _buf(std::move(buf)) {} + + tmp_buf& get_buffer() { return _buf; } + const tmp_buf& get_buffer() const { return _buf; } +private: + tmp_buf _buf; +}; + +class skip_bytes { +public: + explicit skip_bytes(uint64_t v) : _value(v) {} + uint64_t get_value() const { return _value; } +private: + uint64_t _value; +}; + +template +class consumption_result { +public: + using stop_consuming_type = stop_consuming; + using consumption_variant = std::variant; + using tmp_buf = typename stop_consuming_type::tmp_buf; + + /*[[deprecated]]*/ consumption_result(std::optional opt_buf) { + if (opt_buf) { + _result = stop_consuming_type{std::move(opt_buf.value())}; + } + } + + consumption_result(const continue_consuming&) {} + consumption_result(stop_consuming_type&& stop) : _result(std::move(stop)) {} + consumption_result(skip_bytes&& skip) : _result(std::move(skip)) {} + + consumption_variant& get() { return _result; } + const consumption_variant& get() const { return _result; } + +private: + consumption_variant _result; +}; + +// Consumer concept, for consume() method +SEASTAR_CONCEPT( +// The consumer should operate on the data given to it, and +// return a future "consumption result", which can be +// - continue_consuming, if the consumer has consumed all the input given +// to it and is ready for more +// - stop_consuming, when the consumer is done (and in that case +// the contained buffer is the unconsumed part of the last data buffer - this +// can also happen to be empty). +// - skip_bytes, when the consumer has consumed all the input given to it +// and wants to skip before processing the next chunk +// +// For backward compatibility reasons, we also support the deprecated return value +// of type "unconsumed remainder" which can be +// - empty optional, if the consumer consumed all the input given to it +// and is ready for more +// - non-empty optional, when the consumer is done (and in that case +// the value is the unconsumed part of the last data buffer - this +// can also happen to be empty). + +template +concept InputStreamConsumer = requires (Consumer c) { + { c(temporary_buffer{}) } -> std::same_as>>; +}; + +template +concept ObsoleteInputStreamConsumer = requires (Consumer c) { + { c(temporary_buffer{}) } -> std::same_as>>>; +}; +) + +/// Buffers data from a data_source and provides a stream interface to the user. +/// +/// \note All methods must be called sequentially. That is, no method may be +/// invoked before the previous method's returned future is resolved. +template +class input_stream final { + static_assert(sizeof(CharType) == 1, "must buffer stream of bytes"); + data_source _fd; + temporary_buffer _buf; + bool _eof = false; +private: + using tmp_buf = temporary_buffer; + size_t available() const { return _buf.size(); } +protected: + void reset() { _buf = {}; } + data_source* fd() { return &_fd; } +public: + using consumption_result_type = consumption_result; + // unconsumed_remainder is mapped for compatibility only; new code should use consumption_result_type + using unconsumed_remainder = std::optional; + using char_type = CharType; + input_stream() noexcept = default; + explicit input_stream(data_source fd) noexcept : _fd(std::move(fd)), _buf() {} + input_stream(input_stream&&) = default; + input_stream& operator=(input_stream&&) = default; + /// Reads n bytes from the stream, or fewer if reached the end of stream. + /// + /// \returns a future that waits until n bytes are available in the + /// stream and returns them. If the end of stream is reached before n + /// bytes were read, fewer than n bytes will be returned - so despite + /// the method's name, the caller must not assume the returned buffer + /// will always contain exactly n bytes. + /// + /// \throws if an I/O error occurs during the read. As explained above, + /// prematurely reaching the end of stream is *not* an I/O error. + future> read_exactly(size_t n); + template + SEASTAR_CONCEPT(requires InputStreamConsumer || ObsoleteInputStreamConsumer) + future<> consume(Consumer&& c); + template + SEASTAR_CONCEPT(requires InputStreamConsumer || ObsoleteInputStreamConsumer) + future<> consume(Consumer& c); + bool eof() const { return _eof; } + /// Returns some data from the stream, or an empty buffer on end of + /// stream. + future read(); + /// Returns up to n bytes from the stream, or an empty buffer on end of + /// stream. + future read_up_to(size_t n); + /// Detaches the \c input_stream from the underlying data source. + /// + /// Waits for any background operations (for example, read-ahead) to + /// complete, so that the any resources the stream is using can be + /// safely destroyed. An example is a \ref file resource used by + /// the stream returned by make_file_input_stream(). + /// + /// \return a future that becomes ready when this stream no longer + /// needs the data source. + future<> close() { + return _fd.close(); + } + /// Ignores n next bytes from the stream. + future<> skip(uint64_t n); + + /// Detaches the underlying \c data_source from the \c input_stream. + /// + /// The intended usage is custom \c data_source_impl implementations + /// wrapping an existing \c input_stream, therefore it shouldn't be + /// called on an \c input_stream that was already used. + /// After calling \c detach() the \c input_stream is in an unusable, + /// moved-from state. + /// + /// \throws std::logic_error if called on a used stream + /// + /// \returns the data_source + data_source detach() &&; +private: + future> read_exactly_part(size_t n, tmp_buf buf, size_t completed); +}; + +/// Facilitates data buffering before it's handed over to data_sink. +/// +/// When trim_to_size is true it's guaranteed that data sink will not receive +/// chunks larger than the configured size, which could be the case when a +/// single write call is made with data larger than the configured size. +/// +/// The data sink will not receive empty chunks. +/// +/// \note All methods must be called sequentially. That is, no method +/// may be invoked before the previous method's returned future is +/// resolved. +template +class output_stream final { + static_assert(sizeof(CharType) == 1, "must buffer stream of bytes"); + data_sink _fd; + temporary_buffer _buf; + net::packet _zc_bufs = net::packet::make_null_packet(); //zero copy buffers + size_t _size = 0; + size_t _begin = 0; + size_t _end = 0; + bool _trim_to_size = false; + bool _batch_flushes = false; + std::optional> _in_batch; + bool _flush = false; + bool _flushing = false; + std::exception_ptr _ex; +private: + size_t available() const { return _end - _begin; } + size_t possibly_available() const { return _size - _begin; } + future<> split_and_put(temporary_buffer buf); + future<> put(temporary_buffer buf); + void poll_flush(); + future<> zero_copy_put(net::packet p); + future<> zero_copy_split_and_put(net::packet p); + [[gnu::noinline]] + future<> slow_write(const CharType* buf, size_t n); +public: + using char_type = CharType; + output_stream() noexcept = default; + output_stream(data_sink fd, size_t size, bool trim_to_size = false, bool batch_flushes = false) noexcept + : _fd(std::move(fd)), _size(size), _trim_to_size(trim_to_size), _batch_flushes(batch_flushes) {} + output_stream(output_stream&&) noexcept = default; + output_stream& operator=(output_stream&&) noexcept = default; + ~output_stream() { assert(!_in_batch && "Was this stream properly closed?"); } + future<> write(const char_type* buf, size_t n); + future<> write(const char_type* buf); + + template + future<> write(const basic_sstring& s); + future<> write(const std::basic_string& s); + + future<> write(net::packet p); + future<> write(scattered_message msg); + future<> write(temporary_buffer); + future<> flush(); + + /// Flushes the stream before closing it (and the underlying data sink) to + /// any further writes. The resulting future must be waited on before + /// destroying this object. + future<> close(); + + /// Detaches the underlying \c data_sink from the \c output_stream. + /// + /// The intended usage is custom \c data_sink_impl implementations + /// wrapping an existing \c output_stream, therefore it shouldn't be + /// called on an \c output_stream that was already used. + /// After calling \c detach() the \c output_stream is in an unusable, + /// moved-from state. + /// + /// \throws std::logic_error if called on a used stream + /// + /// \returns the data_sink + data_sink detach() &&; +private: + friend class reactor; +}; + +/*! + * \brief copy all the content from the input stream to the output stream + */ +template +future<> copy(input_stream&, output_stream&); + +} + +#include "iostream-impl.hh" diff --git a/src/seastar/include/seastar/core/layered_file.hh b/src/seastar/include/seastar/core/layered_file.hh new file mode 100644 index 000000000..aa150e37e --- /dev/null +++ b/src/seastar/include/seastar/core/layered_file.hh @@ -0,0 +1,67 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2020 ScyllaDB + */ + +#pragma once + +#include + +namespace seastar { + +/// \addtogroup fileio-module +/// @{ + +/// Base class for layered file implementations. +/// +/// A layered file implementation implements `file_impl` virtual +/// functions such as dma_read() by forwarding them to another, existing +/// file called the underlying file. This base class simplifies construction +/// of layered files by performing standard tasks such as setting up the +/// file alignment. Actual implementation of the I/O methods is left for the +/// derived class. +class layered_file_impl : public file_impl { +protected: + file _underlying_file; +public: + /// Constructs a layered file. This sets up the underlying_file() method + /// and initializes alignment constants to be the same as the underlying file. + explicit layered_file_impl(file underlying_file) noexcept + : _underlying_file(std::move(underlying_file)) { + _memory_dma_alignment = _underlying_file.memory_dma_alignment(); + _disk_read_dma_alignment = _underlying_file.disk_read_dma_alignment(); + _disk_write_dma_alignment = _underlying_file.disk_write_dma_alignment(); + } + + /// The underlying file which can be used to back I/O methods. + file& underlying_file() noexcept { + return _underlying_file; + } + + /// The underlying file which can be used to back I/O methods. + const file& underlying_file() const noexcept { + return _underlying_file; + } +}; + + +/// @} + + +} diff --git a/src/seastar/include/seastar/core/linux-aio.hh b/src/seastar/include/seastar/core/linux-aio.hh new file mode 100644 index 000000000..1e4eef211 --- /dev/null +++ b/src/seastar/include/seastar/core/linux-aio.hh @@ -0,0 +1,234 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2017 ScyllaDB + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace seastar { + +namespace internal { + +namespace linux_abi { + +using aio_context_t = unsigned long; + +enum class iocb_cmd : uint16_t { + PREAD = 0, + PWRITE = 1, + FSYNC = 2, + FDSYNC = 3, + POLL = 5, + NOOP = 6, + PREADV = 7, + PWRITEV = 8, +}; + +struct io_event { + uint64_t data; + uint64_t obj; + int64_t res; + int64_t res2; +}; + +constexpr int IOCB_FLAG_RESFD = 1; + +struct iocb { + uint64_t aio_data; + +#if __BYTE_ORDER == __LITTLE_ENDIAN + uint32_t aio_key; + int32_t aio_rw_flags; +#elif __BYTE_ORDER == __BIG_ENDIAN + int32_t aio_rw_flags; + uint32_t aio_key; +#else +#error bad byteorder +#endif + + iocb_cmd aio_lio_opcode; + int16_t aio_reqprio; + uint32_t aio_fildes; + + uint64_t aio_buf; + uint64_t aio_nbytes; + int64_t aio_offset; + + uint64_t aio_reserved2; + + uint32_t aio_flags; + + uint32_t aio_resfd; +}; + +struct aio_sigset { + const sigset_t *sigmask; + size_t sigsetsize; +}; + +} + +linux_abi::iocb make_read_iocb(int fd, uint64_t offset, void* buffer, size_t len); +linux_abi::iocb make_write_iocb(int fd, uint64_t offset, const void* buffer, size_t len); +linux_abi::iocb make_readv_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov); +linux_abi::iocb make_writev_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov); +linux_abi::iocb make_poll_iocb(int fd, uint32_t events); + +void set_user_data(linux_abi::iocb& iocb, void* data); +void* get_user_data(const linux_abi::iocb& iocb); +void set_nowait(linux_abi::iocb& iocb, bool nowait); + +void set_eventfd_notification(linux_abi::iocb& iocb, int eventfd); + +linux_abi::iocb* get_iocb(const linux_abi::io_event& ioev); + +int io_setup(int nr_events, linux_abi::aio_context_t* io_context); +int io_destroy(linux_abi::aio_context_t io_context); +int io_submit(linux_abi::aio_context_t io_context, long nr, linux_abi::iocb** iocbs); +int io_cancel(linux_abi::aio_context_t io_context, linux_abi::iocb* iocb, linux_abi::io_event* result); +int io_getevents(linux_abi::aio_context_t io_context, long min_nr, long nr, linux_abi::io_event* events, const ::timespec* timeout, + bool force_syscall = false); +int io_pgetevents(linux_abi::aio_context_t io_context, long min_nr, long nr, linux_abi::io_event* events, const ::timespec* timeout, const sigset_t* sigmask, + bool force_syscall = false); + +void setup_aio_context(size_t nr, linux_abi::aio_context_t* io_context); + +} + +extern bool aio_nowait_supported; + +namespace internal { + +inline +linux_abi::iocb +make_read_iocb(int fd, uint64_t offset, void* buffer, size_t len) { + linux_abi::iocb iocb{}; + iocb.aio_lio_opcode = linux_abi::iocb_cmd::PREAD; + iocb.aio_fildes = fd; + iocb.aio_offset = offset; + iocb.aio_buf = reinterpret_cast(buffer); + iocb.aio_nbytes = len; + return iocb; +} + +inline +linux_abi::iocb +make_write_iocb(int fd, uint64_t offset, const void* buffer, size_t len) { + linux_abi::iocb iocb{}; + iocb.aio_lio_opcode = linux_abi::iocb_cmd::PWRITE; + iocb.aio_fildes = fd; + iocb.aio_offset = offset; + iocb.aio_buf = reinterpret_cast(buffer); + iocb.aio_nbytes = len; + return iocb; +} + +inline +linux_abi::iocb +make_readv_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov) { + linux_abi::iocb iocb{}; + iocb.aio_lio_opcode = linux_abi::iocb_cmd::PREADV; + iocb.aio_fildes = fd; + iocb.aio_offset = offset; + iocb.aio_buf = reinterpret_cast(iov); + iocb.aio_nbytes = niov; + return iocb; +} + +inline +linux_abi::iocb +make_writev_iocb(int fd, uint64_t offset, const ::iovec* iov, size_t niov) { + linux_abi::iocb iocb{}; + iocb.aio_lio_opcode = linux_abi::iocb_cmd::PWRITEV; + iocb.aio_fildes = fd; + iocb.aio_offset = offset; + iocb.aio_buf = reinterpret_cast(iov); + iocb.aio_nbytes = niov; + return iocb; +} + +inline +linux_abi::iocb +make_poll_iocb(int fd, uint32_t events) { + linux_abi::iocb iocb{}; + iocb.aio_lio_opcode = linux_abi::iocb_cmd::POLL; + iocb.aio_fildes = fd; + iocb.aio_buf = events; + return iocb; +} + +inline +linux_abi::iocb +make_fdsync_iocb(int fd) { + linux_abi::iocb iocb{}; + iocb.aio_lio_opcode = linux_abi::iocb_cmd::FDSYNC; + iocb.aio_fildes = fd; + return iocb; +} + +inline +void +set_user_data(linux_abi::iocb& iocb, void* data) { + iocb.aio_data = reinterpret_cast(data); +} + +inline +void* +get_user_data(const linux_abi::iocb& iocb) { + return reinterpret_cast(uintptr_t(iocb.aio_data)); +} + +inline +void +set_eventfd_notification(linux_abi::iocb& iocb, int eventfd) { + iocb.aio_flags |= linux_abi::IOCB_FLAG_RESFD; + iocb.aio_resfd = eventfd; +} + +inline +linux_abi::iocb* +get_iocb(const linux_abi::io_event& ev) { + return reinterpret_cast(uintptr_t(ev.obj)); +} + +inline +void +set_nowait(linux_abi::iocb& iocb, bool nowait) { +#ifdef RWF_NOWAIT + if (aio_nowait_supported) { + if (nowait) { + iocb.aio_rw_flags |= RWF_NOWAIT; + } else { + iocb.aio_rw_flags &= ~RWF_NOWAIT; + } + } +#endif +} + +} + + +} + diff --git a/src/seastar/include/seastar/core/loop.hh b/src/seastar/include/seastar/core/loop.hh new file mode 100644 index 000000000..a7a0145a8 --- /dev/null +++ b/src/seastar/include/seastar/core/loop.hh @@ -0,0 +1,715 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2020 ScyllaDB. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +namespace seastar { + +/// \addtogroup future-util +/// @{ + +// The AsyncAction concept represents an action which can complete later than +// the actual function invocation. It is represented by a function which +// returns a future which resolves when the action is done. + +struct stop_iteration_tag { }; +using stop_iteration = bool_class; + +namespace internal { + +template +class repeater final : public continuation_base { + promise<> _promise; + AsyncAction _action; +public: + explicit repeater(AsyncAction&& action) : _action(std::move(action)) {} + future<> get_future() { return _promise.get_future(); } + task* waiting_task() noexcept override { return _promise.waiting_task(); } + virtual void run_and_dispose() noexcept override { + if (_state.failed()) { + _promise.set_exception(std::move(_state).get_exception()); + delete this; + return; + } else { + if (_state.get0() == stop_iteration::yes) { + _promise.set_value(); + delete this; + return; + } + _state = {}; + } + try { + do { + auto f = futurize_invoke(_action); + if (!f.available()) { + internal::set_callback(f, this); + return; + } + if (f.get0() == stop_iteration::yes) { + _promise.set_value(); + delete this; + return; + } + } while (!need_preempt()); + } catch (...) { + _promise.set_exception(std::current_exception()); + delete this; + return; + } + _state.set(stop_iteration::no); + schedule(this); + } +}; + +} // namespace internal + +// Delete these overloads so that the actual implementation can use a +// universal reference but still reject lvalue references. +template +future<> repeat(const AsyncAction& action) noexcept = delete; +template +future<> repeat(AsyncAction& action) noexcept = delete; + +/// Invokes given action until it fails or the function requests iteration to stop by returning +/// \c stop_iteration::yes. +/// +/// \param action a callable taking no arguments, returning a future. Will +/// be called again as soon as the future resolves, unless the +/// future fails, action throws, or it resolves with \c stop_iteration::yes. +/// If \c action is an r-value it can be moved in the middle of iteration. +/// \return a ready future if we stopped successfully, or a failed future if +/// a call to to \c action failed. +template +SEASTAR_CONCEPT( requires seastar::InvokeReturns || seastar::InvokeReturns> ) +inline +future<> repeat(AsyncAction&& action) noexcept { + using futurator = futurize>; + static_assert(std::is_same, typename futurator::type>::value, "bad AsyncAction signature"); + for (;;) { + // Do not type-erase here in case this is a short repeat() + auto f = futurator::invoke(action); + + if (!f.available() || f.failed() || need_preempt()) { + return [&] () noexcept { + memory::scoped_critical_alloc_section _; + auto repeater = new internal::repeater(std::move(action)); + auto ret = repeater->get_future(); + internal::set_callback(f, repeater); + return ret; + }(); + } + + if (f.get0() == stop_iteration::yes) { + return make_ready_future<>(); + } + } +} + +/// \cond internal + +template +struct repeat_until_value_type_helper; + +/// Type helper for repeat_until_value() +template +struct repeat_until_value_type_helper>> { + /// The type of the value we are computing + using value_type = T; + /// Type used by \c AsyncAction while looping + using optional_type = std::optional; + /// Return type of repeat_until_value() + using future_type = future; +}; + +/// Return value of repeat_until_value() +template +using repeat_until_value_return_type + = typename repeat_until_value_type_helper>::type>::future_type; + +/// \endcond + +namespace internal { + +template +class repeat_until_value_state final : public continuation_base> { + promise _promise; + AsyncAction _action; +public: + explicit repeat_until_value_state(AsyncAction action) : _action(std::move(action)) {} + repeat_until_value_state(std::optional st, AsyncAction action) : repeat_until_value_state(std::move(action)) { + this->_state.set(std::move(st)); + } + future get_future() { return _promise.get_future(); } + task* waiting_task() noexcept override { return _promise.waiting_task(); } + virtual void run_and_dispose() noexcept override { + if (this->_state.failed()) { + _promise.set_exception(std::move(this->_state).get_exception()); + delete this; + return; + } else { + auto v = std::move(this->_state).get0(); + if (v) { + _promise.set_value(std::move(*v)); + delete this; + return; + } + this->_state = {}; + } + try { + do { + auto f = futurize_invoke(_action); + if (!f.available()) { + internal::set_callback(f, this); + return; + } + auto ret = f.get0(); + if (ret) { + _promise.set_value(std::move(*ret)); + delete this; + return; + } + } while (!need_preempt()); + } catch (...) { + _promise.set_exception(std::current_exception()); + delete this; + return; + } + this->_state.set(std::nullopt); + schedule(this); + } +}; + +} // namespace internal + +/// Invokes given action until it fails or the function requests iteration to stop by returning +/// an engaged \c future> or std::optional. The value is extracted +/// from the \c optional, and returned, as a future, from repeat_until_value(). +/// +/// \param action a callable taking no arguments, returning a future> +/// or std::optional. Will be called again as soon as the future +/// resolves, unless the future fails, action throws, or it resolves with +/// an engaged \c optional. If \c action is an r-value it can be moved +/// in the middle of iteration. +/// \return a ready future if we stopped successfully, or a failed future if +/// a call to to \c action failed. The \c optional's value is returned. +template +SEASTAR_CONCEPT( requires requires (AsyncAction aa) { + bool(futurize_invoke(aa).get0()); + futurize_invoke(aa).get0().value(); +} ) +repeat_until_value_return_type +repeat_until_value(AsyncAction action) noexcept { + using futurator = futurize>; + using type_helper = repeat_until_value_type_helper; + // the "T" in the documentation + using value_type = typename type_helper::value_type; + using optional_type = typename type_helper::optional_type; + do { + auto f = futurator::invoke(action); + + if (!f.available()) { + return [&] () noexcept { + memory::scoped_critical_alloc_section _; + auto state = new internal::repeat_until_value_state(std::move(action)); + auto ret = state->get_future(); + internal::set_callback(f, state); + return ret; + }(); + } + + if (f.failed()) { + return make_exception_future(f.get_exception()); + } + + optional_type&& optional = std::move(f).get0(); + if (optional) { + return make_ready_future(std::move(optional.value())); + } + } while (!need_preempt()); + + try { + auto state = new internal::repeat_until_value_state(std::nullopt, std::move(action)); + auto f = state->get_future(); + schedule(state); + return f; + } catch (...) { + return make_exception_future(std::current_exception()); + } +} + +namespace internal { + +template +class do_until_state final : public continuation_base<> { + promise<> _promise; + StopCondition _stop; + AsyncAction _action; +public: + explicit do_until_state(StopCondition stop, AsyncAction action) : _stop(std::move(stop)), _action(std::move(action)) {} + future<> get_future() { return _promise.get_future(); } + task* waiting_task() noexcept override { return _promise.waiting_task(); } + virtual void run_and_dispose() noexcept override { + if (_state.available()) { + if (_state.failed()) { + _promise.set_urgent_state(std::move(_state)); + delete this; + return; + } + _state = {}; // allow next cycle to overrun state + } + try { + do { + if (_stop()) { + _promise.set_value(); + delete this; + return; + } + auto f = _action(); + if (!f.available()) { + internal::set_callback(f, this); + return; + } + if (f.failed()) { + f.forward_to(std::move(_promise)); + delete this; + return; + } + } while (!need_preempt()); + } catch (...) { + _promise.set_exception(std::current_exception()); + delete this; + return; + } + schedule(this); + } +}; + +} // namespace internal + +/// Invokes given action until it fails or given condition evaluates to true. +/// +/// \param stop_cond a callable taking no arguments, returning a boolean that +/// evalutes to true when you don't want to call \c action +/// any longer +/// \param action a callable taking no arguments, returning a future<>. Will +/// be called again as soon as the future resolves, unless the +/// future fails, or \c stop_cond returns \c true. +/// \return a ready future if we stopped successfully, or a failed future if +/// a call to to \c action failed. +template +SEASTAR_CONCEPT( requires seastar::InvokeReturns && seastar::InvokeReturns> ) +inline +future<> do_until(StopCondition stop_cond, AsyncAction action) noexcept { + using namespace internal; + for (;;) { + if (stop_cond()) { + return make_ready_future<>(); + } + auto f = futurize_invoke(action); + if (f.failed()) { + return f; + } + if (!f.available() || need_preempt()) { + return [&] () noexcept { + memory::scoped_critical_alloc_section _; + auto task = new do_until_state(std::move(stop_cond), std::move(action)); + auto ret = task->get_future(); + internal::set_callback(f, task); + return ret; + }(); + } + } +} + +/// Invoke given action until it fails. +/// +/// Calls \c action repeatedly until it returns a failed future. +/// +/// \param action a callable taking no arguments, returning a \c future<> +/// that becomes ready when you wish it to be called again. +/// \return a future<> that will resolve to the first failure of \c action +template +SEASTAR_CONCEPT( requires seastar::InvokeReturns> ) +inline +future<> keep_doing(AsyncAction action) noexcept { + return repeat([action = std::move(action)] () mutable { + return action().then([] { + return stop_iteration::no; + }); + }); +} + +namespace internal { +template +class do_for_each_state final : public continuation_base<> { + Iterator _begin; + Iterator _end; + AsyncAction _action; + promise<> _pr; + +public: + do_for_each_state(Iterator begin, Iterator end, AsyncAction action, future<> first_unavailable) + : _begin(std::move(begin)), _end(std::move(end)), _action(std::move(action)) { + internal::set_callback(first_unavailable, this); + } + virtual void run_and_dispose() noexcept override { + std::unique_ptr zis(this); + if (_state.failed()) { + _pr.set_urgent_state(std::move(_state)); + return; + } + while (_begin != _end) { + auto f = futurize_invoke(_action, *_begin++); + if (f.failed()) { + f.forward_to(std::move(_pr)); + return; + } + if (!f.available() || need_preempt()) { + _state = {}; + internal::set_callback(f, this); + zis.release(); + return; + } + } + _pr.set_value(); + } + task* waiting_task() noexcept override { + return _pr.waiting_task(); + } + future<> get_future() { + return _pr.get_future(); + } +}; + +template +inline +future<> do_for_each_impl(Iterator begin, Iterator end, AsyncAction action) { + while (begin != end) { + auto f = futurize_invoke(action, *begin++); + if (f.failed()) { + return f; + } + if (!f.available() || need_preempt()) { + auto* s = new internal::do_for_each_state{ + std::move(begin), std::move(end), std::move(action), std::move(f)}; + return s->get_future(); + } + } + return make_ready_future<>(); +} +} // namespace internal + +/// \addtogroup future-util + +/// \brief Call a function for each item in a range, sequentially (iterator version). +/// +/// For each item in a range, call a function, waiting for the previous +/// invocation to complete before calling the next one. +/// +/// \param begin an \c InputIterator designating the beginning of the range +/// \param end an \c InputIterator designating the endof the range +/// \param action a callable, taking a reference to objects from the range +/// as a parameter, and returning a \c future<> that resolves +/// when it is acceptable to process the next item. +/// \return a ready future on success, or the first failed future if +/// \c action failed. +template +SEASTAR_CONCEPT( requires requires (Iterator i, AsyncAction aa) { + { futurize_invoke(aa, *i) } -> std::same_as>; +} ) +inline +future<> do_for_each(Iterator begin, Iterator end, AsyncAction action) noexcept { + try { + return internal::do_for_each_impl(std::move(begin), std::move(end), std::move(action)); + } catch (...) { + return current_exception_as_future(); + } +} + +/// \brief Call a function for each item in a range, sequentially (range version). +/// +/// For each item in a range, call a function, waiting for the previous +/// invocation to complete before calling the next one. +/// +/// \param c an \c Container object designating input range +/// \param action a callable, taking a reference to objects from the range +/// as a parameter, and returning a \c future<> that resolves +/// when it is acceptable to process the next item. +/// \return a ready future on success, or the first failed future if +/// \c action failed. +template +SEASTAR_CONCEPT( requires requires (Container c, AsyncAction aa) { + { futurize_invoke(aa, *c.begin()) } -> std::same_as>; +} ) +inline +future<> do_for_each(Container& c, AsyncAction action) noexcept { + try { + return internal::do_for_each_impl(std::begin(c), std::end(c), std::move(action)); + } catch (...) { + return current_exception_as_future(); + } +} + +namespace internal { + +template +inline +size_t +iterator_range_estimate_vector_capacity(Iterator begin, Iterator end, IteratorCategory category) { + // For InputIterators we can't estimate needed capacity + return 0; +} + +template +inline +size_t +iterator_range_estimate_vector_capacity(Iterator begin, Iterator end, std::forward_iterator_tag category) { + // May be linear time below random_access_iterator_tag, but still better than reallocation + return std::distance(begin, end); +} + +} // namespace internal + +/// \cond internal + +class parallel_for_each_state final : private continuation_base<> { + std::vector> _incomplete; + promise<> _result; + std::exception_ptr _ex; +private: + // Wait for one of the futures in _incomplete to complete, and then + // decide what to do: wait for another one, or deliver _result if all + // are complete. + void wait_for_one() noexcept; + virtual void run_and_dispose() noexcept override; + task* waiting_task() noexcept override { return _result.waiting_task(); } +public: + parallel_for_each_state(size_t n); + void add_future(future<>&& f); + future<> get_future(); +}; + +/// \endcond + +/// \brief Run tasks in parallel (iterator version). +/// +/// Given a range [\c begin, \c end) of objects, run \c func on each \c *i in +/// the range, and return a future<> that resolves when all the functions +/// complete. \c func should return a future<> that indicates when it is +/// complete. All invocations are performed in parallel. This allows the range +/// to refer to stack objects, but means that unlike other loops this cannot +/// check need_preempt and can only be used with small ranges. +/// +/// \param begin an \c InputIterator designating the beginning of the range +/// \param end an \c InputIterator designating the end of the range +/// \param func Function to invoke with each element in the range (returning +/// a \c future<>) +/// \return a \c future<> that resolves when all the function invocations +/// complete. If one or more return an exception, the return value +/// contains one of the exceptions. +template +SEASTAR_CONCEPT( requires requires (Func f, Iterator i) { { f(*i++) } -> std::same_as>; } ) +inline +future<> +parallel_for_each(Iterator begin, Iterator end, Func&& func) noexcept { + parallel_for_each_state* s = nullptr; + // Process all elements, giving each future the following treatment: + // - available, not failed: do nothing + // - available, failed: collect exception in ex + // - not available: collect in s (allocating it if needed) + while (begin != end) { + auto f = futurize_invoke(std::forward(func), *begin++); + if (!f.available() || f.failed()) { + if (!s) { + memory::scoped_critical_alloc_section _; + using itraits = std::iterator_traits; + auto n = (internal::iterator_range_estimate_vector_capacity(begin, end, typename itraits::iterator_category()) + 1); + s = new parallel_for_each_state(n); + } + s->add_future(std::move(f)); + } + } + // If any futures were not available, hand off to parallel_for_each_state::start(). + // Otherwise we can return a result immediately. + if (s) { + // s->get_future() takes ownership of s (and chains it to one of the futures it contains) + // so this isn't a leak + return s->get_future(); + } + return make_ready_future<>(); +} + +/// \brief Run tasks in parallel (range version). +/// +/// Given a \c range of objects, invoke \c func with each object +/// in the range, and return a future<> that resolves when all +/// the functions complete. \c func should return a future<> that indicates +/// when it is complete. All invocations are performed in parallel. This allows +/// the range to refer to stack objects, but means that unlike other loops this +/// cannot check need_preempt and can only be used with small ranges. +/// +/// \param range A range of objects to iterate run \c func on +/// \param func A callable, accepting reference to the range's +/// \c value_type, and returning a \c future<>. +/// \return a \c future<> that becomes ready when the entire range +/// was processed. If one or more of the invocations of +/// \c func returned an exceptional future, then the return +/// value will contain one of those exceptions. + +namespace internal { + +template +inline +future<> +parallel_for_each_impl(Range&& range, Func&& func) { + return parallel_for_each(std::begin(range), std::end(range), + std::forward(func)); +} + +} // namespace internal + +template +SEASTAR_CONCEPT( requires requires (Func f, Range r) { { f(*r.begin()) } -> std::same_as>; } ) +inline +future<> +parallel_for_each(Range&& range, Func&& func) noexcept { + auto impl = internal::parallel_for_each_impl; + return futurize_invoke(impl, std::forward(range), std::forward(func)); +} + +/// Run a maximum of \c max_concurrent tasks in parallel (iterator version). +/// +/// Given a range [\c begin, \c end) of objects, run \c func on each \c *i in +/// the range, and return a future<> that resolves when all the functions +/// complete. \c func should return a future<> that indicates when it is +/// complete. Up to \c max_concurrent invocations are performed in parallel. +/// This does not allow the range to refer to stack objects. The caller +/// must ensure that the range outlives the call to max_concurrent_for_each +/// so it can be iterated in the background. +/// +/// \param begin an \c InputIterator designating the beginning of the range +/// \param end an \c InputIterator designating the end of the range +/// \param max_concurrent maximum number of concurrent invocations of \c func, must be greater than zero. +/// \param func Function to invoke with each element in the range (returning +/// a \c future<>) +/// \return a \c future<> that resolves when all the function invocations +/// complete. If one or more return an exception, the return value +/// contains one of the exceptions. +template +SEASTAR_CONCEPT( requires requires (Func f, Iterator i) { { f(*i++) } -> std::same_as>; } ) +inline +future<> +max_concurrent_for_each(Iterator begin, Iterator end, size_t max_concurrent, Func&& func) noexcept { + struct state { + Iterator begin; + Iterator end; + Func func; + size_t max_concurrent; + semaphore sem; + std::exception_ptr err; + + state(Iterator begin_, Iterator end_, size_t max_concurrent_, Func func_) + : begin(std::move(begin_)) + , end(std::move(end_)) + , func(std::move(func_)) + , max_concurrent(max_concurrent_) + , sem(max_concurrent_) + , err() + { } + }; + + assert(max_concurrent > 0); + + try { + return do_with(state(std::move(begin), std::move(end), max_concurrent, std::forward(func)), [] (state& s) { + return do_until([&s] { return s.begin == s.end; }, [&s] { + return s.sem.wait().then([&s] () mutable noexcept { + // Possibly run in background and signal _sem when the task is done. + // The background tasks are waited on using _sem. + (void)futurize_invoke(s.func, *s.begin++).then_wrapped([&s] (future<> fut) { + if (fut.failed()) { + auto e = fut.get_exception();; + if (!s.err) { + s.err = std::move(e); + } + } + s.sem.signal(); + }); + }); + }).then([&s] { + // Wait for any background task to finish + // and signal and semaphore + return s.sem.wait(s.max_concurrent); + }).then([&s] { + if (!s.err) { + return make_ready_future<>(); + } + return seastar::make_exception_future<>(std::move(s.err)); + }); + }); + } catch (...) { + return current_exception_as_future(); + } +} + +/// Run a maximum of \c max_concurrent tasks in parallel (range version). +/// +/// Given a range [\c begin, \c end) of objects, run \c func on each \c *i in +/// the range, and return a future<> that resolves when all the functions +/// complete. \c func should return a future<> that indicates when it is +/// complete. Up to \c max_concurrent invocations are performed in parallel. +/// This does not allow the range to refer to stack objects. The caller +/// must ensure that the range outlives the call to max_concurrent_for_each +/// so it can be iterated in the background. +/// +/// \param begin an \c InputIterator designating the beginning of the range +/// \param end an \c InputIterator designating the end of the range +/// \param max_concurrent maximum number of concurrent invocations of \c func, must be greater than zero. +/// \param func Function to invoke with each element in the range (returning +/// a \c future<>) +/// \return a \c future<> that resolves when all the function invocations +/// complete. If one or more return an exception, the return value +/// contains one of the exceptions. +template +SEASTAR_CONCEPT( requires std::ranges::range && requires (Func f, Range r) { { f(*r.begin()) } -> std::same_as>; } ) +inline +future<> +max_concurrent_for_each(Range&& range, size_t max_concurrent, Func&& func) noexcept { + try { + return max_concurrent_for_each(std::begin(range), std::end(range), max_concurrent, std::forward(func)); + } catch (...) { + return current_exception_as_future(); + } +} + +/// @} + +} // namespace seastar diff --git a/src/seastar/include/seastar/core/lowres_clock.hh b/src/seastar/include/seastar/core/lowres_clock.hh new file mode 100644 index 000000000..2d683a87a --- /dev/null +++ b/src/seastar/include/seastar/core/lowres_clock.hh @@ -0,0 +1,160 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB + */ + +#pragma once + +#include +#include + +#include + +#include +#include + +namespace seastar { + +// +// Forward declarations. +// + +class lowres_clock; +class lowres_system_clock; + +/// \cond internal + +class lowres_clock_impl final { +public: + using base_steady_clock = std::chrono::steady_clock; + using base_system_clock = std::chrono::system_clock; + + // The clocks' resolutions are 10 ms. However, to make it is easier to do calculations with + // `std::chrono::milliseconds`, we make the clock period 1 ms instead of 10 ms. + using period = std::ratio<1, 1000>; + + using steady_rep = base_steady_clock::rep; + using steady_duration = std::chrono::duration; + using steady_time_point = std::chrono::time_point; + + using system_rep = base_system_clock::rep; + using system_duration = std::chrono::duration; + using system_time_point = std::chrono::time_point; + + static steady_time_point steady_now() noexcept { + auto const nr = counters::_steady_now.load(std::memory_order_relaxed); + return steady_time_point(steady_duration(nr)); + } + + static system_time_point system_now() noexcept { + auto const nr = counters::_system_now.load(std::memory_order_relaxed); + return system_time_point(system_duration(nr)); + } + + // For construction. + friend class smp; +private: + // Both counters are updated by cpu0 and read by other cpus. Place them on their own cache line to avoid false + // sharing. + struct alignas(seastar::cache_line_size) counters final { + static std::atomic _steady_now; + static std::atomic _system_now; + }; + + // The timer expires every 10 ms. + static constexpr std::chrono::milliseconds _granularity{10}; + + // High-resolution timer to drive these low-resolution clocks. + timer<> _timer{}; + + static void update() noexcept; + + // Private to ensure that static variables are only initialized once. + // might throw when arming timer. + lowres_clock_impl(); +}; + +/// \endcond + +// +/// \brief Low-resolution and efficient steady clock. +/// +/// This is a monotonic clock with a granularity of 10 ms. Time points from this clock do not correspond to system +/// time. +/// +/// The primary benefit of this clock is that invoking \c now() is inexpensive compared to +/// \c std::chrono::steady_clock::now(). +/// +/// \see \c lowres_system_clock for a low-resolution clock which produces time points corresponding to system time. +/// +class lowres_clock final { +public: + using rep = lowres_clock_impl::steady_rep; + using period = lowres_clock_impl::period; + using duration = lowres_clock_impl::steady_duration; + using time_point = lowres_clock_impl::steady_time_point; + + static constexpr bool is_steady = true; + + /// + /// \note Outside of a Seastar application, the result is undefined. + /// + static time_point now() noexcept { + return lowres_clock_impl::steady_now(); + } +}; + +/// +/// \brief Low-resolution and efficient system clock. +/// +/// This clock has the same granularity as \c lowres_clock, but it is not required to be monotonic and its time points +/// correspond to system time. +/// +/// The primary benefit of this clock is that invoking \c now() is inexpensive compared to +/// \c std::chrono::system_clock::now(). +/// +class lowres_system_clock final { +public: + using rep = lowres_clock_impl::system_rep; + using period = lowres_clock_impl::period; + using duration = lowres_clock_impl::system_duration; + using time_point = lowres_clock_impl::system_time_point; + + static constexpr bool is_steady = lowres_clock_impl::base_system_clock::is_steady; + + /// + /// \note Outside of a Seastar application, the result is undefined. + /// + static time_point now() noexcept { + return lowres_clock_impl::system_now(); + } + + static std::time_t to_time_t(time_point t) noexcept { + return std::chrono::duration_cast(t.time_since_epoch()).count(); + } + + static time_point from_time_t(std::time_t t) noexcept { + return time_point(std::chrono::duration_cast(std::chrono::seconds(t))); + } +}; + +extern template class timer; + +} + diff --git a/src/seastar/include/seastar/core/make_task.hh b/src/seastar/include/seastar/core/make_task.hh new file mode 100644 index 000000000..74c521b07 --- /dev/null +++ b/src/seastar/include/seastar/core/make_task.hh @@ -0,0 +1,62 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include + +namespace seastar { + +template +class lambda_task final : public task { + Func _func; + using futurator = futurize>; + typename futurator::promise_type _result; +public: + lambda_task(scheduling_group sg, const Func& func) : task(sg), _func(func) {} + lambda_task(scheduling_group sg, Func&& func) : task(sg), _func(std::move(func)) {} + typename futurator::type get_future() noexcept { return _result.get_future(); } + virtual void run_and_dispose() noexcept override { + futurator::invoke(_func).forward_to(std::move(_result)); + delete this; + } + virtual task* waiting_task() noexcept override { + return _result.waiting_task(); + } +}; + +template +inline +lambda_task* +make_task(Func&& func) noexcept { + return new lambda_task(current_scheduling_group(), std::forward(func)); +} + +template +inline +lambda_task* +make_task(scheduling_group sg, Func&& func) noexcept { + return new lambda_task(sg, std::forward(func)); +} + +} diff --git a/src/seastar/include/seastar/core/manual_clock.hh b/src/seastar/include/seastar/core/manual_clock.hh new file mode 100644 index 000000000..34bc5bec1 --- /dev/null +++ b/src/seastar/include/seastar/core/manual_clock.hh @@ -0,0 +1,51 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB + */ + +#pragma once + +#include + +#include +#include + +namespace seastar { + +class manual_clock { +public: + using rep = int64_t; + using period = std::chrono::nanoseconds::period; + using duration = std::chrono::duration; + using time_point = std::chrono::time_point; +private: + static std::atomic _now; + static void expire_timers() noexcept; +public: + manual_clock() noexcept; + static time_point now() noexcept { + return time_point(duration(_now.load(std::memory_order_relaxed))); + } + static void advance(duration d) noexcept; +}; + +extern template class timer; + +} + diff --git a/src/seastar/include/seastar/core/map_reduce.hh b/src/seastar/include/seastar/core/map_reduce.hh new file mode 100644 index 000000000..f9e1ad075 --- /dev/null +++ b/src/seastar/include/seastar/core/map_reduce.hh @@ -0,0 +1,254 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2020 ScyllaDB. + */ + +#pragma once + +#include + +#include +#include + +namespace seastar { + +/// \addtogroup future-util +/// @{ + +/// \cond internal + +template +struct reducer_with_get_traits; + +template +struct reducer_with_get_traits { + using result_type = decltype(std::declval().get()); + using future_type = future; + static future_type maybe_call_get(future<> f, lw_shared_ptr r) { + return f.then([r = std::move(r)] () mutable { + return make_ready_future(std::move(*r).get()); + }); + } +}; + +template +struct reducer_with_get_traits { + using future_type = decltype(std::declval().get()); + static future_type maybe_call_get(future<> f, lw_shared_ptr r) { + return f.then([r = std::move(r)] () mutable { + return r->get(); + }).then_wrapped([r] (future_type f) { + return f; + }); + } +}; + +template +struct reducer_traits { + using future_type = future<>; + static future_type maybe_call_get(future<> f, lw_shared_ptr r) { + return f.then([r = std::move(r)] {}); + } +}; + +template +struct reducer_traits().get(), void())> : public reducer_with_get_traits>::value> {}; + +/// \endcond + +/// Map a function over a range and reduce the result. +/// +/// \param begin an \c InputIterator designating the beginning of the range +/// \param end an \c InputIterator designating the end of the range +/// \param mapper is a callable which transforms values from the iterator range into a future +/// \param r is an object which can be called with T as parameter and yields a future<> +/// It may have a get() method which returns a value of type U which holds the result of reduction. +/// \return Th reduced value wrapped in a future. +/// If the reducer has no get() method then this function returns future<>. + +// TODO: specialize for non-deferring reducer +template +SEASTAR_CONCEPT( requires requires (Iterator i, Mapper mapper, Reducer reduce) { + *i++; + { i != i } -> std::convertible_to; + mapper(*i); + reduce(futurize_invoke(mapper, *i).get0()); +} ) +inline +auto +map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Reducer&& r) + -> typename reducer_traits::future_type +{ + auto r_ptr = make_lw_shared(std::forward(r)); + future<> ret = make_ready_future<>(); + while (begin != end) { + ret = futurize_invoke(mapper, *begin++).then_wrapped([ret = std::move(ret), r_ptr] (auto f) mutable { + return ret.then_wrapped([f = std::move(f), r_ptr] (auto rf) mutable { + if (rf.failed()) { + f.ignore_ready_future(); + return std::move(rf); + } else { + return futurize_invoke(*r_ptr, std::move(f.get0())); + } + }); + }); + } + return reducer_traits::maybe_call_get(std::move(ret), r_ptr); +} + +/// Asynchronous map/reduce transformation. +/// +/// Given a range of objects, an asynchronous unary function +/// operating on these objects, an initial value, and a +/// binary function for reducing, map_reduce() will +/// transform each object in the range, then invoke +/// the the reducing function with the result. +/// +/// Example: +/// +/// Calculate the total size of several files: +/// +/// \code +/// map_reduce(files.begin(), files.end(), +/// std::mem_fn(file::size), +/// size_t(0), +/// std::plus()) +/// \endcode +/// +/// Requirements: +/// - Iterator: an InputIterator. +/// - Mapper: unary function taking Iterator::value_type and producing a future<...>. +/// - Initial: any value type +/// - Reduce: a binary function taking two Initial values and returning an Initial +/// +/// Return type: +/// - future +/// +/// \param begin beginning of object range to operate on +/// \param end end of object range to operate on +/// \param mapper map function to call on each object, returning a future +/// \param initial initial input value to reduce function +/// \param reduce binary function for merging two result values from \c mapper +/// +/// \return equivalent to \c reduce(reduce(initial, mapper(obj0)), mapper(obj1)) ... +template +SEASTAR_CONCEPT( requires requires (Iterator i, Mapper mapper, Initial initial, Reduce reduce) { + *i++; + { i != i} -> std::convertible_to; + mapper(*i); + requires is_future::value; + { reduce(std::move(initial), mapper(*i).get0()) } -> std::convertible_to; +} ) +inline +future +map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Initial initial, Reduce reduce) { + struct state { + Initial result; + Reduce reduce; + }; + auto s = make_lw_shared(state{std::move(initial), std::move(reduce)}); + future<> ret = make_ready_future<>(); + while (begin != end) { + ret = futurize_invoke(mapper, *begin++).then_wrapped([s = s.get(), ret = std::move(ret)] (auto f) mutable { + try { + s->result = s->reduce(std::move(s->result), std::move(f.get0())); + return std::move(ret); + } catch (...) { + return std::move(ret).then_wrapped([ex = std::current_exception()] (auto f) { + f.ignore_ready_future(); + return make_exception_future<>(ex); + }); + } + }); + } + return ret.then([s] { + return make_ready_future(std::move(s->result)); + }); +} + +/// Asynchronous map/reduce transformation (range version). +/// +/// Given a range of objects, an asynchronous unary function +/// operating on these objects, an initial value, and a +/// binary function for reducing, map_reduce() will +/// transform each object in the range, then invoke +/// the the reducing function with the result. +/// +/// Example: +/// +/// Calculate the total size of several files: +/// +/// \code +/// std::vector files = ...; +/// map_reduce(files, +/// std::mem_fn(file::size), +/// size_t(0), +/// std::plus()) +/// \endcode +/// +/// Requirements: +/// - Iterator: an InputIterator. +/// - Mapper: unary function taking Iterator::value_type and producing a future<...>. +/// - Initial: any value type +/// - Reduce: a binary function taking two Initial values and returning an Initial +/// +/// Return type: +/// - future +/// +/// \param range object range to operate on +/// \param mapper map function to call on each object, returning a future +/// \param initial initial input value to reduce function +/// \param reduce binary function for merging two result values from \c mapper +/// +/// \return equivalent to \c reduce(reduce(initial, mapper(obj0)), mapper(obj1)) ... +template +SEASTAR_CONCEPT( requires requires (Range range, Mapper mapper, Initial initial, Reduce reduce) { + std::begin(range); + std::end(range); + mapper(*std::begin(range)); + requires is_future>::value; + { reduce(std::move(initial), mapper(*std::begin(range)).get0()) } -> std::convertible_to; +} ) +inline +future +map_reduce(Range&& range, Mapper&& mapper, Initial initial, Reduce reduce) { + return map_reduce(std::begin(range), std::end(range), std::forward(mapper), + std::move(initial), std::move(reduce)); +} + +/// Implements @Reducer concept. Calculates the result by +/// adding elements to the accumulator. +template +class adder { +private: + Result _result; +public: + future<> operator()(const Addend& value) { + _result += value; + return make_ready_future<>(); + } + Result get() && { + return std::move(_result); + } +}; + +/// @} + +} // namespace seastar diff --git a/src/seastar/include/seastar/core/memory.hh b/src/seastar/include/seastar/core/memory.hh new file mode 100644 index 000000000..92b63cd2b --- /dev/null +++ b/src/seastar/include/seastar/core/memory.hh @@ -0,0 +1,370 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace seastar { + +/// \defgroup memory-module Memory management +/// +/// Functions and classes for managing memory. +/// +/// Memory management in seastar consists of the following: +/// +/// - Low-level memory management in the \ref memory namespace. +/// - Various smart pointers: \ref shared_ptr, \ref lw_shared_ptr, +/// and \ref foreign_ptr. +/// - zero-copy support: \ref temporary_buffer and \ref deleter. + +/// Low-level memory management support +/// +/// The \c memory namespace provides functions and classes for interfacing +/// with the seastar memory allocator. +/// +/// The seastar memory allocator splits system memory into a pool per +/// logical core (lcore). Memory allocated one an lcore should be freed +/// on the same lcore; failing to do so carries a severe performance +/// penalty. It is possible to share memory with another core, but this +/// should be limited to avoid cache coherency traffic. +/// You can obtain the memory layout of the current shard with +/// \ref get_memory_layout(). +/// +/// ## Critical allocation scopes +/// +/// Seastar supports marking scopes as critical allocation scopes for the purpose +/// of special treatment from various memory related utilities. +/// See \ref scoped_critical_alloc_section. +/// +/// ## Diagnostics and debugging features +/// +/// ### Allocation failure injector +/// +/// Allows injecting allocation failures for testing resiliency against +/// allocation failures, or exceptions in general. See: +/// * \ref alloc_failure_injector +/// * \ref with_allocation_failures() +/// +/// ### Large allocation warning +/// +/// Large allocations put great pressure on the allocator which might be unable +/// to serve them even if there is enough memory available, due to memory +/// fragmentation. This is especially relevant for long-running applications, +/// the kind of applications that are typically built with seastar. This feature +/// allows finding these large by logging a warning on large allocations, with +/// the stacktrace of the. See: +/// * \ref set_large_allocation_warning_threshold() +/// * \ref get_large_allocation_warning_threshold() +/// * \ref scoped_large_allocation_warning_threshold +/// * \ref scoped_large_allocation_warning_disable +/// +/// ### Heap profiling +/// +/// Heap profiling allows finding out how memory is used by your application, by +/// recording the stacktrace of all allocations. See: +/// * \ref set_heap_profiling_enabled() +/// * \ref scoped_heap_profiling +/// +/// ### Abort on allocation failure +/// +/// Often, the best way to debug an allocation failure is a coredump. This +/// feature allows dumping core on allocation failures, containing the stack of +/// the failed allocation, by means of aborting. To enable set the +/// `abort_on_seastar_bad_alloc` configuration option or the respective command +/// line flag. +/// +/// ### Dump diagnostics report +/// +/// Dump a diagnostic report of the state of the seastar allocator upon allocation +/// failure. The report is dumped with the `seastar_memory` logger, with debug +/// level. +/// You can configure a report to be dumped with error level on certain allocation +/// kinds, see: +/// * set_dump_memory_diagnostics_on_alloc_failure_kind() +/// * set_additional_diagnostics_producer() +/// * generate_memory_diagnostics_report() +/// +/// The diagnostics report dump can be configured with the command +/// line/configuration file via the \p dump-memory-diagnostics-on-alloc-failure-kind +/// command-line flag/configuration item. +namespace memory { + +/// \cond internal + +#ifdef SEASTAR_OVERRIDE_ALLOCATOR_PAGE_SIZE +#define SEASTAR_INTERNAL_ALLOCATOR_PAGE_SIZE (SEASTAR_OVERRIDE_ALLOCATOR_PAGE_SIZE) +#else +#define SEASTAR_INTERNAL_ALLOCATOR_PAGE_SIZE 4096 +#endif + +static constexpr size_t page_size = SEASTAR_INTERNAL_ALLOCATOR_PAGE_SIZE; +static constexpr size_t page_bits = log2ceil(page_size); +static constexpr size_t huge_page_size = +#if defined(__x86_64__) || defined(__i386__) || defined(__s390x__) || defined(__zarch__) + 1 << 21; // 2M +#elif defined(__aarch64__) + 1 << 21; // 2M +#elif defined(__PPC__) + 1 << 24; // 16M +#else +#error "Huge page size is not defined for this architecture" +#endif + +void configure(std::vector m, bool mbind, + std::optional hugetlbfs_path = {}); + +void enable_abort_on_allocation_failure(); + +class disable_abort_on_alloc_failure_temporarily { +public: + disable_abort_on_alloc_failure_temporarily(); + ~disable_abort_on_alloc_failure_temporarily() noexcept; +}; + +// Disables heap profiling as long as this object is alive. +// Can be nested, in which case the profiling is re-enabled when all +// the objects go out of scope. +class disable_backtrace_temporarily { + bool _old; +public: + disable_backtrace_temporarily(); + ~disable_backtrace_temporarily(); +}; + +enum class reclaiming_result { + reclaimed_nothing, + reclaimed_something +}; + +// Determines when reclaimer can be invoked +enum class reclaimer_scope { + // + // Reclaimer is only invoked in its own fiber. That fiber will be + // given higher priority than regular application fibers. + // + async, + + // + // Reclaimer may be invoked synchronously with allocation. + // It may also be invoked in async scope. + // + // Reclaimer may invoke allocation, though it is discouraged because + // the system may be low on memory and such allocations may fail. + // Reclaimers which allocate should be prepared for re-entry. + // + sync +}; + +class reclaimer { +public: + struct request { + // The number of bytes which is needed to be released. + // The reclaimer can release a different amount. + // If less is released then the reclaimer may be invoked again. + size_t bytes_to_reclaim; + }; + using reclaim_fn = std::function; +private: + std::function _reclaim; + reclaimer_scope _scope; +public: + // Installs new reclaimer which will be invoked when system is falling + // low on memory. 'scope' determines when reclaimer can be executed. + reclaimer(std::function reclaim, reclaimer_scope scope = reclaimer_scope::async); + reclaimer(std::function reclaim, reclaimer_scope scope = reclaimer_scope::async); + ~reclaimer(); + reclaiming_result do_reclaim(size_t bytes_to_reclaim) { return _reclaim(request{bytes_to_reclaim}); } + reclaimer_scope scope() const { return _scope; } +}; + +extern std::pmr::polymorphic_allocator* malloc_allocator; + +// Call periodically to recycle objects that were freed +// on cpu other than the one they were allocated on. +// +// Returns @true if any work was actually performed. +bool drain_cross_cpu_freelist(); + + +// We don't want the memory code calling back into the rest of +// the system, so allow the rest of the system to tell the memory +// code how to initiate reclaim. +// +// When memory is low, calling \c hook(fn) will result in fn being called +// in a safe place wrt. allocations. +void set_reclaim_hook( + std::function)> hook); + +/// \endcond + +class statistics; + +/// Capture a snapshot of memory allocation statistics for this lcore. +statistics stats(); + +/// Memory allocation statistics. +class statistics { + uint64_t _mallocs; + uint64_t _frees; + uint64_t _cross_cpu_frees; + size_t _total_memory; + size_t _free_memory; + uint64_t _reclaims; + uint64_t _large_allocs; + + uint64_t _foreign_mallocs; + uint64_t _foreign_frees; + uint64_t _foreign_cross_frees; +private: + statistics(uint64_t mallocs, uint64_t frees, uint64_t cross_cpu_frees, + uint64_t total_memory, uint64_t free_memory, uint64_t reclaims, uint64_t large_allocs, + uint64_t foreign_mallocs, uint64_t foreign_frees, uint64_t foreign_cross_frees) + : _mallocs(mallocs), _frees(frees), _cross_cpu_frees(cross_cpu_frees) + , _total_memory(total_memory), _free_memory(free_memory), _reclaims(reclaims), _large_allocs(large_allocs) + , _foreign_mallocs(foreign_mallocs), _foreign_frees(foreign_frees) + , _foreign_cross_frees(foreign_cross_frees) {} +public: + /// Total number of memory allocations calls since the system was started. + uint64_t mallocs() const { return _mallocs; } + /// Total number of memory deallocations calls since the system was started. + uint64_t frees() const { return _frees; } + /// Total number of memory deallocations that occured on a different lcore + /// than the one on which they were allocated. + uint64_t cross_cpu_frees() const { return _cross_cpu_frees; } + /// Total number of objects which were allocated but not freed. + size_t live_objects() const { return mallocs() - frees(); } + /// Total free memory (in bytes) + size_t free_memory() const { return _free_memory; } + /// Total allocated memory (in bytes) + size_t allocated_memory() const { return _total_memory - _free_memory; } + /// Total memory (in bytes) + size_t total_memory() const { return _total_memory; } + /// Number of reclaims performed due to low memory + uint64_t reclaims() const { return _reclaims; } + /// Number of allocations which violated the large allocation threshold + uint64_t large_allocations() const { return _large_allocs; } + /// Number of foreign allocations + uint64_t foreign_mallocs() const { return _foreign_mallocs; } + /// Number of foreign frees + uint64_t foreign_frees() const { return _foreign_frees; } + /// Number of foreign frees on reactor threads + uint64_t foreign_cross_frees() const { return _foreign_cross_frees; } + friend statistics stats(); +}; + +struct memory_layout { + uintptr_t start; + uintptr_t end; +}; + +// Discover virtual address range used by the allocator on current shard. +// Supported only when seastar allocator is enabled. +memory::memory_layout get_memory_layout(); + +/// Returns the value of free memory low water mark in bytes. +/// When free memory is below this value, reclaimers are invoked until it goes above again. +size_t min_free_memory(); + +/// Sets the value of free memory low water mark in memory::page_size units. +void set_min_free_pages(size_t pages); + +/// Enable the large allocation warning threshold. +/// +/// Warn when allocation above a given threshold are performed. +/// +/// \param threshold size (in bytes) above which an allocation will be logged +void set_large_allocation_warning_threshold(size_t threshold); + +/// Gets the current large allocation warning threshold. +size_t get_large_allocation_warning_threshold(); + +/// Disable large allocation warnings. +void disable_large_allocation_warning(); + +/// Set a different large allocation warning threshold for a scope. +class scoped_large_allocation_warning_threshold { + size_t _old_threshold; +public: + explicit scoped_large_allocation_warning_threshold(size_t threshold) + : _old_threshold(get_large_allocation_warning_threshold()) { + set_large_allocation_warning_threshold(threshold); + } + scoped_large_allocation_warning_threshold(const scoped_large_allocation_warning_threshold&) = delete; + scoped_large_allocation_warning_threshold(scoped_large_allocation_warning_threshold&& x) = delete; + ~scoped_large_allocation_warning_threshold() { + if (_old_threshold) { + set_large_allocation_warning_threshold(_old_threshold); + } + } + void operator=(const scoped_large_allocation_warning_threshold&) const = delete; + void operator=(scoped_large_allocation_warning_threshold&&) = delete; +}; + +/// Disable large allocation warnings for a scope. +class scoped_large_allocation_warning_disable { + size_t _old_threshold; +public: + scoped_large_allocation_warning_disable() + : _old_threshold(get_large_allocation_warning_threshold()) { + disable_large_allocation_warning(); + } + scoped_large_allocation_warning_disable(const scoped_large_allocation_warning_disable&) = delete; + scoped_large_allocation_warning_disable(scoped_large_allocation_warning_disable&& x) = delete; + ~scoped_large_allocation_warning_disable() { + if (_old_threshold) { + set_large_allocation_warning_threshold(_old_threshold); + } + } + void operator=(const scoped_large_allocation_warning_disable&) const = delete; + void operator=(scoped_large_allocation_warning_disable&&) = delete; +}; + +/// Enable/disable heap profiling. +/// +/// In order to use heap profiling you have to define +/// `SEASTAR_HEAPPROF`. +/// Heap profiling data is not currently exposed via an API for +/// inspection, instead it was designed to be inspected from a +/// debugger. +/// For an example script that makes use of the heap profiling data +/// see [scylla-gdb.py] (https://github.com/scylladb/scylla/blob/e1b22b6a4c56b4f1d0adf65d1a11db4bcb51fe7d/scylla-gdb.py#L1439) +/// This script can generate either textual representation of the data, +/// or a zoomable flame graph ([flame graph generation instructions](https://github.com/scylladb/scylla/wiki/Seastar-heap-profiler), +/// [example flame graph](https://user-images.githubusercontent.com/1389273/72920437-f0cf8a80-3d51-11ea-92f0-f3dbeb698871.png)). +void set_heap_profiling_enabled(bool); + +/// Enable heap profiling for the duration of the scope. +/// +/// For more information about heap profiling see +/// \ref set_heap_profiling_enabled(). +class scoped_heap_profiling { +public: + scoped_heap_profiling() noexcept; + ~scoped_heap_profiling(); +}; + +} +} diff --git a/src/seastar/include/seastar/core/metrics.hh b/src/seastar/include/seastar/core/metrics.hh new file mode 100644 index 000000000..88d1ad6a0 --- /dev/null +++ b/src/seastar/include/seastar/core/metrics.hh @@ -0,0 +1,587 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +/*! \file metrics.hh + * \brief header for metrics creation. + * + * This header file contains the metrics creation method with their helper function. + * Include this file when need to create metrics. + * Typically this will be in your source file. + * + * Code that is under the impl namespace should not be used directly. + * + */ + +namespace seastar { + +/*! + * \addtogroup metrics + * @{ + * + * \namespace seastar::metrics + * \brief metrics creation and registration + * + * the metrics namespace holds the relevant method and classes to generate metrics. + * + * The metrics layer support registering metrics, that later will be + * exported via different API protocols. + * + * To be able to support multiple protocols the following simplifications where made: + * 1. The id of the metrics is based on the collectd id + * 2. A metric could be a single value either a reference or a function + * + * To add metrics definition to class A do the following: + * * Add a metrics_group memeber to A + * * Add a a set_metrics() method that would be called in the constructor. + * + * + * In A header file + * \code + * #include "core/metrics_registration.hh" + * class A { + * metric_groups _metrics + * + * void setup_metrics(); + * + * }; + * \endcode + * + * In A source file: + * + * \code + * include "core/metrics.hh" + * + * void A::setup_metrics() { + * namespace sm = seastar::metrics; + * _metrics = sm::create_metric_group(); + * _metrics->add_group("cache", {sm::make_gauge("bytes", "used", [this] { return _region.occupancy().used_space(); })}); + * } + * \endcode + */ + +namespace metrics { + +class double_registration : public std::runtime_error { +public: + double_registration(std::string what); +}; + +/*! + * \defgroup metrics_types metrics type definitions + * The following are for the metric layer use, do not use them directly + * Instead use the make_counter, make_gauge, make_absolute and make_derived + * + */ +using metric_type_def = sstring; /*!< Used to hold an inherit type (like bytes)*/ +using metric_name_type = sstring; /*!< The metric name'*/ +using instance_id_type = sstring; /*!< typically used for the shard id*/ + +/*! + * \brief Human-readable description of a metric/group. + * + * + * Uses a separate class to deal with type resolution + * + * Add this to metric creation: + * + * \code + * _metrics->add_group("groupname", { + * sm::make_gauge("metric_name", value, description("A documentation about the return value")) + * }); + * \endcode + * + */ +class description { +public: + description(sstring s = sstring()) : _s(std::move(s)) + {} + const sstring& str() const { + return _s; + } +private: + sstring _s; +}; + +/*! + * \brief Label a metrics + * + * Label are useful for adding information about a metric that + * later you would need to aggregate by. + * For example, if you have multiple queues on a shard. + * Adding the queue id as a Label will allow you to use the same name + * of the metrics with multiple id instances. + * + * label_instance holds an instance of label consist of a key and value. + * + * Typically you will not generate a label_instance yourself, but use a label + * object for that. + * @see label for more information + * + * + */ +class label_instance { + sstring _key; + sstring _value; +public: + /*! + * \brief create a label_instance + * label instance consists of key and value. + * The key is an sstring. + * T - the value type can be any type that can be lexical_cast to string + * (ie. if it support the redirection operator for stringstream). + * + * All primitive types are supported so all the following examples are valid: + * label_instance a("smp_queue", 1) + * label_instance a("my_key", "my_value") + * label_instance a("internal_id", -1) + */ + template + label_instance(const sstring& key, T v) : _key(key), _value(boost::lexical_cast(v)){} + + /*! + * \brief returns the label key + */ + const sstring key() const { + return _key; + } + + /*! + * \brief returns the label value + */ + const sstring value() const { + return _value; + } + bool operator<(const label_instance&) const; + bool operator==(const label_instance&) const; + bool operator!=(const label_instance&) const; +}; + + +/*! + * \brief Class that creates label instances + * + * A factory class to create label instance + * Typically, the same Label name is used in multiple places. + * label is a label factory, you create it once, and use it to create the label_instance. + * + * In the example we would like to label the smp_queue with with the queue owner + * + * seastar::metrics::label smp_owner("smp_owner"); + * + * now, when creating a new smp metric we can add a label to it: + * + * sm::make_queue_length("send_batch_queue_length", _last_snt_batch, {smp_owner(cpuid)}) + * + * where cpuid in this case is unsiged. + */ +class label { + sstring key; +public: + using instance = label_instance; + /*! + * \brief creating a label + * key is the label name, it will be the key for all label_instance + * that will be created from this label. + */ + explicit label(const sstring& key) : key(key) { + } + + /*! + * \brief creating a label instance + * + * Use the function operator to create a new label instance. + * T - the value type can be any type that can be lexical_cast to string + * (ie. if it support the redirection operator for stringstream). + * + * All primitive types are supported so if lab is a label, all the following examples are valid: + * lab(1) + * lab("my_value") + * lab(-1) + */ + template + instance operator()(T value) const { + return label_instance(key, std::forward(value)); + } + + /*! + * \brief returns the label name + */ + const sstring& name() const { + return key; + } +}; + +/*! + * \namespace impl + * \brief holds the implementation parts of the metrics layer, do not use directly. + * + * The metrics layer define a thin API for adding metrics. + * Some of the implementation details need to be in the header file, they should not be use directly. + */ +namespace impl { + +// The value binding data types +enum class data_type : uint8_t { + COUNTER, // unsigned int 64 + GAUGE, // double + DERIVE, // signed int 64 + ABSOLUTE, // unsigned int 64 + HISTOGRAM, +}; + +/*! + * \brief A helper class that used to return metrics value. + * + * Do not use directly @see metrics_creation + */ +struct metric_value { + std::variant u; + data_type _type; + data_type type() const { + return _type; + } + + double d() const { + return std::get(u); + } + + uint64_t ui() const { + return std::get(u); + } + + int64_t i() const { + return std::get(u); + } + + metric_value() + : _type(data_type::GAUGE) { + } + + metric_value(histogram&& h, data_type t = data_type::HISTOGRAM) : + u(std::move(h)), _type(t) { + } + metric_value(const histogram& h, data_type t = data_type::HISTOGRAM) : + u(h), _type(t) { + } + + metric_value(double d, data_type t) + : u(d), _type(t) { + } + + metric_value& operator=(const metric_value& c) = default; + + metric_value& operator+=(const metric_value& c) { + *this = *this + c; + return *this; + } + + metric_value operator+(const metric_value& c); + const histogram& get_histogram() const { + return std::get(u); + } +}; + +using metric_function = std::function; + +struct metric_type { + data_type base_type; + metric_type_def type_name; +}; + +struct metric_definition_impl { + metric_name_type name; + metric_type type; + metric_function f; + description d; + bool enabled = true; + std::map labels; + metric_definition_impl& operator ()(bool enabled); + metric_definition_impl& operator ()(const label_instance& label); + metric_definition_impl& set_type(const sstring& type_name); + metric_definition_impl( + metric_name_type name, + metric_type type, + metric_function f, + description d, + std::vector labels); +}; + +class metric_groups_def { +public: + metric_groups_def() = default; + virtual ~metric_groups_def() = default; + metric_groups_def(const metric_groups_def&) = delete; + metric_groups_def(metric_groups_def&&) = default; + virtual metric_groups_def& add_metric(group_name_type name, const metric_definition& md) = 0; + virtual metric_groups_def& add_group(group_name_type name, const std::initializer_list& l) = 0; + virtual metric_groups_def& add_group(group_name_type name, const std::vector& l) = 0; +}; + +instance_id_type shard(); + +template +struct is_callable; + +template +struct is_callable::type>::value>::type> : public std::true_type { +}; + +template +struct is_callable::value, std::true_type>::type> : public std::false_type { +}; + +template::value>> +metric_function make_function(T val, data_type dt) { + return [dt, val] { + return metric_value(val(), dt); + }; +} + +template::value>> +metric_function make_function(T& val, data_type dt) { + return [dt, &val] { + return metric_value(val, dt); + }; +} +} + +extern const bool metric_disabled; + +extern label shard_label; + +/* + * The metrics definition are defined to be compatible with collectd metrics defintion. + * Typically you should used gauge or derived. + */ + + +/*! + * \brief Gauge are a general purpose metric. + * + * They can support floating point and can increase or decrease + */ +template +impl::metric_definition_impl make_gauge(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}) { + return {name, {impl::data_type::GAUGE, "gauge"}, make_function(std::forward(val), impl::data_type::GAUGE), d, labels}; +} + +/*! + * \brief Gauge are a general purpose metric. + * + * They can support floating point and can increase or decrease + */ +template +impl::metric_definition_impl make_gauge(metric_name_type name, + description d, T&& val) { + return {name, {impl::data_type::GAUGE, "gauge"}, make_function(std::forward(val), impl::data_type::GAUGE), d, {}}; +} + +/*! + * \brief Gauge are a general purpose metric. + * + * They can support floating point and can increase or decrease + */ +template +impl::metric_definition_impl make_gauge(metric_name_type name, + description d, std::vector labels, T&& val) { + return {name, {impl::data_type::GAUGE, "gauge"}, make_function(std::forward(val), impl::data_type::GAUGE), d, labels}; +} + + +/*! + * \brief Derive are used when a rate is more interesting than the value. + * + * Derive is an integer value that can increase or decrease, typically it is used when looking at the + * derivation of the value. + * + * It is OK to use it when counting things and if no wrap-around is expected (it shouldn't) it's prefer over counter metric. + */ +template +impl::metric_definition_impl make_derive(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}) { + return {name, {impl::data_type::DERIVE, "derive"}, make_function(std::forward(val), impl::data_type::DERIVE), d, labels}; +} + + +/*! + * \brief Derive are used when a rate is more interesting than the value. + * + * Derive is an integer value that can increase or decrease, typically it is used when looking at the + * derivation of the value. + * + * It is OK to use it when counting things and if no wrap-around is expected (it shouldn't) it's prefer over counter metric. + */ +template +impl::metric_definition_impl make_derive(metric_name_type name, description d, + T&& val) { + return {name, {impl::data_type::DERIVE, "derive"}, make_function(std::forward(val), impl::data_type::DERIVE), d, {}}; +} + + +/*! + * \brief Derive are used when a rate is more interesting than the value. + * + * Derive is an integer value that can increase or decrease, typically it is used when looking at the + * derivation of the value. + * + * It is OK to use it when counting things and if no wrap-around is expected (it shouldn't) it's prefer over counter metric. + */ +template +impl::metric_definition_impl make_derive(metric_name_type name, description d, std::vector labels, + T&& val) { + return {name, {impl::data_type::DERIVE, "derive"}, make_function(std::forward(val), impl::data_type::DERIVE), d, labels}; +} + + +/*! + * \brief create a counter metric + * + * Counters are similar to derived, but they assume monotony, so if a counter value decrease in a series it is count as a wrap-around. + * It is better to use large enough data value than to use counter. + * + */ +template +impl::metric_definition_impl make_counter(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}) { + return {name, {impl::data_type::COUNTER, "counter"}, make_function(std::forward(val), impl::data_type::COUNTER), d, labels}; +} + +/*! + * \brief create an absolute metric. + * + * Absolute are used for metric that are being erased after each time they are read. + * They are here for compatibility reasons and should general be avoided in most applications. + */ +template +impl::metric_definition_impl make_absolute(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}) { + return {name, {impl::data_type::ABSOLUTE, "absolute"}, make_function(std::forward(val), impl::data_type::ABSOLUTE), d, labels}; +} + +/*! + * \brief create a histogram metric. + * + * Histograms are a list o buckets with upper values and counter for the number + * of entries in each bucket. + */ +template +impl::metric_definition_impl make_histogram(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}) { + return {name, {impl::data_type::HISTOGRAM, "histogram"}, make_function(std::forward(val), impl::data_type::HISTOGRAM), d, labels}; +} + +/*! + * \brief create a histogram metric. + * + * Histograms are a list o buckets with upper values and counter for the number + * of entries in each bucket. + */ +template +impl::metric_definition_impl make_histogram(metric_name_type name, + description d, std::vector labels, T&& val) { + return {name, {impl::data_type::HISTOGRAM, "histogram"}, make_function(std::forward(val), impl::data_type::HISTOGRAM), d, labels}; +} + + +/*! + * \brief create a histogram metric. + * + * Histograms are a list o buckets with upper values and counter for the number + * of entries in each bucket. + */ +template +impl::metric_definition_impl make_histogram(metric_name_type name, + description d, T&& val) { + return {name, {impl::data_type::HISTOGRAM, "histogram"}, make_function(std::forward(val), impl::data_type::HISTOGRAM), d, {}}; +} + + +/*! + * \brief create a total_bytes metric. + * + * total_bytes are used for an ever growing counters, like the total bytes + * passed on a network. + */ + +template +impl::metric_definition_impl make_total_bytes(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}, + instance_id_type instance = impl::shard()) { + return make_derive(name, std::forward(val), d, labels).set_type("total_bytes"); +} + +/*! + * \brief create a current_bytes metric. + * + * current_bytes are used to report on current status in bytes. + * For example the current free memory. + */ + +template +impl::metric_definition_impl make_current_bytes(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}, + instance_id_type instance = impl::shard()) { + return make_derive(name, std::forward(val), d, labels).set_type("bytes"); +} + + +/*! + * \brief create a queue_length metric. + * + * queue_length are used to report on queue length + */ + +template +impl::metric_definition_impl make_queue_length(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}, + instance_id_type instance = impl::shard()) { + return make_gauge(name, std::forward(val), d, labels).set_type("queue_length"); +} + + +/*! + * \brief create a total operation metric. + * + * total_operations are used for ever growing operation counter. + */ + +template +impl::metric_definition_impl make_total_operations(metric_name_type name, + T&& val, description d=description(), std::vector labels = {}, + instance_id_type instance = impl::shard()) { + return make_derive(name, std::forward(val), d, labels).set_type("total_operations"); +} + +/*! @} */ +} +} diff --git a/src/seastar/include/seastar/core/metrics_api.hh b/src/seastar/include/seastar/core/metrics_api.hh new file mode 100644 index 000000000..1343045f6 --- /dev/null +++ b/src/seastar/include/seastar/core/metrics_api.hh @@ -0,0 +1,386 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB. + */ + +#pragma once + +#include +#include +#include +#include +/*! + * \file metrics_api.hh + * \brief header file for metric API layer (like promehteus or collectd) + * + * + * + */ +namespace seastar { +namespace metrics { +namespace impl { + +using labels_type = std::map; +} +} +} + +namespace std { + +template<> +struct hash { + using argument_type = seastar::metrics::impl::labels_type; + using result_type = ::std::size_t; + result_type operator()(argument_type const& s) const { + result_type h = 0; + for (auto&& i : s) { + boost::hash_combine(h, std::hash{}(i.second)); + } + return h; + } +}; + +} + +namespace seastar { +namespace metrics { +namespace impl { + +/** + * Metrics are collected in groups that belongs to some logical entity. + * For example, different measurements of the cpu, will belong to group "cpu". + * + * Name is the metric name like used_objects or used_bytes + * + * Inherit type allows customizing one of the basic types (gauge, counter, derive). + * + * Instance_id is used to differentiate multiple instance of the metrics. + * In the seastar environment it is typical to have a metric per shard. + * + */ + +class metric_id { +public: + metric_id() = default; + metric_id(group_name_type group, metric_name_type name, + labels_type labels = {}) + : _group(std::move(group)), _name( + std::move(name)), _labels(labels) { + } + metric_id(metric_id &&) = default; + metric_id(const metric_id &) = default; + + metric_id & operator=(metric_id &&) = default; + metric_id & operator=(const metric_id &) = default; + + const group_name_type & group_name() const { + return _group; + } + void group_name(const group_name_type & name) { + _group = name; + } + const instance_id_type & instance_id() const { + return _labels.at(shard_label.name()); + } + const metric_name_type & name() const { + return _name; + } + const labels_type& labels() const { + return _labels; + } + sstring full_name() const; + + bool operator<(const metric_id&) const; + bool operator==(const metric_id&) const; +private: + auto as_tuple() const { + return std::tie(group_name(), instance_id(), name(), labels()); + } + group_name_type _group; + metric_name_type _name; + labels_type _labels; +}; +} +} +} + +namespace std { + +template<> +struct hash +{ + typedef seastar::metrics::impl::metric_id argument_type; + typedef ::std::size_t result_type; + result_type operator()(argument_type const& s) const + { + result_type const h1 ( std::hash{}(s.group_name()) ); + result_type const h2 ( std::hash{}(s.instance_id()) ); + return h1 ^ (h2 << 1); // or use boost::hash_combine + } +}; + +} + +namespace seastar { +namespace metrics { +namespace impl { + +/*! + * \brief holds metadata information of a metric family + * + * Holds the information that is shared between all metrics + * that belongs to the same metric_family + */ +struct metric_family_info { + data_type type; + metric_type_def inherit_type; + description d; + sstring name; +}; + + +/*! + * \brief holds metric metadata + */ +struct metric_info { + metric_id id; + bool enabled; +}; + + +using metrics_registration = std::vector; + +class metric_groups_impl : public metric_groups_def { + metrics_registration _registration; +public: + metric_groups_impl() = default; + ~metric_groups_impl(); + metric_groups_impl(const metric_groups_impl&) = delete; + metric_groups_impl(metric_groups_impl&&) = default; + metric_groups_impl& add_metric(group_name_type name, const metric_definition& md); + metric_groups_impl& add_group(group_name_type name, const std::initializer_list& l); + metric_groups_impl& add_group(group_name_type name, const std::vector& l); +}; + +class impl; + +class registered_metric { + metric_info _info; + metric_function _f; + shared_ptr _impl; +public: + registered_metric(metric_id id, metric_function f, bool enabled=true); + virtual ~registered_metric() {} + virtual metric_value operator()() const { + return _f(); + } + + bool is_enabled() const { + return _info.enabled; + } + + void set_enabled(bool b) { + _info.enabled = b; + } + + const metric_id& get_id() const { + return _info.id; + } + + const metric_info& info() const { + return _info; + } + metric_function& get_function() { + return _f; + } +}; + +using register_ref = shared_ptr; +using metric_instances = std::map; + +class metric_family { + metric_instances _instances; + metric_family_info _info; +public: + using iterator = metric_instances::iterator; + using const_iterator = metric_instances::const_iterator; + + metric_family() = default; + metric_family(const metric_family&) = default; + metric_family(const metric_instances& instances) : _instances(instances) { + } + metric_family(const metric_instances& instances, const metric_family_info& info) : _instances(instances), _info(info) { + } + metric_family(metric_instances&& instances, metric_family_info&& info) : _instances(std::move(instances)), _info(std::move(info)) { + } + metric_family(metric_instances&& instances) : _instances(std::move(instances)) { + } + + register_ref& operator[](const labels_type& l) { + return _instances[l]; + } + + const register_ref& at(const labels_type& l) const { + return _instances.at(l); + } + + metric_family_info& info() { + return _info; + } + + const metric_family_info& info() const { + return _info; + } + + iterator find(const labels_type& l) { + return _instances.find(l); + } + + const_iterator find(const labels_type& l) const { + return _instances.find(l); + } + + iterator begin() { + return _instances.begin(); + } + + const_iterator begin() const { + return _instances.cbegin(); + } + + iterator end() { + return _instances.end(); + } + + bool empty() const { + return _instances.empty(); + } + + iterator erase(const_iterator position) { + return _instances.erase(position); + } + + const_iterator end() const { + return _instances.cend(); + } + + uint32_t size() const { + return _instances.size(); + } + +}; + +using value_map = std::map; + +using metric_metadata_vector = std::vector; + +/*! + * \brief holds a metric family metadata + * + * The meta data of a metric family compose of the + * metadata of the family, and a vector of the metadata for + * each of the metric. + */ +struct metric_family_metadata { + metric_family_info mf; + metric_metadata_vector metrics; +}; + +using value_vector = std::vector; +using metric_metadata = std::vector; +using metric_values = std::vector; + +struct values_copy { + shared_ptr metadata; + metric_values values; +}; + +struct config { + sstring hostname; +}; + +class impl { + value_map _value_map; + config _config; + bool _dirty = true; + shared_ptr _metadata; + std::vector> _current_metrics; +public: + value_map& get_value_map() { + return _value_map; + } + + const value_map& get_value_map() const { + return _value_map; + } + + void add_registration(const metric_id& id, const metric_type& type, metric_function f, const description& d, bool enabled); + void remove_registration(const metric_id& id); + future<> stop() { + return make_ready_future<>(); + } + const config& get_config() const { + return _config; + } + void set_config(const config& c) { + _config = c; + } + + shared_ptr metadata(); + + std::vector>& functions(); + + void update_metrics_if_needed(); + + void dirty() { + _dirty = true; + } +}; + +const value_map& get_value_map(); +using values_reference = shared_ptr; + +foreign_ptr get_values(); + +shared_ptr get_local_impl(); + +void unregister_metric(const metric_id & id); + +/*! + * \brief initialize metric group + * + * Create a metric_group_def. + * No need to use it directly. + */ +std::unique_ptr create_metric_groups(); + +} +/*! + * \brief set the metrics configuration + */ +future<> configure(const boost::program_options::variables_map & opts); + +/*! + * \brief get the metrics configuration desciprtion + */ + +boost::program_options::options_description get_options_description(); + +} +} diff --git a/src/seastar/include/seastar/core/metrics_registration.hh b/src/seastar/include/seastar/core/metrics_registration.hh new file mode 100644 index 000000000..6f57b708b --- /dev/null +++ b/src/seastar/include/seastar/core/metrics_registration.hh @@ -0,0 +1,173 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB. + */ + +#pragma once + +#include +#include + +#include + +/*! + * \file metrics_registration.hh + * \brief holds the metric_groups definition needed by class that reports metrics + * + * If class A needs to report metrics, + * typically you include metrics_registration.hh, in A header file and add to A: + * * metric_groups _metrics as a member + * * set_metrics() method that would be called in the constructor. + * \code + * class A { + * metric_groups _metrics + * + * void setup_metrics(); + * + * }; + * \endcode + * To define the metrics, include in your source file metircs.hh + * @see metrics.hh for the definition for adding a metric. + */ + +namespace seastar { + +namespace metrics { + +namespace impl { +class metric_groups_def; +struct metric_definition_impl; +class metric_groups_impl; +} + +using group_name_type = sstring; /*!< A group of logically related metrics */ +class metric_groups; + +class metric_definition { + std::unique_ptr _impl; +public: + metric_definition(const impl::metric_definition_impl& impl) noexcept; + metric_definition(metric_definition&& m) noexcept; + ~metric_definition(); + friend metric_groups; + friend impl::metric_groups_impl; +}; + +class metric_group_definition { +public: + group_name_type name; + std::initializer_list metrics; + metric_group_definition(const group_name_type& name, std::initializer_list l); + metric_group_definition(const metric_group_definition&) = delete; + ~metric_group_definition(); +}; + +/*! + * metric_groups + * \brief holds the metric definition. + * + * Add multiple metric groups definitions. + * Initialization can be done in the constructor or with a call to add_group + * @see metrics.hh for example and supported metrics + */ +class metric_groups { + std::unique_ptr _impl; +public: + metric_groups() noexcept; + metric_groups(metric_groups&&) = default; + virtual ~metric_groups(); + metric_groups& operator=(metric_groups&&) = default; + /*! + * \brief add metrics belong to the same group in the constructor. + * + * combine the constructor with the add_group functionality. + */ + metric_groups(std::initializer_list mg); + + /*! + * \brief Add metrics belonging to the same group. + * + * Use the metrics creation functions to add metrics. + * + * For example: + * _metrics.add_group("my_group", { + * make_counter("my_counter_name1", counter, description("my counter description")), + * make_counter("my_counter_name2", counter, description("my second counter description")), + * make_gauge("my_gauge_name1", gauge, description("my gauge description")), + * }); + * + * Metric name should be unique inside the group. + * You can chain add_group calls like: + * _metrics.add_group("my group1", {...}).add_group("my group2", {...}); + * + * This overload (with initializer_list) is needed because metric_definition + * has no copy constructor, so the other overload (with vector) cannot be + * invoked on a braced-init-list. + */ + metric_groups& add_group(const group_name_type& name, const std::initializer_list& l); + + /*! + * \brief Add metrics belonging to the same group. + * + * Use the metrics creation functions to add metrics. + * + * For example: + * vector v; + * v.push_back(make_counter("my_counter_name1", counter, description("my counter description"))); + * v.push_back(make_counter("my_counter_name2", counter, description("my second counter description"))); + * v.push_back(make_gauge("my_gauge_name1", gauge, description("my gauge description"))); + * _metrics.add_group("my_group", v); + * + * Metric name should be unique inside the group. + * You can chain add_group calls like: + * _metrics.add_group("my group1", vec1).add_group("my group2", vec2); + */ + metric_groups& add_group(const group_name_type& name, const std::vector& l); + + /*! + * \brief clear all metrics groups registrations. + */ + void clear(); +}; + + +/*! + * \brief hold a single metric group + * Initialization is done in the constructor or + * with a call to add_group + */ +class metric_group : public metric_groups { +public: + metric_group() noexcept; + metric_group(const metric_group&) = delete; + metric_group(metric_group&&) = default; + virtual ~metric_group(); + metric_group& operator=(metric_group&&) = default; + + /*! + * \brief add metrics belong to the same group in the constructor. + * + * + */ + metric_group(const group_name_type& name, std::initializer_list l); +}; + + +} +} diff --git a/src/seastar/include/seastar/core/metrics_types.hh b/src/seastar/include/seastar/core/metrics_types.hh new file mode 100644 index 000000000..13d79cb21 --- /dev/null +++ b/src/seastar/include/seastar/core/metrics_types.hh @@ -0,0 +1,83 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB + */ + +#pragma once +#include + +namespace seastar { +namespace metrics { + + +/*! + * \brief Histogram bucket type + * + * A histogram bucket contains an upper bound and the number + * of events in the buckets. + */ +struct histogram_bucket { + uint64_t count = 0; // number of events. + double upper_bound = 0; // Inclusive. +}; + + +/*! + * \brief Histogram data type + * + * The histogram struct is a container for histogram values. + * It is not a histogram implementation but it will be used by histogram + * implementation to return its internal values. + */ +struct histogram { + uint64_t sample_count = 0; + double sample_sum = 0; + std::vector buckets; // Ordered in increasing order of upper_bound, +Inf bucket is optional. + + /*! + * \brief Addition assigning a historgram + * + * The histogram must match the buckets upper bounds + * or an exception will be thrown + */ + histogram& operator+=(const histogram& h); + + /*! + * \brief Addition historgrams + * + * Add two histograms and return the result as a new histogram + * The histogram must match the buckets upper bounds + * or an exception will be thrown + */ + histogram operator+(const histogram& h) const; + + /*! + * \brief Addition historgrams + * + * Add two histograms and return the result as a new histogram + * The histogram must match the buckets upper bounds + * or an exception will be thrown + */ + histogram operator+(histogram&& h) const; + +}; + +} + +} diff --git a/src/seastar/include/seastar/core/on_internal_error.hh b/src/seastar/include/seastar/core/on_internal_error.hh new file mode 100644 index 000000000..5c56a9c45 --- /dev/null +++ b/src/seastar/include/seastar/core/on_internal_error.hh @@ -0,0 +1,56 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2020 ScyllaDB + */ + +#pragma once + +#include + +namespace seastar { + +class logger; + +/// Controls whether on_internal_error() aborts or throws. The default +/// is to throw. +void set_abort_on_internal_error(bool do_abort); + +/// Report an internal error +/// +/// Depending on the value passed to set_abort_on_internal_error, this +/// will either log to \p logger and abort or throw a std::runtime_error. +[[noreturn]] void on_internal_error(logger& logger, std::string_view reason); + +/// Report an internal error +/// +/// Depending on the value passed to set_abort_on_internal_error, this +/// will either log to \p logger and abort or throw the passed-in +/// \p ex. +/// This overload cannot attach a backtrace to the exception, so if the +/// caller wishes to have one attached they have to do it themselves. +[[noreturn]] void on_internal_error(logger& logger, std::exception_ptr ex); + +/// Report an internal error in a noexcept context +/// +/// The error will be logged to \logger and if set_abort_on_internal_error, +/// was set to true, the program will be aborted. This overload can be used +/// in noexcept contexts like destructors or noexcept functions. +void on_internal_error_noexcept(logger& logger, std::string_view reason) noexcept; + +} diff --git a/src/seastar/include/seastar/core/pipe.hh b/src/seastar/include/seastar/core/pipe.hh new file mode 100644 index 000000000..d69484384 --- /dev/null +++ b/src/seastar/include/seastar/core/pipe.hh @@ -0,0 +1,267 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include + +#include + +/// \defgroup fiber-module Fibers +/// +/// \brief Fibers of execution +/// +/// Seastar continuations are normally short, but often chained to one +/// another, so that one continuation does a bit of work and then schedules +/// another continuation for later. Such chains can be long, and often even +/// involve loopings - see for example \ref repeat. We call such chains +/// "fibers" of execution. +/// +/// These fibers are not threads - each is just a string of continuations - +/// but they share some common requirements with traditional threads. +/// For example, we want to avoid one fiber getting starved while a second +/// fiber continuously runs its continuations one after another. +/// As another example, fibers may want to communicate - e.g., one fiber +/// produces data that a second fiber consumes, and we wish to ensure that +/// both fibers get a chance to run, and that if one stops prematurely, +/// the other doesn't hang forever. +/// +/// Consult the following table to see which APIs are useful for fiber tasks: +/// +/// Task | APIs +/// -----------------------------------------------|------------------- +/// Repeat a blocking task indefinitely | \ref keep_doing() +/// Repeat a blocking task, then exit | \ref repeat(), \ref do_until() +/// Provide mutual exclusion between two tasks | \ref semaphore, \ref shared_mutex +/// Pass a stream of data between two fibers | \ref seastar::pipe +/// Safely shut down a resource | \ref seastar::gate +/// Hold on to an object while a fiber is running | \ref do_with() +/// + +/// Seastar API namespace +namespace seastar { + +/// \addtogroup fiber-module +/// @{ + +class broken_pipe_exception : public std::exception { +public: + virtual const char* what() const noexcept { + return "Broken pipe"; + } +}; + +class unread_overflow_exception : public std::exception { +public: + virtual const char* what() const noexcept { + return "pipe_reader::unread() overflow"; + } +}; + +/// \cond internal +namespace internal { +template +class pipe_buffer { +private: + queue> _buf; + bool _read_open = true; + bool _write_open = true; +public: + pipe_buffer(size_t size) : _buf(size) {} + future> read() { + return _buf.pop_eventually(); + } + future<> write(T&& data) { + return _buf.push_eventually(std::move(data)); + } + bool readable() const { + return _write_open || !_buf.empty(); + } + bool writeable() const { + return _read_open; + } + bool close_read() { + // If a writer blocking (on a full queue), need to stop it. + if (_buf.full()) { + _buf.abort(std::make_exception_ptr(broken_pipe_exception())); + } + _read_open = false; + return !_write_open; + } + bool close_write() { + // If the queue is empty, write the EOF (disengaged optional) to the + // queue to wake a blocked reader. If the queue is not empty, there is + // no need to write the EOF to the queue - the reader will return an + // EOF when it sees that _write_open == false. + if (_buf.empty()) { + _buf.push({}); + } + _write_open = false; + return !_read_open; + } +}; +} // namespace internal +/// \endcond + +template +class pipe; + +/// \brief Read side of a \ref seastar::pipe +/// +/// The read side of a pipe, which allows only reading from the pipe. +/// A pipe_reader object cannot be created separately, but only as part of a +/// reader/writer pair through \ref seastar::pipe. +template +class pipe_reader { +private: + internal::pipe_buffer *_bufp; + std::optional _unread; + pipe_reader(internal::pipe_buffer *bufp) : _bufp(bufp) { } + friend class pipe; +public: + /// \brief Read next item from the pipe + /// + /// Returns a future value, which is fulfilled when the pipe's buffer + /// becomes non-empty, or the write side is closed. The value returned + /// is an optional, which is disengaged to mark and end of file + /// (i.e., the write side was closed, and we've read everything it sent). + future> read() { + if (_unread) { + auto ret = std::move(*_unread); + _unread = {}; + return make_ready_future>(std::move(ret)); + } + if (_bufp->readable()) { + return _bufp->read(); + } else { + return make_ready_future>(); + } + } + /// \brief Return an item to the front of the pipe + /// + /// Pushes the given item to the front of the pipe, so it will be + /// returned by the next read() call. The typical use case is to + /// unread() the last item returned by read(). + /// More generally, it is legal to unread() any item, not just one + /// previously returned by read(), but note that the unread() is limited + /// to just one item - two calls to unread() without an intervening call + /// to read() will cause an exception. + void unread(T&& item) { + if (_unread) { + throw unread_overflow_exception(); + } + _unread = std::move(item); + } + ~pipe_reader() { + if (_bufp && _bufp->close_read()) { + delete _bufp; + } + } + // Allow move, but not copy, of pipe_reader + pipe_reader(pipe_reader&& other) : _bufp(other._bufp) { + other._bufp = nullptr; + } + pipe_reader& operator=(pipe_reader&& other) { + std::swap(_bufp, other._bufp); + } +}; + +/// \brief Write side of a \ref seastar::pipe +/// +/// The write side of a pipe, which allows only writing to the pipe. +/// A pipe_writer object cannot be created separately, but only as part of a +/// reader/writer pair through \ref seastar::pipe. +template +class pipe_writer { +private: + internal::pipe_buffer *_bufp; + pipe_writer(internal::pipe_buffer *bufp) : _bufp(bufp) { } + friend class pipe; +public: + /// \brief Write an item to the pipe + /// + /// Returns a future value, which is fulfilled when the data was written + /// to the buffer (when it become non-full). If the data could not be + /// written because the read side was closed, an exception + /// \ref broken_pipe_exception is returned in the future. + future<> write(T&& data) { + if (_bufp->writeable()) { + return _bufp->write(std::move(data)); + } else { + return make_exception_future<>(broken_pipe_exception()); + } + } + ~pipe_writer() { + if (_bufp && _bufp->close_write()) { + delete _bufp; + } + } + // Allow move, but not copy, of pipe_writer + pipe_writer(pipe_writer&& other) : _bufp(other._bufp) { + other._bufp = nullptr; + } + pipe_writer& operator=(pipe_writer&& other) { + std::swap(_bufp, other._bufp); + } +}; + +/// \brief A fixed-size pipe for communicating between two fibers. +/// +/// A pipe is a mechanism to transfer data between two fibers, one +/// producing data, and the other consuming it. The fixed-size buffer also +/// ensures a balanced execution of the two fibers, because the producer +/// fiber blocks when it writes to a full pipe, until the consumer fiber gets +/// to run and read from the pipe. +/// +/// A pipe resembles a Unix pipe, in that it has a read side, a write side, +/// and a fixed-sized buffer between them, and supports either end to be closed +/// independently (and EOF or broken pipe when using the other side). +/// A pipe object holds the reader and write sides of the pipe as two +/// separate objects. These objects can be moved into two different fibers. +/// Importantly, if one of the pipe ends is destroyed (i.e., the continuations +/// capturing it end), the other end of the pipe will stop blocking, so the +/// other fiber will not hang. +/// +/// The pipe's read and write interfaces are future-based blocking. I.e., the +/// write() and read() methods return a future which is fulfilled when the +/// operation is complete. The pipe is single-reader single-writer, meaning +/// that until the future returned by read() is fulfilled, read() must not be +/// called again (and same for write). +/// +/// Note: The pipe reader and writer are movable, but *not* copyable. It is +/// often convenient to wrap each end in a shared pointer, so it can be +/// copied (e.g., used in an std::function which needs to be copyable) or +/// easily captured into multiple continuations. +template +class pipe { +public: + pipe_reader reader; + pipe_writer writer; + explicit pipe(size_t size) : pipe(new internal::pipe_buffer(size)) { } +private: + pipe(internal::pipe_buffer *bufp) : reader(bufp), writer(bufp) { } +}; + + +/// @} + +} // namespace seastar diff --git a/src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh b/src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh new file mode 100644 index 000000000..437bba47f --- /dev/null +++ b/src/seastar/include/seastar/core/polymorphic_temporary_buffer.hh @@ -0,0 +1,43 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2019 Elazar Leibovich + */ + +#pragma once + +#include +#include + +namespace seastar { + +/// Creates a `temporary_buffer` allocated by a custom allocator +/// +/// \param allocator allocator to use when allocating the temporary_buffer +/// \param size size of the temporary buffer +template +temporary_buffer make_temporary_buffer(std::pmr::polymorphic_allocator* allocator, std::size_t size) { + if (allocator == memory::malloc_allocator) { + return temporary_buffer(size); + } + CharType *buffer = allocator->allocate(size); + return temporary_buffer(buffer, size, + make_deleter(deleter(), [allocator, buffer, size] () mutable { allocator->deallocate(buffer, size); })); +} + +} diff --git a/src/seastar/include/seastar/core/posix.hh b/src/seastar/include/seastar/core/posix.hh new file mode 100644 index 000000000..f8dece37c --- /dev/null +++ b/src/seastar/include/seastar/core/posix.hh @@ -0,0 +1,492 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB + */ + +#pragma once + +#include +#include "abort_on_ebadf.hh" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace seastar { + +/// \file +/// \defgroup posix-support POSIX Support +/// +/// Mostly-internal APIs to provide C++ glue for the underlying POSIX platform; +/// but can be used by the application when they don't block. +/// +/// \addtogroup posix-support +/// @{ + +inline void throw_system_error_on(bool condition, const char* what_arg = ""); + +template +inline void throw_kernel_error(T r); + +struct mmap_deleter { + size_t _size; + void operator()(void* ptr) const; +}; + +using mmap_area = std::unique_ptr; + +mmap_area mmap_anonymous(void* addr, size_t length, int prot, int flags); + +class file_desc { + int _fd; +public: + file_desc() = delete; + file_desc(const file_desc&) = delete; + file_desc(file_desc&& x) noexcept : _fd(x._fd) { x._fd = -1; } + ~file_desc() { if (_fd != -1) { ::close(_fd); } } + void operator=(const file_desc&) = delete; + file_desc& operator=(file_desc&& x) { + if (this != &x) { + std::swap(_fd, x._fd); + if (x._fd != -1) { + x.close(); + } + } + return *this; + } + void close() { + assert(_fd != -1); + auto r = ::close(_fd); + throw_system_error_on(r == -1, "close"); + _fd = -1; + } + int get() const { return _fd; } + + static file_desc from_fd(int fd) { + return file_desc(fd); + } + + static file_desc open(sstring name, int flags, mode_t mode = 0) { + int fd = ::open(name.c_str(), flags, mode); + throw_system_error_on(fd == -1, "open"); + return file_desc(fd); + } + static file_desc socket(int family, int type, int protocol = 0) { + int fd = ::socket(family, type, protocol); + throw_system_error_on(fd == -1, "socket"); + return file_desc(fd); + } + static file_desc eventfd(unsigned initval, int flags) { + int fd = ::eventfd(initval, flags); + throw_system_error_on(fd == -1, "eventfd"); + return file_desc(fd); + } + static file_desc epoll_create(int flags = 0) { + int fd = ::epoll_create1(flags); + throw_system_error_on(fd == -1, "epoll_create1"); + return file_desc(fd); + } + static file_desc timerfd_create(int clockid, int flags) { + int fd = ::timerfd_create(clockid, flags); + throw_system_error_on(fd == -1, "timerfd_create"); + return file_desc(fd); + } + static file_desc temporary(sstring directory); + file_desc dup() const { + int fd = ::dup(get()); + throw_system_error_on(fd == -1, "dup"); + return file_desc(fd); + } + file_desc accept(socket_address& sa, int flags = 0) { + auto ret = ::accept4(_fd, &sa.as_posix_sockaddr(), &sa.addr_length, flags); + throw_system_error_on(ret == -1, "accept4"); + return file_desc(ret); + } + static file_desc inotify_init(int flags); + // return nullopt if no connection is availbale to be accepted + std::optional try_accept(socket_address& sa, int flags = 0) { + auto ret = ::accept4(_fd, &sa.as_posix_sockaddr(), &sa.addr_length, flags); + if (ret == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(ret == -1, "accept4"); + return file_desc(ret); + } + void shutdown(int how) { + auto ret = ::shutdown(_fd, how); + if (ret == -1 && errno != ENOTCONN) { + throw_system_error_on(ret == -1, "shutdown"); + } + } + void truncate(size_t size) { + auto ret = ::ftruncate(_fd, size); + throw_system_error_on(ret, "ftruncate"); + } + int ioctl(int request) { + return ioctl(request, 0); + } + int ioctl(int request, int value) { + int r = ::ioctl(_fd, request, value); + throw_system_error_on(r == -1, "ioctl"); + return r; + } + int ioctl(int request, unsigned int value) { + int r = ::ioctl(_fd, request, value); + throw_system_error_on(r == -1, "ioctl"); + return r; + } + template + int ioctl(int request, X& data) { + int r = ::ioctl(_fd, request, &data); + throw_system_error_on(r == -1, "ioctl"); + return r; + } + template + int ioctl(int request, X&& data) { + int r = ::ioctl(_fd, request, &data); + throw_system_error_on(r == -1, "ioctl"); + return r; + } + template + int setsockopt(int level, int optname, X&& data) { + int r = ::setsockopt(_fd, level, optname, &data, sizeof(data)); + throw_system_error_on(r == -1, "setsockopt"); + return r; + } + int setsockopt(int level, int optname, const char* data) { + int r = ::setsockopt(_fd, level, optname, data, strlen(data) + 1); + throw_system_error_on(r == -1, "setsockopt"); + return r; + } + int setsockopt(int level, int optname, const void* data, socklen_t len) { + int r = ::setsockopt(_fd, level, optname, data, len); + throw_system_error_on(r == -1, "setsockopt"); + return r; + } + template + Data getsockopt(int level, int optname) { + Data data; + socklen_t len = sizeof(data); + memset(&data, 0, len); + int r = ::getsockopt(_fd, level, optname, &data, &len); + throw_system_error_on(r == -1, "getsockopt"); + return data; + } + int getsockopt(int level, int optname, char* data, socklen_t len) { + int r = ::getsockopt(_fd, level, optname, data, &len); + throw_system_error_on(r == -1, "getsockopt"); + return r; + } + size_t size() { + struct stat buf; + auto r = ::fstat(_fd, &buf); + throw_system_error_on(r == -1, "fstat"); + return buf.st_size; + } + std::optional read(void* buffer, size_t len) { + auto r = ::read(_fd, buffer, len); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "read"); + return { size_t(r) }; + } + std::optional recv(void* buffer, size_t len, int flags) { + auto r = ::recv(_fd, buffer, len, flags); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "recv"); + return { ssize_t(r) }; + } + std::optional recvmsg(msghdr* mh, int flags) { + auto r = ::recvmsg(_fd, mh, flags); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "recvmsg"); + return { size_t(r) }; + } + std::optional send(const void* buffer, size_t len, int flags) { + auto r = ::send(_fd, buffer, len, flags); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "send"); + return { size_t(r) }; + } + std::optional sendto(socket_address& addr, const void* buf, size_t len, int flags) { + auto r = ::sendto(_fd, buf, len, flags, &addr.u.sa, addr.length()); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "sendto"); + return { size_t(r) }; + } + std::optional sendmsg(const msghdr* msg, int flags) { + auto r = ::sendmsg(_fd, msg, flags); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "sendmsg"); + return { size_t(r) }; + } + void bind(sockaddr& sa, socklen_t sl) { + auto r = ::bind(_fd, &sa, sl); + throw_system_error_on(r == -1, "bind"); + } + void connect(sockaddr& sa, socklen_t sl) { + auto r = ::connect(_fd, &sa, sl); + if (r == -1 && errno == EINPROGRESS) { + return; + } + throw_system_error_on(r == -1, "connect"); + } + socket_address get_address() { + socket_address addr; + auto r = ::getsockname(_fd, &addr.u.sa, &addr.addr_length); + throw_system_error_on(r == -1, "getsockname"); + return addr; + } + void listen(int backlog) { + auto fd = ::listen(_fd, backlog); + throw_system_error_on(fd == -1, "listen"); + } + std::optional write(const void* buf, size_t len) { + auto r = ::write(_fd, buf, len); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "write"); + return { size_t(r) }; + } + std::optional writev(const iovec *iov, int iovcnt) { + auto r = ::writev(_fd, iov, iovcnt); + if (r == -1 && errno == EAGAIN) { + return {}; + } + throw_system_error_on(r == -1, "writev"); + return { size_t(r) }; + } + size_t pread(void* buf, size_t len, off_t off) { + auto r = ::pread(_fd, buf, len, off); + throw_system_error_on(r == -1, "pread"); + return size_t(r); + } + void timerfd_settime(int flags, const itimerspec& its) { + auto r = ::timerfd_settime(_fd, flags, &its, NULL); + throw_system_error_on(r == -1, "timerfd_settime"); + } + + mmap_area map(size_t size, unsigned prot, unsigned flags, size_t offset, + void* addr = nullptr) { + void *x = mmap(addr, size, prot, flags, _fd, offset); + throw_system_error_on(x == MAP_FAILED, "mmap"); + return mmap_area(static_cast(x), mmap_deleter{size}); + } + + mmap_area map_shared_rw(size_t size, size_t offset) { + return map(size, PROT_READ | PROT_WRITE, MAP_SHARED, offset); + } + + mmap_area map_shared_ro(size_t size, size_t offset) { + return map(size, PROT_READ, MAP_SHARED, offset); + } + + mmap_area map_private_rw(size_t size, size_t offset) { + return map(size, PROT_READ | PROT_WRITE, MAP_PRIVATE, offset); + } + + mmap_area map_private_ro(size_t size, size_t offset) { + return map(size, PROT_READ, MAP_PRIVATE, offset); + } + +private: + file_desc(int fd) : _fd(fd) {} + }; + + +namespace posix { + +/// Converts a duration value to a `timespec` +/// +/// \param d a duration value to convert to the POSIX `timespec` format +/// \return `d` as a `timespec` value +template +struct timespec +to_timespec(std::chrono::duration d) { + auto ns = std::chrono::duration_cast(d).count(); + struct timespec ts {}; + ts.tv_sec = ns / 1000000000; + ts.tv_nsec = ns % 1000000000; + return ts; +} + +/// Converts a relative start time and an interval to an `itimerspec` +/// +/// \param base First expiration of the timer, relative to the current time +/// \param interval period for re-arming the timer +/// \return `base` and `interval` converted to an `itimerspec` +template +struct itimerspec +to_relative_itimerspec(std::chrono::duration base, std::chrono::duration interval) { + struct itimerspec its {}; + its.it_interval = to_timespec(interval); + its.it_value = to_timespec(base); + return its; +} + + +/// Converts a time_point and a duration to an `itimerspec` +/// +/// \param base base time for the timer; must use the same clock as the timer +/// \param interval period for re-arming the timer +/// \return `base` and `interval` converted to an `itimerspec` +template +struct itimerspec +to_absolute_itimerspec(std::chrono::time_point base, std::chrono::duration interval) { + return to_relative_itimerspec(base.time_since_epoch(), interval); +} + +} + +class posix_thread { +public: + class attr; +private: + // must allocate, since this class is moveable + std::unique_ptr> _func; + pthread_t _pthread; + bool _valid = true; + mmap_area _stack; +private: + static void* start_routine(void* arg) noexcept; +public: + posix_thread(std::function func); + posix_thread(attr a, std::function func); + posix_thread(posix_thread&& x); + ~posix_thread(); + void join(); +public: + class attr { + public: + struct stack_size { size_t size = 0; }; + attr() = default; + template + attr(A... a) { + set(std::forward(a)...); + } + void set() {} + template + void set(A a, Rest... rest) { + set(std::forward(a)); + set(std::forward(rest)...); + } + void set(stack_size ss) { _stack_size = ss; } + private: + stack_size _stack_size; + friend class posix_thread; + }; +}; + + +inline +void throw_system_error_on(bool condition, const char* what_arg) { + if (condition) { + if ((errno == EBADF || errno == ENOTSOCK) && is_abort_on_ebadf_enabled()) { + abort(); + } + throw std::system_error(errno, std::system_category(), what_arg); + } +} + +template +inline +void throw_kernel_error(T r) { + static_assert(std::is_signed::value, "kernel error variables must be signed"); + if (r < 0) { + auto ec = -r; + if ((ec == EBADF || ec == ENOTSOCK) && is_abort_on_ebadf_enabled()) { + abort(); + } + throw std::system_error(-r, std::system_category()); + } +} + +template +inline +void throw_pthread_error(T r) { + if (r != 0) { + throw std::system_error(r, std::system_category()); + } +} + +inline +sigset_t make_sigset_mask(int signo) { + sigset_t set; + sigemptyset(&set); + sigaddset(&set, signo); + return set; +} + +inline +sigset_t make_full_sigset_mask() { + sigset_t set; + sigfillset(&set); + return set; +} + +inline +sigset_t make_empty_sigset_mask() { + sigset_t set; + sigemptyset(&set); + return set; +} + +inline +void pin_this_thread(unsigned cpu_id) { + cpu_set_t cs; + CPU_ZERO(&cs); + CPU_SET(cpu_id, &cs); + auto r = pthread_setaffinity_np(pthread_self(), sizeof(cs), &cs); + assert(r == 0); + (void)r; +} + +/// @} + +} diff --git a/src/seastar/include/seastar/core/preempt.hh b/src/seastar/include/seastar/core/preempt.hh new file mode 100644 index 000000000..722c7831e --- /dev/null +++ b/src/seastar/include/seastar/core/preempt.hh @@ -0,0 +1,58 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB. + */ + +#pragma once +#include + +namespace seastar { + +namespace internal { + +struct preemption_monitor { + // We preempt when head != tail + // This happens to match the Linux aio completion ring, so we can have the + // kernel preempt a task by queuing a completion event to an io_context. + std::atomic head; + std::atomic tail; +}; + +} + +extern __thread const internal::preemption_monitor* g_need_preempt; + +inline bool need_preempt() noexcept { +#ifndef SEASTAR_DEBUG + // prevent compiler from eliminating loads in a loop + std::atomic_signal_fence(std::memory_order_seq_cst); + auto np = g_need_preempt; + // We aren't reading anything from the ring, so we don't need + // any barriers. + auto head = np->head.load(std::memory_order_relaxed); + auto tail = np->tail.load(std::memory_order_relaxed); + // Possible optimization: read head and tail in a single 64-bit load, + // and find a funky way to compare the two 32-bit halves. + return __builtin_expect(head != tail, false); +#else + return true; +#endif +} + +} diff --git a/src/seastar/include/seastar/core/prefetch.hh b/src/seastar/include/seastar/core/prefetch.hh new file mode 100644 index 000000000..73f9abd3f --- /dev/null +++ b/src/seastar/include/seastar/core/prefetch.hh @@ -0,0 +1,115 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace seastar { + +template +struct prefetcher; + +template +struct prefetcher<0, RW, LOC> { + prefetcher(uintptr_t ptr) {} +}; + +template +struct prefetcher { + prefetcher(uintptr_t ptr) { + __builtin_prefetch(reinterpret_cast(ptr), RW, LOC); + std::atomic_signal_fence(std::memory_order_seq_cst); + prefetcher(ptr + cache_line_size); + } +}; + +// LOC is a locality from __buitin_prefetch() gcc documentation: +// "The value locality must be a compile-time constant integer between zero and three. A value of +// zero means that the data has no temporal locality, so it need not be left in the cache after +// the access. A value of three means that the data has a high degree of temporal locality and +// should be left in all levels of cache possible. Values of one and two mean, respectively, a +// low or moderate degree of temporal locality. The default is three." +template +void prefetch(T* ptr) { + prefetcher(reinterpret_cast(ptr)); +} + +template +void prefetch(Iterator begin, Iterator end) { + std::for_each(begin, end, [] (auto v) { prefetch(v); }); +} + +template +void prefetch_n(T** pptr) { + boost::mpl::for_each< boost::mpl::range_c >( [pptr] (size_t x) { prefetch(*(pptr + x)); } ); +} + +template +void prefetch(void* ptr) { + prefetcher(reinterpret_cast(ptr)); +} + +template +void prefetch_n(Iterator begin, Iterator end) { + std::for_each(begin, end, [] (auto v) { prefetch(v); }); +} + +template +void prefetch_n(T** pptr) { + boost::mpl::for_each< boost::mpl::range_c >( [pptr] (size_t x) { prefetch(*(pptr + x)); } ); +} + +template +void prefetchw(T* ptr) { + prefetcher(reinterpret_cast(ptr)); +} + +template +void prefetchw_n(Iterator begin, Iterator end) { + std::for_each(begin, end, [] (auto v) { prefetchw(v); }); +} + +template +void prefetchw_n(T** pptr) { + boost::mpl::for_each< boost::mpl::range_c >( [pptr] (size_t x) { prefetchw(*(pptr + x)); } ); +} + +template +void prefetchw(void* ptr) { + prefetcher(reinterpret_cast(ptr)); +} + +template +void prefetchw_n(Iterator begin, Iterator end) { + std::for_each(begin, end, [] (auto v) { prefetchw(v); }); +} + +template +void prefetchw_n(T** pptr) { + boost::mpl::for_each< boost::mpl::range_c >( [pptr] (size_t x) { prefetchw(*(pptr + x)); } ); +} + +} diff --git a/src/seastar/include/seastar/core/print.hh b/src/seastar/include/seastar/core/print.hh new file mode 100644 index 000000000..72e3934db --- /dev/null +++ b/src/seastar/include/seastar/core/print.hh @@ -0,0 +1,148 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#if 0 +inline +std::ostream& +operator<<(std::ostream& os, const void* ptr) { + auto flags = os.flags(); + os << "0x" << std::hex << reinterpret_cast(ptr); + os.flags(flags); + return os; +} +#endif + +inline +std::ostream& +operator<<(std::ostream&& os, const void* ptr) { + return os << ptr; // selects non-rvalue version +} + +namespace seastar { + +template +std::ostream& +fprint(std::ostream& os, const char* fmt, A&&... a) { + ::fmt::fprintf(os, fmt, std::forward(a)...); + return os; +} + +template +void +print(const char* fmt, A&&... a) { + ::fmt::printf(fmt, std::forward(a)...); +} + +template +std::string +sprint(const char* fmt, A&&... a) { + std::ostringstream os; + ::fmt::fprintf(os, fmt, std::forward(a)...); + return os.str(); +} + +template +std::string +sprint(const sstring& fmt, A&&... a) { + std::ostringstream os; + ::fmt::fprintf(os, fmt.c_str(), std::forward(a)...); + return os.str(); +} + +template +std::string +format_separated(Iterator b, Iterator e, const char* sep = ", ") { + std::string ret; + if (b == e) { + return ret; + } + ret += *b++; + while (b != e) { + ret += sep; + ret += *b++; + } + return ret; +} + +template +struct usecfmt_wrapper { + TimePoint val; +}; + +template +inline +usecfmt_wrapper +usecfmt(TimePoint tp) { + return { tp }; +}; + +template +std::ostream& +operator<<(std::ostream& os, usecfmt_wrapper>> tp) { + auto usec = std::chrono::duration_cast(tp.val.time_since_epoch()).count(); + std::ostream tmp(os.rdbuf()); + tmp << std::setw(12) << (usec / 1000000) << "." << std::setw(6) << std::setfill('0') << (usec % 1000000); + return os; +} + +template +void +log(A&&... a) { + std::cout << usecfmt(std::chrono::high_resolution_clock::now()) << " "; + print(std::forward(a)...); +} + +/** + * Evaluate the formatted string in a native fmt library format + * + * @param fmt format string with the native fmt library syntax + * @param a positional parameters + * + * @return sstring object with the result of applying the given positional + * parameters on a given format string. + */ +template +sstring +format(const char* fmt, A&&... a) { + fmt::memory_buffer out; + fmt::format_to(out, fmt, std::forward(a)...); + return sstring{out.data(), out.size()}; +} + +// temporary, use fmt::print() instead +template +std::ostream& +fmt_print(std::ostream& os, const char* format, A&&... a) { + fmt::print(os, format, std::forward(a)...); + return os; +} + +} diff --git a/src/seastar/include/seastar/core/prometheus.hh b/src/seastar/include/seastar/core/prometheus.hh new file mode 100644 index 000000000..70c18894f --- /dev/null +++ b/src/seastar/include/seastar/core/prometheus.hh @@ -0,0 +1,51 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 ScyllaDB + */ + +#pragma once + +#include +#include +#include + +namespace seastar { + +namespace prometheus { + +/*! + * Holds prometheus related configuration + */ +struct config { + sstring metric_help; //!< Default help message for the returned metrics + sstring hostname; //!< hostname is deprecated, use label instead + std::optional label; //!< A label that will be added to all metrics, we advice not to use it and set it on the prometheus server + sstring prefix = "seastar"; //!< a prefix that will be added to metric names +}; + +future<> start(httpd::http_server_control& http_server, config ctx); + +/// \defgroup add_prometheus_routes adds a /metrics endpoint that returns prometheus metrics +/// both in txt format and in protobuf according to the prometheus spec +/// @{ +future<> add_prometheus_routes(distributed& server, config ctx); +future<> add_prometheus_routes(http_server& server, config ctx); +/// @} +} +} diff --git a/src/seastar/include/seastar/core/queue.hh b/src/seastar/include/seastar/core/queue.hh new file mode 100644 index 000000000..b5bf8e049 --- /dev/null +++ b/src/seastar/include/seastar/core/queue.hh @@ -0,0 +1,279 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include + +namespace seastar { + +/// Asynchronous single-producer single-consumer queue with limited capacity. +/// There can be at most one producer-side and at most one consumer-side operation active at any time. +/// Operations returning a future are considered to be active until the future resolves. +template +class queue { + std::queue> _q; + size_t _max; + std::optional> _not_empty; + std::optional> _not_full; + std::exception_ptr _ex = nullptr; +private: + void notify_not_empty(); + void notify_not_full(); +public: + explicit queue(size_t size); + + /// \brief Push an item. + /// + /// Returns false if the queue was full and the item was not pushed. + bool push(T&& a); + + /// \brief Pop an item. + /// + /// Popping from an empty queue will result in undefined behavior. + T pop(); + + /// Consumes items from the queue, passing them to \c func, until \c func + /// returns false or the queue it empty + /// + /// Returns false if func returned false. + template + bool consume(Func&& func); + + /// Returns true when the queue is empty. + bool empty() const; + + /// Returns true when the queue is full. + bool full() const; + + /// Returns a future<> that becomes available when pop() or consume() + /// can be called. + /// A consumer-side operation. Cannot be called concurrently with other consumer-side operations. + future<> not_empty(); + + /// Returns a future<> that becomes available when push() can be called. + /// A producer-side operation. Cannot be called concurrently with other producer-side operations. + future<> not_full(); + + /// Pops element now or when there is some. Returns a future that becomes + /// available when some element is available. + /// If the queue is, or already was, abort()ed, the future resolves with + /// the exception provided to abort(). + /// A consumer-side operation. Cannot be called concurrently with other consumer-side operations. + future pop_eventually(); + + /// Pushes the element now or when there is room. Returns a future<> which + /// resolves when data was pushed. + /// If the queue is, or already was, abort()ed, the future resolves with + /// the exception provided to abort(). + /// A producer-side operation. Cannot be called concurrently with other producer-side operations. + future<> push_eventually(T&& data); + + /// Returns the number of items currently in the queue. + size_t size() const { return _q.size(); } + + /// Returns the size limit imposed on the queue during its construction + /// or by a call to set_max_size(). If the queue contains max_size() + /// items (or more), further items cannot be pushed until some are popped. + size_t max_size() const { return _max; } + + /// Set the maximum size to a new value. If the queue's max size is reduced, + /// items already in the queue will not be expunged and the queue will be temporarily + /// bigger than its max_size. + void set_max_size(size_t max) { + _max = max; + if (!full()) { + notify_not_full(); + } + } + + /// Destroy any items in the queue, and pass the provided exception to any + /// waiting readers or writers - or to any later read or write attempts. + void abort(std::exception_ptr ex) { + while (!_q.empty()) { + _q.pop(); + } + _ex = ex; + if (_not_full) { + _not_full->set_exception(ex); + _not_full= std::nullopt; + } + if (_not_empty) { + _not_empty->set_exception(std::move(ex)); + _not_empty = std::nullopt; + } + } + + /// \brief Check if there is an active consumer + /// + /// Returns true if another fiber waits for an item to be pushed into the queue + bool has_blocked_consumer() const { + return bool(_not_empty); + } +}; + +template +inline +queue::queue(size_t size) + : _max(size) { +} + +template +inline +void queue::notify_not_empty() { + if (_not_empty) { + _not_empty->set_value(); + _not_empty = std::optional>(); + } +} + +template +inline +void queue::notify_not_full() { + if (_not_full) { + _not_full->set_value(); + _not_full = std::optional>(); + } +} + +template +inline +bool queue::push(T&& data) { + if (_q.size() < _max) { + _q.push(std::move(data)); + notify_not_empty(); + return true; + } else { + return false; + } +} + +template +inline +T queue::pop() { + if (_q.size() == _max) { + notify_not_full(); + } + T data = std::move(_q.front()); + _q.pop(); + return data; +} + +template +inline +future queue::pop_eventually() { + if (_ex) { + return make_exception_future(_ex); + } + if (empty()) { + return not_empty().then([this] { + if (_ex) { + return make_exception_future(_ex); + } else { + return make_ready_future(pop()); + } + }); + } else { + return make_ready_future(pop()); + } +} + +template +inline +future<> queue::push_eventually(T&& data) { + if (_ex) { + return make_exception_future<>(_ex); + } + if (full()) { + return not_full().then([this, data = std::move(data)] () mutable { + _q.push(std::move(data)); + notify_not_empty(); + }); + } else { + _q.push(std::move(data)); + notify_not_empty(); + return make_ready_future<>(); + } +} + +template +template +inline +bool queue::consume(Func&& func) { + if (_ex) { + std::rethrow_exception(_ex); + } + bool running = true; + while (!_q.empty() && running) { + running = func(std::move(_q.front())); + _q.pop(); + } + if (!full()) { + notify_not_full(); + } + return running; +} + +template +inline +bool queue::empty() const { + return _q.empty(); +} + +template +inline +bool queue::full() const { + return _q.size() >= _max; +} + +template +inline +future<> queue::not_empty() { + if (_ex) { + return make_exception_future<>(_ex); + } + if (!empty()) { + return make_ready_future<>(); + } else { + _not_empty = promise<>(); + return _not_empty->get_future(); + } +} + +template +inline +future<> queue::not_full() { + if (_ex) { + return make_exception_future<>(_ex); + } + if (!full()) { + return make_ready_future<>(); + } else { + _not_full = promise<>(); + return _not_full->get_future(); + } +} + +} + diff --git a/src/seastar/include/seastar/core/ragel.hh b/src/seastar/include/seastar/core/ragel.hh new file mode 100644 index 000000000..de14035e9 --- /dev/null +++ b/src/seastar/include/seastar/core/ragel.hh @@ -0,0 +1,140 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +// Support classes for Ragel parsers + +// Builds an sstring that can be scattered across multiple packets. +// +// Use a sstring_build::guard variable to designate each scattered +// char array, and call mark_start() and mark_end() at the start +// and end points, respectively. sstring_builder will collect data +// from intervening segments, if needed. +// +// After mark_end() has been called, use the get() method to obtain +// the built string. +// +// FIXME: switch to string_view. +// +class sstring_builder { + sstring _value; + const char* _start = nullptr; +public: + class guard; +public: + sstring get() && { + return std::move(_value); + } + void reset() { + _value = {}; + _start = nullptr; + } + friend class guard; +}; + +class sstring_builder::guard { + sstring_builder& _builder; + const char* _block_end; +public: + guard(sstring_builder& builder, const char* block_start, const char* block_end) + : _builder(builder), _block_end(block_end) { + if (!_builder._value.empty()) { + mark_start(block_start); + } + } + ~guard() { + if (_builder._start) { + mark_end(_block_end); + } + } + void mark_start(const char* p) { + _builder._start = p; + } + void mark_end(const char* p) { + if (_builder._value.empty()) { + // avoid an allocation in the common case + _builder._value = sstring(_builder._start, p); + } else { + _builder._value += sstring(_builder._start, p); + } + _builder._start = nullptr; + } +}; + + +// CRTP +template +class ragel_parser_base { +protected: + int _fsm_cs; + std::unique_ptr _fsm_stack = nullptr; + int _fsm_stack_size = 0; + int _fsm_top; + int _fsm_act; + char* _fsm_ts; + char* _fsm_te; + sstring_builder _builder; +protected: + void init_base() { + _builder.reset(); + } + void prepush() { + if (_fsm_top == _fsm_stack_size) { + auto old = _fsm_stack_size; + _fsm_stack_size = std::max(_fsm_stack_size * 2, 16); + assert(_fsm_stack_size > old); + std::unique_ptr new_stack{new int[_fsm_stack_size]}; + std::copy(_fsm_stack.get(), _fsm_stack.get() + _fsm_top, new_stack.get()); + std::swap(_fsm_stack, new_stack); + } + } + void postpop() {} + sstring get_str() { + return std::move(_builder).get(); + } +public: + using unconsumed_remainder = std::optional>; + future operator()(temporary_buffer buf) { + char* p = buf.get_write(); + char* pe = p + buf.size(); + char* eof = buf.empty() ? pe : nullptr; + char* parsed = static_cast(this)->parse(p, pe, eof); + if (parsed) { + buf.trim_front(parsed - p); + return make_ready_future(std::move(buf)); + } + return make_ready_future(); + } +}; + +} diff --git a/src/seastar/include/seastar/core/reactor.hh b/src/seastar/include/seastar/core/reactor.hh new file mode 100644 index 000000000..224193247 --- /dev/null +++ b/src/seastar/include/seastar/core/reactor.hh @@ -0,0 +1,755 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2014 Cloudius Systems + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal/pollable_fd.hh" +#include "internal/poll.hh" + +#ifdef HAVE_OSV +#include +#include +#include +#include +#endif + +struct _Unwind_Exception; + +namespace seastar { + +using shard_id = unsigned; + +namespace alien { +class message_queue; +} +class reactor; +inline +size_t iovec_len(const std::vector& iov) +{ + size_t ret = 0; + for (auto&& e : iov) { + ret += e.iov_len; + } + return ret; +} + +} + +namespace std { + +template <> +struct hash<::sockaddr_in> { + size_t operator()(::sockaddr_in a) const { + return a.sin_port ^ a.sin_addr.s_addr; + } +}; + +} + +bool operator==(const ::sockaddr_in a, const ::sockaddr_in b); + +namespace seastar { + +void register_network_stack(sstring name, boost::program_options::options_description opts, + noncopyable_function>(boost::program_options::variables_map opts)> create, + bool make_default = false); + +class thread_pool; +class smp; + +class reactor_backend_selector; + +class reactor_backend; + +namespace internal { + +class reactor_stall_sampler; +class cpu_stall_detector; +class buffer_allocator; + +template // signature: bool () +std::unique_ptr make_pollfn(Func&& func); + +class poller { + std::unique_ptr _pollfn; + class registration_task; + class deregistration_task; + registration_task* _registration_task = nullptr; +public: + template // signature: bool () + static poller simple(Func&& poll) { + return poller(make_pollfn(std::forward(poll))); + } + poller(std::unique_ptr fn) + : _pollfn(std::move(fn)) { + do_register(); + } + ~poller(); + poller(poller&& x) noexcept; + poller& operator=(poller&& x) noexcept; + void do_register() noexcept; + friend class reactor; +}; + +} + +class kernel_completion; +class io_queue; +class disk_config_params; + +class io_completion : public kernel_completion { +public: + virtual void complete_with(ssize_t res) final override; + + virtual void complete(size_t res) noexcept = 0; + virtual void set_exception(std::exception_ptr eptr) noexcept = 0; +}; + +class reactor { + using sched_clock = std::chrono::steady_clock; +private: + struct task_queue; + using task_queue_list = circular_buffer_fixed_capacity; + using pollfn = seastar::pollfn; + + class signal_pollfn; + class batch_flush_pollfn; + class smp_pollfn; + class drain_cross_cpu_freelist_pollfn; + class lowres_timer_pollfn; + class manual_timer_pollfn; + class epoll_pollfn; + class reap_kernel_completions_pollfn; + class kernel_submit_work_pollfn; + class io_queue_submission_pollfn; + class syscall_pollfn; + class execution_stage_pollfn; + friend class manual_clock; + friend class file_data_source_impl; // for fstream statistics + friend class internal::reactor_stall_sampler; + friend class preempt_io_context; + friend struct hrtimer_aio_completion; + friend struct task_quota_aio_completion; + friend class reactor_backend_epoll; + friend class reactor_backend_aio; + friend class reactor_backend_selector; + friend class aio_storage_context; +public: + using poller = internal::poller; + using idle_cpu_handler_result = seastar::idle_cpu_handler_result; + using work_waiting_on_reactor = seastar::work_waiting_on_reactor; + using idle_cpu_handler = seastar::idle_cpu_handler; + + struct io_stats { + uint64_t aio_reads = 0; + uint64_t aio_read_bytes = 0; + uint64_t aio_writes = 0; + uint64_t aio_write_bytes = 0; + uint64_t aio_errors = 0; + uint64_t fstream_reads = 0; + uint64_t fstream_read_bytes = 0; + uint64_t fstream_reads_blocked = 0; + uint64_t fstream_read_bytes_blocked = 0; + uint64_t fstream_read_aheads_discarded = 0; + uint64_t fstream_read_ahead_discarded_bytes = 0; + }; + friend void io_completion::complete_with(ssize_t); + +private: + reactor_config _cfg; + file_desc _notify_eventfd; + file_desc _task_quota_timer; +#ifdef HAVE_OSV + reactor_backend_osv _backend; + sched::thread _timer_thread; + sched::thread *_engine_thread; + mutable mutex _timer_mutex; + condvar _timer_cond; + s64 _timer_due = 0; +#else + std::unique_ptr _backend; +#endif + sigset_t _active_sigmask; // holds sigmask while sleeping with sig disabled + std::vector _pollers; + + static constexpr unsigned max_aio_per_queue = 128; + static constexpr unsigned max_queues = 8; + static constexpr unsigned max_aio = max_aio_per_queue * max_queues; + friend disk_config_params; + + // Not all reactors have IO queues. If the number of IO queues is less than the number of shards, + // some reactors will talk to foreign io_queues. If this reactor holds a valid IO queue, it will + // be stored here. + std::vector> my_io_queues; + std::unordered_map _io_queues; + + std::vector ()>> _exit_funcs; + unsigned _id = 0; + bool _stopping = false; + bool _stopped = false; + bool _finished_running_tasks = false; + condition_variable _stop_requested; + bool _handle_sigint = true; + std::optional>> _network_stack_ready; + int _return = 0; + promise<> _start_promise; + semaphore _cpu_started; + internal::preemption_monitor _preemption_monitor{}; + uint64_t _global_tasks_processed = 0; + uint64_t _polls = 0; + std::unique_ptr _cpu_stall_detector; + + unsigned _max_task_backlog = 1000; + timer_set, &timer<>::_link> _timers; + timer_set, &timer<>::_link>::timer_list_t _expired_timers; + timer_set, &timer::_link> _lowres_timers; + timer_set, &timer::_link>::timer_list_t _expired_lowres_timers; + timer_set, &timer::_link> _manual_timers; + timer_set, &timer::_link>::timer_list_t _expired_manual_timers; + io_stats _io_stats; + uint64_t _fsyncs = 0; + uint64_t _cxx_exceptions = 0; + uint64_t _abandoned_failed_futures = 0; + struct task_queue { + explicit task_queue(unsigned id, sstring name, float shares); + int64_t _vruntime = 0; + float _shares; + int64_t _reciprocal_shares_times_2_power_32; + bool _current = false; + bool _active = false; + uint8_t _id; + sched_clock::time_point _ts; // to help calculating wait/starve-times + sched_clock::duration _runtime = {}; + sched_clock::duration _waittime = {}; + sched_clock::duration _starvetime = {}; + uint64_t _tasks_processed = 0; + circular_buffer _q; + sstring _name; + int64_t to_vruntime(sched_clock::duration runtime) const; + void set_shares(float shares) noexcept; + struct indirect_compare; + sched_clock::duration _time_spent_on_task_quota_violations = {}; + seastar::metrics::metric_groups _metrics; + void rename(sstring new_name); + private: + void register_stats(); + }; + + circular_buffer _pending_io; + boost::container::static_vector, max_scheduling_groups()> _task_queues; + internal::scheduling_group_specific_thread_local_data _scheduling_group_specific_data; + int64_t _last_vruntime = 0; + task_queue_list _active_task_queues; + task_queue_list _activating_task_queues; + task_queue* _at_destroy_tasks; + sched_clock::duration _task_quota; + task* _current_task = nullptr; + /// Handler that will be called when there is no task to execute on cpu. + /// It represents a low priority work. + /// + /// Handler's return value determines whether handler did any actual work. If no work was done then reactor will go + /// into sleep. + /// + /// Handler's argument is a function that returns true if a task which should be executed on cpu appears or false + /// otherwise. This function should be used by a handler to return early if a task appears. + idle_cpu_handler _idle_cpu_handler{ [] (work_waiting_on_reactor) {return idle_cpu_handler_result::no_more_work;} }; + std::unique_ptr _network_stack; + // _lowres_clock_impl will only be created on cpu 0 + std::unique_ptr _lowres_clock_impl; + lowres_clock::time_point _lowres_next_timeout; + std::optional _epoll_poller; + std::optional _aio_eventfd; + const bool _reuseport; + circular_buffer _loads; + double _load = 0; + sched_clock::duration _total_idle{0}; + sched_clock::duration _total_sleep; + sched_clock::time_point _start_time = sched_clock::now(); + std::chrono::nanoseconds _max_poll_time = calculate_poll_time(); + circular_buffer* > _flush_batching; + std::atomic _sleeping alignas(seastar::cache_line_size){0}; + pthread_t _thread_id alignas(seastar::cache_line_size) = pthread_self(); + bool _strict_o_direct = true; + bool _force_io_getevents_syscall = false; + bool _bypass_fsync = false; + bool _have_aio_fsync = false; + std::atomic _dying{false}; +private: + static std::chrono::nanoseconds calculate_poll_time(); + static void block_notifier(int); + void wakeup(); + size_t handle_aio_error(internal::linux_abi::iocb* iocb, int ec); + bool flush_pending_aio(); + bool reap_kernel_completions(); + bool flush_tcp_batches(); + bool do_expire_lowres_timers() noexcept; + bool do_check_lowres_timers() const noexcept; + void expire_manual_timers() noexcept; + void start_aio_eventfd_loop(); + void stop_aio_eventfd_loop(); + template + void complete_timers(T&, E&, EnableFunc&& enable_fn) noexcept(noexcept(enable_fn())); + + /** + * Returns TRUE if all pollers allow blocking. + * + * @return FALSE if at least one of the blockers requires a non-blocking + * execution. + */ + bool poll_once(); + bool pure_poll_once(); +public: + /// Register a user-defined signal handler + void handle_signal(int signo, noncopyable_function&& handler); + +private: + class signals { + public: + signals(); + ~signals(); + + bool poll_signal(); + bool pure_poll_signal() const; + void handle_signal(int signo, noncopyable_function&& handler); + void handle_signal_once(int signo, noncopyable_function&& handler); + static void action(int signo, siginfo_t* siginfo, void* ignore); + static void failed_to_handle(int signo); + private: + struct signal_handler { + signal_handler(int signo, noncopyable_function&& handler); + noncopyable_function _handler; + }; + std::atomic _pending_signals; + std::unordered_map _signal_handlers; + + friend void reactor::handle_signal(int, noncopyable_function&&); + }; + + signals _signals; + std::unique_ptr _thread_pool; + friend class thread_pool; + friend class thread_context; + friend class internal::cpu_stall_detector; + + uint64_t pending_task_count() const; + void run_tasks(task_queue& tq); + bool have_more_tasks() const; + bool posix_reuseport_detect(); + void task_quota_timer_thread_fn(); + void run_some_tasks(); + void activate(task_queue& tq); + void insert_active_task_queue(task_queue* tq); + task_queue* pop_active_task_queue(sched_clock::time_point now); + void insert_activating_task_queues(); + void account_runtime(task_queue& tq, sched_clock::duration runtime); + void account_idle(sched_clock::duration idletime); + void allocate_scheduling_group_specific_data(scheduling_group sg, scheduling_group_key key); + future<> init_scheduling_group(scheduling_group sg, sstring name, float shares); + future<> init_new_scheduling_group_key(scheduling_group_key key, scheduling_group_key_config cfg); + future<> destroy_scheduling_group(scheduling_group sg); + uint64_t tasks_processed() const; + uint64_t min_vruntime() const; + void request_preemption(); + void start_handling_signal(); + void reset_preemption_monitor(); + void service_highres_timer() noexcept; + + future> + do_accept(pollable_fd_state& listen_fd); + future<> do_connect(pollable_fd_state& pfd, socket_address& sa); + + future + do_read_some(pollable_fd_state& fd, void* buffer, size_t size); + future + do_read_some(pollable_fd_state& fd, const std::vector& iov); + future> + do_read_some(pollable_fd_state& fd, internal::buffer_allocator* ba); + + future + do_write_some(pollable_fd_state& fd, const void* buffer, size_t size); + future + do_write_some(pollable_fd_state& fd, net::packet& p); +public: + static boost::program_options::options_description get_options_description(reactor_config cfg); + explicit reactor(unsigned id, reactor_backend_selector rbs, reactor_config cfg); + reactor(const reactor&) = delete; + ~reactor(); + void operator=(const reactor&) = delete; + + sched_clock::duration uptime() { + return sched_clock::now() - _start_time; + } + + io_queue& get_io_queue(dev_t devid = 0) { + auto queue = _io_queues.find(devid); + if (queue == _io_queues.end()) { + return *_io_queues[0]; + } else { + return *(queue->second); + } + } + + io_priority_class register_one_priority_class(sstring name, uint32_t shares); + + /// \brief Updates the current amount of shares for a given priority class + /// + /// This can involve a cross-shard call if the I/O Queue that is responsible for + /// this class lives in a foreign shard. + /// + /// \param pc the priority class handle + /// \param shares the new shares value + /// \return a future that is ready when the share update is applied + future<> update_shares_for_class(io_priority_class pc, uint32_t shares); + static future<> rename_priority_class(io_priority_class pc, sstring new_name) noexcept; + + void configure(boost::program_options::variables_map config); + + server_socket listen(socket_address sa, listen_options opts = {}); + + future connect(socket_address sa); + future connect(socket_address, socket_address, transport proto = transport::TCP); + + pollable_fd posix_listen(socket_address sa, listen_options opts = {}); + + bool posix_reuseport_available() const { return _reuseport; } + + pollable_fd make_pollable_fd(socket_address sa, int proto); + + future<> posix_connect(pollable_fd pfd, socket_address sa, socket_address local); + + future<> write_all(pollable_fd_state& fd, const void* buffer, size_t size); + + future open_file_dma(std::string_view name, open_flags flags, file_open_options options = {}) noexcept; + future open_directory(std::string_view name) noexcept; + future<> make_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept; + future<> touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept; + future> file_type(std::string_view name, follow_symlink = follow_symlink::yes) noexcept; + future file_stat(std::string_view pathname, follow_symlink) noexcept; + future file_size(std::string_view pathname) noexcept; + future file_accessible(std::string_view pathname, access_flags flags) noexcept; + future file_exists(std::string_view pathname) noexcept { + return file_accessible(pathname, access_flags::exists); + } + future file_system_at(std::string_view pathname) noexcept; + future statvfs(std::string_view pathname) noexcept; + future<> remove_file(std::string_view pathname) noexcept; + future<> rename_file(std::string_view old_pathname, std::string_view new_pathname) noexcept; + future<> link_file(std::string_view oldpath, std::string_view newpath) noexcept; + future<> chmod(std::string_view name, file_permissions permissions) noexcept; + + future inotify_add_watch(int fd, std::string_view path, uint32_t flags); + + // In the following three methods, prepare_io is not guaranteed to execute in the same processor + // in which it was generated. Therefore, care must be taken to avoid the use of objects that could + // be destroyed within or at exit of prepare_io. + void submit_io(io_completion* desc, internal::io_request req) noexcept; + future submit_io_read(io_queue* ioq, + const io_priority_class& priority_class, + size_t len, + internal::io_request req) noexcept; + future submit_io_write(io_queue* ioq, + const io_priority_class& priority_class, + size_t len, + internal::io_request req) noexcept; + + int run(); + void exit(int ret); + future<> when_started() { return _start_promise.get_future(); } + // The function waits for timeout period for reactor stop notification + // which happens on termination signals or call for exit(). + template + future<> wait_for_stop(std::chrono::duration timeout) { + return _stop_requested.wait(timeout, [this] { return _stopping; }); + } + + void at_exit(noncopyable_function ()> func); + + template + void at_destroy(Func&& func) { + _at_destroy_tasks->_q.push_back(make_task(default_scheduling_group(), std::forward(func))); + } + +#ifdef SEASTAR_SHUFFLE_TASK_QUEUE + void shuffle(task*&, task_queue&); +#endif + task* current_task() const { return _current_task; } + + void add_task(task* t) noexcept { + auto sg = t->group(); + auto* q = _task_queues[sg._id].get(); + bool was_empty = q->_q.empty(); + q->_q.push_back(std::move(t)); +#ifdef SEASTAR_SHUFFLE_TASK_QUEUE + shuffle(q->_q.back(), *q); +#endif + if (was_empty) { + activate(*q); + } + } + void add_urgent_task(task* t) noexcept { + memory::scoped_critical_alloc_section _; + auto sg = t->group(); + auto* q = _task_queues[sg._id].get(); + bool was_empty = q->_q.empty(); + q->_q.push_front(std::move(t)); +#ifdef SEASTAR_SHUFFLE_TASK_QUEUE + shuffle(q->_q.front(), *q); +#endif + if (was_empty) { + activate(*q); + } + } + + /// Set a handler that will be called when there is no task to execute on cpu. + /// Handler should do a low priority work. + /// + /// Handler's return value determines whether handler did any actual work. If no work was done then reactor will go + /// into sleep. + /// + /// Handler's argument is a function that returns true if a task which should be executed on cpu appears or false + /// otherwise. This function should be used by a handler to return early if a task appears. + void set_idle_cpu_handler(idle_cpu_handler&& handler) { + _idle_cpu_handler = std::move(handler); + } + void force_poll(); + + void add_high_priority_task(task*) noexcept; + + network_stack& net() { return *_network_stack; } + + [[deprecated("Use this_shard_id")]] + shard_id cpu_id() const; + + void sleep(); + + steady_clock_type::duration total_idle_time(); + steady_clock_type::duration total_busy_time(); + std::chrono::nanoseconds total_steal_time(); + + const io_stats& get_io_stats() const { return _io_stats; } + uint64_t abandoned_failed_futures() const { return _abandoned_failed_futures; } +#ifdef HAVE_OSV + void timer_thread_func(); + void set_timer(sched::timer &tmr, s64 t); +#endif +private: + /** + * Add a new "poller" - a non-blocking function returning a boolean, that + * will be called every iteration of a main loop. + * If it returns FALSE then reactor's main loop is forbidden to block in the + * current iteration. + * + * @param fn a new "poller" function to register + */ + void register_poller(pollfn* p); + void unregister_poller(pollfn* p); + void replace_poller(pollfn* old, pollfn* neww); + void register_metrics(); + future<> write_all_part(pollable_fd_state& fd, const void* buffer, size_t size, size_t completed); + + future<> fdatasync(int fd) noexcept; + + void add_timer(timer*) noexcept; + bool queue_timer(timer*) noexcept; + void del_timer(timer*) noexcept; + void add_timer(timer*) noexcept; + bool queue_timer(timer*) noexcept; + void del_timer(timer*) noexcept; + void add_timer(timer*) noexcept; + bool queue_timer(timer*) noexcept; + void del_timer(timer*) noexcept; + + future<> run_exit_tasks(); + void stop(); + friend class alien::message_queue; + friend class pollable_fd; + friend class pollable_fd_state; + friend struct pollable_fd_state_deleter; + friend class posix_file_impl; + friend class blockdev_file_impl; + friend class readable_eventfd; + friend class timer<>; + friend class timer; + friend class timer; + friend class smp; + friend class smp_message_queue; + friend class internal::poller; + friend class scheduling_group; + friend void add_to_flush_poller(output_stream* os); + friend void seastar::log_exception_trace() noexcept; + friend void report_failed_future(const std::exception_ptr& eptr) noexcept; + friend void with_allow_abandoned_failed_futures(unsigned count, noncopyable_function func); + metrics::metric_groups _metric_groups; + friend future create_scheduling_group(sstring name, float shares) noexcept; + friend future<> seastar::destroy_scheduling_group(scheduling_group) noexcept; + friend future<> seastar::rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept; + friend future scheduling_group_key_create(scheduling_group_key_config cfg) noexcept; + + template + friend T* internal::scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept; + template + SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Mapper mapper, Reducer reducer, Initial initial) { + {reducer(initial, mapper(specific_val))} -> std::convertible_to; + }) + friend future::return_type> + map_reduce_scheduling_group_specific(Mapper mapper, Reducer reducer, Initial initial_val, scheduling_group_key key); + template + SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Reducer reducer, Initial initial) { + {reducer(initial, specific_val)} -> std::convertible_to; + }) + friend future::return_type> + reduce_scheduling_group_specific(Reducer reducer, Initial initial_val, scheduling_group_key key); + + future fstat(int fd) noexcept; + future fstatfs(int fd) noexcept; + friend future> make_file_impl(int fd, file_open_options options, int flags) noexcept; +public: + future<> readable(pollable_fd_state& fd); + future<> writeable(pollable_fd_state& fd); + future<> readable_or_writeable(pollable_fd_state& fd); + void abort_reader(pollable_fd_state& fd); + void abort_writer(pollable_fd_state& fd); + void enable_timer(steady_clock_type::time_point when) noexcept; + /// Sets the "Strict DMA" flag. + /// + /// When true (default), file I/O operations must use DMA. This is + /// the most performant option, but does not work on some file systems + /// such as tmpfs or aufs (used in some Docker setups). + /// + /// When false, file I/O operations can fall back to buffered I/O if + /// DMA is not available. This can result in dramatic reducation in + /// performance and an increase in memory consumption. + void set_strict_dma(bool value); + void set_bypass_fsync(bool value); + void update_blocked_reactor_notify_ms(std::chrono::milliseconds ms); + std::chrono::milliseconds get_blocked_reactor_notify_ms() const; + // For testing: + void set_stall_detector_report_function(std::function report); + std::function get_stall_detector_report_function() const; +}; + +template // signature: bool () +inline +std::unique_ptr +internal::make_pollfn(Func&& func) { + struct the_pollfn : simple_pollfn { + the_pollfn(Func&& func) : func(std::forward(func)) {} + Func func; + virtual bool poll() override final { + return func(); + } + }; + return std::make_unique(std::forward(func)); +} + +extern __thread reactor* local_engine; +extern __thread size_t task_quota; + +inline reactor& engine() { + return *local_engine; +} + +inline bool engine_is_ready() { + return local_engine != nullptr; +} + +inline +size_t iovec_len(const iovec* begin, size_t len) +{ + size_t ret = 0; + auto end = begin + len; + while (begin != end) { + ret += begin++->iov_len; + } + return ret; +} + +inline int hrtimer_signal() { + // We don't want to use SIGALRM, because the boost unit test library + // also plays with it. + return SIGRTMIN; +} + + +extern logger seastar_logger; + +} diff --git a/src/seastar/include/seastar/core/reactor_config.hh b/src/seastar/include/seastar/core/reactor_config.hh new file mode 100644 index 000000000..d7cc5f0a9 --- /dev/null +++ b/src/seastar/include/seastar/core/reactor_config.hh @@ -0,0 +1,47 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2019 ScyllaDB + */ + +#pragma once + +#include + +namespace seastar { + +/// Configuration structure for reactor +/// +/// This structure provides configuration items for the reactor. It is typically +/// provided by \ref app_template, not the user. +struct reactor_config { + std::chrono::duration task_quota{0.5e-3}; ///< default time between polls + /// \brief Handle SIGINT/SIGTERM by calling reactor::stop() + /// + /// When true, Seastar will set up signal handlers for SIGINT/SIGTERM that call + /// reactor::stop(). The reactor will then execute callbacks installed by + /// reactor::at_exit(). + /// + /// When false, Seastar will not set up signal handlers for SIGINT/SIGTERM + /// automatically. The default behavior (terminate the program) will be kept. + /// You can adjust the behavior of SIGINT/SIGTERM by installing signal handlers + /// via reactor::handle_signal(). + bool auto_handle_sigint_sigterm = true; ///< automatically terminate on SIGINT/SIGTERM +}; + +} diff --git a/src/seastar/include/seastar/core/report_exception.hh b/src/seastar/include/seastar/core/report_exception.hh new file mode 100644 index 000000000..7738d9bae --- /dev/null +++ b/src/seastar/include/seastar/core/report_exception.hh @@ -0,0 +1,31 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright 2014 Cloudius Systems + */ + +#pragma once + +#include + +namespace seastar { + +void report_exception(std::string_view message, std::exception_ptr) noexcept; + +} + diff --git a/src/seastar/include/seastar/core/resource.hh b/src/seastar/include/seastar/core/resource.hh new file mode 100644 index 000000000..5120c2aa4 --- /dev/null +++ b/src/seastar/include/seastar/core/resource.hh @@ -0,0 +1,94 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +cpu_set_t cpuid_to_cpuset(unsigned cpuid); + +namespace resource { + +using std::optional; + +using cpuset = std::set; + +struct configuration { + optional total_memory; + optional reserve_memory; // if total_memory not specified + optional cpus; + optional cpu_set; + std::unordered_map num_io_queues; + bool assign_orphan_cpus = false; +}; + +struct memory { + size_t bytes; + unsigned nodeid; + +}; + +// Since this is static information, we will keep a copy at each CPU. +// This will allow us to easily find who is the IO coordinator for a given +// node without a trip to a remote CPU. +struct io_queue_topology { + std::vector shard_to_coordinator; + std::vector coordinator_to_idx; + std::vector coordinator_to_idx_valid; // for validity asserts + unsigned nr_coordinators; +}; + +struct cpu { + unsigned cpu_id; + std::vector mem; +}; + +struct resources { + std::vector cpus; + std::unordered_map ioq_topology; +}; + +resources allocate(configuration c); +unsigned nr_processing_units(); +} + +// We need a wrapper class, because boost::program_options wants validate() +// (below) to be in the same namespace as the type it is validating. +struct cpuset_bpo_wrapper { + resource::cpuset value; +}; + +// Overload for boost program options parsing/validation +extern +void validate(boost::any& v, + const std::vector& values, + cpuset_bpo_wrapper* target_type, int); + +} diff --git a/src/seastar/include/seastar/core/rwlock.hh b/src/seastar/include/seastar/core/rwlock.hh new file mode 100644 index 000000000..58b046a2f --- /dev/null +++ b/src/seastar/include/seastar/core/rwlock.hh @@ -0,0 +1,180 @@ +/* +* This file is open source software, licensed to you under the terms +* of the Apache License, Version 2.0 (the "License"). See the NOTICE file +* distributed with this work for additional information regarding copyright +* ownership. You may not use this file except in compliance with the License. +* +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an +* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +* KIND, either express or implied. See the License for the +* specific language governing permissions and limitations +* under the License. +*/ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include + +namespace seastar { + +/// \cond internal +// lock / unlock semantics for rwlock, so it can be used with with_lock() +template +class basic_rwlock; + +template::clock> +class rwlock_for_read { +public: + future<> lock() { + return static_cast*>(this)->read_lock(); + } + void unlock() { + static_cast*>(this)->read_unlock(); + } + friend class basic_rwlock; +}; + +template::clock> +class rwlock_for_write { +public: + future<> lock() { + return static_cast*>(this)->write_lock(); + } + void unlock() { + static_cast*>(this)->write_unlock(); + } + friend class basic_rwlock; +}; +/// \endcond + + +/// \addtogroup fiber-module +/// @{ + +/// Implements a read-write lock mechanism. Beware: this is not a cross-CPU +/// lock, due to seastar's sharded architecture. +/// Instead, it can be used to achieve rwlock semantics between two (or more) +/// fibers running in the same CPU that may use the same resource. +/// Acquiring the write lock will effectively cause all readers not to be executed +/// until the write part is done. +template::clock> +class basic_rwlock : private rwlock_for_read, rwlock_for_write { + using semaphore_type = basic_semaphore; + + static constexpr size_t max_ops = semaphore_type::max_counter(); + + semaphore_type _sem; +public: + basic_rwlock() + : _sem(max_ops) { + } + + /// Cast this rwlock into read lock object with lock semantics appropriate to be used + /// by "with_lock". The resulting object will have lock / unlock calls that, when called, + /// will acquire / release the lock in read mode. + rwlock_for_read& for_read() { + return *this; + } + + /// Cast this rwlock into write lock object with lock semantics appropriate to be used + /// by "with_lock". The resulting object will have lock / unlock calls that, when called, + /// will acquire / release the lock in write mode. + rwlock_for_write& for_write() { + return *this; + } + + /// Acquires this lock in read mode. Many readers are allowed, but when + /// this future returns, and until \ref read_unlock is called, all fibers + /// waiting on \ref write_lock are guaranteed not to execute. + future<> read_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) { + return _sem.wait(timeout); + } + + /// Releases the lock, which must have been taken in read mode. After this + /// is called, one of the fibers waiting on \ref write_lock will be allowed + /// to proceed. + void read_unlock() { + assert(_sem.current() < max_ops); + _sem.signal(); + } + + /// Acquires this lock in write mode. Only one writer is allowed. When + /// this future returns, and until \ref write_unlock is called, all other + /// fibers waiting on either \ref read_lock or \ref write_lock are guaranteed + /// not to execute. + future<> write_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) { + return _sem.wait(timeout, max_ops); + } + + /// Releases the lock, which must have been taken in write mode. After this + /// is called, one of the other fibers waiting on \ref write_lock or the fibers + /// waiting on \ref read_lock will be allowed to proceed. + void write_unlock() { + assert(_sem.current() == 0); + _sem.signal(max_ops); + } + + /// Tries to acquire the lock in read mode iff this can be done without waiting. + bool try_read_lock() { + return _sem.try_wait(); + } + + /// Tries to acquire the lock in write mode iff this can be done without waiting. + bool try_write_lock() { + return _sem.try_wait(max_ops); + } + + using holder = semaphore_units; + + /// hold_read_lock() waits for a read lock and returns an object which, + /// when destroyed, releases the lock. This makes it easy to ensure that + /// the lock is eventually undone, at any circumstance (even including + /// exceptions). The release() method can be used on the returned object + /// to release its ownership of the lock and avoid the automatic unlock. + /// Note that both hold_read_lock() and hold_write_lock() return an object + /// of the same type, rwlock::holder. + /// + /// hold_read_lock() may throw an exception (or, in other implementations, + /// return an exceptional future) when it failed to obtain the lock - + /// e.g., on allocation failure. + future hold_read_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) { + return get_units(_sem, 1); + } + + /// hold_write_lock() waits for a write lock and returns an object which, + /// when destroyed, releases the lock. This makes it easy to ensure that + /// the lock is eventually undone, at any circumstance (even including + /// exceptions). The release() method can be used on the returned object + /// to release its ownership of the lock and avoid the automatic unlock. + /// Note that both hold_read_lock() and hold_write_lock() return an object + /// of the same type, rwlock::holder. + /// + /// hold_read_lock() may throw an exception (or, in other implementations, + /// return an exceptional future) when it failed to obtain the lock - + /// e.g., on allocation failure. + future hold_write_lock(typename semaphore_type::time_point timeout = semaphore_type::time_point::max()) { + return get_units(_sem, max_ops); + } + + /// Checks if any read or write locks are currently held. + bool locked() const { + return _sem.available_units() != max_ops; + } + + friend class rwlock_for_read; + friend class rwlock_for_write; +}; + +using rwlock = basic_rwlock<>; + +/// @} + +} diff --git a/src/seastar/include/seastar/core/scattered_message.hh b/src/seastar/include/seastar/core/scattered_message.hh new file mode 100644 index 000000000..83655249f --- /dev/null +++ b/src/seastar/include/seastar/core/scattered_message.hh @@ -0,0 +1,112 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace seastar { + +template +class scattered_message { +private: + using fragment = net::fragment; + using packet = net::packet; + using char_type = CharType; + packet _p; +public: + scattered_message() {} + scattered_message(scattered_message&&) = default; + scattered_message(const scattered_message&) = delete; + + void append_static(const char_type* buf, size_t size) { + if (size) { + _p = packet(std::move(_p), fragment{(char_type*)buf, size}, deleter()); + } + } + + template + void append_static(const char_type(&s)[N]) { + append_static(s, N - 1); + } + + void append_static(const char_type* s) { + append_static(s, strlen(s)); + } + + template + void append_static(const basic_sstring& s) { + append_static(s.begin(), s.size()); + } + + void append_static(const std::string_view& s) { + append_static(s.data(), s.size()); + } + + void append(std::string_view v) { + if (v.size()) { + _p = packet(std::move(_p), temporary_buffer::copy_of(v)); + } + } + + template + void append(basic_sstring s) { + if (s.size()) { + _p = packet(std::move(_p), std::move(s).release()); + } + } + + template + void append(const basic_sstring& s, Callback callback) { + if (s.size()) { + _p = packet(std::move(_p), fragment{s.begin(), s.size()}, make_deleter(std::move(callback))); + } + } + + void reserve(int n_frags) { + _p.reserve(n_frags); + } + + packet release() && { + return std::move(_p); + } + + template + void on_delete(Callback callback) { + _p = packet(std::move(_p), make_deleter(std::move(callback))); + } + + operator bool() const { + return _p.len(); + } + + size_t size() { + return _p.len(); + } +}; + +} diff --git a/src/seastar/include/seastar/core/scheduling.hh b/src/seastar/include/seastar/core/scheduling.hh new file mode 100644 index 000000000..fb4c59e5a --- /dev/null +++ b/src/seastar/include/seastar/core/scheduling.hh @@ -0,0 +1,366 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2016 Scylla DB Ltd + */ + +#pragma once + +#include +#include +#include +#include + +/// \file + +namespace seastar { + +constexpr unsigned max_scheduling_groups() { return 16; } + +#if SEASTAR_API_LEVEL < 6 +#define SEASTAR_ELLIPSIS ... +template +#else +#define SEASTAR_ELLIPSIS +template +#endif +class future; + +class reactor; + +class scheduling_group; +class scheduling_group_key; + +namespace internal { + +// Returns an index between 0 and max_scheduling_groups() +unsigned scheduling_group_index(scheduling_group sg) noexcept; +scheduling_group scheduling_group_from_index(unsigned index) noexcept; + +unsigned long scheduling_group_key_id(scheduling_group_key) noexcept; + +template +T* scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept; + +} + + +/// Creates a scheduling group with a specified number of shares. +/// +/// The operation is global and affects all shards. The returned scheduling +/// group can then be used in any shard. +/// +/// \param name A name that identifiers the group; will be used as a label +/// in the group's metrics +/// \param shares number of shares of the CPU time allotted to the group; +/// Use numbers in the 1-1000 range (but can go above). +/// \return a scheduling group that can be used on any shard +future create_scheduling_group(sstring name, float shares) noexcept; + +/// Destroys a scheduling group. +/// +/// Destroys a \ref scheduling_group previously created with create_scheduling_group(). +/// The destroyed group must not be currently in use and must not be used later. +/// +/// The operation is global and affects all shards. +/// +/// \param sg The scheduling group to be destroyed +/// \return a future that is ready when the scheduling group has been torn down +future<> destroy_scheduling_group(scheduling_group sg) noexcept; + +/// Rename scheduling group. +/// +/// Renames a \ref scheduling_group previously created with create_scheduling_group(). +/// +/// The operation is global and affects all shards. +/// The operation affects the exported statistics labels. +/// +/// \param sg The scheduling group to be renamed +/// \param new_name The new name for the scheduling group. +/// \return a future that is ready when the scheduling group has been renamed +future<> rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept; + + +/** + * Represents a configuration for a specific scheduling group value, + * it contains all that is needed to maintain a scheduling group specific + * value when it needs to be created, due to, for example, a new + * \ref scheduling_group being created. + * + * @note is is recomended to use @ref make_scheduling_group_key_config in order to + * create and configure this syructure. The only reason that one might want to not use + * this method is because of a need for specific intervention in the construction or + * destruction of the value. Even then, it is recommended to first create the configuration + * with @ref make_scheduling_group_key_config and only the change it. + * + */ +struct scheduling_group_key_config { + /** + * Constructs a default configuration + */ + scheduling_group_key_config() : + scheduling_group_key_config(typeid(void)) {} + /** + * Creates a configuration that is made for a specific type. + * It does not contain the right alignment and allocation sizes + * neither the correct construction or destruction logic, but only + * the indication for the intended type which is used in debug mode + * to make sure that the correct type is reffered to when accessing + * the value. + * @param type_info - the type information class (create with typeid(T)). + */ + scheduling_group_key_config(const std::type_info& type_info) : + type_index(type_info) {} + /// The allocation size for the value (usually: sizeof(T)) + size_t allocation_size; + /// The required alignment of the value (usually: alignof(T)) + size_t alignment; + /// Holds the type information for debug mode runtime validation + std::type_index type_index; + /// A function that will be called for each newly allocated value + std::function constructor; + /// A function that will be called for each element that is about + /// to be dealocated. + std::function destructor; + +}; + + +/** + * A class that is intended to encapsulate the scheduling group specific + * key and "hide" it implementation concerns and details. + * + * @note this object can be copied accross shards and scheduling groups. + */ +class scheduling_group_key { +public: + /// The only user allowed operation on a key is copying. + scheduling_group_key(const scheduling_group_key&) noexcept = default; + scheduling_group_key(scheduling_group_key&&) noexcept = default; +private: + scheduling_group_key(unsigned long id) noexcept : + _id(id) {} + unsigned long _id; + unsigned long id() const noexcept { + return _id; + } + friend class reactor; + friend future scheduling_group_key_create(scheduling_group_key_config cfg) noexcept; + template + friend T* internal::scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept; + template + friend T& scheduling_group_get_specific(scheduling_group_key key) noexcept; + + friend unsigned long internal::scheduling_group_key_id(scheduling_group_key key) noexcept; +}; + +namespace internal { + +inline unsigned long scheduling_group_key_id(scheduling_group_key key) noexcept { + return key.id(); +} + +/** + * @brief A function in the spirit of Cpp17 apply, but specifically for constructors. + * This function is used in order to preserve support in Cpp14. + + * @tparam ConstructorType - the constructor type or in other words the type to be constructed + * @tparam Tuple - T params tuple type (should be deduced) + * @tparam size_t...Idx - a sequence of indexes in order to access the typpels members in compile time. + * (should be deduced) + * + * @param pre_alocated_mem - a pointer to the pre allocated memory chunk that will hold the + * the initialized object. + * @param args - A tupple that holds the prarameters for the constructor + * @param idx_seq - An index sequence that will be used to access the members of the tuple in compile + * time. + * + * @note this function was not intended to be called by users and it is only a utility function + * for suporting \ref make_scheduling_group_key_config + */ +template +void apply_constructor(void* pre_alocated_mem, Tuple args, std::index_sequence idx_seq) { + new (pre_alocated_mem) ConstructorType(std::get(args)...); +} +} + +/** + * A template function that builds a scheduling group specific value configuration. + * This configuration is used by the infrastructure to allocate memory for the values + * and initialize or deinitialize them when they are created or destroyed. + * + * @tparam T - the type for the newly created value. + * @tparam ...ConstructorArgs - the types for the constructor parameters (should be deduced) + * @param args - The parameters for the constructor. + * @return a fully initialized \ref scheduling_group_key_config object. + */ +template +scheduling_group_key_config +make_scheduling_group_key_config(ConstructorArgs... args) { + scheduling_group_key_config sgkc(typeid(T)); + sgkc.allocation_size = sizeof(T); + sgkc.alignment = alignof(T); + sgkc.constructor = [args = std::make_tuple(args...)] (void* p) { + internal::apply_constructor(p, args, std::make_index_sequence()); + }; + sgkc.destructor = [] (void* p) { + static_cast(p)->~T(); + }; + return sgkc; +} + +/** + * Returns a future that holds a scheduling key and resolves when this key can be used + * to access the scheduling group specific value it represents. + * @param cfg - A \ref scheduling_group_key_config object (by recomendation: initialized with + * \ref make_scheduling_group_key_config ) + * @return A future containing \ref scheduling_group_key for the newly created specific value. + */ +future scheduling_group_key_create(scheduling_group_key_config cfg) noexcept; + +/** + * Returnes a reference to the given scheduling group specific value + * @tparam T - the type of the scheduling specific type (cannot be deduced) + * @param sg - the scheduling group which it's specific value to retrieve + * @param key - the key of the value to retrieve. + * @return A reference to the scheduling specific value. + */ +template +T& scheduling_group_get_specific(scheduling_group sg, scheduling_group_key key); + + +/// \brief Identifies function calls that are accounted as a group +/// +/// A `scheduling_group` is a tag that can be used to mark a function call. +/// Executions of such tagged calls are accounted as a group. +class scheduling_group { + unsigned _id; +private: + explicit scheduling_group(unsigned id) noexcept : _id(id) {} +public: + /// Creates a `scheduling_group` object denoting the default group + constexpr scheduling_group() noexcept : _id(0) {} // must be constexpr for current_scheduling_group_holder + bool active() const noexcept; + const sstring& name() const noexcept; + bool operator==(scheduling_group x) const noexcept { return _id == x._id; } + bool operator!=(scheduling_group x) const noexcept { return _id != x._id; } + bool is_main() const noexcept { return _id == 0; } + template + /** + * Returnes a reference to this scheduling group specific value + * @tparam T - the type of the scheduling specific type (cannot be deduced) + * @param key - the key of the value to retrieve. + * @return A reference to this scheduling specific value. + */ + T& get_specific(scheduling_group_key key) noexcept { + return *internal::scheduling_group_get_specific_ptr(*this, key); + } + /// Adjusts the number of shares allotted to the group. + /// + /// Dynamically adjust the number of shares allotted to the group, increasing or + /// decreasing the amount of CPU bandwidth it gets. The adjustment is local to + /// the shard. + /// + /// This can be used to reduce a background job's interference with a foreground + /// load: the shares can be started at a low value, increased when the background + /// job's backlog increases, and reduced again when the backlog decreases. + /// + /// \param shares number of shares allotted to the group. Use numbers + /// in the 1-1000 range. + void set_shares(float shares) noexcept; + friend future create_scheduling_group(sstring name, float shares) noexcept; + friend future<> destroy_scheduling_group(scheduling_group sg) noexcept; + friend future<> rename_scheduling_group(scheduling_group sg, sstring new_name) noexcept; + friend class reactor; + friend unsigned internal::scheduling_group_index(scheduling_group sg) noexcept; + friend scheduling_group internal::scheduling_group_from_index(unsigned index) noexcept; + + template + SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Mapper mapper, Reducer reducer, Initial initial) { + {reducer(initial, mapper(specific_val))} -> std::convertible_to; + }) + friend future::return_type> + map_reduce_scheduling_group_specific(Mapper mapper, Reducer reducer, Initial initial_val, scheduling_group_key key); + + template + SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Reducer reducer, Initial initial) { + {reducer(initial, specific_val)} -> std::convertible_to; + }) + friend future::return_type> + reduce_scheduling_group_specific(Reducer reducer, Initial initial_val, scheduling_group_key key); + + +}; + +/// \cond internal +namespace internal { + +inline +unsigned +scheduling_group_index(scheduling_group sg) noexcept { + return sg._id; +} + +inline +scheduling_group +scheduling_group_from_index(unsigned index) noexcept { + return scheduling_group(index); +} + +inline +scheduling_group* +current_scheduling_group_ptr() noexcept { + // Slow unless constructor is constexpr + static thread_local scheduling_group sg; + return &sg; +} + +} +/// \endcond + +/// Returns the current scheduling group +inline +scheduling_group +current_scheduling_group() noexcept { + return *internal::current_scheduling_group_ptr(); +} + +inline +scheduling_group +default_scheduling_group() noexcept { + return scheduling_group(); +} + +inline +bool +scheduling_group::active() const noexcept { + return *this == current_scheduling_group(); +} + +} + +namespace std { + +template <> +struct hash { + size_t operator()(seastar::scheduling_group sg) const noexcept { + return seastar::internal::scheduling_group_index(sg); + } +}; + +} diff --git a/src/seastar/include/seastar/core/scheduling_specific.hh b/src/seastar/include/seastar/core/scheduling_specific.hh new file mode 100644 index 000000000..d2e3158f5 --- /dev/null +++ b/src/seastar/include/seastar/core/scheduling_specific.hh @@ -0,0 +1,189 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2019 Scylla DB Ltd + */ + +#include +#include +#include +#include +#include + +#pragma once + +namespace seastar { + +namespace internal { + +struct scheduling_group_specific_thread_local_data { + struct per_scheduling_group { + bool queue_is_initialized = false; + /** + * This array holds pointers to the scheduling group specific + * data. The pointer is not use as is but is cast to a reference + * to the appropriate type that is actually pointed to. + */ + std::vector specific_vals; + }; + std::array per_scheduling_group_data; + std::vector scheduling_group_key_configs; +}; + +inline +scheduling_group_specific_thread_local_data** get_scheduling_group_specific_thread_local_data_ptr() noexcept { + static thread_local scheduling_group_specific_thread_local_data* data; + return &data; +} +inline +scheduling_group_specific_thread_local_data& get_scheduling_group_specific_thread_local_data() noexcept { + return **get_scheduling_group_specific_thread_local_data_ptr(); +} + +[[noreturn]] void no_such_scheduling_group(scheduling_group sg); + +/** + * Returns a pointer to the given scheduling group specific data. + * @param sg - The scheduling group which it's data needs to be accessed + * @param key - The scheduling group key that for the data to access + * @return A pointer of type T* to the data, if sg is valid initialized. + * + * @note The parameter T has to be given since there is no way to deduce it. + */ +template +T* scheduling_group_get_specific_ptr(scheduling_group sg, scheduling_group_key key) noexcept { + auto& data = internal::get_scheduling_group_specific_thread_local_data(); +#ifdef SEASTAR_DEBUG + assert(std::type_index(typeid(T)) == data.scheduling_group_key_configs[key.id()].type_index); +#endif + auto sg_id = internal::scheduling_group_index(sg); + if (__builtin_expect(sg_id < data.per_scheduling_group_data.size() && + data.per_scheduling_group_data[sg_id].queue_is_initialized, true)) { + return reinterpret_cast(data.per_scheduling_group_data[sg_id].specific_vals[key.id()]); + } + return nullptr; +} + +} + +/** + * Returns a reference to the given scheduling group specific data. + * @param sg - The scheduling group which it's data needs to be accessed + * @param key - The scheduling group key that for the data to access + * @return A reference of type T& to the data. + * + * @note The parameter T has to be given since there is no way to deduce it. + * May throw std::invalid_argument if sg does not exist or is uninitialized. + */ +template +T& scheduling_group_get_specific(scheduling_group sg, scheduling_group_key key) { + T* p = internal::scheduling_group_get_specific_ptr(sg, std::move(key)); + if (!p) { + internal::no_such_scheduling_group(sg); + } + return *p; +} + +/** + * Returns a reference to the current specific data. + * @param key - The scheduling group key that for the data to access + * @return A reference of type T& to the data. + * + * @note The parameter T has to be given since there is no way to deduce it. + */ +template +T& scheduling_group_get_specific(scheduling_group_key key) noexcept { + // Unlike internal::scheduling_group_get_specific_ptr, this can + // return a reference to an element whose queue_is_initialized is + // false. + auto& data = internal::get_scheduling_group_specific_thread_local_data(); + assert(std::type_index(typeid(T)) == data.scheduling_group_key_configs[key.id()].type_index); + auto sg_id = internal::scheduling_group_index(current_scheduling_group()); + return *reinterpret_cast(data.per_scheduling_group_data[sg_id].specific_vals[key.id()]); +} + +/** + * A map reduce over all values of a specific scheduling group data. + * @param mapper - A functor SomeType(SpecificValType&) or SomeType(SpecificValType) that maps + * the specific data to a value of any type. + * @param reducer - A functor of of type ConvetibleToInitial(Initial, MapperReurnType) that reduces + * a value of type Initial and of the mapper return type to a value of type convertible to Initial. + * @param initial_val - the initial value to pass in the first call to the reducer. + * @param key - the key to the specific data that the mapper should act upon. + * @return A future that resolves when the result of the map reduce is ready. + * @note The type of SpecificValType must be given because there is no way to deduce it in a *consistent* + * manner. + * @note Theoretically the parameter type of Mapper can be deduced to be the type (function_traits::arg<0>) + * but then there is a danger when the Mapper accepts a parameter type T where SpecificValType is convertible to + * SpecificValType. + */ +template +SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Mapper mapper, Reducer reducer, Initial initial) { + {reducer(initial, mapper(specific_val))} -> std::convertible_to; +}) +future::return_type> +map_reduce_scheduling_group_specific(Mapper mapper, Reducer reducer, + Initial initial_val, scheduling_group_key key) { + using per_scheduling_group = internal::scheduling_group_specific_thread_local_data::per_scheduling_group; + auto& data = internal::get_scheduling_group_specific_thread_local_data(); + auto wrapped_mapper = [key, mapper] (per_scheduling_group& psg) { + auto id = internal::scheduling_group_key_id(key); + return make_ready_future::return_type> + (mapper(*reinterpret_cast(psg.specific_vals[id]))); + }; + + return map_reduce( + data.per_scheduling_group_data + | boost::adaptors::filtered(std::mem_fn(&per_scheduling_group::queue_is_initialized)), + wrapped_mapper, std::move(initial_val), reducer); +} + +/** + * A reduce over all values of a specific scheduling group data. + * @param reducer - A functor of of type ConvetibleToInitial(Initial, SpecificValType) that reduces + * a value of type Initial and of the sg specific data type to a value of type convertible to Initial. + * @param initial_val - the initial value to pass in the first call to the reducer. + * @param key - the key to the specific data that the mapper should act upon. + * @return A future that resolves when the result of the reduce is ready. + * * @note The type of SpecificValType must be given because there is no way to deduce it in a *consistent* + * manner. + * @note Theoretically the parameter type of Reducer can be deduced to be the type (function_traits::arg<0>) + * but then there is a danger when the Reducer accepts a parameter type T where SpecificValType is convertible to + * SpecificValType. + */ +template +SEASTAR_CONCEPT( requires requires(SpecificValType specific_val, Reducer reducer, Initial initial) { + {reducer(initial, specific_val)} -> std::convertible_to; +}) +future::return_type> +reduce_scheduling_group_specific(Reducer reducer, Initial initial_val, scheduling_group_key key) { + using per_scheduling_group = internal::scheduling_group_specific_thread_local_data::per_scheduling_group; + auto& data = internal::get_scheduling_group_specific_thread_local_data(); + + auto mapper = [key] (per_scheduling_group& psg) { + auto id = internal::scheduling_group_key_id(key); + return make_ready_future(*reinterpret_cast(psg.specific_vals[id])); + }; + + return map_reduce( + data.per_scheduling_group_data + | boost::adaptors::filtered(std::mem_fn(&per_scheduling_group::queue_is_initialized)), + mapper, std::move(initial_val), reducer); +} + +} diff --git a/src/seastar/include/seastar/core/scollectd.hh b/src/seastar/include/seastar/core/scollectd.hh new file mode 100644 index 000000000..7909295bb --- /dev/null +++ b/src/seastar/include/seastar/core/scollectd.hh @@ -0,0 +1,848 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +namespace seastar { + +/** + * Implementation of rudimentary collectd data gathering. + * + * Usage is hopefully straight forward. Though, feel free to read + * https://collectd.org/wiki/index.php/Naming_schema + * for an explanation on the naming model. + * + * Typically, you'll add values something like: + * + * scollectd::type_instance_id typ("", "", "", ""); + * scollectd::add_polled_metric(typ, [ | scollectd::make_typed(, ) [, ...]); + * + * Where + * `` would be the overall 'module', e.g. "cpu" + * `` -> optional distinguisher between plugin instances. For cpu, the built-in + * scollectd::per_cpu_plugin_instance constant is a good choice, i.e. 0->N cpu. + * If there are no instances (e.g. only one), empty constant is appropriate (none) + * `` is the 'type' of metric collected, for ex. "usage" (cpu/0/usage) + * `` is a distinguisher for metric parts of the type, e.g. "idle", "user", "kernel" + * -> cpu/0/usage/idle | cpu/0/usage/user | cpu/0/usage/kernel + * + * Each type instance can bind an arbitrary number of values, ech representing some aspect in turn of the instance. + * The structure and interpretation is up to the producer/consumer + * + * There is a single "scollectd" instance per cpu, and values should be bound locally + * to this cpu. Polling is done at a frequency set in the seastar config (def once per s), + * and all registered values will be sent via UDP packages to the destination host(s) + * + * Note that the tuple { plugin, plugin_instance, type, type_instance } is considered a + * unique ID for a value registration, so using the same tuple twice will remove the previously + * registered values. + * + * Values can be unregistered at any time, though they must be so on the same thread/cpu + * as they we're registered. The "registration" achor type provides RAII style value unregistration + * semantics. + * + */ + +namespace scollectd { + +extern seastar::logger logger; + +using data_type = seastar::metrics::impl::data_type; + +enum class known_type { + // from types.db. Defined collectd types (type_id) selection. + // This enum omits the very application specific types, such + // as mysql_* etc, since if you really are re-writing mysql + // in seastar, you probably know how to look the type up manually... + + absolute, + backends, + bitrate, + blocked_clients, + bytes, + cache_eviction, + cache_operation, + cache_ratio, + cache_result, + cache_size, + capacity, + changes_since_last_save, + charge, + clock_last_meas, + clock_last_update, + clock_mode, + clock_reachability, + clock_skew_ppm, + clock_state, + clock_stratum, + compression, + compression_ratio, + connections, + conntrack, + contextswitch, + count, + counter, + cpu, + cpufreq, + current, + current_connections, + current_sessions, + delay, + derive, + df, + df_complex, + df_inodes, + disk_io_time, + disk_latency, + disk_merged, + disk_octets, + disk_ops, + disk_ops_complex, + disk_time, + dns_answer, + dns_notify, + dns_octets, + dns_opcode, + dns_qtype, + dns_qtype_cached, + dns_query, + dns_question, + dns_rcode, + dns_reject, + dns_request, + dns_resolver, + dns_response, + dns_transfer, + dns_update, + dns_zops, + drbd_resource, + duration, + email_check, + email_count, + email_size, + entropy, + evicted_keys, + expired_keys, + fanspeed, + file_handles, + file_size, + files, + flow, + fork_rate, + frequency, + frequency_error, + frequency_offset, + fscache_stat, + gauge, + hash_collisions, + http_request_methods, + http_requests, + http_response_codes, + humidity, + if_collisions, + if_dropped, + if_errors, + if_multicast, + if_octets, + if_packets, + if_rx_errors, + if_rx_octets, + if_tx_errors, + if_tx_octets, + invocations, + io_octets, + io_packets, + ipt_bytes, + ipt_packets, + irq, + latency, + links, + load, + md_disks, + memory, + memory_lua, + memory_throttle_count, + multimeter, + mutex_operations, + objects, + operations, + packets, + pending_operations, + percent, + percent_bytes, + percent_inodes, + ping, + ping_droprate, + ping_stddev, + players, + power, + pressure, + protocol_counter, + pubsub, + queue_length, + records, + requests, + response_code, + response_time, + root_delay, + root_dispersion, + route_etx, + route_metric, + routes, + segments, + serial_octets, + signal_noise, + signal_power, + signal_quality, + snr, + spl, + swap, + swap_io, + tcp_connections, + temperature, + threads, + time_dispersion, + time_offset, + time_offset_ntp, + time_offset_rms, + time_ref, + timeleft, + total_bytes, + total_connections, + total_objects, + total_operations, + total_requests, + total_sessions, + total_threads, + total_time_in_ms, + total_values, + uptime, + users, + vcl, + vcpu, + virt_cpu_total, + virt_vcpu, + vmpage_action, + vmpage_faults, + vmpage_io, + vmpage_number, + volatile_changes, + voltage, + voltage_threshold, + vs_memory, + vs_processes, + vs_threads, +}; + +// don't use directly. use make_typed. +template +struct typed { + typed(data_type t, T && v) + : type(t), value(std::forward(v)) { + } + data_type type; + T value; +}; + +template +static inline typed make_typed(data_type type, T&& t) { + return typed(type, std::forward(t)); +} + +using plugin_id = seastar::metrics::group_name_type; +using plugin_instance_id = seastar::metrics::instance_id_type; +using type_id = seastar::metrics::metric_type_def; +using type_instance = seastar::metrics::metric_name_type; + +type_id type_id_for(known_type); + +using description = seastar::metrics::description; + +static constexpr unsigned max_collectd_field_text_len = 63; + +class type_instance_id { + static thread_local unsigned _next_truncated_idx; + + /// truncate a given field to the maximum allowed length + void truncate(sstring& field, const char* field_desc); +public: + type_instance_id() = default; + type_instance_id(plugin_id p, plugin_instance_id pi, type_id t, + scollectd::type_instance ti = std::string()) + : _plugin(std::move(p)), _plugin_instance(std::move(pi)), _type( + std::move(t)), _type_instance(std::move(ti)) { + // truncate strings to the maximum allowed length + truncate(_plugin, "plugin"); + truncate(_plugin_instance, "plugin_instance"); + truncate(_type, "type"); + truncate(_type_instance, "type_instance"); + } + type_instance_id(const seastar::metrics::impl::metric_id &id, const type_id& inherit_type) : _plugin(id.group_name()), + _plugin_instance(id.instance_id()), _type(inherit_type), + _type_instance(id.name()) { + } + type_instance_id(type_instance_id &&) = default; + type_instance_id(const type_instance_id &) = default; + + type_instance_id & operator=(type_instance_id &&) = default; + type_instance_id & operator=(const type_instance_id &) = default; + + const plugin_id & plugin() const { + return _plugin; + } + const plugin_instance_id & plugin_instance() const { + return _plugin_instance; + } + const type_id & type() const { + return _type; + } + const scollectd::type_instance & type_instance() const { + return _type_instance; + } + bool operator<(const type_instance_id&) const; + bool operator==(const type_instance_id&) const; +private: + plugin_id _plugin; + plugin_instance_id _plugin_instance; + type_id _type; + scollectd::type_instance _type_instance; +}; + +extern const plugin_instance_id per_cpu_plugin_instance; + +void configure(const boost::program_options::variables_map&); +boost::program_options::options_description get_options_description(); +void remove_polled_metric(const type_instance_id &); + +class plugin_instance_metrics; + +/** + * Anchor for polled registration. + * Iff the registered type is in some way none-persistent, + * use this as receiver of the reg and ensure it dies before the + * added value(s). + * + * Use: + * uint64_t v = 0; + * registration r = add_polled_metric(v); + * ++r; + * + */ +struct registration { + registration() = default; + registration(const type_instance_id& id); + registration(type_instance_id&& id); + registration(const registration&) = delete; + registration(registration&&) = default; + ~registration(); + registration & operator=(const registration&) = delete; + registration & operator=(registration&&) = default; + + void unregister() { + remove_polled_metric(_id); + _id = type_instance_id(); + } +private: + friend class plugin_instance_metrics; + type_instance_id _id; + shared_ptr _impl; +}; + +/** + * Helper type to make generating vectors of registration objects + * easier, since it constructs from an initializer list of + * type_instance_id:s, avoiding early conversion to registration objs, + * which in case of init lists, are copy semantics, not move... + */ +class registrations + : public std::vector +{ +public: + typedef std::vector vector_type; + + registrations() + {} + registrations(vector_type&& v) : vector_type(std::move(v)) + {} + registrations(const std::initializer_list& l) + : vector_type(l.begin(),l.end()) + {} + registrations& operator=(vector_type&& v) { + vector_type::operator=(std::move(v)); + return *this; + } + registrations& operator=(const std::initializer_list& l) { + return registrations::operator=(registrations(l)); + } +}; + +class value_list; + +struct typed_value { + /** + * Wraps N values of a given type (type_id). + * Used to group types into a plugin_instance_metrics + */ + template + typed_value(const type_id& tid, const scollectd::type_instance& ti, description, Args&&... args); + + template + typed_value(const type_id& tid, const scollectd::type_instance& ti, Args&&... args) + : typed_value(tid, ti, description(), std::forward(args)...) + {} + + const scollectd::type_instance& type_instance() const { + return _type_instance; + } + const shared_ptr& values() const { + return _values; + } + const type_id & type() const { + return _type_id; + } +private: + type_id _type_id; + scollectd::type_instance _type_instance; + shared_ptr _values; +}; + +class plugin_instance_metrics { +public: + template + plugin_instance_metrics(const plugin_id& p, const plugin_instance_id& pi, TypedValues&&... values) + : _plugin_id(p) + , _plugin_instance(pi) + , _registrations({ add_impl(values)... }) + {} + std::vector bound_ids() const; + void add(const typed_value&); +private: + type_instance_id add_impl(const typed_value&); + + plugin_id _plugin_id; + plugin_instance_id _plugin_instance; + registrations _registrations; +}; + +/** + * Simplified wrapper for the common case of per-cpu plugin instances + * (i.e. distributed objects) + */ +class percpu_plugin_instance_metrics : public plugin_instance_metrics { +public: + template + percpu_plugin_instance_metrics(const plugin_id& p, TypedValues&&... values) + : plugin_instance_metrics(p, per_cpu_plugin_instance, std::forward(values)...) + {} +}; + +/** + * Template wrapper for type_id values, deriving type_id string + * from the known_types enum, for auto-completetion joy. + */ +template +struct typed_value_impl: public typed_value { + template + typed_value_impl(const scollectd::type_instance& ti, Args&& ... args) + : typed_value(type_id_for(Type), ti, std::forward(args)...) + {} + + template + typed_value_impl(scollectd::type_instance ti, description d, Args&& ... args) + : typed_value(type_id_for(Type), std::move(ti), std::move(d), std::forward(args)...) + {} + template + typed_value_impl(description d, Args&& ... args) + : typed_value(type_id_for(Type), scollectd::type_instance(), std::move(d), std::forward(args)...) + {} +}; + +/*! + * \deprecated metrics registration should be done using the metrics layer + * + * Some typedefs for common used types. Feel free to add. + */ +typedef typed_value_impl total_bytes; +typedef typed_value_impl total_connections; +typedef typed_value_impl total_objects; +typedef typed_value_impl total_operations; +typedef typed_value_impl total_requests; +typedef typed_value_impl total_sessions; +typedef typed_value_impl total_threads; +typedef typed_value_impl total_time_in_ms; +typedef typed_value_impl total_values; +typedef typed_value_impl queue_length; +typedef typed_value_impl counter; +typedef typed_value_impl count; +typedef typed_value_impl gauge; + +// lots of template junk to build typed value list tuples +// for registered values. +template +struct data_type_for; + +template +struct is_callable; + +template +struct is_callable::type>::value, +void>::type> : public std::true_type { +}; + +template +struct is_callable::value, void>::type> : public std::false_type { +}; + +template +struct data_type_for::value && std::is_unsigned::value, +void>::type> : public std::integral_constant { +}; +template +struct data_type_for::value && std::is_signed::value, void>::type> : public std::integral_constant< +data_type, data_type::DERIVE> { +}; +template +struct data_type_for::value, void>::type> : public std::integral_constant< +data_type, data_type::GAUGE> { +}; +template +struct data_type_for::value, void>::type> : public data_type_for< +typename std::result_of::type> { +}; +template +struct data_type_for> : public data_type_for { +}; + +template +class value { +public: + template + struct wrap { + wrap(const W & v) + : _v(v) { + } + const W & operator()() const { + return _v; + } + const W & _v; + }; + + typedef typename std::remove_reference::type value_type; + typedef typename std::conditional< + is_callable::type>::value, + value_type, wrap >::type stored_type; + + value(const value_type & t) + : value(data_type_for::value, t) { + } + value(data_type type, const value_type & t) + : _type(type), _t(t) { + } + uint64_t operator()() const { + auto v = _t(); + if (_type == data_type::GAUGE) { + return convert(double(v)); + } else { + uint64_t u = v; + return convert(u); + } + } + operator uint64_t() const { + return (*this)(); + } + operator data_type() const { + return _type; + } + data_type type() const { + return _type; + } +private: + // not super quick value -> protocol endian 64-bit values. + template + void bpack(_Iter s, _Iter e, uint64_t v) const { + while (s != e) { + *s++ = (v & 0xff); + v >>= 8; + } + } + template + typename std::enable_if::value, uint64_t>::type convert( + V v) const { + uint64_t i = v; + // network byte order + return ntohq(i); + } + template + typename std::enable_if::value, uint64_t>::type convert( + V t) const { + union { + uint64_t i; + double v; + } v; + union { + uint64_t i; + uint8_t b[8]; + } u; + v.v = t; + // intel byte order. could also obviously be faster. + // could be ignored if we just assume we're le (for now), + // but this is ok me thinks. + bpack(std::begin(u.b), std::end(u.b), v.i); + return u.i; + } + ; + + const data_type _type; + const stored_type _t; +}; + +template +class value> : public value { +public: + value(const typed & args) +: value(args.type, args.value) { + } +}; + +class value_list { + bool _enabled = true; +public: + value_list(description d) : _description(std::move(d)) + {} + value_list(value_list&&) = default; + virtual ~value_list() {} + + virtual size_t size() const = 0; + + virtual void types(data_type *) const = 0; + virtual void values(net::packed *) const = 0; + + const description& desc() const { + return _description; + } + + bool empty() const { + return size() == 0; + } + + bool is_enabled() const { + return _enabled; + } + + void set_enabled(bool b) { + _enabled = b; + } +private: + description _description; +}; + +template +class values_impl: public value_list { +public: + static const size_t num_values = sizeof...(Args); + + values_impl(description d, Args&& ...args) + : value_list(std::move(d)) + , _values(std::forward(args)...) + {} + + values_impl(values_impl&& a) = default; + values_impl(const values_impl& a) = default; + + size_t size() const override { + return num_values; + } + void types(data_type * p) const override { + unpack(_values, [p](Args... args) { + std::initializer_list tmp = { args... }; + std::copy(tmp.begin(), tmp.end(), p); + }); + } + void values(net::packed * p) const override { + unpack(_values, [p](Args... args) { + std::initializer_list tmp = { args... }; + std::copy(tmp.begin(), tmp.end(), p); + }); + } +private: + template + void unpack(const std::tuple& t, _Op&& op) const { + do_unpack(t, std::index_sequence_for {}, std::forward<_Op>(op)); + } + + template + void do_unpack(const std::tuple& t, const std::index_sequence &, _Op&& op) const { + op(std::get(t)...); + } + + std::tuple < Args... > _values; +}; + +void add_polled(const type_instance_id &, const shared_ptr &, bool enabled = true); + +typedef std::function notify_function; +template +static auto make_type_instance(description d, _Args && ... args) -> values_impl < decltype(value<_Args>(std::forward<_Args>(args)))... > +{ + return values_impl(std::forward<_Args>(args)))...>( + std::move(d), value<_Args>(std::forward<_Args>(args))...); +} +/*! + * \deprecated metrics registration should be done using the metrics layer + * + */ +template +[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const plugin_id & plugin, + const plugin_instance_id & plugin_instance, const type_id & type, + const scollectd::type_instance & type_instance, _Args&& ... args) { + return add_polled_metric(plugin, plugin_instance, type, type_instance, description(), + std::forward<_Args>(args)...); +} +/*! + * \deprecated metrics registration should be done using the metrics layer + * + */ +template +[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const plugin_id & plugin, + const plugin_instance_id & plugin_instance, const type_id & type, + const scollectd::type_instance & type_instance, description d, _Args&& ... args) { + return add_polled_metric( + type_instance_id(plugin, plugin_instance, type, type_instance), std::move(d), + std::forward<_Args>(args)...); +} +template +static future<> send_explicit_metric(const plugin_id & plugin, + const plugin_instance_id & plugin_instance, const type_id & type, + const scollectd::type_instance & type_instance, _Args&& ... args) { + return send_explicit_metric( + type_instance_id(plugin, plugin_instance, type, type_instance), + std::forward<_Args>(args)...); +} +template +static notify_function create_explicit_metric(const plugin_id & plugin, + const plugin_instance_id & plugin_instance, const type_id & type, + const scollectd::type_instance & type_instance, _Args&& ... args) { + return create_explicit_metric( + type_instance_id(plugin, plugin_instance, type, type_instance), + std::forward<_Args>(args)...); +} + +seastar::metrics::impl::metric_id to_metrics_id(const type_instance_id & id); +/*! + * \deprecated metrics registration should be done using the metrics layer + * + */ +template +[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const type_instance_id & id, description d, + Arg&& arg, bool enabled = true) { + seastar::metrics::impl::get_local_impl()->add_registration(to_metrics_id(id), arg.type, seastar::metrics::impl::make_function(arg.value, arg.type), d, enabled); + return id; +} +/*! + * \deprecated metrics registration should be done using the metrics layer + * + */ +template +[[deprecated("Use the metrics layer")]] static type_instance_id add_polled_metric(const type_instance_id & id, + Arg&& arg) { + return std::move(add_polled_metric(id, description(), std::forward(arg))); +} + +/*! + * \deprecated metrics registration should be done using the metrics layer + * + */ +template +[[deprecated("Use the metrics layer")]] static type_instance_id add_disabled_polled_metric(const type_instance_id & id, description d, + Args&& arg) { + return add_polled_metric(id, d, std::forward(arg), false); +} + +template +static type_instance_id add_disabled_polled_metric(const type_instance_id & id, + Args&& args) { + return add_disabled_polled_metric(id, description(), std::forward(args)); +} + +template +static type_instance_id add_disabled_polled_metric(const type_instance_id & id, + Args&& ... args) { + return add_disabled_polled_metric(id, description(), std::forward(args)...); +} + +// "Explicit" metric sends. Sends a single value list as a message. +// Obviously not super efficient either. But maybe someone needs it sometime. +template +static future<> send_explicit_metric(const type_instance_id & id, + _Args&& ... args) { + return send_metric(id, make_type_instance(std::forward<_Args>(args)...)); +} +template +static notify_function create_explicit_metric(const type_instance_id & id, + _Args&& ... args) { + auto list = make_type_instance(std::forward<_Args>(args)...); + return [id, list=std::move(list)]() { + send_metric(id, list); + }; +} + +template +typed_value::typed_value(const type_id& tid, const scollectd::type_instance& ti, description d, Args&&... args) + : _type_id(tid) + , _type_instance(ti) + , _values(::seastar::make_shared(args)...))>(make_type_instance(std::move(d), std::forward(args)...))) +{} + +// Send a message packet (string) +future<> send_notification(const type_instance_id & id, const sstring & msg); +}; + +} diff --git a/src/seastar/include/seastar/core/scollectd_api.hh b/src/seastar/include/seastar/core/scollectd_api.hh new file mode 100644 index 000000000..b4ca7eae9 --- /dev/null +++ b/src/seastar/include/seastar/core/scollectd_api.hh @@ -0,0 +1,35 @@ +/* + * Copyright 2015 Cloudius Systems + */ + +#pragma once + +#include +#include + +namespace seastar { + +namespace scollectd { + +using collectd_value = seastar::metrics::impl::metric_value; + +std::vector get_collectd_value( + const scollectd::type_instance_id& id); + +std::vector get_collectd_ids(); + +sstring get_collectd_description_str(const scollectd::type_instance_id&); + +bool is_enabled(const scollectd::type_instance_id& id); +/** + * Enable or disable collectd metrics on local instance + * @param id - the metric to enable or disable + * @param enable - should the collectd metrics be enable or disable + */ +void enable(const scollectd::type_instance_id& id, bool enable); + + +metrics::impl::value_map get_value_map(); +} + +} diff --git a/src/seastar/include/seastar/core/seastar.hh b/src/seastar/include/seastar/core/seastar.hh new file mode 100644 index 000000000..18130ae92 --- /dev/null +++ b/src/seastar/include/seastar/core/seastar.hh @@ -0,0 +1,386 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +/// \mainpage +/// +/// Seastar is a high performance C++ application framework for high +/// concurrency server applications. +/// +/// A good place to start is the [Tutorial](tutorial.html) or [Multi-page version](split/). +/// +/// Please see: +/// - \ref future-module Documentation on futures and promises, which are +/// the seastar building blocks. +/// - \ref future-util Utililty functions for working with futures +/// - \ref memory-module Memory management +/// - \ref networking-module TCP/IP networking +/// - \ref fileio-module File Input/Output +/// - \ref smp-module Multicore support +/// - \ref fiber-module Utilities for managing loosely coupled chains of +/// continuations, also known as fibers +/// - \ref thread-module Support for traditional threaded execution +/// - \ref rpc Build high-level communication protocols +/// +/// View the [Seastar compatibility statement](./md_compatibility.html) for +/// information about library evolution. + +#include +#include +#include +#include +#include +#include "./internal/api-level.hh" + +namespace seastar { + +// iostream.hh +template class input_stream; +template class output_stream; + +class server_socket; +class socket; +class connected_socket; +class socket_address; +struct listen_options; +enum class transport; + +// file.hh +class file; +struct file_open_options; +struct stat_data; + +namespace net { + +class udp_channel; + +} + +// Networking API + +/// \defgroup networking-module Networking +/// +/// Seastar provides a simple networking API, backed by two +/// TCP/IP stacks: the POSIX stack, utilizing the kernel's +/// BSD socket APIs, and the native stack, implement fully +/// within seastar and able to drive network cards directly. +/// The native stack supports zero-copy on both transmit +/// and receive, and is implemented using seastar's high +/// performance, lockless sharded design. The network stack +/// can be selected with the \c \--network-stack command-line +/// parameter. + +/// \addtogroup networking-module +/// @{ + +/// Listen for connections on a given port +/// +/// Starts listening on a given address for incoming connections. +/// +/// \param sa socket address to listen on +/// +/// \return \ref server_socket object ready to accept connections. +/// +/// \see listen(socket_address sa, listen_options opts) +server_socket listen(socket_address sa); + +/// Listen for connections on a given port +/// +/// Starts listening on a given address for incoming connections. +/// +/// \param sa socket address to listen on +/// \param opts options controlling the listen operation +/// +/// \return \ref server_socket object ready to accept connections. +/// +/// \see listen(socket_address sa) +server_socket listen(socket_address sa, listen_options opts); + +/// Establishes a connection to a given address +/// +/// Attempts to connect to the given address. +/// +/// \param sa socket address to connect to +/// +/// \return a \ref connected_socket object, or an exception +future connect(socket_address sa); + +/// Establishes a connection to a given address +/// +/// Attempts to connect to the given address with a defined local endpoint +/// +/// \param sa socket address to connect to +/// \param local socket address for local endpoint +/// \param proto transport protocol (TCP or SCTP) +/// +/// \return a \ref connected_socket object, or an exception +future connect(socket_address sa, socket_address local, transport proto); + + +/// Creates a socket object suitable for establishing stream-oriented connections +/// +/// \return a \ref socket object that can be used for establishing connections +socket make_socket(); + +/// Creates a udp_channel object suitable for sending UDP packets +/// +/// The channel is not bound to a local address, and thus can only be used +/// for sending. +/// +/// \return a \ref net::udp_channel object that can be used for UDP transfers. +net::udp_channel make_udp_channel(); + + +/// Creates a udp_channel object suitable for sending and receiving UDP packets +/// +/// \param local local address to bind to +/// +/// \return a \ref net::udp_channel object that can be used for UDP transfers. +net::udp_channel make_udp_channel(const socket_address& local); + +/// @} + +/// \defgroup fileio-module File Input/Output +/// +/// Seastar provides a file API to deal with persistent storage. +/// Unlike most file APIs, seastar offers unbuffered file I/O +/// (similar to, and based on, \c O_DIRECT). Unbuffered I/O means +/// that the application is required to do its own caching, but +/// delivers better performance if this caching is done correctly. +/// +/// For random I/O or sequential unbuffered I/O, the \ref file +/// class provides a set of methods for reading, writing, discarding, +/// or otherwise manipulating a file. For buffered sequential I/O, +/// see \ref make_file_input_stream() and \ref make_file_output_stream(). + +/// \addtogroup fileio-module +/// @{ + +/// Opens or creates a file. The "dma" in the name refers to the fact +/// that data transfers are unbuffered and uncached. +/// +/// \param name the name of the file to open or create +/// \param flags various flags controlling the open process +/// \return a \ref file object, as a future +/// +/// \note +/// The file name is not guaranteed to be stable on disk, unless the +/// containing directory is sync'ed. +/// +/// \relates file +future open_file_dma(std::string_view name, open_flags flags) noexcept; + +/// Opens or creates a file. The "dma" in the name refers to the fact +/// that data transfers are unbuffered and uncached. +/// +/// \param name the name of the file to open or create +/// \param flags various flags controlling the open process +/// \param options options for opening the file +/// \return a \ref file object, as a future +/// +/// \note +/// The file name is not guaranteed to be stable on disk, unless the +/// containing directory is sync'ed. +/// +/// \relates file +future open_file_dma(std::string_view name, open_flags flags, file_open_options options) noexcept; + +/// Checks if a given directory supports direct io +/// +/// Seastar bypasses the Operating System caches and issues direct io to the +/// underlying block devices. Projects using seastar should check if the directory +/// lies in a filesystem that support such operations. This function can be used +/// to do that. +/// +/// It will return if direct io can be used, or throw an std::system_error +/// exception, with the EINVAL error code. +/// +/// A std::system_error with the respective error code is also thrown if \c path is +/// not a directory. +/// +/// \param path the directory we need to verify. +future<> check_direct_io_support(std::string_view path) noexcept; + +/// Opens a directory. +/// +/// \param name name of the directory to open +/// +/// \return a \ref file object representing a directory. The only +/// legal operations are \ref file::list_directory(), +/// \ref file::flush(), and \ref file::close(). +/// +/// \relates file +future open_directory(std::string_view name) noexcept; + +/// Creates a new directory. +/// +/// \param name name of the directory to create +/// \param permissions optional file permissions of the directory to create. +/// +/// \note +/// The directory is not guaranteed to be stable on disk, unless the +/// containing directory is sync'ed. +future<> make_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept; + +/// Ensures a directory exists +/// +/// Checks whether a directory exists, and if not, creates it. Only +/// the last component of the directory name is created. +/// +/// \param name name of the directory to potentially create +/// \param permissions optional file permissions of the directory to create. +/// +/// \note +/// The directory is not guaranteed to be stable on disk, unless the +/// containing directory is sync'ed. +/// If the directory exists, the provided permissions are not applied. +future<> touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept; + +/// Recursively ensures a directory exists +/// +/// Checks whether each component of a directory exists, and if not, creates it. +/// +/// \param name name of the directory to potentially create +/// \param permissions optional file permissions of the directory to create. +/// +/// \note +/// This function fsyncs each component created, and is therefore guaranteed to be stable on disk. +/// The provided permissions are applied only on the last component in the path, if it needs to be created, +/// if intermediate directories do not exist, they are created with the default_dir_permissions. +/// If any directory exists, the provided permissions are not applied. +future<> recursive_touch_directory(std::string_view name, file_permissions permissions = file_permissions::default_dir_permissions) noexcept; + +/// Synchronizes a directory to disk +/// +/// Makes sure the modifications in a directory are synchronized in disk. +/// This is useful, for instance, after creating or removing a file inside the +/// directory. +/// +/// \param name name of the directory to potentially create +future<> sync_directory(std::string_view name) noexcept; + + +/// Removes (unlinks) a file or an empty directory +/// +/// \param name name of the file or the directory to remove +/// +/// \note +/// The removal is not guaranteed to be stable on disk, unless the +/// containing directory is sync'ed. +future<> remove_file(std::string_view name) noexcept; + +/// Renames (moves) a file. +/// +/// \param old_name existing file name +/// \param new_name new file name +/// +/// \note +/// The rename is not guaranteed to be stable on disk, unless the +/// both containing directories are sync'ed. +future<> rename_file(std::string_view old_name, std::string_view new_name) noexcept; + +struct follow_symlink_tag { }; +using follow_symlink = bool_class; + +/// Return stat information about a file. +/// +/// \param name name of the file to return its stat information +/// \param fs a follow_symlink flag to follow symbolic links. +/// +/// \return stat_data of the file identified by name. +/// If name identifies a symbolic link then stat_data is returned either for the target of the link, +/// with follow_symlink::yes, or for the link itself, with follow_symlink::no. +future file_stat(std::string_view name, follow_symlink fs = follow_symlink::yes) noexcept; + +/// Return the size of a file. +/// +/// \param name name of the file to return the size +/// +/// Note that file_size of a symlink is NOT the size of the symlink - +/// which is the length of the pathname it contains - +/// but rather the size of the file to which it points. +future file_size(std::string_view name) noexcept; + +/// Check file access. +/// +/// \param name name of the file to check +/// \param flags bit pattern containing type of access to check (read/write/execute or exists). +/// +/// If only access_flags::exists is queried, returns true if the file exists, or false otherwise. +/// Throws a std::filesystem::filesystem_error exception if any error other than ENOENT is encountered. +/// +/// If any of the access_flags (read/write/execute) is set, returns true if the file exists and is +/// accessible with the requested flags, or false if the file exists and is not accessible +/// as queried. +/// Throws a std::filesystem::filesystem_error exception if any error other than EACCES is encountered. +/// Note that if any path component leading to the file is not searchable, the file is considered inaccessible +/// with the requested mode and false will be returned. +future file_accessible(std::string_view name, access_flags flags) noexcept; + +/// check if a file exists. +/// +/// \param name name of the file to check +future file_exists(std::string_view name) noexcept; + +/// Determine the type of a file (regular file, directory, etc.) +/// +/// \param name name of the file for which type information is requested +/// \param follow a follow_symlink flag that determines whether a trailing symbolic link should be followed or not +/// +/// \return a engaged optional with the file type if lookup was successful; a disengaged optional +/// if the file (or one of its parent directories) does not exist; an exceptional future on +/// other errors. +future> file_type(std::string_view name, follow_symlink follow = follow_symlink::yes) noexcept; + + +/// Creates a hard link for a file +/// +/// \param oldpath existing file name +/// \param newpath name of link +/// +future<> link_file(std::string_view oldpath, std::string_view newpath) noexcept; + +/// Changes the permissions mode of a file or directory +/// +/// \param name name of the file ot directory to change +/// \param permissions permissions to set +/// +future<> chmod(std::string_view name, file_permissions permissions) noexcept; + +/// Return information about the filesystem where a file is located. +/// +/// \param name name of the file to inspect +future file_system_at(std::string_view name) noexcept; + +/// Return space available to unprivileged users in filesystem where a file is located, in bytes. +/// +/// \param name name of the file to inspect +future fs_avail(std::string_view name) noexcept; + +/// Return free space in filesystem where a file is located, in bytes. +/// +/// \param name name of the file to inspect +future fs_free(std::string_view name) noexcept; +/// @} + +} diff --git a/src/seastar/include/seastar/core/semaphore.hh b/src/seastar/include/seastar/core/semaphore.hh new file mode 100644 index 000000000..a4837da0d --- /dev/null +++ b/src/seastar/include/seastar/core/semaphore.hh @@ -0,0 +1,572 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace seastar { + +/// \addtogroup fiber-module +/// @{ + +/// Exception thrown when a semaphore is broken by +/// \ref semaphore::broken(). +class broken_semaphore : public std::exception { +public: + /// Reports the exception reason. + virtual const char* what() const noexcept; +}; + +/// Exception thrown when a semaphore wait operation +/// times out. +/// +/// \see semaphore::wait(typename timer<>::duration timeout, size_t nr) +class semaphore_timed_out : public std::exception { +public: + /// Reports the exception reason. + virtual const char* what() const noexcept; +}; + +/// Exception Factory for standard semaphore +/// +/// constructs standard semaphore exceptions +/// \see semaphore_timed_out and broken_semaphore +struct semaphore_default_exception_factory { + static semaphore_timed_out timeout() noexcept; + static broken_semaphore broken() noexcept; +}; + +class named_semaphore_timed_out : public semaphore_timed_out { + sstring _msg; +public: + named_semaphore_timed_out(std::string_view msg) noexcept; + virtual const char* what() const noexcept; +}; + +class broken_named_semaphore : public broken_semaphore { + sstring _msg; +public: + broken_named_semaphore(std::string_view msg) noexcept; + virtual const char* what() const noexcept; +}; + +// A factory of semaphore exceptions that contain additional context: the semaphore name +// auto sem = named_semaphore(0, named_semaphore_exception_factory{"file_opening_limit_semaphore"}); +struct named_semaphore_exception_factory { + sstring name; + named_semaphore_timed_out timeout() const noexcept; + broken_named_semaphore broken() const noexcept; +}; + +/// \brief Counted resource guard. +/// +/// This is a standard computer science semaphore, adapted +/// for futures. You can deposit units into a counter, +/// or take them away. Taking units from the counter may wait +/// if not enough units are available. +/// +/// To support exceptional conditions, a \ref broken() method +/// is provided, which causes all current waiters to stop waiting, +/// with an exceptional future returned. This allows causing all +/// fibers that are blocked on a semaphore to continue. This is +/// similar to POSIX's `pthread_cancel()`, with \ref wait() acting +/// as a cancellation point. +/// +/// \tparam ExceptionFactory template parameter allows modifying a semaphore to throw +/// customized exceptions on timeout/broken(). It has to provide two functions +/// ExceptionFactory::timeout() and ExceptionFactory::broken() which return corresponding +/// exception object. +template::clock> +class basic_semaphore : private ExceptionFactory { +public: + using duration = typename timer::duration; + using clock = typename timer::clock; + using time_point = typename timer::time_point; + using exception_factory = ExceptionFactory; +private: + ssize_t _count; + std::exception_ptr _ex; + struct entry { + promise<> pr; + size_t nr; + entry(promise<>&& pr_, size_t nr_) noexcept : pr(std::move(pr_)), nr(nr_) {} + }; + using expiry_handler = std::function; + expiring_fifo _wait_list; + expiry_handler make_expiry_handler() noexcept { + return [this] (entry& e) noexcept { + try { + e.pr.set_exception(this->timeout()); + } catch (...) { + e.pr.set_exception(semaphore_timed_out()); + } + }; + } + bool has_available_units(size_t nr) const noexcept { + return _count >= 0 && (static_cast(_count) >= nr); + } + bool may_proceed(size_t nr) const noexcept { + return has_available_units(nr) && _wait_list.empty(); + } +public: + /// Returns the maximum number of units the semaphore counter can hold + static constexpr size_t max_counter() noexcept { + return std::numeric_limits::max(); + } + + /// Constructs a semaphore object with a specific number of units + /// in its internal counter. E.g., starting it at 1 is suitable for use as + /// an unlocked mutex. + /// + /// \param count number of initial units present in the counter. + basic_semaphore(size_t count) noexcept(std::is_nothrow_default_constructible_v) + : exception_factory() + , _count(count), + _wait_list(make_expiry_handler()) + {} + basic_semaphore(size_t count, exception_factory&& factory) noexcept(std::is_nothrow_move_constructible_v) + : exception_factory(std::move(factory)) + , _count(count) + , _wait_list(make_expiry_handler()) + { + static_assert(std::is_nothrow_move_constructible_v); + } + /// Waits until at least a specific number of units are available in the + /// counter, and reduces the counter by that amount of units. + /// + /// \note Waits are serviced in FIFO order, though if several are awakened + /// at once, they may be reordered by the scheduler. + /// + /// \param nr Amount of units to wait for (default 1). + /// \return a future that becomes ready when sufficient units are available + /// to satisfy the request. If the semaphore was \ref broken(), may + /// contain an exception. + future<> wait(size_t nr = 1) noexcept { + return wait(time_point::max(), nr); + } + /// Waits until at least a specific number of units are available in the + /// counter, and reduces the counter by that amount of units. If the request + /// cannot be satisfied in time, the request is aborted. + /// + /// \note Waits are serviced in FIFO order, though if several are awakened + /// at once, they may be reordered by the scheduler. + /// + /// \param timeout expiration time. + /// \param nr Amount of units to wait for (default 1). + /// \return a future that becomes ready when sufficient units are available + /// to satisfy the request. On timeout, the future contains a + /// \ref semaphore_timed_out exception. If the semaphore was + /// \ref broken(), may contain an exception. + future<> wait(time_point timeout, size_t nr = 1) noexcept { + if (may_proceed(nr)) { + _count -= nr; + return make_ready_future<>(); + } + if (_ex) { + return make_exception_future(_ex); + } + entry e(promise<>(), nr); + auto fut = e.pr.get_future(); + try { + _wait_list.push_back(std::move(e), timeout); + } catch (...) { + e.pr.set_exception(std::current_exception()); + } + return fut; + } + + /// Waits until at least a specific number of units are available in the + /// counter, and reduces the counter by that amount of units. If the request + /// cannot be satisfied in time, the request is aborted. + /// + /// \note Waits are serviced in FIFO order, though if several are awakened + /// at once, they may be reordered by the scheduler. + /// + /// \param timeout how long to wait. + /// \param nr Amount of units to wait for (default 1). + /// \return a future that becomes ready when sufficient units are available + /// to satisfy the request. On timeout, the future contains a + /// \ref semaphore_timed_out exception. If the semaphore was + /// \ref broken(), may contain an exception. + future<> wait(duration timeout, size_t nr = 1) noexcept { + return wait(clock::now() + timeout, nr); + } + /// Deposits a specified number of units into the counter. + /// + /// The counter is incremented by the specified number of units. + /// If the new counter value is sufficient to satisfy the request + /// of one or more waiters, their futures (in FIFO order) become + /// ready, and the value of the counter is reduced according to + /// the amount requested. + /// + /// \param nr Number of units to deposit (default 1). + void signal(size_t nr = 1) noexcept { + if (_ex) { + return; + } + _count += nr; + while (!_wait_list.empty() && has_available_units(_wait_list.front().nr)) { + auto& x = _wait_list.front(); + _count -= x.nr; + x.pr.set_value(); + _wait_list.pop_front(); + } + } + + /// Consume the specific number of units without blocking + // + /// Consume the specific number of units now, regardless of how many units are available + /// in the counter, and reduces the counter by that amount of units. This operation may + /// cause the counter to go negative. + /// + /// \param nr Amount of units to consume (default 1). + void consume(size_t nr = 1) noexcept { + if (_ex) { + return; + } + _count -= nr; + } + + /// Attempts to reduce the counter value by a specified number of units. + /// + /// If sufficient units are available in the counter, and if no + /// other fiber is waiting, then the counter is reduced. Otherwise, + /// nothing happens. This is useful for "opportunistic" waits where + /// useful work can happen if the counter happens to be ready, but + /// when it is not worthwhile to wait. + /// + /// \param nr number of units to reduce the counter by (default 1). + /// \return `true` if the counter had sufficient units, and was decremented. + bool try_wait(size_t nr = 1) noexcept { + if (may_proceed(nr)) { + _count -= nr; + return true; + } else { + return false; + } + } + /// Returns the number of units available in the counter. + /// + /// Does not take into account any waiters. + size_t current() const noexcept { return std::max(_count, ssize_t(0)); } + + /// Returns the number of available units. + /// + /// Takes into account units consumed using \ref consume() and therefore + /// may return a negative value. + ssize_t available_units() const noexcept { return _count; } + + /// Returns the current number of waiters + size_t waiters() const noexcept { return _wait_list.size(); } + + /// Signal to waiters that an error occurred. \ref wait() will see + /// an exceptional future<> containing a \ref broken_semaphore exception. + /// The future is made available immediately. + void broken() noexcept { + std::exception_ptr ep; + try { + ep = std::make_exception_ptr(exception_factory::broken()); + } catch (...) { + ep = std::make_exception_ptr(broken_semaphore()); + } + broken(std::move(ep)); + } + + /// Signal to waiters that an error occurred. \ref wait() will see + /// an exceptional future<> containing the provided exception parameter. + /// The future is made available immediately. + template + void broken(const Exception& ex) noexcept { + broken(std::make_exception_ptr(ex)); + } + + /// Signal to waiters that an error occurred. \ref wait() will see + /// an exceptional future<> containing the provided exception parameter. + /// The future is made available immediately. + void broken(std::exception_ptr ex) noexcept; + + /// Reserve memory for waiters so that wait() will not throw. + void ensure_space_for_waiters(size_t n) { + _wait_list.reserve(n); + } +}; + +template +inline +void +basic_semaphore::broken(std::exception_ptr xp) noexcept { + static_assert(std::is_nothrow_copy_constructible_v); + _ex = xp; + _count = 0; + while (!_wait_list.empty()) { + auto& x = _wait_list.front(); + x.pr.set_exception(xp); + _wait_list.pop_front(); + } +} + +template::clock> +class semaphore_units { + basic_semaphore* _sem; + size_t _n; + + semaphore_units(basic_semaphore* sem, size_t n) noexcept : _sem(sem), _n(n) {} +public: + semaphore_units() noexcept : semaphore_units(nullptr, 0) {} + semaphore_units(basic_semaphore& sem, size_t n) noexcept : semaphore_units(&sem, n) {} + semaphore_units(semaphore_units&& o) noexcept : _sem(o._sem), _n(std::exchange(o._n, 0)) { + } + semaphore_units& operator=(semaphore_units&& o) noexcept { + _sem = o._sem; + _n = std::exchange(o._n, 0); + return *this; + } + semaphore_units(const semaphore_units&) = delete; + ~semaphore_units() noexcept { + return_all(); + } + /// Return ownership of some units to the semaphore. The semaphore will be signaled by the number of units returned. + /// + /// \param units number of units to subtract. + /// + /// \note throws exception if \c units is more than those protected by the semaphore + /// + /// \return the number of remaining units + size_t return_units(size_t units) { + if (units > _n) { + throw std::invalid_argument("Cannot take more units than those protected by the semaphore"); + } + _n -= units; + _sem->signal(units); + return _n; + } + /// Return ownership of all units. The semaphore will be signaled by the number of units returned. + void return_all() noexcept { + if (_n) { + _sem->signal(_n); + _n = 0; + } + } + /// Releases ownership of the units. The semaphore will not be signalled. + /// + /// \return the number of units held + size_t release() noexcept { + return std::exchange(_n, 0); + } + /// Splits this instance into a \ref semaphore_units object holding the specified amount of units. + /// This object will continue holding the remaining units. + /// + /// \param units number of units to subtract. + /// + /// \note throws exception if \c units is more than those protected by the semaphore + /// + /// \return semaphore_units holding the specified number of units + semaphore_units split(size_t units) { + if (units > _n) { + throw std::invalid_argument("Cannot take more units than those protected by the semaphore"); + } + _n -= units; + return semaphore_units(_sem, units); + } + /// The inverse of split(), in which the units held by the specified \ref semaphore_units + /// object are merged into the current one. The function assumes (and asserts) that both + /// are associated with the same \ref semaphore. + /// + /// \return the updated semaphore_units object + void adopt(semaphore_units&& other) noexcept { + assert(other._sem == _sem); + _n += other.release(); + } + + /// Returns the number of units held + size_t count() const noexcept { + return _n; + } +}; + +/// \brief Take units from semaphore temporarily +/// +/// Takes units from the semaphore and returns them when the \ref semaphore_units object goes out of scope. +/// This provides a safe way to temporarily take units from a semaphore and ensure +/// that they are eventually returned under all circumstances (exceptions, premature scope exits, etc). +/// +/// Unlike with_semaphore(), the scope of unit holding is not limited to the scope of a single async lambda. +/// +/// \param sem The semaphore to take units from +/// \param units Number of units to take +/// \return a \ref future<> holding \ref semaphore_units object. When the object goes out of scope +/// the units are returned to the semaphore. +/// +/// \note The caller must guarantee that \c sem is valid as long as +/// \ref seaphore_units object is alive. +/// +/// \related semaphore +template::clock> +future> +get_units(basic_semaphore& sem, size_t units) noexcept { + return sem.wait(units).then([&sem, units] { + return semaphore_units{ sem, units }; + }); +} + +/// \brief Take units from semaphore temporarily with time bound on wait +/// +/// Like \ref get_units(basic_semaphore&, size_t) but when +/// timeout is reached before units are granted returns an exceptional future holding semaphore_timed_out. +/// +/// \param sem The semaphore to take units from +/// \param units Number of units to take +/// \return a \ref future<> holding \ref semaphore_units object. When the object goes out of scope +/// the units are returned to the semaphore. +/// +/// \note The caller must guarantee that \c sem is valid as long as +/// \ref seaphore_units object is alive. +/// +/// \related semaphore +template::clock> +future> +get_units(basic_semaphore& sem, size_t units, typename basic_semaphore::time_point timeout) noexcept { + return sem.wait(timeout, units).then([&sem, units] { + return semaphore_units{ sem, units }; + }); +} + +/// \brief Take units from semaphore temporarily with time bound on wait +/// +/// Like \ref get_units(basic_semaphore&, size_t, basic_semaphore::time_point) but +/// allow the timeout to be specified as a duration. +/// +/// \param sem The semaphore to take units from +/// \param units Number of units to take +/// \param timeout a duration specifying when to timeout the current request +/// \return a \ref future<> holding \ref semaphore_units object. When the object goes out of scope +/// the units are returned to the semaphore. +/// +/// \note The caller must guarantee that \c sem is valid as long as +/// \ref seaphore_units object is alive. +/// +/// \related semaphore +template +future> +get_units(basic_semaphore& sem, size_t units, typename basic_semaphore::duration timeout) noexcept { + return sem.wait(timeout, units).then([&sem, units] { + return semaphore_units{ sem, units }; + }); +} + + +/// \brief Consume units from semaphore temporarily +/// +/// Consume units from the semaphore and returns them when the \ref semaphore_units object goes out of scope. +/// This provides a safe way to temporarily take units from a semaphore and ensure +/// that they are eventually returned under all circumstances (exceptions, premature scope exits, etc). +/// +/// Unlike get_units(), this calls the non-blocking consume() API. +/// +/// Unlike with_semaphore(), the scope of unit holding is not limited to the scope of a single async lambda. +/// +/// \param sem The semaphore to take units from +/// \param units Number of units to consume +template::clock> +semaphore_units +consume_units(basic_semaphore& sem, size_t units) noexcept { + sem.consume(units); + return semaphore_units{ sem, units }; +} + +/// \brief Runs a function protected by a semaphore +/// +/// Acquires a \ref semaphore, runs a function, and releases +/// the semaphore, returning the the return value of the function, +/// as a \ref future. +/// +/// \param sem The semaphore to be held while the \c func is +/// running. +/// \param units Number of units to acquire from \c sem (as +/// with semaphore::wait()) +/// \param func The function to run; signature \c void() or +/// \c future<>(). +/// \return a \ref future<> holding the function's return value +/// or exception thrown; or a \ref future<> containing +/// an exception from one of the semaphore::broken() +/// variants. +/// +/// \note The caller must guarantee that \c sem is valid until +/// the future returned by with_semaphore() resolves. +/// +/// \related semaphore +template ::clock> +inline +futurize_t> +with_semaphore(basic_semaphore& sem, size_t units, Func&& func) noexcept { + return get_units(sem, units).then([func = std::forward(func)] (auto units) mutable { + return futurize_invoke(std::forward(func)).finally([units = std::move(units)] {}); + }); +} + +/// \brief Runs a function protected by a semaphore with time bound on wait +/// +/// If possible, acquires a \ref semaphore, runs a function, and releases +/// the semaphore, returning the the return value of the function, +/// as a \ref future. +/// +/// If the semaphore can't be acquired within the specified timeout, returns +/// a semaphore_timed_out exception +/// +/// \param sem The semaphore to be held while the \c func is +/// running. +/// \param units Number of units to acquire from \c sem (as +/// with semaphore::wait()) +/// \param timeout a duration specifying when to timeout the current request +/// \param func The function to run; signature \c void() or +/// \c future<>(). +/// \return a \ref future<> holding the function's return value +/// or exception thrown; or a \ref future<> containing +/// an exception from one of the semaphore::broken() +/// variants. +/// +/// \note The caller must guarantee that \c sem is valid until +/// the future returned by with_semaphore() resolves. +/// +/// \related semaphore +template +inline +futurize_t> +with_semaphore(basic_semaphore& sem, size_t units, typename basic_semaphore::duration timeout, Func&& func) noexcept { + return get_units(sem, units, timeout).then([func = std::forward(func)] (auto units) mutable { + return futurize_invoke(std::forward(func)).finally([units = std::move(units)] {}); + }); +} + +/// default basic_semaphore specialization that throws semaphore specific exceptions +/// on error conditions. +using semaphore = basic_semaphore; +using named_semaphore = basic_semaphore; + +/// @} + +} diff --git a/src/seastar/include/seastar/core/sharded.hh b/src/seastar/include/seastar/core/sharded.hh new file mode 100644 index 000000000..c002476e1 --- /dev/null +++ b/src/seastar/include/seastar/core/sharded.hh @@ -0,0 +1,909 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if __has_include() +#include +#endif + +/// \defgroup smp-module Multicore +/// +/// \brief Support for exploiting multiple cores on a server. +/// +/// Seastar supports multicore servers by using *sharding*. Each logical +/// core (lcore) runs a separate event loop, with its own memory allocator, +/// TCP/IP stack, and other services. Shards communicate by explicit message +/// passing, rather than using locks and condition variables as with traditional +/// threaded programming. + +namespace seastar { + +template +class sharded_parameter; + +namespace internal { + +template +auto unwrap_sharded_arg(sharded_parameter sp); + +using on_each_shard_func = std::function (unsigned shard)>; + +future<> sharded_parallel_for_each(unsigned nr_shards, on_each_shard_func on_each_shard) noexcept(std::is_nothrow_move_constructible_v); + +} + +/// \addtogroup smp-module +/// @{ + +template +class sharded; + +/// If sharded service inherits from this class sharded::stop() will wait +/// until all references to a service on each shard will disappear before +/// returning. It is still service's own responsibility to track its references +/// in asynchronous code by calling shared_from_this() and keeping returned smart +/// pointer as long as object is in use. +template +class async_sharded_service : public enable_shared_from_this { +protected: + std::function _delete_cb; + async_sharded_service() noexcept = default; + virtual ~async_sharded_service() { + if (_delete_cb) { + _delete_cb(); + } + } + template friend class sharded; +}; + + +/// \brief Provide a sharded service with access to its peers +/// +/// If a service class inherits from this, it will gain a \code container() +/// \endcode method that provides access to the \ref sharded object, with which +/// it can call its peers. +template +class peering_sharded_service { + sharded* _container = nullptr; +private: + template friend class sharded; + void set_container(sharded* container) noexcept { _container = container; } +public: + peering_sharded_service() noexcept = default; + peering_sharded_service(peering_sharded_service&&) noexcept = default; + peering_sharded_service(const peering_sharded_service&) = delete; + peering_sharded_service& operator=(const peering_sharded_service&) = delete; + sharded& container() noexcept { return *_container; } + const sharded& container() const noexcept { return *_container; } +}; + + +/// Exception thrown when a \ref sharded object does not exist +class no_sharded_instance_exception : public std::exception { +public: + virtual const char* what() const noexcept override { + return "sharded instance does not exist"; + } +}; + +/// Template helper to distribute a service across all logical cores. +/// +/// The \c sharded template manages a sharded service, by creating +/// a copy of the service on each logical core, providing mechanisms to communicate +/// with each shard's copy, and a way to stop the service. +/// +/// \tparam Service a class to be instantiated on each core. Must expose +/// a \c stop() method that returns a \c future<>, to be called when +/// the service is stopped. +template +class sharded { + struct entry { + shared_ptr service; + promise<> freed; + }; + std::vector _instances; +private: + using invoke_on_all_func_type = std::function (Service&)>; +private: + void service_deleted() noexcept { + _instances[this_shard_id()].freed.set_value(); + } + template + friend struct shared_ptr_make_helper; + + template + std::enable_if_t, T>::value> + set_container(T& service) noexcept { + service.set_container(this); + } + + template + std::enable_if_t, T>::value> + set_container(T& service) noexcept { + } + + future<> + sharded_parallel_for_each(internal::on_each_shard_func func) noexcept(std::is_nothrow_move_constructible_v) { + return internal::sharded_parallel_for_each(_instances.size(), std::move(func)); + } +public: + /// Constructs an empty \c sharded object. No instances of the service are + /// created. + sharded() noexcept {} + sharded(const sharded& other) = delete; + sharded& operator=(const sharded& other) = delete; + /// Sharded object with T that inherits from peering_sharded_service + /// cannot be moved safely, so disable move operations. + sharded(sharded&& other) = delete; + sharded& operator=(sharded&& other) = delete; + /// Destroyes a \c sharded object. Must not be in a started state. + ~sharded(); + + /// Starts \c Service by constructing an instance on every logical core + /// with a copy of \c args passed to the constructor. + /// + /// \param args Arguments to be forwarded to \c Service constructor + /// \return a \ref seastar::future<> that becomes ready when all instances have been + /// constructed. + template + future<> start(Args&&... args) noexcept; + + /// Starts \c Service by constructing an instance on a single logical core + /// with a copy of \c args passed to the constructor. + /// + /// \param args Arguments to be forwarded to \c Service constructor + /// \return a \ref seastar::future<> that becomes ready when the instance has been + /// constructed. + template + future<> start_single(Args&&... args) noexcept; + + /// Stops all started instances and destroys them. + /// + /// For every started instance, its \c stop() method is called, and then + /// it is destroyed. + future<> stop() noexcept; + + /// Invoke a type-erased function on all instances of `Service`. + /// The return value becomes ready when all instances have processed + /// the message. + /// + /// \param options the options to forward to the \ref smp::submit_to() + /// called behind the scenes. + /// \param func Function to be invoked on all shards + /// \return Future that becomes ready once all calls have completed + future<> invoke_on_all(smp_submit_to_options options, std::function (Service&)> func) noexcept; + + /// Invoke a type-erased function on all instances of `Service`. + /// The return value becomes ready when all instances have processed + /// the message. + /// Passes the default \ref smp_submit_to_options to the + /// \ref smp::submit_to() called behind the scenes. + future<> invoke_on_all(std::function (Service&)> func) noexcept { + try { + return invoke_on_all(smp_submit_to_options{}, std::move(func)); + } catch (...) { + return current_exception_as_future(); + } + } + + /// Invoke a function on all instances of `Service`. + /// The return value becomes ready when all instances have processed + /// the message. The function can be a member pointer to function, + /// a free function, or a functor. The first argument of the function + /// will be a reference to the local service on the shard. + /// + /// For a non-static pointer-to-member-function, the first argument + /// becomes `this`, not the first declared parameter. + /// + /// \param options the options to forward to the \ref smp::submit_to() + /// called behind the scenes. + /// \param func invocable accepting a `Service&` as the first parameter + /// to be invoked on all shards + /// \return Future that becomes ready once all calls have completed + template + SEASTAR_CONCEPT(requires std::invocable) + future<> invoke_on_all(smp_submit_to_options options, Func func, Args... args) noexcept; + + /// Invoke a function on all instances of `Service`. + /// The return value becomes ready when all instances have processed + /// the message. + /// Passes the default \ref smp_submit_to_options to the + /// \ref smp::submit_to() called behind the scenes. + template + SEASTAR_CONCEPT(requires std::invocable) + future<> invoke_on_all(Func func, Args... args) noexcept { + try { + return invoke_on_all(smp_submit_to_options{}, std::move(func), std::move(args)...); + } catch (...) { + return current_exception_as_future(); + } + } + + /// Invoke a callable on all instances of \c Service except the instance + /// which is allocated on current shard. + /// + /// \param options the options to forward to the \ref smp::submit_to() + /// called behind the scenes. + /// \param func a callable with the signature `void (Service&)` + /// or `future<> (Service&)`, to be called on each core + /// with the local instance as an argument. + /// \return a `future<>` that becomes ready when all cores but the current one have + /// processed the message. + template + SEASTAR_CONCEPT(requires std::invocable) + future<> invoke_on_others(smp_submit_to_options options, Func func, Args... args) noexcept; + + /// Invoke a callable on all instances of \c Service except the instance + /// which is allocated on current shard. + /// + /// \param func a callable with the signature `void (Service&)` + /// or `future<> (Service&)`, to be called on each core + /// with the local instance as an argument. + /// \return a `future<>` that becomes ready when all cores but the current one have + /// processed the message. + /// + /// Passes the default \ref smp_submit_to_options to the + /// \ref smp::submit_to() called behind the scenes. + template + SEASTAR_CONCEPT(requires std::invocable) + future<> invoke_on_others(Func func, Args... args) noexcept { + try { + return invoke_on_others(smp_submit_to_options{}, std::move(func), std::move(args)...); + } catch (...) { + return current_exception_as_future(); + } + } + + /// Invoke a method on all instances of `Service` and reduce the results using + /// `Reducer`. + /// + /// \see map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Reducer&& r) + template + inline + auto + map_reduce(Reducer&& r, Ret (Service::*func)(FuncArgs...), Args&&... args) + -> typename reducer_traits::future_type + { + return ::seastar::map_reduce(boost::make_counting_iterator(0), + boost::make_counting_iterator(_instances.size()), + [this, func, args = std::make_tuple(std::forward(args)...)] (unsigned c) mutable { + return smp::submit_to(c, [this, func, args] () mutable { + return std::apply([this, func] (Args&&... args) mutable { + auto inst = _instances[this_shard_id()].service; + if (inst) { + return ((*inst).*func)(std::forward(args)...); + } else { + throw no_sharded_instance_exception(); + } + }, std::move(args)); + }); + }, std::forward(r)); + } + + /// Invoke a callable on all instances of `Service` and reduce the results using + /// `Reducer`. + /// + /// \see map_reduce(Iterator begin, Iterator end, Mapper&& mapper, Reducer&& r) + template + inline + auto map_reduce(Reducer&& r, Func&& func) -> typename reducer_traits::future_type + { + return ::seastar::map_reduce(boost::make_counting_iterator(0), + boost::make_counting_iterator(_instances.size()), + [this, &func] (unsigned c) mutable { + return smp::submit_to(c, [this, func] () mutable { + auto inst = get_local_service(); + return func(*inst); + }); + }, std::forward(r)); + } + + /// Applies a map function to all shards, then reduces the output by calling a reducer function. + /// + /// \param map callable with the signature `Value (Service&)` or + /// `future (Service&)` (for some `Value` type). + /// used as the second input to \c reduce + /// \param initial initial value used as the first input to \c reduce. + /// \param reduce binary function used to left-fold the return values of \c map + /// into \c initial . + /// + /// Each \c map invocation runs on the shard associated with the service. + /// + /// \tparam Mapper unary function taking `Service&` and producing some result. + /// \tparam Initial any value type + /// \tparam Reduce a binary function taking two Initial values and returning an Initial + /// \return Result of invoking `map` with each instance in parallel, reduced by calling + /// `reduce()` on each adjacent pair of results. + template + inline + future + map_reduce0(Mapper map, Initial initial, Reduce reduce) { + auto wrapped_map = [this, map] (unsigned c) { + return smp::submit_to(c, [this, map] { + auto inst = get_local_service(); + return map(*inst); + }); + }; + return ::seastar::map_reduce(smp::all_cpus().begin(), smp::all_cpus().end(), + std::move(wrapped_map), + std::move(initial), + std::move(reduce)); + } + + /// Applies a map function to all shards, and return a vector of the result. + /// + /// \param mapper callable with the signature `Value (Service&)` or + /// `future (Service&)` (for some `Value` type). + /// + /// Each \c map invocation runs on the shard associated with the service. + /// + /// \tparam Mapper unary function taking `Service&` and producing some result. + /// \return Result vector of invoking `map` with each instance in parallel + template >, typename return_type = decltype(internal::untuple(std::declval()))> + inline future> map(Mapper mapper) { + return do_with(std::vector(), + [&mapper, this] (std::vector& vec) mutable { + vec.resize(smp::count); + return parallel_for_each(boost::irange(0, _instances.size()), [this, &vec, mapper] (unsigned c) { + return smp::submit_to(c, [this, mapper] { + auto inst = get_local_service(); + return mapper(*inst); + }).then([&vec, c] (auto res) { + vec[c] = res; + }); + }).then([&vec] { + return make_ready_future>(std::move(vec)); + }); + }); + } + + /// Invoke a callable on a specific instance of `Service`. + /// + /// \param id shard id to call + /// \param options the options to forward to the \ref smp::submit_to() + /// called behind the scenes. + /// \param func a callable with signature `Value (Service&, Args...)` or + /// `future (Service&, Args...)` (for some `Value` type), or a pointer + /// to a member function of Service + /// \param args parameters to the callable; will be copied or moved. To pass by reference, + /// use std::ref(). + /// + /// \return result of calling `func(instance)` on the designated instance + template >> + SEASTAR_CONCEPT(requires std::invocable) + Ret + invoke_on(unsigned id, smp_submit_to_options options, Func&& func, Args&&... args) { + return smp::submit_to(id, options, [this, func = std::forward(func), args = std::tuple(std::move(args)...)] () mutable { + auto inst = get_local_service(); + return std::apply(std::forward(func), std::tuple_cat(std::forward_as_tuple(*inst), std::move(args))); + }); + } + + /// Invoke a callable on a specific instance of `Service`. + /// + /// \param id shard id to call + /// \param func a callable with signature `Value (Service&)` or + /// `future (Service&)` (for some `Value` type), or a pointer + /// to a member function of Service + /// \param args parameters to the callable + /// \return result of calling `func(instance)` on the designated instance + template >> + SEASTAR_CONCEPT(requires std::invocable) + Ret + invoke_on(unsigned id, Func&& func, Args&&... args) { + return invoke_on(id, smp_submit_to_options(), std::forward(func), std::forward(args)...); + } + + /// Gets a reference to the local instance. + const Service& local() const noexcept; + + /// Gets a reference to the local instance. + Service& local() noexcept; + + /// Gets a shared pointer to the local instance. + shared_ptr local_shared() noexcept; + + /// Checks whether the local instance has been initialized. + bool local_is_initialized() const noexcept; + +private: + void track_deletion(shared_ptr& s, std::false_type) noexcept { + // do not wait for instance to be deleted since it is not going to notify us + service_deleted(); + } + + void track_deletion(shared_ptr& s, std::true_type) { + s->_delete_cb = std::bind(std::mem_fn(&sharded::service_deleted), this); + } + + template + shared_ptr create_local_service(Args&&... args) { + auto s = ::seastar::make_shared(std::forward(args)...); + set_container(*s); + track_deletion(s, std::is_base_of, Service>()); + return s; + } + + shared_ptr get_local_service() { + auto inst = _instances[this_shard_id()].service; + if (!inst) { + throw no_sharded_instance_exception(); + } + return inst; + } +}; + +namespace internal { + +template +struct sharded_unwrap { + using type = T; +}; + +template +struct sharded_unwrap>> { + using type = T&; +}; + +template +using sharded_unwrap_t = typename sharded_unwrap::type; + +} // internal + + +/// \brief Helper to pass a parameter to a `sharded<>` object that depends +/// on the shard. It is evaluated on the shard, just before being +/// passed to the local instance. It is useful when passing +/// parameters to sharded::start(). +template +class sharded_parameter { + Func _func; + std::tuple _params; +public: + /// Creates a sharded parameter, which evaluates differently based on + /// the shard it is executed on. + /// + /// \param func Function to be executed + /// \param params optional parameters to be passed to the function. Can + /// be std::ref(sharded), in which case the local + /// instance will be passed. Anything else + /// will be passed by value unchanged. + explicit sharded_parameter(Func func, Params... params) + SEASTAR_CONCEPT(requires std::invocable...>) + : _func(std::move(func)), _params(std::make_tuple(std::move(params)...)) { + } +private: + auto evaluate() const; + + template + friend auto internal::unwrap_sharded_arg(sharded_parameter sp); +}; + +/// \example sharded_parameter_demo.cc +/// +/// Example use of \ref sharded_parameter. + +/// @} + +template +sharded::~sharded() { + assert(_instances.empty()); +} + +namespace internal { + +template +class either_sharded_or_local { + sharded& _sharded; +public: + either_sharded_or_local(sharded& s) : _sharded(s) {} + operator sharded& () { return _sharded; } + operator Service& () { return _sharded.local(); } +}; + +template +inline +T&& +unwrap_sharded_arg(T&& arg) { + return std::forward(arg); +} + +template +either_sharded_or_local +unwrap_sharded_arg(std::reference_wrapper> arg) { + return either_sharded_or_local(arg); +} + +template +auto +unwrap_sharded_arg(sharded_parameter sp) { + return sp.evaluate(); +} + +} + +template +auto +sharded_parameter::evaluate() const { + auto unwrap_params_and_invoke = [this] (const auto&... params) { + return std::invoke(_func, internal::unwrap_sharded_arg(params)...); + }; + return std::apply(unwrap_params_and_invoke, _params); +} + +template +template +future<> +sharded::start(Args&&... args) noexcept { + try { + _instances.resize(smp::count); + return sharded_parallel_for_each( + [this, args = std::make_tuple(std::forward(args)...)] (unsigned c) mutable { + return smp::submit_to(c, [this, args] () mutable { + _instances[this_shard_id()].service = std::apply([this] (Args... args) { + return create_local_service(internal::unwrap_sharded_arg(std::forward(args))...); + }, args); + }); + }).then_wrapped([this] (future<> f) { + try { + f.get(); + return make_ready_future<>(); + } catch (...) { + return this->stop().then([e = std::current_exception()] () mutable { + std::rethrow_exception(e); + }); + } + }); + } catch (...) { + return current_exception_as_future(); + } +} + +template +template +future<> +sharded::start_single(Args&&... args) noexcept { + try { + assert(_instances.empty()); + _instances.resize(1); + return smp::submit_to(0, [this, args = std::make_tuple(std::forward(args)...)] () mutable { + _instances[0].service = std::apply([this] (Args... args) { + return create_local_service(internal::unwrap_sharded_arg(std::forward(args))...); + }, args); + }).then_wrapped([this] (future<> f) { + try { + f.get(); + return make_ready_future<>(); + } catch (...) { + return this->stop().then([e = std::current_exception()] () mutable { + std::rethrow_exception(e); + }); + } + }); + } catch (...) { + return current_exception_as_future(); + } +} + +namespace internal { + +// Helper check if Service::stop exists + +struct sharded_has_stop { + // If a member names "stop" exists, try to call it, even if it doesn't + // have the correct signature. This is so that we don't ignore a function + // named stop() just because the signature is incorrect, and instead + // force the user to resolve the ambiguity. + template + constexpr static auto check(int) -> std::enable_if_t<(sizeof(&Service::stop) >= 0), bool> { + return true; + } + + // Fallback in case Service::stop doesn't exist. + template + static constexpr auto check(...) -> bool { + return false; + } +}; + +template +struct sharded_call_stop { + template + static future<> call(Service& instance); +}; + +template <> +template +inline +future<> sharded_call_stop::call(Service& instance) { + return instance.stop(); +} + +template <> +template +inline +future<> sharded_call_stop::call(Service& instance) { + return make_ready_future<>(); +} + +template +inline +future<> +stop_sharded_instance(Service& instance) { + constexpr bool has_stop = internal::sharded_has_stop::check(0); + return internal::sharded_call_stop::call(instance); +} + +} + +template +future<> +sharded::stop() noexcept { + try { + return sharded_parallel_for_each([this] (unsigned c) mutable { + return smp::submit_to(c, [this] () mutable { + auto inst = _instances[this_shard_id()].service; + if (!inst) { + return make_ready_future<>(); + } + return internal::stop_sharded_instance(*inst); + }); + }).then_wrapped([this] (future<> fut) { + return sharded_parallel_for_each([this] (unsigned c) { + return smp::submit_to(c, [this] { + if (_instances[this_shard_id()].service == nullptr) { + return make_ready_future<>(); + } + _instances[this_shard_id()].service = nullptr; + return _instances[this_shard_id()].freed.get_future(); + }); + }).finally([this, fut = std::move(fut)] () mutable { + _instances.clear(); + _instances = std::vector::entry>(); + return std::move(fut); + }); + }); + } catch (...) { + return current_exception_as_future(); + } +} + +template +future<> +sharded::invoke_on_all(smp_submit_to_options options, std::function (Service&)> func) noexcept { + try { + return sharded_parallel_for_each([this, options, func = std::move(func)] (unsigned c) { + return smp::submit_to(c, options, [this, func] { + return func(*get_local_service()); + }); + }); + } catch (...) { + return current_exception_as_future(); + } +} + +template +template +SEASTAR_CONCEPT(requires std::invocable) +inline +future<> +sharded::invoke_on_all(smp_submit_to_options options, Func func, Args... args) noexcept { + static_assert(std::is_same_v>, future<>>, + "invoke_on_all()'s func must return void or future<>"); + try { + return invoke_on_all(options, invoke_on_all_func_type([func, args = std::tuple(std::move(args)...)] (Service& service) mutable { + return futurize_apply(func, std::tuple_cat(std::forward_as_tuple(service), args)); + })); + } catch (...) { + return current_exception_as_future(); + } +} + +template +template +SEASTAR_CONCEPT(requires std::invocable) +inline +future<> +sharded::invoke_on_others(smp_submit_to_options options, Func func, Args... args) noexcept { + static_assert(std::is_same_v>, future<>>, + "invoke_on_others()'s func must return void or future<>"); + try { + return invoke_on_all(options, [orig = this_shard_id(), func = std::move(func), args = std::tuple(std::move(args)...)] (Service& s) -> future<> { + return this_shard_id() == orig ? make_ready_future<>() : futurize_apply(func, std::tuple_cat(std::forward_as_tuple(s), args));; + }); + } catch (...) { + return current_exception_as_future(); + } +} + +template +const Service& sharded::local() const noexcept { + assert(local_is_initialized()); + return *_instances[this_shard_id()].service; +} + +template +Service& sharded::local() noexcept { + assert(local_is_initialized()); + return *_instances[this_shard_id()].service; +} + +template +shared_ptr sharded::local_shared() noexcept { + assert(local_is_initialized()); + return _instances[this_shard_id()].service; +} + +template +inline bool sharded::local_is_initialized() const noexcept { + return _instances.size() > this_shard_id() && + _instances[this_shard_id()].service; +} + +/// \addtogroup smp-module +/// @{ + +/// Smart pointer wrapper which makes it safe to move across CPUs. +/// +/// \c foreign_ptr<> is a smart pointer wrapper which, unlike +/// \ref shared_ptr and \ref lw_shared_ptr, is safe to move to a +/// different core. +/// +/// As seastar avoids locking, any but the most trivial objects must +/// be destroyed on the same core they were created on, so that, +/// for example, their destructors can unlink references to the +/// object from various containers. In addition, for performance +/// reasons, the shared pointer types do not use atomic operations +/// to manage their reference counts. As a result they cannot be +/// used on multiple cores in parallel. +/// +/// \c foreign_ptr<> provides a solution to that problem. +/// \c foreign_ptr<> wraps any pointer type -- raw pointer, +/// \ref seastar::shared_ptr<>, or similar, and remembers on what core this +/// happened. When the \c foreign_ptr<> object is destroyed, it +/// sends a message to the original core so that the wrapped object +/// can be safely destroyed. +/// +/// \c foreign_ptr<> is a move-only object; it cannot be copied. +/// +template +class foreign_ptr { +private: + PtrType _value; + unsigned _cpu; +private: + void destroy(PtrType p, unsigned cpu) noexcept { + if (p && this_shard_id() != cpu) { + // `destroy()` is called from the destructor and other + // synchronous methods (like `reset()`), that have no way to + // wait for this future. + (void)smp::submit_to(cpu, [v = std::move(p)] () mutable { + // Destroy the contained pointer. We do this explicitly + // in the current shard, because the lambda is destroyed + // in the shard that submitted the task. + v = {}; + }); + } + } +public: + using element_type = typename std::pointer_traits::element_type; + using pointer = element_type*; + + /// Constructs a null \c foreign_ptr<>. + foreign_ptr() noexcept(std::is_nothrow_default_constructible_v) + : _value(PtrType()) + , _cpu(this_shard_id()) { + } + /// Constructs a null \c foreign_ptr<>. + foreign_ptr(std::nullptr_t) noexcept(std::is_nothrow_default_constructible_v) : foreign_ptr() {} + /// Wraps a pointer object and remembers the current core. + foreign_ptr(PtrType value) noexcept(std::is_nothrow_move_constructible_v) + : _value(std::move(value)) + , _cpu(this_shard_id()) { + } + // The type is intentionally non-copyable because copies + // are expensive because each copy requires across-CPU call. + foreign_ptr(const foreign_ptr&) = delete; + /// Moves a \c foreign_ptr<> to another object. + foreign_ptr(foreign_ptr&& other) noexcept(std::is_nothrow_move_constructible_v) = default; + /// Destroys the wrapped object on its original cpu. + ~foreign_ptr() { + destroy(std::move(_value), _cpu); + } + /// Creates a copy of this foreign ptr. Only works if the stored ptr is copyable. + future copy() const noexcept { + return smp::submit_to(_cpu, [this] () mutable { + auto v = _value; + return make_foreign(std::move(v)); + }); + } + /// Accesses the wrapped object. + element_type& operator*() const noexcept(noexcept(*_value)) { return *_value; } + /// Accesses the wrapped object. + element_type* operator->() const noexcept(noexcept(&*_value)) { return &*_value; } + /// Access the raw pointer to the wrapped object. + pointer get() const noexcept(noexcept(&*_value)) { return &*_value; } + /// Return the owner-shard of this pointer. + /// + /// The owner shard of the pointer can change as a result of + /// move-assigment or a call to reset(). + unsigned get_owner_shard() const noexcept { return _cpu; } + /// Checks whether the wrapped pointer is non-null. + operator bool() const noexcept(noexcept(static_cast(_value))) { return static_cast(_value); } + /// Move-assigns a \c foreign_ptr<>. + foreign_ptr& operator=(foreign_ptr&& other) noexcept(std::is_nothrow_move_constructible::value) { + destroy(std::move(_value), _cpu); + _value = std::move(other._value); + _cpu = other._cpu; + return *this; + } + /// Releases the owned pointer + /// + /// Warning: the caller is now responsible for destroying the + /// pointer on its owner shard. This method is best called on the + /// owner shard to avoid accidents. + PtrType release() noexcept(std::is_nothrow_default_constructible_v) { + return std::exchange(_value, {}); + } + /// Replace the managed pointer with new_ptr. + /// + /// The previous managed pointer is destroyed on its owner shard. + void reset(PtrType new_ptr) noexcept(std::is_nothrow_move_constructible_v) { + auto old_ptr = std::move(_value); + auto old_cpu = _cpu; + + _value = std::move(new_ptr); + _cpu = this_shard_id(); + + destroy(std::move(old_ptr), old_cpu); + } + /// Replace the managed pointer with a null value. + /// + /// The previous managed pointer is destroyed on its owner shard. + void reset(std::nullptr_t = nullptr) noexcept(std::is_nothrow_default_constructible_v) { + reset(PtrType()); + } +}; + +/// Wraps a raw or smart pointer object in a \ref foreign_ptr<>. +/// +/// \relates foreign_ptr +template +foreign_ptr make_foreign(T ptr) { + return foreign_ptr(std::move(ptr)); +} + +/// @} + +template +struct is_smart_ptr> : std::true_type {}; + +} diff --git a/src/seastar/include/seastar/core/shared_future.hh b/src/seastar/include/seastar/core/shared_future.hh new file mode 100644 index 000000000..6d8a8045e --- /dev/null +++ b/src/seastar/include/seastar/core/shared_future.hh @@ -0,0 +1,299 @@ +/* + * This file is open source software, licensed to you under the terms + * of the Apache License, Version 2.0 (the "License"). See the NOTICE file + * distributed with this work for additional information regarding copyright + * ownership. You may not use this file except in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (C) 2015 ScyllaDB + */ + +#pragma once + +#include +#include + +namespace seastar { + +/// \addtogroup future-module +/// @{ + +/// Changes the clock used by shared_future<> and shared_promise<> when passed as the first template parameter. +template +struct with_clock {}; + +/// \cond internal + +template +struct future_option_traits; + +template +struct future_option_traits, T...> { + using clock_type = Clock; + + template