From e6918187568dbd01842d8d1d2c808ce16a894239 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 21 Apr 2024 13:54:28 +0200 Subject: Adding upstream version 18.2.2. Signed-off-by: Daniel Baumann --- src/rocksdb/util/aligned_buffer.h | 234 + src/rocksdb/util/async_file_reader.cc | 73 + src/rocksdb/util/async_file_reader.h | 144 + src/rocksdb/util/autovector.h | 406 ++ src/rocksdb/util/autovector_test.cc | 331 ++ src/rocksdb/util/bloom_impl.h | 489 ++ src/rocksdb/util/bloom_test.cc | 1175 +++++ src/rocksdb/util/build_version.cc.in | 81 + src/rocksdb/util/cast_util.h | 42 + src/rocksdb/util/channel.h | 69 + src/rocksdb/util/cleanable.cc | 181 + src/rocksdb/util/coding.cc | 90 + src/rocksdb/util/coding.h | 389 ++ src/rocksdb/util/coding_lean.h | 101 + src/rocksdb/util/coding_test.cc | 217 + src/rocksdb/util/compaction_job_stats_impl.cc | 100 + src/rocksdb/util/comparator.cc | 391 ++ src/rocksdb/util/compression.cc | 122 + src/rocksdb/util/compression.h | 1786 ++++++++ src/rocksdb/util/compression_context_cache.cc | 106 + src/rocksdb/util/compression_context_cache.h | 47 + src/rocksdb/util/concurrent_task_limiter_impl.cc | 64 + src/rocksdb/util/concurrent_task_limiter_impl.h | 67 + src/rocksdb/util/core_local.h | 83 + src/rocksdb/util/coro_utils.h | 112 + src/rocksdb/util/crc32c.cc | 1351 ++++++ src/rocksdb/util/crc32c.h | 56 + src/rocksdb/util/crc32c_arm64.cc | 215 + src/rocksdb/util/crc32c_arm64.h | 52 + src/rocksdb/util/crc32c_ppc.c | 94 + src/rocksdb/util/crc32c_ppc.h | 22 + src/rocksdb/util/crc32c_ppc_asm.S | 756 +++ src/rocksdb/util/crc32c_ppc_constants.h | 900 ++++ src/rocksdb/util/crc32c_test.cc | 213 + src/rocksdb/util/defer.h | 82 + src/rocksdb/util/defer_test.cc | 51 + src/rocksdb/util/distributed_mutex.h | 48 + src/rocksdb/util/duplicate_detector.h | 71 + src/rocksdb/util/dynamic_bloom.cc | 70 + src/rocksdb/util/dynamic_bloom.h | 214 + src/rocksdb/util/dynamic_bloom_test.cc | 325 ++ src/rocksdb/util/fastrange.h | 114 + src/rocksdb/util/file_checksum_helper.cc | 172 + src/rocksdb/util/file_checksum_helper.h | 100 + src/rocksdb/util/file_reader_writer_test.cc | 1066 +++++ src/rocksdb/util/filelock_test.cc | 148 + src/rocksdb/util/filter_bench.cc | 840 ++++ src/rocksdb/util/gflags_compat.h | 30 + src/rocksdb/util/hash.cc | 201 + src/rocksdb/util/hash.h | 137 + src/rocksdb/util/hash128.h | 26 + src/rocksdb/util/hash_containers.h | 51 + src/rocksdb/util/hash_map.h | 67 + src/rocksdb/util/hash_test.cc | 853 ++++ src/rocksdb/util/heap.h | 174 + src/rocksdb/util/heap_test.cc | 131 + src/rocksdb/util/kv_map.h | 33 + src/rocksdb/util/log_write_bench.cc | 88 + src/rocksdb/util/math.h | 294 ++ src/rocksdb/util/math128.h | 316 ++ src/rocksdb/util/murmurhash.cc | 196 + src/rocksdb/util/murmurhash.h | 43 + src/rocksdb/util/mutexlock.h | 180 + src/rocksdb/util/ppc-opcode.h | 27 + src/rocksdb/util/random.cc | 62 + src/rocksdb/util/random.h | 190 + src/rocksdb/util/random_test.cc | 107 + src/rocksdb/util/rate_limiter.cc | 378 ++ src/rocksdb/util/rate_limiter.h | 146 + src/rocksdb/util/rate_limiter_test.cc | 476 ++ src/rocksdb/util/repeatable_thread.h | 149 + src/rocksdb/util/repeatable_thread_test.cc | 111 + src/rocksdb/util/ribbon_alg.h | 1225 +++++ src/rocksdb/util/ribbon_config.cc | 506 ++ src/rocksdb/util/ribbon_config.h | 182 + src/rocksdb/util/ribbon_impl.h | 1137 +++++ src/rocksdb/util/ribbon_test.cc | 1308 ++++++ src/rocksdb/util/set_comparator.h | 24 + src/rocksdb/util/single_thread_executor.h | 56 + src/rocksdb/util/slice.cc | 405 ++ src/rocksdb/util/slice_test.cc | 191 + src/rocksdb/util/slice_transform_test.cc | 154 + src/rocksdb/util/status.cc | 154 + src/rocksdb/util/stderr_logger.cc | 30 + src/rocksdb/util/stderr_logger.h | 31 + src/rocksdb/util/stop_watch.h | 118 + src/rocksdb/util/string_util.cc | 504 ++ src/rocksdb/util/string_util.h | 177 + src/rocksdb/util/thread_guard.h | 41 + src/rocksdb/util/thread_list_test.cc | 360 ++ src/rocksdb/util/thread_local.cc | 521 +++ src/rocksdb/util/thread_local.h | 100 + src/rocksdb/util/thread_local_test.cc | 582 +++ src/rocksdb/util/thread_operation.h | 112 + src/rocksdb/util/threadpool_imp.cc | 551 +++ src/rocksdb/util/threadpool_imp.h | 120 + src/rocksdb/util/timer.h | 340 ++ src/rocksdb/util/timer_queue.h | 231 + src/rocksdb/util/timer_queue_test.cc | 73 + src/rocksdb/util/timer_test.cc | 402 ++ src/rocksdb/util/user_comparator_wrapper.h | 64 + src/rocksdb/util/vector_iterator.h | 118 + src/rocksdb/util/work_queue.h | 150 + src/rocksdb/util/work_queue_test.cc | 272 ++ src/rocksdb/util/xxhash.cc | 48 + src/rocksdb/util/xxhash.h | 5346 ++++++++++++++++++++++ src/rocksdb/util/xxph3.h | 1764 +++++++ 107 files changed, 35723 insertions(+) create mode 100644 src/rocksdb/util/aligned_buffer.h create mode 100644 src/rocksdb/util/async_file_reader.cc create mode 100644 src/rocksdb/util/async_file_reader.h create mode 100644 src/rocksdb/util/autovector.h create mode 100644 src/rocksdb/util/autovector_test.cc create mode 100644 src/rocksdb/util/bloom_impl.h create mode 100644 src/rocksdb/util/bloom_test.cc create mode 100644 src/rocksdb/util/build_version.cc.in create mode 100644 src/rocksdb/util/cast_util.h create mode 100644 src/rocksdb/util/channel.h create mode 100644 src/rocksdb/util/cleanable.cc create mode 100644 src/rocksdb/util/coding.cc create mode 100644 src/rocksdb/util/coding.h create mode 100644 src/rocksdb/util/coding_lean.h create mode 100644 src/rocksdb/util/coding_test.cc create mode 100644 src/rocksdb/util/compaction_job_stats_impl.cc create mode 100644 src/rocksdb/util/comparator.cc create mode 100644 src/rocksdb/util/compression.cc create mode 100644 src/rocksdb/util/compression.h create mode 100644 src/rocksdb/util/compression_context_cache.cc create mode 100644 src/rocksdb/util/compression_context_cache.h create mode 100644 src/rocksdb/util/concurrent_task_limiter_impl.cc create mode 100644 src/rocksdb/util/concurrent_task_limiter_impl.h create mode 100644 src/rocksdb/util/core_local.h create mode 100644 src/rocksdb/util/coro_utils.h create mode 100644 src/rocksdb/util/crc32c.cc create mode 100644 src/rocksdb/util/crc32c.h create mode 100644 src/rocksdb/util/crc32c_arm64.cc create mode 100644 src/rocksdb/util/crc32c_arm64.h create mode 100644 src/rocksdb/util/crc32c_ppc.c create mode 100644 src/rocksdb/util/crc32c_ppc.h create mode 100644 src/rocksdb/util/crc32c_ppc_asm.S create mode 100644 src/rocksdb/util/crc32c_ppc_constants.h create mode 100644 src/rocksdb/util/crc32c_test.cc create mode 100644 src/rocksdb/util/defer.h create mode 100644 src/rocksdb/util/defer_test.cc create mode 100644 src/rocksdb/util/distributed_mutex.h create mode 100644 src/rocksdb/util/duplicate_detector.h create mode 100644 src/rocksdb/util/dynamic_bloom.cc create mode 100644 src/rocksdb/util/dynamic_bloom.h create mode 100644 src/rocksdb/util/dynamic_bloom_test.cc create mode 100644 src/rocksdb/util/fastrange.h create mode 100644 src/rocksdb/util/file_checksum_helper.cc create mode 100644 src/rocksdb/util/file_checksum_helper.h create mode 100644 src/rocksdb/util/file_reader_writer_test.cc create mode 100644 src/rocksdb/util/filelock_test.cc create mode 100644 src/rocksdb/util/filter_bench.cc create mode 100644 src/rocksdb/util/gflags_compat.h create mode 100644 src/rocksdb/util/hash.cc create mode 100644 src/rocksdb/util/hash.h create mode 100644 src/rocksdb/util/hash128.h create mode 100644 src/rocksdb/util/hash_containers.h create mode 100644 src/rocksdb/util/hash_map.h create mode 100644 src/rocksdb/util/hash_test.cc create mode 100644 src/rocksdb/util/heap.h create mode 100644 src/rocksdb/util/heap_test.cc create mode 100644 src/rocksdb/util/kv_map.h create mode 100644 src/rocksdb/util/log_write_bench.cc create mode 100644 src/rocksdb/util/math.h create mode 100644 src/rocksdb/util/math128.h create mode 100644 src/rocksdb/util/murmurhash.cc create mode 100644 src/rocksdb/util/murmurhash.h create mode 100644 src/rocksdb/util/mutexlock.h create mode 100644 src/rocksdb/util/ppc-opcode.h create mode 100644 src/rocksdb/util/random.cc create mode 100644 src/rocksdb/util/random.h create mode 100644 src/rocksdb/util/random_test.cc create mode 100644 src/rocksdb/util/rate_limiter.cc create mode 100644 src/rocksdb/util/rate_limiter.h create mode 100644 src/rocksdb/util/rate_limiter_test.cc create mode 100644 src/rocksdb/util/repeatable_thread.h create mode 100644 src/rocksdb/util/repeatable_thread_test.cc create mode 100644 src/rocksdb/util/ribbon_alg.h create mode 100644 src/rocksdb/util/ribbon_config.cc create mode 100644 src/rocksdb/util/ribbon_config.h create mode 100644 src/rocksdb/util/ribbon_impl.h create mode 100644 src/rocksdb/util/ribbon_test.cc create mode 100644 src/rocksdb/util/set_comparator.h create mode 100644 src/rocksdb/util/single_thread_executor.h create mode 100644 src/rocksdb/util/slice.cc create mode 100644 src/rocksdb/util/slice_test.cc create mode 100644 src/rocksdb/util/slice_transform_test.cc create mode 100644 src/rocksdb/util/status.cc create mode 100644 src/rocksdb/util/stderr_logger.cc create mode 100644 src/rocksdb/util/stderr_logger.h create mode 100644 src/rocksdb/util/stop_watch.h create mode 100644 src/rocksdb/util/string_util.cc create mode 100644 src/rocksdb/util/string_util.h create mode 100644 src/rocksdb/util/thread_guard.h create mode 100644 src/rocksdb/util/thread_list_test.cc create mode 100644 src/rocksdb/util/thread_local.cc create mode 100644 src/rocksdb/util/thread_local.h create mode 100644 src/rocksdb/util/thread_local_test.cc create mode 100644 src/rocksdb/util/thread_operation.h create mode 100644 src/rocksdb/util/threadpool_imp.cc create mode 100644 src/rocksdb/util/threadpool_imp.h create mode 100644 src/rocksdb/util/timer.h create mode 100644 src/rocksdb/util/timer_queue.h create mode 100644 src/rocksdb/util/timer_queue_test.cc create mode 100644 src/rocksdb/util/timer_test.cc create mode 100644 src/rocksdb/util/user_comparator_wrapper.h create mode 100644 src/rocksdb/util/vector_iterator.h create mode 100644 src/rocksdb/util/work_queue.h create mode 100644 src/rocksdb/util/work_queue_test.cc create mode 100644 src/rocksdb/util/xxhash.cc create mode 100644 src/rocksdb/util/xxhash.h create mode 100644 src/rocksdb/util/xxph3.h (limited to 'src/rocksdb/util') diff --git a/src/rocksdb/util/aligned_buffer.h b/src/rocksdb/util/aligned_buffer.h new file mode 100644 index 000000000..95ee5dfe8 --- /dev/null +++ b/src/rocksdb/util/aligned_buffer.h @@ -0,0 +1,234 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +// This file contains utilities to handle the alignment of pages and buffers. + +// Truncate to a multiple of page_size, which is also a page boundary. This +// helps to figuring out the right alignment. +// Example: +// TruncateToPageBoundary(4096, 5000) => 4096 +// TruncateToPageBoundary((4096, 10000) => 8192 +inline size_t TruncateToPageBoundary(size_t page_size, size_t s) { + s -= (s & (page_size - 1)); + assert((s % page_size) == 0); + return s; +} + +// Round up x to a multiple of y. +// Example: +// Roundup(13, 5) => 15 +// Roundup(201, 16) => 208 +inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } + +// Round down x to a multiple of y. +// Example: +// Rounddown(13, 5) => 10 +// Rounddown(201, 16) => 192 +inline size_t Rounddown(size_t x, size_t y) { return (x / y) * y; } + +// AlignedBuffer manages a buffer by taking alignment into consideration, and +// aligns the buffer start and end positions. It is mainly used for direct I/O, +// though it can be used other purposes as well. +// It also supports expanding the managed buffer, and copying whole or part of +// the data from old buffer into the new expanded buffer. Such a copy especially +// helps in cases avoiding an IO to re-fetch the data from disk. +// +// Example: +// AlignedBuffer buf; +// buf.Alignment(alignment); +// buf.AllocateNewBuffer(user_requested_buf_size); +// ... +// buf.AllocateNewBuffer(2*user_requested_buf_size, /*copy_data*/ true, +// copy_offset, copy_len); +class AlignedBuffer { + size_t alignment_; + std::unique_ptr buf_; + size_t capacity_; + size_t cursize_; + char* bufstart_; + + public: + AlignedBuffer() + : alignment_(), capacity_(0), cursize_(0), bufstart_(nullptr) {} + + AlignedBuffer(AlignedBuffer&& o) noexcept { *this = std::move(o); } + + AlignedBuffer& operator=(AlignedBuffer&& o) noexcept { + alignment_ = std::move(o.alignment_); + buf_ = std::move(o.buf_); + capacity_ = std::move(o.capacity_); + cursize_ = std::move(o.cursize_); + bufstart_ = std::move(o.bufstart_); + return *this; + } + + AlignedBuffer(const AlignedBuffer&) = delete; + + AlignedBuffer& operator=(const AlignedBuffer&) = delete; + + static bool isAligned(const void* ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; + } + + static bool isAligned(size_t n, size_t alignment) { + return n % alignment == 0; + } + + size_t Alignment() const { return alignment_; } + + size_t Capacity() const { return capacity_; } + + size_t CurrentSize() const { return cursize_; } + + const char* BufferStart() const { return bufstart_; } + + char* BufferStart() { return bufstart_; } + + void Clear() { cursize_ = 0; } + + char* Release() { + cursize_ = 0; + capacity_ = 0; + bufstart_ = nullptr; + return buf_.release(); + } + + void Alignment(size_t alignment) { + assert(alignment > 0); + assert((alignment & (alignment - 1)) == 0); + alignment_ = alignment; + } + + // Allocates a new buffer and sets the start position to the first aligned + // byte. + // + // requested_capacity: requested new buffer capacity. This capacity will be + // rounded up based on alignment. + // copy_data: Copy data from old buffer to new buffer. If copy_offset and + // copy_len are not passed in and the new requested capacity is bigger + // than the existing buffer's capacity, the data in the exising buffer is + // fully copied over to the new buffer. + // copy_offset: Copy data from this offset in old buffer. + // copy_len: Number of bytes to copy. + // + // The function does nothing if the new requested_capacity is smaller than + // the current buffer capacity and copy_data is true i.e. the old buffer is + // retained as is. + void AllocateNewBuffer(size_t requested_capacity, bool copy_data = false, + uint64_t copy_offset = 0, size_t copy_len = 0) { + assert(alignment_ > 0); + assert((alignment_ & (alignment_ - 1)) == 0); + + copy_len = copy_len > 0 ? copy_len : cursize_; + if (copy_data && requested_capacity < copy_len) { + // If we are downsizing to a capacity that is smaller than the current + // data in the buffer -- Ignore the request. + return; + } + + size_t new_capacity = Roundup(requested_capacity, alignment_); + char* new_buf = new char[new_capacity + alignment_]; + char* new_bufstart = reinterpret_cast( + (reinterpret_cast(new_buf) + (alignment_ - 1)) & + ~static_cast(alignment_ - 1)); + + if (copy_data) { + assert(bufstart_ + copy_offset + copy_len <= bufstart_ + cursize_); + memcpy(new_bufstart, bufstart_ + copy_offset, copy_len); + cursize_ = copy_len; + } else { + cursize_ = 0; + } + + bufstart_ = new_bufstart; + capacity_ = new_capacity; + buf_.reset(new_buf); + } + + // Append to the buffer. + // + // src : source to copy the data from. + // append_size : number of bytes to copy from src. + // Returns the number of bytes appended. + // + // If append_size is more than the remaining buffer size only the + // remaining-size worth of bytes are copied. + size_t Append(const char* src, size_t append_size) { + size_t buffer_remaining = capacity_ - cursize_; + size_t to_copy = std::min(append_size, buffer_remaining); + + if (to_copy > 0) { + memcpy(bufstart_ + cursize_, src, to_copy); + cursize_ += to_copy; + } + return to_copy; + } + + // Read from the buffer. + // + // dest : destination buffer to copy the data to. + // offset : the buffer offset to start reading from. + // read_size : the number of bytes to copy from the buffer to dest. + // Returns the number of bytes read/copied to dest. + size_t Read(char* dest, size_t offset, size_t read_size) const { + assert(offset < cursize_); + + size_t to_read = 0; + if (offset < cursize_) { + to_read = std::min(cursize_ - offset, read_size); + } + if (to_read > 0) { + memcpy(dest, bufstart_ + offset, to_read); + } + return to_read; + } + + // Pad to the end of alignment with "padding" + void PadToAlignmentWith(int padding) { + size_t total_size = Roundup(cursize_, alignment_); + size_t pad_size = total_size - cursize_; + + if (pad_size > 0) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + } + + void PadWith(size_t pad_size, int padding) { + assert((pad_size + cursize_) <= capacity_); + memset(bufstart_ + cursize_, padding, pad_size); + cursize_ += pad_size; + } + + // After a partial flush move the tail to the beginning of the buffer. + void RefitTail(size_t tail_offset, size_t tail_size) { + if (tail_size > 0) { + memmove(bufstart_, bufstart_ + tail_offset, tail_size); + } + cursize_ = tail_size; + } + + // Returns a place to start appending. + // WARNING: Note that it is possible to write past the end of the buffer if + // the buffer is modified without using the write APIs or encapsulation + // offered by AlignedBuffer. It is up to the user to guard against such + // errors. + char* Destination() { return bufstart_ + cursize_; } + + void Size(size_t cursize) { cursize_ = cursize; } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/async_file_reader.cc b/src/rocksdb/util/async_file_reader.cc new file mode 100644 index 000000000..8401a6b44 --- /dev/null +++ b/src/rocksdb/util/async_file_reader.cc @@ -0,0 +1,73 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#if USE_COROUTINES +#include "util/async_file_reader.h" + +namespace ROCKSDB_NAMESPACE { +bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) { + if (tail_) { + tail_->next_ = awaiter; + } + tail_ = awaiter; + if (!head_) { + head_ = awaiter; + } + num_reqs_ += awaiter->num_reqs_; + awaiter->io_handle_.resize(awaiter->num_reqs_); + awaiter->del_fn_.resize(awaiter->num_reqs_); + for (size_t i = 0; i < awaiter->num_reqs_; ++i) { + awaiter->file_ + ->ReadAsync( + awaiter->read_reqs_[i], awaiter->opts_, + [](const FSReadRequest& req, void* cb_arg) { + FSReadRequest* read_req = static_cast(cb_arg); + read_req->status = req.status; + read_req->result = req.result; + }, + &awaiter->read_reqs_[i], &awaiter->io_handle_[i], + &awaiter->del_fn_[i], /*aligned_buf=*/nullptr) + .PermitUncheckedError(); + } + return true; +} + +void AsyncFileReader::Wait() { + if (!head_) { + return; + } + ReadAwaiter* waiter; + std::vector io_handles; + io_handles.reserve(num_reqs_); + waiter = head_; + do { + for (size_t i = 0; i < waiter->num_reqs_; ++i) { + if (waiter->io_handle_[i]) { + io_handles.push_back(waiter->io_handle_[i]); + } + } + } while (waiter != tail_ && (waiter = waiter->next_)); + if (io_handles.size() > 0) { + StopWatch sw(SystemClock::Default().get(), stats_, POLL_WAIT_MICROS); + fs_->Poll(io_handles, io_handles.size()).PermitUncheckedError(); + } + do { + waiter = head_; + head_ = waiter->next_; + + for (size_t i = 0; i < waiter->num_reqs_; ++i) { + if (waiter->io_handle_[i] && waiter->del_fn_[i]) { + waiter->del_fn_[i](waiter->io_handle_[i]); + } + } + waiter->awaiting_coro_.resume(); + } while (waiter != tail_); + head_ = tail_ = nullptr; + RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_reqs_); + num_reqs_ = 0; +} +} // namespace ROCKSDB_NAMESPACE +#endif // USE_COROUTINES diff --git a/src/rocksdb/util/async_file_reader.h b/src/rocksdb/util/async_file_reader.h new file mode 100644 index 000000000..df69a840e --- /dev/null +++ b/src/rocksdb/util/async_file_reader.h @@ -0,0 +1,144 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory).#pragma once +#pragma once + +#if USE_COROUTINES +#include "file/random_access_file_reader.h" +#include "folly/experimental/coro/ViaIfAsync.h" +#include "port/port.h" +#include "rocksdb/file_system.h" +#include "rocksdb/statistics.h" +#include "util/autovector.h" +#include "util/stop_watch.h" + +namespace ROCKSDB_NAMESPACE { +class SingleThreadExecutor; + +// AsyncFileReader implements the Awaitable concept, which allows calling +// coroutines to co_await it. When the AsyncFileReader Awaitable is +// resumed, it initiates the fie reads requested by the awaiting caller +// by calling RandomAccessFileReader's ReadAsync. It then suspends the +// awaiting coroutine. The suspended awaiter is later resumed by Wait(). +class AsyncFileReader { + class ReadAwaiter; + template + class ReadOperation; + + public: + AsyncFileReader(FileSystem* fs, Statistics* stats) : fs_(fs), stats_(stats) {} + + ~AsyncFileReader() {} + + ReadOperation MultiReadAsync(RandomAccessFileReader* file, + const IOOptions& opts, + FSReadRequest* read_reqs, + size_t num_reqs, + AlignedBuf* aligned_buf) noexcept { + return ReadOperation{*this, file, opts, + read_reqs, num_reqs, aligned_buf}; + } + + private: + friend SingleThreadExecutor; + + // Implementation of the Awaitable concept + class ReadAwaiter { + public: + explicit ReadAwaiter(AsyncFileReader& reader, RandomAccessFileReader* file, + const IOOptions& opts, FSReadRequest* read_reqs, + size_t num_reqs, AlignedBuf* /*aligned_buf*/) noexcept + : reader_(reader), + file_(file), + opts_(opts), + read_reqs_(read_reqs), + num_reqs_(num_reqs), + next_(nullptr) {} + + bool await_ready() noexcept { return false; } + + // A return value of true means suspend the awaiter (calling coroutine). The + // awaiting_coro parameter is the handle of the awaiter. The handle can be + // resumed later, so we cache it here. + bool await_suspend( + folly::coro::impl::coroutine_handle<> awaiting_coro) noexcept { + awaiting_coro_ = awaiting_coro; + // MultiReadAsyncImpl always returns true, so caller will be suspended + return reader_.MultiReadAsyncImpl(this); + } + + void await_resume() noexcept {} + + private: + friend AsyncFileReader; + + // The parameters passed to MultiReadAsync are cached here when the caller + // calls MultiReadAsync. Later, when the execution of this awaitable is + // started, these are used to do the actual IO + AsyncFileReader& reader_; + RandomAccessFileReader* file_; + const IOOptions& opts_; + FSReadRequest* read_reqs_; + size_t num_reqs_; + autovector io_handle_; + autovector del_fn_; + folly::coro::impl::coroutine_handle<> awaiting_coro_; + // Use this to link to the next ReadAwaiter in the suspended coroutine + // list. The head and tail of the list are tracked by AsyncFileReader. + // We use this approach rather than an STL container in order to avoid + // extra memory allocations. The coroutine call already allocates a + // ReadAwaiter object. + ReadAwaiter* next_; + }; + + // An instance of ReadOperation is returned to the caller of MultiGetAsync. + // This represents an awaitable that can be started later. + template + class ReadOperation { + public: + explicit ReadOperation(AsyncFileReader& reader, + RandomAccessFileReader* file, const IOOptions& opts, + FSReadRequest* read_reqs, size_t num_reqs, + AlignedBuf* aligned_buf) noexcept + : reader_(reader), + file_(file), + opts_(opts), + read_reqs_(read_reqs), + num_reqs_(num_reqs), + aligned_buf_(aligned_buf) {} + + auto viaIfAsync(folly::Executor::KeepAlive<> executor) const { + return folly::coro::co_viaIfAsync( + std::move(executor), + Awaiter{reader_, file_, opts_, read_reqs_, num_reqs_, aligned_buf_}); + } + + private: + AsyncFileReader& reader_; + RandomAccessFileReader* file_; + const IOOptions& opts_; + FSReadRequest* read_reqs_; + size_t num_reqs_; + AlignedBuf* aligned_buf_; + }; + + // This function does the actual work when this awaitable starts execution + bool MultiReadAsyncImpl(ReadAwaiter* awaiter); + + // Called by the SingleThreadExecutor to poll for async IO completion. + // This also resumes the awaiting coroutines. + void Wait(); + + // Head of the queue of awaiters waiting for async IO completion + ReadAwaiter* head_ = nullptr; + // Tail of the awaiter queue + ReadAwaiter* tail_ = nullptr; + // Total number of pending async IOs + size_t num_reqs_ = 0; + FileSystem* fs_; + Statistics* stats_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // USE_COROUTINES diff --git a/src/rocksdb/util/autovector.h b/src/rocksdb/util/autovector.h new file mode 100644 index 000000000..f758473b7 --- /dev/null +++ b/src/rocksdb/util/autovector.h @@ -0,0 +1,406 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "port/lang.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_LITE +template +class autovector : public std::vector { + using std::vector::vector; + + public: + autovector() { + // Make sure the initial vector has space for kSize elements + std::vector::reserve(kSize); + } +}; +#else +// A vector that leverages pre-allocated stack-based array to achieve better +// performance for array with small amount of items. +// +// The interface resembles that of vector, but with less features since we aim +// to solve the problem that we have in hand, rather than implementing a +// full-fledged generic container. +// +// Currently we don't support: +// * shrink_to_fit() +// If used correctly, in most cases, people should not touch the +// underlying vector at all. +// * random insert()/erase(), please only use push_back()/pop_back(). +// * No move/swap operations. Each autovector instance has a +// stack-allocated array and if we want support move/swap operations, we +// need to copy the arrays other than just swapping the pointers. In this +// case we'll just explicitly forbid these operations since they may +// lead users to make false assumption by thinking they are inexpensive +// operations. +// +// Naming style of public methods almost follows that of the STL's. +template +class autovector { + public: + // General STL-style container member types. + using value_type = T; + using difference_type = typename std::vector::difference_type; + using size_type = typename std::vector::size_type; + using reference = value_type&; + using const_reference = const value_type&; + using pointer = value_type*; + using const_pointer = const value_type*; + + // This class is the base for regular/const iterator + template + class iterator_impl { + public: + // -- iterator traits + using self_type = iterator_impl; + using value_type = TValueType; + using reference = TValueType&; + using pointer = TValueType*; + using difference_type = typename TAutoVector::difference_type; + using iterator_category = std::random_access_iterator_tag; + + iterator_impl(TAutoVector* vect, size_t index) + : vect_(vect), index_(index){}; + iterator_impl(const iterator_impl&) = default; + ~iterator_impl() {} + iterator_impl& operator=(const iterator_impl&) = default; + + // -- Advancement + // ++iterator + self_type& operator++() { + ++index_; + return *this; + } + + // iterator++ + self_type operator++(int) { + auto old = *this; + ++index_; + return old; + } + + // --iterator + self_type& operator--() { + --index_; + return *this; + } + + // iterator-- + self_type operator--(int) { + auto old = *this; + --index_; + return old; + } + + self_type operator-(difference_type len) const { + return self_type(vect_, index_ - len); + } + + difference_type operator-(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ - other.index_; + } + + self_type operator+(difference_type len) const { + return self_type(vect_, index_ + len); + } + + self_type& operator+=(difference_type len) { + index_ += len; + return *this; + } + + self_type& operator-=(difference_type len) { + index_ -= len; + return *this; + } + + // -- Reference + reference operator*() const { + assert(vect_->size() >= index_); + return (*vect_)[index_]; + } + + pointer operator->() const { + assert(vect_->size() >= index_); + return &(*vect_)[index_]; + } + + reference operator[](difference_type len) const { return *(*this + len); } + + // -- Logical Operators + bool operator==(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ == other.index_; + } + + bool operator!=(const self_type& other) const { return !(*this == other); } + + bool operator>(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ > other.index_; + } + + bool operator<(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ < other.index_; + } + + bool operator>=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ >= other.index_; + } + + bool operator<=(const self_type& other) const { + assert(vect_ == other.vect_); + return index_ <= other.index_; + } + + private: + TAutoVector* vect_ = nullptr; + size_t index_ = 0; + }; + + using iterator = iterator_impl; + using const_iterator = iterator_impl; + using reverse_iterator = std::reverse_iterator; + using const_reverse_iterator = std::reverse_iterator; + + autovector() : values_(reinterpret_cast(buf_)) {} + + autovector(std::initializer_list init_list) + : values_(reinterpret_cast(buf_)) { + for (const T& item : init_list) { + push_back(item); + } + } + + ~autovector() { clear(); } + + // -- Immutable operations + // Indicate if all data resides in in-stack data structure. + bool only_in_stack() const { + // If no element was inserted at all, the vector's capacity will be `0`. + return vect_.capacity() == 0; + } + + size_type size() const { return num_stack_items_ + vect_.size(); } + + // resize does not guarantee anything about the contents of the newly + // available elements + void resize(size_type n) { + if (n > kSize) { + vect_.resize(n - kSize); + while (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_++])) value_type(); + } + num_stack_items_ = kSize; + } else { + vect_.clear(); + while (num_stack_items_ < n) { + new ((void*)(&values_[num_stack_items_++])) value_type(); + } + while (num_stack_items_ > n) { + values_[--num_stack_items_].~value_type(); + } + } + } + + bool empty() const { return size() == 0; } + + size_type capacity() const { return kSize + vect_.capacity(); } + + void reserve(size_t cap) { + if (cap > kSize) { + vect_.reserve(cap - kSize); + } + + assert(cap <= capacity()); + } + + const_reference operator[](size_type n) const { + assert(n < size()); + if (n < kSize) { + return values_[n]; + } + return vect_[n - kSize]; + } + + reference operator[](size_type n) { + assert(n < size()); + if (n < kSize) { + return values_[n]; + } + return vect_[n - kSize]; + } + + const_reference at(size_type n) const { + assert(n < size()); + return (*this)[n]; + } + + reference at(size_type n) { + assert(n < size()); + return (*this)[n]; + } + + reference front() { + assert(!empty()); + return *begin(); + } + + const_reference front() const { + assert(!empty()); + return *begin(); + } + + reference back() { + assert(!empty()); + return *(end() - 1); + } + + const_reference back() const { + assert(!empty()); + return *(end() - 1); + } + + // -- Mutable Operations + void push_back(T&& item) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_])) value_type(); + values_[num_stack_items_++] = std::move(item); + } else { + vect_.push_back(item); + } + } + + void push_back(const T& item) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_])) value_type(); + values_[num_stack_items_++] = item; + } else { + vect_.push_back(item); + } + } + + template +#if _LIBCPP_STD_VER > 14 + reference emplace_back(Args&&... args) { + if (num_stack_items_ < kSize) { + return *(new ((void*)(&values_[num_stack_items_++])) + value_type(std::forward(args)...)); + } else { + return vect_.emplace_back(std::forward(args)...); + } + } +#else + void emplace_back(Args&&... args) { + if (num_stack_items_ < kSize) { + new ((void*)(&values_[num_stack_items_++])) + value_type(std::forward(args)...); + } else { + vect_.emplace_back(std::forward(args)...); + } + } +#endif + + void pop_back() { + assert(!empty()); + if (!vect_.empty()) { + vect_.pop_back(); + } else { + values_[--num_stack_items_].~value_type(); + } + } + + void clear() { + while (num_stack_items_ > 0) { + values_[--num_stack_items_].~value_type(); + } + vect_.clear(); + } + + // -- Copy and Assignment + autovector& assign(const autovector& other); + + autovector(const autovector& other) { assign(other); } + + autovector& operator=(const autovector& other) { return assign(other); } + + autovector(autovector&& other) noexcept { *this = std::move(other); } + autovector& operator=(autovector&& other); + + // -- Iterator Operations + iterator begin() { return iterator(this, 0); } + + const_iterator begin() const { return const_iterator(this, 0); } + + iterator end() { return iterator(this, this->size()); } + + const_iterator end() const { return const_iterator(this, this->size()); } + + reverse_iterator rbegin() { return reverse_iterator(end()); } + + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + + reverse_iterator rend() { return reverse_iterator(begin()); } + + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + private: + size_type num_stack_items_ = 0; // current number of items + alignas(alignof( + value_type)) char buf_[kSize * + sizeof(value_type)]; // the first `kSize` items + pointer values_; + // used only if there are more than `kSize` items. + std::vector vect_; +}; + +template +autovector& autovector::assign( + const autovector& other) { + values_ = reinterpret_cast(buf_); + // copy the internal vector + vect_.assign(other.vect_.begin(), other.vect_.end()); + + // copy array + num_stack_items_ = other.num_stack_items_; + std::copy(other.values_, other.values_ + num_stack_items_, values_); + + return *this; +} + +template +autovector& autovector::operator=( + autovector&& other) { + values_ = reinterpret_cast(buf_); + vect_ = std::move(other.vect_); + size_t n = other.num_stack_items_; + num_stack_items_ = n; + other.num_stack_items_ = 0; + for (size_t i = 0; i < n; ++i) { + values_[i] = std::move(other.values_[i]); + } + return *this; +} + +#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/autovector_test.cc b/src/rocksdb/util/autovector_test.cc new file mode 100644 index 000000000..8c7c39ce6 --- /dev/null +++ b/src/rocksdb/util/autovector_test.cc @@ -0,0 +1,331 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/autovector.h" + +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/string_util.h" + +using std::cout; +using std::endl; + +namespace ROCKSDB_NAMESPACE { + +class AutoVectorTest : public testing::Test {}; +const unsigned long kSize = 8; + +namespace { +template +void AssertAutoVectorOnlyInStack(autovector* vec, bool result) { +#ifndef ROCKSDB_LITE + ASSERT_EQ(vec->only_in_stack(), result); +#else + (void)vec; + (void)result; +#endif // !ROCKSDB_LITE +} +} // namespace + +TEST_F(AutoVectorTest, PushBackAndPopBack) { + autovector vec; + ASSERT_TRUE(vec.empty()); + ASSERT_EQ(0ul, vec.size()); + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.push_back(i); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + AssertAutoVectorOnlyInStack(&vec, true); + } else { + AssertAutoVectorOnlyInStack(&vec, false); + } + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i]); + ASSERT_EQ(i, vec.at(i)); + } + + size_t size = vec.size(); + while (size != 0) { + vec.pop_back(); + // will always be in heap + AssertAutoVectorOnlyInStack(&vec, false); + ASSERT_EQ(--size, vec.size()); + } + + ASSERT_TRUE(vec.empty()); +} + +TEST_F(AutoVectorTest, EmplaceBack) { + using ValType = std::pair; + autovector vec; + + for (size_t i = 0; i < 1000 * kSize; ++i) { + vec.emplace_back(i, std::to_string(i + 123)); + ASSERT_TRUE(!vec.empty()); + if (i < kSize) { + AssertAutoVectorOnlyInStack(&vec, true); + } else { + AssertAutoVectorOnlyInStack(&vec, false); + } + + ASSERT_EQ(i + 1, vec.size()); + ASSERT_EQ(i, vec[i].first); + ASSERT_EQ(std::to_string(i + 123), vec[i].second); + } + + vec.clear(); + ASSERT_TRUE(vec.empty()); + AssertAutoVectorOnlyInStack(&vec, false); +} + +TEST_F(AutoVectorTest, Resize) { + autovector vec; + + vec.resize(kSize); + AssertAutoVectorOnlyInStack(&vec, true); + for (size_t i = 0; i < kSize; ++i) { + vec[i] = i; + } + + vec.resize(kSize * 2); + AssertAutoVectorOnlyInStack(&vec, false); + for (size_t i = 0; i < kSize; ++i) { + ASSERT_EQ(vec[i], i); + } + for (size_t i = 0; i < kSize; ++i) { + vec[i + kSize] = i; + } + + vec.resize(1); + ASSERT_EQ(1U, vec.size()); +} + +namespace { +void AssertEqual(const autovector& a, + const autovector& b) { + ASSERT_EQ(a.size(), b.size()); + ASSERT_EQ(a.empty(), b.empty()); +#ifndef ROCKSDB_LITE + ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); +#endif // !ROCKSDB_LITE + for (size_t i = 0; i < a.size(); ++i) { + ASSERT_EQ(a[i], b[i]); + } +} +} // namespace + +TEST_F(AutoVectorTest, CopyAndAssignment) { + // Test both heap-allocated and stack-allocated cases. + for (auto size : {kSize / 2, kSize * 1000}) { + autovector vec; + for (size_t i = 0; i < size; ++i) { + vec.push_back(i); + } + + { + autovector other; + other = vec; + AssertEqual(other, vec); + } + + { + autovector other(vec); + AssertEqual(other, vec); + } + } +} + +TEST_F(AutoVectorTest, Iterators) { + autovector vec; + for (size_t i = 0; i < kSize * 1000; ++i) { + vec.push_back(std::to_string(i)); + } + + // basic operator test + ASSERT_EQ(vec.front(), *vec.begin()); + ASSERT_EQ(vec.back(), *(vec.end() - 1)); + ASSERT_TRUE(vec.begin() < vec.end()); + + // non-const iterator + size_t index = 0; + for (const auto& item : vec) { + ASSERT_EQ(vec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = vec.rbegin(); pos != vec.rend(); ++pos) { + ASSERT_EQ(vec[index--], *pos); + } + + // const iterator + const auto& cvec = vec; + index = 0; + for (const auto& item : cvec) { + ASSERT_EQ(cvec[index++], item); + } + + index = vec.size() - 1; + for (auto pos = cvec.rbegin(); pos != cvec.rend(); ++pos) { + ASSERT_EQ(cvec[index--], *pos); + } + + // forward and backward + auto pos = vec.begin(); + while (pos != vec.end()) { + auto old_val = *pos; + auto old = pos++; + // HACK: make sure -> works + ASSERT_TRUE(!old->empty()); + ASSERT_EQ(old_val, *old); + ASSERT_TRUE(pos == vec.end() || old_val != *pos); + } + + pos = vec.begin(); + for (size_t i = 0; i < vec.size(); i += 2) { + // Cannot use ASSERT_EQ since that macro depends on iostream serialization + ASSERT_TRUE(pos + 2 - 2 == pos); + pos += 2; + ASSERT_TRUE(pos >= vec.begin()); + ASSERT_TRUE(pos <= vec.end()); + + size_t diff = static_cast(pos - vec.begin()); + ASSERT_EQ(i + 2, diff); + } +} + +namespace { +std::vector GetTestKeys(size_t size) { + std::vector keys; + keys.resize(size); + + int index = 0; + for (auto& key : keys) { + key = "item-" + std::to_string(index++); + } + return keys; +} +} // namespace + +template +void BenchmarkVectorCreationAndInsertion( + std::string name, size_t ops, size_t item_size, + const std::vector& items) { + auto env = Env::Default(); + + int index = 0; + auto start_time = env->NowNanos(); + auto ops_remaining = ops; + while (ops_remaining--) { + TVector v; + for (size_t i = 0; i < item_size; ++i) { + v.push_back(items[index++]); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "created " << ops << " " << name << " instances:\n\t" + << "each was inserted with " << item_size << " elements\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; +} + +template +size_t BenchmarkSequenceAccess(std::string name, size_t ops, size_t elem_size) { + TVector v; + for (const auto& item : GetTestKeys(elem_size)) { + v.push_back(item); + } + auto env = Env::Default(); + + auto ops_remaining = ops; + auto start_time = env->NowNanos(); + size_t total = 0; + while (ops_remaining--) { + auto end = v.end(); + for (auto pos = v.begin(); pos != end; ++pos) { + total += pos->size(); + } + } + auto elapsed = env->NowNanos() - start_time; + cout << "performed " << ops << " sequence access against " << name << "\n\t" + << "size: " << elem_size << "\n\t" + << "total time elapsed: " << elapsed << " (ns)" << endl; + // HACK avoid compiler's optimization to ignore total + return total; +} + +// This test case only reports the performance between std::vector +// and autovector. We chose string for comparison because in most +// of our use cases we used std::vector. +TEST_F(AutoVectorTest, PerfBench) { + // We run same operations for kOps times in order to get a more fair result. + size_t kOps = 100000; + + // Creation and insertion test + // Test the case when there is: + // * no element inserted: internal array of std::vector may not really get + // initialize. + // * one element inserted: internal array of std::vector must have + // initialized. + // * kSize elements inserted. This shows the most time we'll spend if we + // keep everything in stack. + // * 2 * kSize elements inserted. The internal vector of + // autovector must have been initialized. + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: std::string)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + auto string_keys = GetTestKeys(kOps * 2 * kSize); + for (auto insertions : {0ul, 1ul, kSize / 2, kSize, 2 * kSize}) { + BenchmarkVectorCreationAndInsertion>( + "std::vector", kOps, insertions, string_keys); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, string_keys); + cout << "-----------------------------------" << endl; + } + + cout << "=====================================================" << endl; + cout << "Creation and Insertion Test (value type: uint64_t)" << endl; + cout << "=====================================================" << endl; + + // pre-generated unique keys + std::vector int_keys(kOps * 2 * kSize); + for (size_t i = 0; i < kOps * 2 * kSize; ++i) { + int_keys[i] = i; + } + for (auto insertions : {0ul, 1ul, kSize / 2, kSize, 2 * kSize}) { + BenchmarkVectorCreationAndInsertion>( + "std::vector", kOps, insertions, int_keys); + BenchmarkVectorCreationAndInsertion>( + "autovector", kOps, insertions, int_keys); + cout << "-----------------------------------" << endl; + } + + // Sequence Access Test + cout << "=====================================================" << endl; + cout << "Sequence Access Test" << endl; + cout << "=====================================================" << endl; + for (auto elem_size : {kSize / 2, kSize, 2 * kSize}) { + BenchmarkSequenceAccess>("std::vector", kOps, + elem_size); + BenchmarkSequenceAccess>("autovector", kOps, + elem_size); + cout << "-----------------------------------" << endl; + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/bloom_impl.h b/src/rocksdb/util/bloom_impl.h new file mode 100644 index 000000000..fadd012d3 --- /dev/null +++ b/src/rocksdb/util/bloom_impl.h @@ -0,0 +1,489 @@ +// Copyright (c) 2019-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Implementation details of various Bloom filter implementations used in +// RocksDB. (DynamicBloom is in a separate file for now because it +// supports concurrent write.) + +#pragma once +#include +#include + +#include + +#include "port/port.h" // for PREFETCH +#include "rocksdb/slice.h" +#include "util/hash.h" + +#ifdef HAVE_AVX2 +#include +#endif + +namespace ROCKSDB_NAMESPACE { + +class BloomMath { + public: + // False positive rate of a standard Bloom filter, for given ratio of + // filter memory bits to added keys, and number of probes per operation. + // (The false positive rate is effectively independent of scale, assuming + // the implementation scales OK.) + static double StandardFpRate(double bits_per_key, int num_probes) { + // Standard very-good-estimate formula. See + // https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives + return std::pow(1.0 - std::exp(-num_probes / bits_per_key), num_probes); + } + + // False positive rate of a "blocked"/"shareded"/"cache-local" Bloom filter, + // for given ratio of filter memory bits to added keys, number of probes per + // operation (all within the given block or cache line size), and block or + // cache line size. + static double CacheLocalFpRate(double bits_per_key, int num_probes, + int cache_line_bits) { + if (bits_per_key <= 0.0) { + // Fix a discontinuity + return 1.0; + } + double keys_per_cache_line = cache_line_bits / bits_per_key; + // A reasonable estimate is the average of the FP rates for one standard + // deviation above and below the mean bucket occupancy. See + // https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter#the-math + double keys_stddev = std::sqrt(keys_per_cache_line); + double crowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line + keys_stddev), num_probes); + double uncrowded_fp = StandardFpRate( + cache_line_bits / (keys_per_cache_line - keys_stddev), num_probes); + return (crowded_fp + uncrowded_fp) / 2; + } + + // False positive rate of querying a new item against `num_keys` items, all + // hashed to `fingerprint_bits` bits. (This assumes the fingerprint hashes + // themselves are stored losslessly. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double FingerprintFpRate(size_t num_keys, int fingerprint_bits) { + double inv_fingerprint_space = std::pow(0.5, fingerprint_bits); + // Base estimate assumes each key maps to a unique fingerprint. + // Could be > 1 in extreme cases. + double base_estimate = num_keys * inv_fingerprint_space; + // To account for potential overlap, we choose between two formulas + if (base_estimate > 0.0001) { + // A very good formula assuming we don't construct a floating point + // number extremely close to 1. Always produces a probability < 1. + return 1.0 - std::exp(-base_estimate); + } else { + // A very good formula when base_estimate is far below 1. (Subtract + // away the integral-approximated sum that some key has same hash as + // one coming before it in a list.) + return base_estimate - (base_estimate * base_estimate * 0.5); + } + } + + // Returns the probably of either of two independent(-ish) events + // happening, given their probabilities. (This is useful for combining + // results from StandardFpRate or CacheLocalFpRate with FingerprintFpRate + // for a hash-efficient Bloom filter's FP rate. See Section 4 of + // http://www.ccs.neu.edu/home/pete/pub/bloom-filters-verification.pdf) + static double IndependentProbabilitySum(double rate1, double rate2) { + // Use formula that avoids floating point extremely close to 1 if + // rates are extremely small. + return rate1 + rate2 - (rate1 * rate2); + } +}; + +// A fast, flexible, and accurate cache-local Bloom implementation with +// SIMD-optimized query performance (currently using AVX2 on Intel). Write +// performance and non-SIMD read are very good, benefiting from FastRange32 +// used in place of % and single-cycle multiplication on recent processors. +// +// Most other SIMD Bloom implementations sacrifice flexibility and/or +// accuracy by requiring num_probes to be a power of two and restricting +// where each probe can occur in a cache line. This implementation sacrifices +// SIMD-optimization for add (might still be possible, especially with AVX512) +// in favor of allowing any num_probes, not crossing cache line boundary, +// and accuracy close to theoretical best accuracy for a cache-local Bloom. +// E.g. theoretical best for 10 bits/key, num_probes=6, and 512-bit bucket +// (Intel cache line size) is 0.9535% FP rate. This implementation yields +// about 0.957%. (Compare to LegacyLocalityBloomImpl at 1.138%, or +// about 0.951% for 1024-bit buckets, cache line size for some ARM CPUs.) +// +// This implementation can use a 32-bit hash (let h2 be h1 * 0x9e3779b9) or +// a 64-bit hash (split into two uint32s). With many millions of keys, the +// false positive rate associated with using a 32-bit hash can dominate the +// false positive rate of the underlying filter. At 10 bits/key setting, the +// inflection point is about 40 million keys, so 32-bit hash is a bad idea +// with 10s of millions of keys or more. +// +// Despite accepting a 64-bit hash, this implementation uses 32-bit fastrange +// to pick a cache line, which can be faster than 64-bit in some cases. +// This only hurts accuracy as you get into 10s of GB for a single filter, +// and accuracy abruptly breaks down at 256GB (2^32 cache lines). Switch to +// 64-bit fastrange if you need filters so big. ;) +// +// Using only a 32-bit input hash within each cache line has negligible +// impact for any reasonable cache line / bucket size, for arbitrary filter +// size, and potentially saves intermediate data size in some cases vs. +// tracking full 64 bits. (Even in an implementation using 64-bit arithmetic +// to generate indices, I might do the same, as a single multiplication +// suffices to generate a sufficiently mixed 64 bits from 32 bits.) +// +// This implementation is currently tied to Intel cache line size, 64 bytes == +// 512 bits. If there's sufficient demand for other cache line sizes, this is +// a pretty good implementation to extend, but slight performance enhancements +// are possible with an alternate implementation (probably not very compatible +// with SIMD): +// (1) Use rotation in addition to multiplication for remixing +// (like murmur hash). (Using multiplication alone *slightly* hurts accuracy +// because lower bits never depend on original upper bits.) +// (2) Extract more than one bit index from each re-mix. (Only if rotation +// or similar is part of remix, because otherwise you're making the +// multiplication-only problem worse.) +// (3) Re-mix full 64 bit hash, to get maximum number of bit indices per +// re-mix. +// +class FastLocalBloomImpl { + public: + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes, + int hash_bits) { + return BloomMath::IndependentProbabilitySum( + BloomMath::CacheLocalFpRate(8.0 * bytes / keys, num_probes, + /*cache line bits*/ 512), + BloomMath::FingerprintFpRate(keys, hash_bits)); + } + + static inline int ChooseNumProbes(int millibits_per_key) { + // Since this implementation can (with AVX2) make up to 8 probes + // for the same cost, we pick the most accurate num_probes, based + // on actual tests of the implementation. Note that for higher + // bits/key, the best choice for cache-local Bloom can be notably + // smaller than standard bloom, e.g. 9 instead of 11 @ 16 b/k. + if (millibits_per_key <= 2080) { + return 1; + } else if (millibits_per_key <= 3580) { + return 2; + } else if (millibits_per_key <= 5100) { + return 3; + } else if (millibits_per_key <= 6640) { + return 4; + } else if (millibits_per_key <= 8300) { + return 5; + } else if (millibits_per_key <= 10070) { + return 6; + } else if (millibits_per_key <= 11720) { + return 7; + } else if (millibits_per_key <= 14001) { + // Would be something like <= 13800 but sacrificing *slightly* for + // more settings using <= 8 probes. + return 8; + } else if (millibits_per_key <= 16050) { + return 9; + } else if (millibits_per_key <= 18300) { + return 10; + } else if (millibits_per_key <= 22001) { + return 11; + } else if (millibits_per_key <= 25501) { + return 12; + } else if (millibits_per_key > 50000) { + // Top out at 24 probes (three sets of 8) + return 24; + } else { + // Roughly optimal choices for remaining range + // e.g. + // 28000 -> 12, 28001 -> 13 + // 50000 -> 23, 50001 -> 24 + return (millibits_per_key - 1) / 2000 - 1; + } + } + + static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes, + int num_probes, char *data) { + uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + AddHashPrepared(h2, num_probes, data + bytes_to_cache_line); + } + + static inline void AddHashPrepared(uint32_t h2, int num_probes, + char *data_at_cache_line) { + uint32_t h = h2; + for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { + // 9-bit address within 512 bit cache line + int bitpos = h >> (32 - 9); + data_at_cache_line[bitpos >> 3] |= (uint8_t{1} << (bitpos & 7)); + } + } + + static inline void PrepareHash(uint32_t h1, uint32_t len_bytes, + const char *data, + uint32_t /*out*/ *byte_offset) { + uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */); + PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */); + *byte_offset = bytes_to_cache_line; + } + + static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes, + int num_probes, const char *data) { + uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); + } + + static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, + const char *data_at_cache_line) { + uint32_t h = h2; +#ifdef HAVE_AVX2 + int rem_probes = num_probes; + + // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18, + // etc.} one can insert specialized code for rem_probes <= 2, bypassing + // the SIMD code in those cases. There is a detectable but minor overhead + // applied to other values of num_probes (when not statically determined), + // but smoother performance curve vs. num_probes. But for now, when + // in doubt, don't add unnecessary code. + + // Powers of 32-bit golden ratio, mod 2**32. + const __m256i multipliers = + _mm256_setr_epi32(0x00000001, 0x9e3779b9, 0xe35e67b1, 0x734297e9, + 0x35fbe861, 0xdeb7c719, 0x448b211, 0x3459b749); + + for (;;) { + // Eight copies of hash + __m256i hash_vector = _mm256_set1_epi32(h); + + // Same effect as repeated multiplication by 0x9e3779b9 thanks to + // associativity of multiplication. + hash_vector = _mm256_mullo_epi32(hash_vector, multipliers); + + // Now the top 9 bits of each of the eight 32-bit values in + // hash_vector are bit addresses for probes within the cache line. + // While the platform-independent code uses byte addressing (6 bits + // to pick a byte + 3 bits to pick a bit within a byte), here we work + // with 32-bit words (4 bits to pick a word + 5 bits to pick a bit + // within a word) because that works well with AVX2 and is equivalent + // under little-endian. + + // Shift each right by 28 bits to get 4-bit word addresses. + const __m256i word_addresses = _mm256_srli_epi32(hash_vector, 28); + + // Gather 32-bit values spread over 512 bits by 4-bit address. In + // essence, we are dereferencing eight pointers within the cache + // line. + // + // Option 1: AVX2 gather (seems to be a little slow - understandable) + // const __m256i value_vector = + // _mm256_i32gather_epi32(static_cast(data_at_cache_line), + // word_addresses, + // /*bytes / i32*/ 4); + // END Option 1 + // Potentially unaligned as we're not *always* cache-aligned -> loadu + const __m256i *mm_data = + reinterpret_cast(data_at_cache_line); + __m256i lower = _mm256_loadu_si256(mm_data); + __m256i upper = _mm256_loadu_si256(mm_data + 1); + // Option 2: AVX512VL permute hack + // Only negligibly faster than Option 3, so not yet worth supporting + // const __m256i value_vector = + // _mm256_permutex2var_epi32(lower, word_addresses, upper); + // END Option 2 + // Option 3: AVX2 permute+blend hack + // Use lowest three bits to order probing values, as if all from same + // 256 bit piece. + lower = _mm256_permutevar8x32_epi32(lower, word_addresses); + upper = _mm256_permutevar8x32_epi32(upper, word_addresses); + // Just top 1 bit of address, to select between lower and upper. + const __m256i upper_lower_selector = _mm256_srai_epi32(hash_vector, 31); + // Finally: the next 8 probed 32-bit values, in probing sequence order. + const __m256i value_vector = + _mm256_blendv_epi8(lower, upper, upper_lower_selector); + // END Option 3 + + // We might not need to probe all 8, so build a mask for selecting only + // what we need. (The k_selector(s) could be pre-computed but that + // doesn't seem to make a noticeable performance difference.) + const __m256i zero_to_seven = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + // Subtract rem_probes from each of those constants + __m256i k_selector = + _mm256_sub_epi32(zero_to_seven, _mm256_set1_epi32(rem_probes)); + // Negative after subtract -> use/select + // Keep only high bit (logical shift right each by 31). + k_selector = _mm256_srli_epi32(k_selector, 31); + + // Strip off the 4 bit word address (shift left) + __m256i bit_addresses = _mm256_slli_epi32(hash_vector, 4); + // And keep only 5-bit (32 - 27) bit-within-32-bit-word addresses. + bit_addresses = _mm256_srli_epi32(bit_addresses, 27); + // Build a bit mask + const __m256i bit_mask = _mm256_sllv_epi32(k_selector, bit_addresses); + + // Like ((~value_vector) & bit_mask) == 0) + bool match = _mm256_testc_si256(value_vector, bit_mask) != 0; + + // This check first so that it's easy for branch predictor to optimize + // num_probes <= 8 case, making it free of unpredictable branches. + if (rem_probes <= 8) { + return match; + } else if (!match) { + return false; + } + // otherwise + // Need another iteration. 0xab25f4c1 == golden ratio to the 8th power + h *= 0xab25f4c1; + rem_probes -= 8; + } +#else + for (int i = 0; i < num_probes; ++i, h *= uint32_t{0x9e3779b9}) { + // 9-bit address within 512 bit cache line + int bitpos = h >> (32 - 9); + if ((data_at_cache_line[bitpos >> 3] & (char(1) << (bitpos & 7))) == 0) { + return false; + } + } + return true; +#endif + } +}; + +// A legacy Bloom filter implementation with no locality of probes (slow). +// It uses double hashing to generate a sequence of hash values. +// Asymptotic analysis is in [Kirsch,Mitzenmacher 2006], but known to have +// subtle accuracy flaws for practical sizes [Dillinger,Manolios 2004]. +// +// DO NOT REUSE +// +class LegacyNoLocalityBloomImpl { + public: + static inline int ChooseNumProbes(int bits_per_key) { + // We intentionally round down to reduce probing cost a little bit + int num_probes = static_cast(bits_per_key * 0.69); // 0.69 =~ ln(2) + if (num_probes < 1) num_probes = 1; + if (num_probes > 30) num_probes = 30; + return num_probes; + } + + static inline void AddHash(uint32_t h, uint32_t total_bits, int num_probes, + char *data) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (int i = 0; i < num_probes; i++) { + const uint32_t bitpos = h % total_bits; + data[bitpos / 8] |= (1 << (bitpos % 8)); + h += delta; + } + } + + static inline bool HashMayMatch(uint32_t h, uint32_t total_bits, + int num_probes, const char *data) { + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + for (int i = 0; i < num_probes; i++) { + const uint32_t bitpos = h % total_bits; + if ((data[bitpos / 8] & (1 << (bitpos % 8))) == 0) { + return false; + } + h += delta; + } + return true; + } +}; + +// A legacy Bloom filter implementation with probes local to a single +// cache line (fast). Because SST files might be transported between +// platforms, the cache line size is a parameter rather than hard coded. +// (But if specified as a constant parameter, an optimizing compiler +// should take advantage of that.) +// +// When ExtraRotates is false, this implementation is notably deficient in +// accuracy. Specifically, it uses double hashing with a 1/512 chance of the +// increment being zero (when cache line size is 512 bits). Thus, there's a +// 1/512 chance of probing only one index, which we'd expect to incur about +// a 1/2 * 1/512 or absolute 0.1% FP rate penalty. More detail at +// https://github.com/facebook/rocksdb/issues/4120 +// +// DO NOT REUSE +// +template +class LegacyLocalityBloomImpl { + private: + static inline uint32_t GetLine(uint32_t h, uint32_t num_lines) { + uint32_t offset_h = ExtraRotates ? (h >> 11) | (h << 21) : h; + return offset_h % num_lines; + } + + public: + // NOTE: this has only been validated to enough accuracy for producing + // reasonable warnings / user feedback, not for making functional decisions. + static double EstimatedFpRate(size_t keys, size_t bytes, int num_probes) { + double bits_per_key = 8.0 * bytes / keys; + double filter_rate = BloomMath::CacheLocalFpRate(bits_per_key, num_probes, + /*cache line bits*/ 512); + if (!ExtraRotates) { + // Good estimate of impact of flaw in index computation. + // Adds roughly 0.002 around 50 bits/key and 0.001 around 100 bits/key. + // The + 22 shifts it nicely to fit for lower bits/key. + filter_rate += 0.1 / (bits_per_key * 0.75 + 22); + } else { + // Not yet validated + assert(false); + } + // Always uses 32-bit hash + double fingerprint_rate = BloomMath::FingerprintFpRate(keys, 32); + return BloomMath::IndependentProbabilitySum(filter_rate, fingerprint_rate); + } + + static inline void AddHash(uint32_t h, uint32_t num_lines, int num_probes, + char *data, int log2_cache_line_bytes) { + const int log2_cache_line_bits = log2_cache_line_bytes + 3; + + char *data_at_offset = + data + (GetLine(h, num_lines) << log2_cache_line_bytes); + const uint32_t delta = (h >> 17) | (h << 15); + for (int i = 0; i < num_probes; ++i) { + // Mask to bit-within-cache-line address + const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); + data_at_offset[bitpos / 8] |= (1 << (bitpos % 8)); + if (ExtraRotates) { + h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); + } + h += delta; + } + } + + static inline void PrepareHashMayMatch(uint32_t h, uint32_t num_lines, + const char *data, + uint32_t /*out*/ *byte_offset, + int log2_cache_line_bytes) { + uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; + PREFETCH(data + b, 0 /* rw */, 1 /* locality */); + PREFETCH(data + b + ((1 << log2_cache_line_bytes) - 1), 0 /* rw */, + 1 /* locality */); + *byte_offset = b; + } + + static inline bool HashMayMatch(uint32_t h, uint32_t num_lines, + int num_probes, const char *data, + int log2_cache_line_bytes) { + uint32_t b = GetLine(h, num_lines) << log2_cache_line_bytes; + return HashMayMatchPrepared(h, num_probes, data + b, log2_cache_line_bytes); + } + + static inline bool HashMayMatchPrepared(uint32_t h, int num_probes, + const char *data_at_offset, + int log2_cache_line_bytes) { + const int log2_cache_line_bits = log2_cache_line_bytes + 3; + + const uint32_t delta = (h >> 17) | (h << 15); + for (int i = 0; i < num_probes; ++i) { + // Mask to bit-within-cache-line address + const uint32_t bitpos = h & ((1 << log2_cache_line_bits) - 1); + if (((data_at_offset[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + if (ExtraRotates) { + h = (h >> log2_cache_line_bits) | (h << (32 - log2_cache_line_bits)); + } + h += delta; + } + return true; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/bloom_test.cc b/src/rocksdb/util/bloom_test.cc new file mode 100644 index 000000000..9d509ac3d --- /dev/null +++ b/src/rocksdb/util/bloom_test.cc @@ -0,0 +1,1175 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#include +#include +#include + +#include "cache/cache_entry_roles.h" +#include "cache/cache_reservation_manager.h" +#include "memory/arena.h" +#include "port/jemalloc_helper.h" +#include "rocksdb/filter_policy.h" +#include "table/block_based/filter_policy_internal.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/gflags_compat.h" +#include "util/hash.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +// The test is not fully designed for bits_per_key other than 10, but with +// this parameter you can easily explore the behavior of other bits_per_key. +// See also filter_bench. +DEFINE_int32(bits_per_key, 10, ""); + +namespace ROCKSDB_NAMESPACE { + +namespace { +const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName(); +const std::string kFastLocalBloom = + test::FastLocalBloomFilterPolicy::kClassName(); +const std::string kStandard128Ribbon = + test::Standard128RibbonFilterPolicy::kClassName(); +} // namespace + +static const int kVerbose = 1; + +static Slice Key(int i, char* buffer) { + std::string s; + PutFixed32(&s, static_cast(i)); + memcpy(buffer, s.c_str(), sizeof(i)); + return Slice(buffer, sizeof(i)); +} + +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + +class FullBloomTest : public testing::TestWithParam { + protected: + BlockBasedTableOptions table_options_; + + private: + std::shared_ptr& policy_; + std::unique_ptr bits_builder_; + std::unique_ptr bits_reader_; + std::unique_ptr buf_; + size_t filter_size_; + + public: + FullBloomTest() : policy_(table_options_.filter_policy), filter_size_(0) { + ResetPolicy(); + } + + BuiltinFilterBitsBuilder* GetBuiltinFilterBitsBuilder() { + // Throws on bad cast + return dynamic_cast(bits_builder_.get()); + } + + const BloomLikeFilterPolicy* GetBloomLikeFilterPolicy() { + // Throws on bad cast + return &dynamic_cast(*policy_); + } + + void Reset() { + bits_builder_.reset(BloomFilterPolicy::GetBuilderFromContext( + FilterBuildingContext(table_options_))); + bits_reader_.reset(nullptr); + buf_.reset(nullptr); + filter_size_ = 0; + } + + void ResetPolicy(double bits_per_key) { + policy_ = BloomLikeFilterPolicy::Create(GetParam(), bits_per_key); + Reset(); + } + + void ResetPolicy() { ResetPolicy(FLAGS_bits_per_key); } + + void Add(const Slice& s) { bits_builder_->AddKey(s); } + + void OpenRaw(const Slice& s) { + bits_reader_.reset(policy_->GetFilterBitsReader(s)); + } + + void Build() { + Slice filter = bits_builder_->Finish(&buf_); + bits_reader_.reset(policy_->GetFilterBitsReader(filter)); + filter_size_ = filter.size(); + } + + size_t FilterSize() const { return filter_size_; } + + Slice FilterData() { return Slice(buf_.get(), filter_size_); } + + int GetNumProbesFromFilterData() { + assert(filter_size_ >= 5); + int8_t raw_num_probes = static_cast(buf_.get()[filter_size_ - 5]); + if (raw_num_probes == -1) { // New bloom filter marker + return static_cast(buf_.get()[filter_size_ - 3]); + } else { + return raw_num_probes; + } + } + + int GetRibbonSeedFromFilterData() { + assert(filter_size_ >= 5); + // Check for ribbon marker + assert(-2 == static_cast(buf_.get()[filter_size_ - 5])); + return static_cast(buf_.get()[filter_size_ - 4]); + } + + bool Matches(const Slice& s) { + if (bits_reader_ == nullptr) { + Build(); + } + return bits_reader_->MayMatch(s); + } + + // Provides a kind of fingerprint on the Bloom filter's + // behavior, for reasonbly high FP rates. + uint64_t PackedMatches() { + char buffer[sizeof(int)]; + uint64_t result = 0; + for (int i = 0; i < 64; i++) { + if (Matches(Key(i + 12345, buffer))) { + result |= uint64_t{1} << i; + } + } + return result; + } + + // Provides a kind of fingerprint on the Bloom filter's + // behavior, for lower FP rates. + std::string FirstFPs(int count) { + char buffer[sizeof(int)]; + std::string rv; + int fp_count = 0; + for (int i = 0; i < 1000000; i++) { + // Pack four match booleans into each hexadecimal digit + if (Matches(Key(i + 1000000, buffer))) { + ++fp_count; + rv += std::to_string(i); + if (fp_count == count) { + break; + } + rv += ','; + } + } + return rv; + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST_P(FullBloomTest, FilterSize) { + // In addition to checking the consistency of space computation, we are + // checking that denoted and computed doubles are interpreted as expected + // as bits_per_key values. + bool some_computed_less_than_denoted = false; + // Note: to avoid unproductive configurations, bits_per_key < 0.5 is rounded + // down to 0 (no filter), and 0.5 <= bits_per_key < 1.0 is rounded up to 1 + // bit per key (1000 millibits). Also, enforced maximum is 100 bits per key + // (100000 millibits). + for (auto bpk : std::vector >{{-HUGE_VAL, 0}, + {-INFINITY, 0}, + {0.0, 0}, + {0.499, 0}, + {0.5, 1000}, + {1.234, 1234}, + {3.456, 3456}, + {9.5, 9500}, + {10.0, 10000}, + {10.499, 10499}, + {21.345, 21345}, + {99.999, 99999}, + {1234.0, 100000}, + {HUGE_VAL, 100000}, + {INFINITY, 100000}, + {NAN, 100000}}) { + ResetPolicy(bpk.first); + auto bfp = GetBloomLikeFilterPolicy(); + EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey()); + EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey()); + + double computed = bpk.first; + // This transforms e.g. 9.5 -> 9.499999999999998, which we still + // round to 10 for whole bits per key. + computed += 0.5; + computed /= 1234567.0; + computed *= 1234567.0; + computed -= 0.5; + some_computed_less_than_denoted |= (computed < bpk.first); + ResetPolicy(computed); + bfp = GetBloomLikeFilterPolicy(); + EXPECT_EQ(bpk.second, bfp->GetMillibitsPerKey()); + EXPECT_EQ((bpk.second + 500) / 1000, bfp->GetWholeBitsPerKey()); + + auto bits_builder = GetBuiltinFilterBitsBuilder(); + if (bpk.second == 0) { + ASSERT_EQ(bits_builder, nullptr); + continue; + } + + size_t n = 1; + size_t space = 0; + for (; n < 1000000; n += 1 + n / 1000) { + // Ensure consistency between CalculateSpace and ApproximateNumEntries + space = bits_builder->CalculateSpace(n); + size_t n2 = bits_builder->ApproximateNumEntries(space); + EXPECT_GE(n2, n); + size_t space2 = bits_builder->CalculateSpace(n2); + if (n > 12000 && GetParam() == kStandard128Ribbon) { + // TODO(peterd): better approximation? + EXPECT_GE(space2, space); + EXPECT_LE(space2 * 0.998, space * 1.0); + } else { + EXPECT_EQ(space2, space); + } + } + // Until size_t overflow + for (; n < (n + n / 3); n += n / 3) { + // Ensure space computation is not overflowing; capped is OK + size_t space2 = bits_builder->CalculateSpace(n); + EXPECT_GE(space2, space); + space = space2; + } + } + // Check that the compiler hasn't optimized our computation into nothing + EXPECT_TRUE(some_computed_less_than_denoted); + ResetPolicy(); +} + +TEST_P(FullBloomTest, FullEmptyFilter) { + // Empty filter is not match, at this level + ASSERT_TRUE(!Matches("hello")); + ASSERT_TRUE(!Matches("world")); +} + +TEST_P(FullBloomTest, FullSmall) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(!Matches("x")); + ASSERT_TRUE(!Matches("foo")); +} + +TEST_P(FullBloomTest, FullVaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + EXPECT_LE(FilterSize(), (size_t)((length * FLAGS_bits_per_key / 8) + + CACHE_LINE_SIZE * 2 + 5)); + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate * 100.0, length, static_cast(FilterSize())); + } + if (FLAGS_bits_per_key == 10) { + EXPECT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) { + mediocre_filters++; // Allowed, but not too often + } else { + good_filters++; + } + } + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters, + mediocre_filters); + } + EXPECT_LE(mediocre_filters, good_filters / 5); +} + +TEST_P(FullBloomTest, OptimizeForMemory) { + char buffer[sizeof(int)]; + for (bool offm : {true, false}) { + table_options_.optimize_filters_for_memory = offm; + ResetPolicy(); + Random32 rnd(12345); + uint64_t total_size = 0; + uint64_t total_mem = 0; + int64_t total_keys = 0; + double total_fp_rate = 0; + constexpr int nfilters = 100; + for (int i = 0; i < nfilters; ++i) { + int nkeys = static_cast(rnd.Uniformish(10000)) + 100; + Reset(); + for (int j = 0; j < nkeys; ++j) { + Add(Key(j, buffer)); + } + Build(); + size_t size = FilterData().size(); + total_size += size; + // optimize_filters_for_memory currently depends on malloc_usable_size + // but we run the rest of the test to ensure no bad behavior without it. +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + size = malloc_usable_size(const_cast(FilterData().data())); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + total_mem += size; + total_keys += nkeys; + total_fp_rate += FalsePositiveRate(); + } + if (FLAGS_bits_per_key == 10) { + EXPECT_LE(total_fp_rate / double{nfilters}, 0.011); + EXPECT_GE(total_fp_rate / double{nfilters}, + CACHE_LINE_SIZE >= 256 ? 0.007 : 0.008); + } + + int64_t ex_min_total_size = int64_t{FLAGS_bits_per_key} * total_keys / 8; + if (GetParam() == kStandard128Ribbon) { + // ~ 30% savings vs. Bloom filter + ex_min_total_size = 7 * ex_min_total_size / 10; + } + EXPECT_GE(static_cast(total_size), ex_min_total_size); + + int64_t blocked_bloom_overhead = nfilters * (CACHE_LINE_SIZE + 5); + if (GetParam() == kLegacyBloom) { + // this config can add extra cache line to make odd number + blocked_bloom_overhead += nfilters * CACHE_LINE_SIZE; + } + + EXPECT_GE(total_mem, total_size); + + // optimize_filters_for_memory not implemented with legacy Bloom + if (offm && GetParam() != kLegacyBloom) { + // This value can include a small extra penalty for kExtraPadding + fprintf(stderr, "Internal fragmentation (optimized): %g%%\n", + (total_mem - total_size) * 100.0 / total_size); + // Less than 1% internal fragmentation + EXPECT_LE(total_mem, total_size * 101 / 100); + // Up to 2% storage penalty + EXPECT_LE(static_cast(total_size), + ex_min_total_size * 102 / 100 + blocked_bloom_overhead); + } else { + fprintf(stderr, "Internal fragmentation (not optimized): %g%%\n", + (total_mem - total_size) * 100.0 / total_size); + // TODO: add control checks for more allocators? +#ifdef ROCKSDB_JEMALLOC + fprintf(stderr, "Jemalloc detected? %d\n", HasJemalloc()); + if (HasJemalloc()) { +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + // More than 5% internal fragmentation + EXPECT_GE(total_mem, total_size * 105 / 100); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + } +#endif // ROCKSDB_JEMALLOC + // No storage penalty, just usual overhead + EXPECT_LE(static_cast(total_size), + ex_min_total_size + blocked_bloom_overhead); + } + } +} + +class ChargeFilterConstructionTest : public testing::Test {}; +TEST_F(ChargeFilterConstructionTest, RibbonFilterFallBackOnLargeBanding) { + constexpr std::size_t kCacheCapacity = + 8 * CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize(); + constexpr std::size_t num_entries_for_cache_full = kCacheCapacity / 8; + + for (CacheEntryRoleOptions::Decision charge_filter_construction_mem : + {CacheEntryRoleOptions::Decision::kEnabled, + CacheEntryRoleOptions::Decision::kDisabled}) { + bool will_fall_back = charge_filter_construction_mem == + CacheEntryRoleOptions::Decision::kEnabled; + + BlockBasedTableOptions table_options; + table_options.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFilterConstruction, + {/*.charged = */ charge_filter_construction_mem}}); + LRUCacheOptions lo; + lo.capacity = kCacheCapacity; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = true; + std::shared_ptr cache(NewLRUCache(lo)); + table_options.block_cache = cache; + table_options.filter_policy = + BloomLikeFilterPolicy::Create(kStandard128Ribbon, FLAGS_bits_per_key); + FilterBuildingContext ctx(table_options); + std::unique_ptr filter_bits_builder( + table_options.filter_policy->GetBuilderWithContext(ctx)); + + char key_buffer[sizeof(int)]; + for (std::size_t i = 0; i < num_entries_for_cache_full; ++i) { + filter_bits_builder->AddKey(Key(static_cast(i), key_buffer)); + } + + std::unique_ptr buf; + Slice filter = filter_bits_builder->Finish(&buf); + + // To verify Ribbon Filter fallbacks to Bloom Filter properly + // based on cache charging result + // See BloomFilterPolicy::GetBloomBitsReader re: metadata + // -1 = Marker for newer Bloom implementations + // -2 = Marker for Standard128 Ribbon + if (will_fall_back) { + EXPECT_EQ(filter.data()[filter.size() - 5], static_cast(-1)); + } else { + EXPECT_EQ(filter.data()[filter.size() - 5], static_cast(-2)); + } + + if (charge_filter_construction_mem == + CacheEntryRoleOptions::Decision::kEnabled) { + const size_t dummy_entry_num = static_cast(std::ceil( + filter.size() * 1.0 / + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize())); + EXPECT_GE( + cache->GetPinnedUsage(), + dummy_entry_num * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()); + EXPECT_LT( + cache->GetPinnedUsage(), + (dummy_entry_num + 1) * + CacheReservationManagerImpl< + CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()); + } else { + EXPECT_EQ(cache->GetPinnedUsage(), 0); + } + } +} + +namespace { +inline uint32_t SelectByCacheLineSize(uint32_t for64, uint32_t for128, + uint32_t for256) { + (void)for64; + (void)for128; + (void)for256; +#if CACHE_LINE_SIZE == 64 + return for64; +#elif CACHE_LINE_SIZE == 128 + return for128; +#elif CACHE_LINE_SIZE == 256 + return for256; +#else +#error "CACHE_LINE_SIZE unknown or unrecognized" +#endif +} +} // namespace + +// Ensure the implementation doesn't accidentally change in an +// incompatible way. This test doesn't check the reading side +// (FirstFPs/PackedMatches) for LegacyBloom because it requires the +// ability to read filters generated using other cache line sizes. +// See RawSchema. +TEST_P(FullBloomTest, Schema) { +#define EXPECT_EQ_Bloom(a, b) \ + { \ + if (GetParam() != kStandard128Ribbon) { \ + EXPECT_EQ(a, b); \ + } \ + } +#define EXPECT_EQ_Ribbon(a, b) \ + { \ + if (GetParam() == kStandard128Ribbon) { \ + EXPECT_EQ(a, b); \ + } \ + } +#define EXPECT_EQ_FastBloom(a, b) \ + { \ + if (GetParam() == kFastLocalBloom) { \ + EXPECT_EQ(a, b); \ + } \ + } +#define EXPECT_EQ_LegacyBloom(a, b) \ + { \ + if (GetParam() == kLegacyBloom) { \ + EXPECT_EQ(a, b); \ + } \ + } +#define EXPECT_EQ_NotLegacy(a, b) \ + { \ + if (GetParam() != kLegacyBloom) { \ + EXPECT_EQ(a, b); \ + } \ + } + + char buffer[sizeof(int)]; + + // First do a small number of keys, where Ribbon config will fall back on + // fast Bloom filter and generate the same data + ResetPolicy(5); // num_probes = 3 + for (int key = 0; key < 87; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ(GetNumProbesFromFilterData(), 3); + + EXPECT_EQ_NotLegacy(BloomHash(FilterData()), 4130687756U); + + EXPECT_EQ_NotLegacy("31,38,40,43,61,83,86,112,125,131", FirstFPs(10)); + + // Now use enough keys so that changing bits / key by 1 is guaranteed to + // change number of allocated cache lines. So keys > max cache line bits. + + // Note that the first attempted Ribbon seed is determined by the hash + // of the first key added (for pseudorandomness in practice, determinism in + // testing) + + ResetPolicy(2); // num_probes = 1 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 1); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(1567096579, 1964771444, 2659542661U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3817481309U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1705851228U); + + EXPECT_EQ_FastBloom("11,13,17,25,29,30,35,37,45,53", FirstFPs(10)); + EXPECT_EQ_Ribbon("3,8,10,17,19,20,23,28,31,32", FirstFPs(10)); + + ResetPolicy(3); // num_probes = 2 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 2); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(2707206547U, 2571983456U, 218344685)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2807269961U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1095342358U); + + EXPECT_EQ_FastBloom("4,15,17,24,27,28,29,53,63,70", FirstFPs(10)); + EXPECT_EQ_Ribbon("3,17,20,28,32,33,36,43,49,54", FirstFPs(10)); + + ResetPolicy(5); // num_probes = 3 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 3); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(515748486, 94611728, 2436112214U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 204628445U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3971337699U); + + EXPECT_EQ_FastBloom("15,24,29,39,53,87,89,100,103,104", FirstFPs(10)); + EXPECT_EQ_Ribbon("3,33,36,43,67,70,76,78,84,102", FirstFPs(10)); + + ResetPolicy(8); // num_probes = 5 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 5); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(1302145999, 2811644657U, 756553699)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 355564975U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3651449053U); + + EXPECT_EQ_FastBloom("16,60,66,126,220,238,244,256,265,287", FirstFPs(10)); + EXPECT_EQ_Ribbon("33,187,203,296,300,322,411,419,547,582", FirstFPs(10)); + + ResetPolicy(9); // num_probes = 6 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(2092755149, 661139132, 1182970461)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2137566013U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1005676675U); + + EXPECT_EQ_FastBloom("156,367,791,872,945,1015,1139,1159,1265", FirstFPs(9)); + EXPECT_EQ_Ribbon("33,187,203,296,411,419,604,612,615,619", FirstFPs(10)); + + ResetPolicy(11); // num_probes = 7 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 7); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(3755609649U, 1812694762, 1449142939)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2561502687U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3129900846U); + + EXPECT_EQ_FastBloom("34,74,130,236,643,882,962,1015,1035,1110", FirstFPs(10)); + EXPECT_EQ_Ribbon("411,419,623,665,727,794,955,1052,1323,1330", FirstFPs(10)); + + // This used to be 9 probes, but 8 is a better choice for speed, + // especially with SIMD groups of 8 probes, with essentially no + // change in FP rate. + // FP rate @ 9 probes, old Bloom: 0.4321% + // FP rate @ 9 probes, new Bloom: 0.1846% + // FP rate @ 8 probes, new Bloom: 0.1843% + ResetPolicy(14); // num_probes = 8 (new), 9 (old) + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 9); + EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 8); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(178861123, 379087593, 2574136516U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3709876890U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1855638875U); + + EXPECT_EQ_FastBloom("130,240,522,565,989,2002,2526,3147,3543", FirstFPs(9)); + EXPECT_EQ_Ribbon("665,727,1323,1755,3866,4232,4442,4492,4736", FirstFPs(9)); + + // This used to be 11 probes, but 9 is a better choice for speed + // AND accuracy. + // FP rate @ 11 probes, old Bloom: 0.3571% + // FP rate @ 11 probes, new Bloom: 0.0884% + // FP rate @ 9 probes, new Bloom: 0.0843% + ResetPolicy(16); // num_probes = 9 (new), 11 (old) + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 11); + EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 9); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(1129406313, 3049154394U, 1727750964)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 1087138490U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 459379967U); + + EXPECT_EQ_FastBloom("3299,3611,3916,6620,7822,8079,8482,8942", FirstFPs(8)); + EXPECT_EQ_Ribbon("727,1323,1755,4442,4736,5386,6974,7154,8222", FirstFPs(9)); + + ResetPolicy(10); // num_probes = 6, but different memory ratio vs. 9 + for (int key = 0; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 61); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(1478976371, 2910591341U, 1182970461)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2498541272U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1273231667U); + + EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9)); + EXPECT_EQ_Ribbon("296,411,419,612,619,623,630,665,686,727", FirstFPs(10)); + + ResetPolicy(10); + for (int key = /*CHANGED*/ 1; key < 2087; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), /*CHANGED*/ 184); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(4205696321U, 1132081253U, 2385981855U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 2058382345U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 3007790572U); + + EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9)); + EXPECT_EQ_Ribbon("33,152,383,497,589,633,737,781,911,990", FirstFPs(10)); + + ResetPolicy(10); + for (int key = 1; key < /*CHANGED*/ 2088; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + SelectByCacheLineSize(2885052954U, 769447944, 4175124908U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 23699164U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1942323379U); + + EXPECT_EQ_FastBloom("16,126,133,422,466,472,813,1002,1035", FirstFPs(9)); + EXPECT_EQ_Ribbon("33,95,360,589,737,911,990,1048,1081,1414", FirstFPs(10)); + + // With new fractional bits_per_key, check that we are rounding to + // whole bits per key for old Bloom filters but fractional for + // new Bloom filter. + ResetPolicy(9.5); + for (int key = 1; key < 2088; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_Bloom(GetNumProbesFromFilterData(), 6); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 3166884174U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 1148258663U); + + EXPECT_EQ_FastBloom("126,156,367,444,458,791,813,976,1015", FirstFPs(9)); + EXPECT_EQ_Ribbon("33,54,95,360,589,693,737,911,990,1048", FirstFPs(10)); + + ResetPolicy(10.499); + for (int key = 1; key < 2088; key++) { + Add(Key(key, buffer)); + } + Build(); + EXPECT_EQ_LegacyBloom(GetNumProbesFromFilterData(), 6); + EXPECT_EQ_FastBloom(GetNumProbesFromFilterData(), 7); + EXPECT_EQ_Ribbon(GetRibbonSeedFromFilterData(), 184); + + EXPECT_EQ_LegacyBloom( + BloomHash(FilterData()), + /*SAME*/ SelectByCacheLineSize(2885052954U, 769447944, 4175124908U)); + EXPECT_EQ_FastBloom(BloomHash(FilterData()), 4098502778U); + EXPECT_EQ_Ribbon(BloomHash(FilterData()), 792138188U); + + EXPECT_EQ_FastBloom("16,236,240,472,1015,1045,1111,1409,1465", FirstFPs(9)); + EXPECT_EQ_Ribbon("33,95,360,589,737,990,1048,1081,1414,1643", FirstFPs(10)); + + ResetPolicy(); +} + +// A helper class for testing custom or corrupt filter bits as read by +// built-in FilterBitsReaders. +struct RawFilterTester { + // Buffer, from which we always return a tail Slice, so the + // last five bytes are always the metadata bytes. + std::array data_; + // Points five bytes from the end + char* metadata_ptr_; + + RawFilterTester() : metadata_ptr_(&*(data_.end() - 5)) {} + + Slice ResetNoFill(uint32_t len_without_metadata, uint32_t num_lines, + uint32_t num_probes) { + metadata_ptr_[0] = static_cast(num_probes); + EncodeFixed32(metadata_ptr_ + 1, num_lines); + uint32_t len = len_without_metadata + /*metadata*/ 5; + assert(len <= data_.size()); + return Slice(metadata_ptr_ - len_without_metadata, len); + } + + Slice Reset(uint32_t len_without_metadata, uint32_t num_lines, + uint32_t num_probes, bool fill_ones) { + data_.fill(fill_ones ? 0xff : 0); + return ResetNoFill(len_without_metadata, num_lines, num_probes); + } + + Slice ResetWeirdFill(uint32_t len_without_metadata, uint32_t num_lines, + uint32_t num_probes) { + for (uint32_t i = 0; i < data_.size(); ++i) { + data_[i] = static_cast(0x7b7b >> (i % 7)); + } + return ResetNoFill(len_without_metadata, num_lines, num_probes); + } +}; + +TEST_P(FullBloomTest, RawSchema) { + RawFilterTester cft; + // Legacy Bloom configurations + // Two probes, about 3/4 bits set: ~50% "FP" rate + // One 256-byte cache line. + OpenRaw(cft.ResetWeirdFill(256, 1, 2)); + EXPECT_EQ(uint64_t{11384799501900898790U}, PackedMatches()); + + // Two 128-byte cache lines. + OpenRaw(cft.ResetWeirdFill(256, 2, 2)); + EXPECT_EQ(uint64_t{10157853359773492589U}, PackedMatches()); + + // Four 64-byte cache lines. + OpenRaw(cft.ResetWeirdFill(256, 4, 2)); + EXPECT_EQ(uint64_t{7123594913907464682U}, PackedMatches()); + + // Fast local Bloom configurations (marker 255 -> -1) + // Two probes, about 3/4 bits set: ~50% "FP" rate + // Four 64-byte cache lines. + OpenRaw(cft.ResetWeirdFill(256, 2U << 8, 255)); + EXPECT_EQ(uint64_t{9957045189927952471U}, PackedMatches()); + + // Ribbon configurations (marker 254 -> -2) + + // Even though the builder never builds configurations this + // small (preferring Bloom), we can test that the configuration + // can be read, for possible future-proofing. + + // 256 slots, one result column = 32 bytes (2 blocks, seed 0) + // ~50% FP rate: + // 0b0101010111110101010000110000011011011111100100001110010011101010 + OpenRaw(cft.ResetWeirdFill(32, 2U << 8, 254)); + EXPECT_EQ(uint64_t{6193930559317665002U}, PackedMatches()); + + // 256 slots, three-to-four result columns = 112 bytes + // ~ 1 in 10 FP rate: + // 0b0000000000100000000000000000000001000001000000010000101000000000 + OpenRaw(cft.ResetWeirdFill(112, 2U << 8, 254)); + EXPECT_EQ(uint64_t{9007200345328128U}, PackedMatches()); +} + +TEST_P(FullBloomTest, CorruptFilters) { + RawFilterTester cft; + + for (bool fill : {false, true}) { + // Legacy Bloom configurations + // Good filter bits - returns same as fill + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 6, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Good filter bits - returns same as fill + OpenRaw(cft.Reset(CACHE_LINE_SIZE * 3, 3, 6, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Good filter bits - returns same as fill + // 256 is unusual but legal cache line size + OpenRaw(cft.Reset(256 * 3, 3, 6, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Good filter bits - returns same as fill + // 30 should be max num_probes + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 30, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Good filter bits - returns same as fill + // 1 should be min num_probes + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 1, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Type 1 trivial filter bits - returns true as if FP by zero probes + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 0, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Type 2 trivial filter bits - returns false as if built from zero keys + OpenRaw(cft.Reset(0, 0, 6, fill)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + // Type 2 trivial filter bits - returns false as if built from zero keys + OpenRaw(cft.Reset(0, 37, 6, fill)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + // Type 2 trivial filter bits - returns false as 0 size trumps 0 probes + OpenRaw(cft.Reset(0, 0, 0, fill)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + // Bad filter bits - returns true for safety + // No solution to 0 * x == CACHE_LINE_SIZE + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 0, 6, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Bad filter bits - returns true for safety + // Can't have 3 * x == 4 for integer x + OpenRaw(cft.Reset(4, 3, 6, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Bad filter bits - returns true for safety + // 97 bytes is not a power of two, so not a legal cache line size + OpenRaw(cft.Reset(97 * 3, 3, 6, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Bad filter bits - returns true for safety + // 65 bytes is not a power of two, so not a legal cache line size + OpenRaw(cft.Reset(65 * 3, 3, 6, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Bad filter bits - returns false as if built from zero keys + // < 5 bytes overall means missing even metadata + OpenRaw(cft.Reset(static_cast(-1), 3, 6, fill)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + OpenRaw(cft.Reset(static_cast(-5), 3, 6, fill)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + // Dubious filter bits - returns same as fill (for now) + // 31 is not a useful num_probes, nor generated by RocksDB unless directly + // using filter bits API without BloomFilterPolicy. + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 31, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Dubious filter bits - returns same as fill (for now) + // Similar, with 127, largest positive char + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 127, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Dubious filter bits - returns true (for now) + // num_probes set to 128 / -128, lowest negative char + // NB: Bug in implementation interprets this as negative and has same + // effect as zero probes, but effectively reserves negative char values + // for future use. + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 128, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Dubious filter bits - returns true (for now) + // Similar, with 253 / -3 + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 1, 253, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // ######################################################### + // Fast local Bloom configurations (marker 255 -> -1) + // Good config with six probes + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 6U << 8, 255, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Becomes bad/reserved config (always true) if any other byte set + OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | 1U, 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 16), 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + OpenRaw(cft.Reset(CACHE_LINE_SIZE, (6U << 8) | (1U << 24), 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Good config, max 30 probes + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 30U << 8, 255, fill)); + ASSERT_EQ(fill, Matches("hello")); + ASSERT_EQ(fill, Matches("world")); + + // Bad/reserved config (always true) if more than 30 + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 31U << 8, 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 33U << 8, 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 66U << 8, 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + OpenRaw(cft.Reset(CACHE_LINE_SIZE, 130U << 8, 255, fill)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + } + + // ######################################################### + // Ribbon configurations (marker 254 -> -2) + // ("fill" doesn't work to detect good configurations, we just + // have to rely on TN probability) + + // Good: 2 blocks * 16 bytes / segment * 4 columns = 128 bytes + // seed = 123 + OpenRaw(cft.Reset(128, (2U << 8) + 123U, 254, false)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + // Good: 2 blocks * 16 bytes / segment * 8 columns = 256 bytes + OpenRaw(cft.Reset(256, (2U << 8) + 123U, 254, false)); + ASSERT_FALSE(Matches("hello")); + ASSERT_FALSE(Matches("world")); + + // Surprisingly OK: 5000 blocks (640,000 slots) in only 1024 bits + // -> average close to 0 columns + OpenRaw(cft.Reset(128, (5000U << 8) + 123U, 254, false)); + // *Almost* all FPs + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + // Need many queries to find a "true negative" + for (int i = 0; Matches(std::to_string(i)); ++i) { + ASSERT_LT(i, 1000); + } + + // Bad: 1 block not allowed (for implementation detail reasons) + OpenRaw(cft.Reset(128, (1U << 8) + 123U, 254, false)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + + // Bad: 0 blocks not allowed + OpenRaw(cft.Reset(128, (0U << 8) + 123U, 254, false)); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); +} + +INSTANTIATE_TEST_CASE_P(Full, FullBloomTest, + testing::Values(kLegacyBloom, kFastLocalBloom, + kStandard128Ribbon)); + +static double GetEffectiveBitsPerKey(FilterBitsBuilder* builder) { + union { + uint64_t key_value = 0; + char key_bytes[8]; + }; + + const unsigned kNumKeys = 1000; + + Slice key_slice{key_bytes, 8}; + for (key_value = 0; key_value < kNumKeys; ++key_value) { + builder->AddKey(key_slice); + } + + std::unique_ptr buf; + auto filter = builder->Finish(&buf); + return filter.size() * /*bits per byte*/ 8 / (1.0 * kNumKeys); +} + +static void SetTestingLevel(int levelish, FilterBuildingContext* ctx) { + if (levelish == -1) { + // Flush is treated as level -1 for this option but actually level 0 + ctx->level_at_creation = 0; + ctx->reason = TableFileCreationReason::kFlush; + } else { + ctx->level_at_creation = levelish; + ctx->reason = TableFileCreationReason::kCompaction; + } +} + +TEST(RibbonTest, RibbonTestLevelThreshold) { + BlockBasedTableOptions opts; + FilterBuildingContext ctx(opts); + // A few settings + for (CompactionStyle cs : {kCompactionStyleLevel, kCompactionStyleUniversal, + kCompactionStyleFIFO, kCompactionStyleNone}) { + ctx.compaction_style = cs; + for (int bloom_before_level : {-1, 0, 1, 10}) { + std::vector > policies; + policies.emplace_back(NewRibbonFilterPolicy(10, bloom_before_level)); + + if (bloom_before_level == 0) { + // Also test new API default + policies.emplace_back(NewRibbonFilterPolicy(10)); + } + + for (std::unique_ptr& policy : policies) { + // Claim to be generating filter for this level + SetTestingLevel(bloom_before_level, &ctx); + + std::unique_ptr builder{ + policy->GetBuilderWithContext(ctx)}; + + // Must be Ribbon (more space efficient than 10 bits per key) + ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + + if (bloom_before_level >= 0) { + // Claim to be generating filter for previous level + SetTestingLevel(bloom_before_level - 1, &ctx); + + builder.reset(policy->GetBuilderWithContext(ctx)); + + if (cs == kCompactionStyleLevel || cs == kCompactionStyleUniversal) { + // Level is considered. + // Must be Bloom (~ 10 bits per key) + ASSERT_GT(GetEffectiveBitsPerKey(builder.get()), 9); + } else { + // Level is ignored under non-traditional compaction styles. + // Must be Ribbon (more space efficient than 10 bits per key) + ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + } + } + + // Like SST file writer + ctx.level_at_creation = -1; + ctx.reason = TableFileCreationReason::kMisc; + + builder.reset(policy->GetBuilderWithContext(ctx)); + + // Must be Ribbon (more space efficient than 10 bits per key) + ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + } + } + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + + return RUN_ALL_TESTS(); +} + +#endif // GFLAGS diff --git a/src/rocksdb/util/build_version.cc.in b/src/rocksdb/util/build_version.cc.in new file mode 100644 index 000000000..c1706dc1f --- /dev/null +++ b/src/rocksdb/util/build_version.cc.in @@ -0,0 +1,81 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include + +#include "rocksdb/version.h" +#include "rocksdb/utilities/object_registry.h" +#include "util/string_util.h" + +// The build script may replace these values with real values based +// on whether or not GIT is available and the platform settings +static const std::string rocksdb_build_git_sha = "rocksdb_build_git_sha:@GIT_SHA@"; +static const std::string rocksdb_build_git_tag = "rocksdb_build_git_tag:@GIT_TAG@"; +#define HAS_GIT_CHANGES @GIT_MOD@ +#if HAS_GIT_CHANGES == 0 +// If HAS_GIT_CHANGES is 0, the GIT date is used. +// Use the time the branch/tag was last modified +static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@"; +#else +// If HAS_GIT_CHANGES is > 0, the branch/tag has modifications. +// Use the time the build was created. +static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@"; +#endif + +#ifndef ROCKSDB_LITE +extern "C" { +@ROCKSDB_PLUGIN_EXTERNS@ +} // extern "C" + +std::unordered_map ROCKSDB_NAMESPACE::ObjectRegistry::builtins_ = { + @ROCKSDB_PLUGIN_BUILTINS@ +}; +#endif //ROCKSDB_LITE + +namespace ROCKSDB_NAMESPACE { +static void AddProperty(std::unordered_map *props, const std::string& name) { + size_t colon = name.find(":"); + if (colon != std::string::npos && colon > 0 && colon < name.length() - 1) { + // If we found a "@:", then this property was a build-time substitution that failed. Skip it + size_t at = name.find("@", colon); + if (at != colon + 1) { + // Everything before the colon is the name, after is the value + (*props)[name.substr(0, colon)] = name.substr(colon + 1); + } + } +} + +static std::unordered_map* LoadPropertiesSet() { + auto * properties = new std::unordered_map(); + AddProperty(properties, rocksdb_build_git_sha); + AddProperty(properties, rocksdb_build_git_tag); + AddProperty(properties, rocksdb_build_date); + return properties; +} + +const std::unordered_map& GetRocksBuildProperties() { + static std::unique_ptr> props(LoadPropertiesSet()); + return *props; +} + +std::string GetRocksVersionAsString(bool with_patch) { + std::string version = std::to_string(ROCKSDB_MAJOR) + "." + std::to_string(ROCKSDB_MINOR); + if (with_patch) { + return version + "." + std::to_string(ROCKSDB_PATCH); + } else { + return version; + } +} + +std::string GetRocksBuildInfoAsString(const std::string& program, bool verbose) { + std::string info = program + " (RocksDB) " + GetRocksVersionAsString(true); + if (verbose) { + for (const auto& it : GetRocksBuildProperties()) { + info.append("\n "); + info.append(it.first); + info.append(": "); + info.append(it.second); + } + } + return info; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/cast_util.h b/src/rocksdb/util/cast_util.h new file mode 100644 index 000000000..c91b6ff1e --- /dev/null +++ b/src/rocksdb/util/cast_util.h @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// The helper function to assert the move from dynamic_cast<> to +// static_cast<> is correct. This function is to deal with legacy code. +// It is not recommended to add new code to issue class casting. The preferred +// solution is to implement the functionality without a need of casting. +template +inline DestClass* static_cast_with_check(SrcClass* x) { + DestClass* ret = static_cast(x); +#ifdef ROCKSDB_USE_RTTI + assert(ret == dynamic_cast(x)); +#endif + return ret; +} + +// A wrapper around static_cast for lossless conversion between integral +// types, including enum types. For example, this can be used for converting +// between signed/unsigned or enum type and underlying type without fear of +// stripping away data, now or in the future. +template +inline To lossless_cast(From x) { + using FromValue = typename std::remove_reference::type; + static_assert( + std::is_integral::value || std::is_enum::value, + "Only works on integral types"); + static_assert(std::is_integral::value || std::is_enum::value, + "Only works on integral types"); + static_assert(sizeof(To) >= sizeof(FromValue), "Must be lossless"); + return static_cast(x); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/channel.h b/src/rocksdb/util/channel.h new file mode 100644 index 000000000..19b956297 --- /dev/null +++ b/src/rocksdb/util/channel.h @@ -0,0 +1,69 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +template +class channel { + public: + explicit channel() : eof_(false) {} + + channel(const channel&) = delete; + void operator=(const channel&) = delete; + + void sendEof() { + std::lock_guard lk(lock_); + eof_ = true; + cv_.notify_all(); + } + + bool eof() { + std::lock_guard lk(lock_); + return buffer_.empty() && eof_; + } + + size_t size() const { + std::lock_guard lk(lock_); + return buffer_.size(); + } + + // writes elem to the queue + void write(T&& elem) { + std::unique_lock lk(lock_); + buffer_.emplace(std::forward(elem)); + cv_.notify_one(); + } + + /// Moves a dequeued element onto elem, blocking until an element + /// is available. + // returns false if EOF + bool read(T& elem) { + std::unique_lock lk(lock_); + cv_.wait(lk, [&] { return eof_ || !buffer_.empty(); }); + if (eof_ && buffer_.empty()) { + return false; + } + elem = std::move(buffer_.front()); + buffer_.pop(); + cv_.notify_one(); + return true; + } + + private: + std::condition_variable cv_; + mutable std::mutex lock_; + std::queue buffer_; + bool eof_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/cleanable.cc b/src/rocksdb/util/cleanable.cc new file mode 100644 index 000000000..89a7ab9be --- /dev/null +++ b/src/rocksdb/util/cleanable.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/cleanable.h" + +#include +#include +#include + +namespace ROCKSDB_NAMESPACE { + +Cleanable::Cleanable() { + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +Cleanable::~Cleanable() { DoCleanup(); } + +Cleanable::Cleanable(Cleanable&& other) noexcept { *this = std::move(other); } + +Cleanable& Cleanable::operator=(Cleanable&& other) noexcept { + assert(this != &other); // https://stackoverflow.com/a/9322542/454544 + cleanup_ = other.cleanup_; + other.cleanup_.function = nullptr; + other.cleanup_.next = nullptr; + return *this; +} + +// If the entire linked list was on heap we could have simply add attach one +// link list to another. However the head is an embeded object to avoid the cost +// of creating objects for most of the use cases when the Cleanable has only one +// Cleanup to do. We could put evernything on heap if benchmarks show no +// negative impact on performance. +// Also we need to iterate on the linked list since there is no pointer to the +// tail. We can add the tail pointer but maintainin it might negatively impact +// the perforamnce for the common case of one cleanup where tail pointer is not +// needed. Again benchmarks could clarify that. +// Even without a tail pointer we could iterate on the list, find the tail, and +// have only that node updated without the need to insert the Cleanups one by +// one. This however would be redundant when the source Cleanable has one or a +// few Cleanups which is the case most of the time. +// TODO(myabandeh): if the list is too long we should maintain a tail pointer +// and have the entire list (minus the head that has to be inserted separately) +// merged with the target linked list at once. +void Cleanable::DelegateCleanupsTo(Cleanable* other) { + assert(other != nullptr); + if (cleanup_.function == nullptr) { + return; + } + Cleanup* c = &cleanup_; + other->RegisterCleanup(c->function, c->arg1, c->arg2); + c = c->next; + while (c != nullptr) { + Cleanup* next = c->next; + other->RegisterCleanup(c); + c = next; + } + cleanup_.function = nullptr; + cleanup_.next = nullptr; +} + +void Cleanable::RegisterCleanup(Cleanable::Cleanup* c) { + assert(c != nullptr); + if (cleanup_.function == nullptr) { + cleanup_.function = c->function; + cleanup_.arg1 = c->arg1; + cleanup_.arg2 = c->arg2; + delete c; + } else { + c->next = cleanup_.next; + cleanup_.next = c; + } +} + +void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { + assert(func != nullptr); + Cleanup* c; + if (cleanup_.function == nullptr) { + c = &cleanup_; + } else { + c = new Cleanup; + c->next = cleanup_.next; + cleanup_.next = c; + } + c->function = func; + c->arg1 = arg1; + c->arg2 = arg2; +} + +struct SharedCleanablePtr::Impl : public Cleanable { + std::atomic ref_count{1}; // Start with 1 ref + void Ref() { ref_count.fetch_add(1, std::memory_order_relaxed); } + void Unref() { + if (ref_count.fetch_sub(1, std::memory_order_relaxed) == 1) { + // Last ref + delete this; + } + } + static void UnrefWrapper(void* arg1, void* /*arg2*/) { + static_cast(arg1)->Unref(); + } +}; + +void SharedCleanablePtr::Reset() { + if (ptr_) { + ptr_->Unref(); + ptr_ = nullptr; + } +} + +void SharedCleanablePtr::Allocate() { + Reset(); + ptr_ = new Impl(); +} + +SharedCleanablePtr::SharedCleanablePtr(const SharedCleanablePtr& from) { + *this = from; +} + +SharedCleanablePtr::SharedCleanablePtr(SharedCleanablePtr&& from) noexcept { + *this = std::move(from); +} + +SharedCleanablePtr& SharedCleanablePtr::operator=( + const SharedCleanablePtr& from) { + if (this != &from) { + Reset(); + ptr_ = from.ptr_; + if (ptr_) { + ptr_->Ref(); + } + } + return *this; +} + +SharedCleanablePtr& SharedCleanablePtr::operator=( + SharedCleanablePtr&& from) noexcept { + assert(this != &from); // https://stackoverflow.com/a/9322542/454544 + Reset(); + ptr_ = from.ptr_; + from.ptr_ = nullptr; + return *this; +} + +SharedCleanablePtr::~SharedCleanablePtr() { Reset(); } + +Cleanable& SharedCleanablePtr::operator*() { + return *ptr_; // implicit upcast +} + +Cleanable* SharedCleanablePtr::operator->() { + return ptr_; // implicit upcast +} + +Cleanable* SharedCleanablePtr::get() { + return ptr_; // implicit upcast +} + +void SharedCleanablePtr::RegisterCopyWith(Cleanable* target) { + if (ptr_) { + // "Virtual" copy of the pointer + ptr_->Ref(); + target->RegisterCleanup(&Impl::UnrefWrapper, ptr_, nullptr); + } +} + +void SharedCleanablePtr::MoveAsCleanupTo(Cleanable* target) { + if (ptr_) { + // "Virtual" move of the pointer + target->RegisterCleanup(&Impl::UnrefWrapper, ptr_, nullptr); + ptr_ = nullptr; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/coding.cc b/src/rocksdb/util/coding.cc new file mode 100644 index 000000000..3da8afaa2 --- /dev/null +++ b/src/rocksdb/util/coding.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include + +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" + +namespace ROCKSDB_NAMESPACE { + +// conversion' conversion from 'type1' to 'type2', possible loss of data +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4244) +#endif +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value) { + uint32_t result = 0; + for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { + uint32_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { + uint64_t result = 0; + for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { + uint64_t byte = *(reinterpret_cast(p)); + p++; + if (byte & 128) { + // More bytes are present + result |= ((byte & 127) << shift); + } else { + result |= (byte << shift); + *value = result; + return reinterpret_cast(p); + } + } + return nullptr; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/coding.h b/src/rocksdb/util/coding.h new file mode 100644 index 000000000..3168fd2fd --- /dev/null +++ b/src/rocksdb/util/coding.h @@ -0,0 +1,389 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Encoding independent of machine byte order: +// * Fixed-length numbers are encoded with least-significant byte first +// (little endian, native order on Intel and others) +// * In addition we support variable length "varint" encoding +// * Strings are encoded prefixed by their length in varint format +// +// Some related functions are provided in coding_lean.h + +#pragma once +#include +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/coding_lean.h" + +// Some processors does not allow unaligned access to memory +#if defined(__sparc) +#define PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED +#endif + +namespace ROCKSDB_NAMESPACE { + +// The maximum length of a varint in bytes for 64-bit. +const uint32_t kMaxVarint64Length = 10; + +// Standard Put... routines append to a string +extern void PutFixed16(std::string* dst, uint16_t value); +extern void PutFixed32(std::string* dst, uint32_t value); +extern void PutFixed64(std::string* dst, uint64_t value); +extern void PutVarint32(std::string* dst, uint32_t value); +extern void PutVarint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2); +extern void PutVarint32Varint32Varint32(std::string* dst, uint32_t value1, + uint32_t value2, uint32_t value3); +extern void PutVarint64(std::string* dst, uint64_t value); +extern void PutVarint64Varint64(std::string* dst, uint64_t value1, + uint64_t value2); +extern void PutVarint32Varint64(std::string* dst, uint32_t value1, + uint64_t value2); +extern void PutVarint32Varint32Varint64(std::string* dst, uint32_t value1, + uint32_t value2, uint64_t value3); +extern void PutLengthPrefixedSlice(std::string* dst, const Slice& value); +extern void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts); +extern void PutLengthPrefixedSlicePartsWithPadding( + std::string* dst, const SliceParts& slice_parts, size_t pad_sz); + +// Standard Get... routines parse a value from the beginning of a Slice +// and advance the slice past the parsed value. +extern bool GetFixed64(Slice* input, uint64_t* value); +extern bool GetFixed32(Slice* input, uint32_t* value); +extern bool GetFixed16(Slice* input, uint16_t* value); +extern bool GetVarint32(Slice* input, uint32_t* value); +extern bool GetVarint64(Slice* input, uint64_t* value); +extern bool GetVarsignedint64(Slice* input, int64_t* value); +extern bool GetLengthPrefixedSlice(Slice* input, Slice* result); +// This function assumes data is well-formed. +extern Slice GetLengthPrefixedSlice(const char* data); + +extern Slice GetSliceUntil(Slice* slice, char delimiter); + +// Borrowed from +// https://github.com/facebook/fbthrift/blob/449a5f77f9f9bae72c9eb5e78093247eef185c04/thrift/lib/cpp/util/VarintUtils-inl.h#L202-L208 +constexpr inline uint64_t i64ToZigzag(const int64_t l) { + return (static_cast(l) << 1) ^ static_cast(l >> 63); +} +inline int64_t zigzagToI64(uint64_t n) { + return (n >> 1) ^ -static_cast(n & 1); +} + +// Pointer-based variants of GetVarint... These either store a value +// in *v and return a pointer just past the parsed value, or return +// nullptr on error. These routines only look at bytes in the range +// [p..limit-1] +extern const char* GetVarint32Ptr(const char* p, const char* limit, + uint32_t* v); +extern const char* GetVarint64Ptr(const char* p, const char* limit, + uint64_t* v); +inline const char* GetVarsignedint64Ptr(const char* p, const char* limit, + int64_t* value) { + uint64_t u = 0; + const char* ret = GetVarint64Ptr(p, limit, &u); + *value = zigzagToI64(u); + return ret; +} + +// Returns the length of the varint32 or varint64 encoding of "v" +extern int VarintLength(uint64_t v); + +// Lower-level versions of Put... that write directly into a character buffer +// and return a pointer just past the last byte written. +// REQUIRES: dst has enough space for the value being written +extern char* EncodeVarint32(char* dst, uint32_t value); +extern char* EncodeVarint64(char* dst, uint64_t value); + +// Internal routine for use by fallback path of GetVarint32Ptr +extern const char* GetVarint32PtrFallback(const char* p, const char* limit, + uint32_t* value); +inline const char* GetVarint32Ptr(const char* p, const char* limit, + uint32_t* value) { + if (p < limit) { + uint32_t result = *(reinterpret_cast(p)); + if ((result & 128) == 0) { + *value = result; + return p + 1; + } + } + return GetVarint32PtrFallback(p, limit, value); +} + +// Pull the last 8 bits and cast it to a character +inline void PutFixed16(std::string* dst, uint16_t value) { + if (port::kLittleEndian) { + dst->append(const_cast(reinterpret_cast(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed16(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutFixed32(std::string* dst, uint32_t value) { + if (port::kLittleEndian) { + dst->append(const_cast(reinterpret_cast(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed32(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutFixed64(std::string* dst, uint64_t value) { + if (port::kLittleEndian) { + dst->append(const_cast(reinterpret_cast(&value)), + sizeof(value)); + } else { + char buf[sizeof(value)]; + EncodeFixed64(buf, value); + dst->append(buf, sizeof(buf)); + } +} + +inline void PutVarint32(std::string* dst, uint32_t v) { + char buf[5]; + char* ptr = EncodeVarint32(buf, v); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint32(std::string* dst, uint32_t v1, uint32_t v2) { + char buf[10]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint32Varint32(std::string* dst, uint32_t v1, + uint32_t v2, uint32_t v3) { + char buf[15]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + ptr = EncodeVarint32(ptr, v3); + dst->append(buf, static_cast(ptr - buf)); +} + +inline char* EncodeVarint64(char* dst, uint64_t v) { + static const unsigned int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +inline void PutVarint64(std::string* dst, uint64_t v) { + char buf[kMaxVarint64Length]; + char* ptr = EncodeVarint64(buf, v); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarsignedint64(std::string* dst, int64_t v) { + char buf[kMaxVarint64Length]; + // Using Zigzag format to convert signed to unsigned + char* ptr = EncodeVarint64(buf, i64ToZigzag(v)); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint64Varint64(std::string* dst, uint64_t v1, uint64_t v2) { + char buf[20]; + char* ptr = EncodeVarint64(buf, v1); + ptr = EncodeVarint64(ptr, v2); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint64(std::string* dst, uint32_t v1, uint64_t v2) { + char buf[15]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint64(ptr, v2); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutVarint32Varint32Varint64(std::string* dst, uint32_t v1, + uint32_t v2, uint64_t v3) { + char buf[20]; + char* ptr = EncodeVarint32(buf, v1); + ptr = EncodeVarint32(ptr, v2); + ptr = EncodeVarint64(ptr, v3); + dst->append(buf, static_cast(ptr - buf)); +} + +inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { + PutVarint32(dst, static_cast(value.size())); + dst->append(value.data(), value.size()); +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, size_t total_bytes, + const SliceParts& slice_parts) { + for (int i = 0; i < slice_parts.num_parts; ++i) { + total_bytes += slice_parts.parts[i].size(); + } + PutVarint32(dst, static_cast(total_bytes)); + for (int i = 0; i < slice_parts.num_parts; ++i) { + dst->append(slice_parts.parts[i].data(), slice_parts.parts[i].size()); + } +} + +inline void PutLengthPrefixedSliceParts(std::string* dst, + const SliceParts& slice_parts) { + PutLengthPrefixedSliceParts(dst, /*total_bytes=*/0, slice_parts); +} + +inline void PutLengthPrefixedSlicePartsWithPadding( + std::string* dst, const SliceParts& slice_parts, size_t pad_sz) { + PutLengthPrefixedSliceParts(dst, /*total_bytes=*/pad_sz, slice_parts); + dst->append(pad_sz, '\0'); +} + +inline int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +inline bool GetFixed64(Slice* input, uint64_t* value) { + if (input->size() < sizeof(uint64_t)) { + return false; + } + *value = DecodeFixed64(input->data()); + input->remove_prefix(sizeof(uint64_t)); + return true; +} + +inline bool GetFixed32(Slice* input, uint32_t* value) { + if (input->size() < sizeof(uint32_t)) { + return false; + } + *value = DecodeFixed32(input->data()); + input->remove_prefix(sizeof(uint32_t)); + return true; +} + +inline bool GetFixed16(Slice* input, uint16_t* value) { + if (input->size() < sizeof(uint16_t)) { + return false; + } + *value = DecodeFixed16(input->data()); + input->remove_prefix(sizeof(uint16_t)); + return true; +} + +inline bool GetVarint32(Slice* input, uint32_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint32Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + +inline bool GetVarint64(Slice* input, uint64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + +inline bool GetVarsignedint64(Slice* input, int64_t* value) { + const char* p = input->data(); + const char* limit = p + input->size(); + const char* q = GetVarsignedint64Ptr(p, limit, value); + if (q == nullptr) { + return false; + } else { + *input = Slice(q, static_cast(limit - q)); + return true; + } +} + +inline bool GetLengthPrefixedSlice(Slice* input, Slice* result) { + uint32_t len = 0; + if (GetVarint32(input, &len) && input->size() >= len) { + *result = Slice(input->data(), len); + input->remove_prefix(len); + return true; + } else { + return false; + } +} + +inline Slice GetLengthPrefixedSlice(const char* data) { + uint32_t len = 0; + // +5: we assume "data" is not corrupted + // unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char + auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len); + return Slice(p, len); +} + +inline Slice GetSliceUntil(Slice* slice, char delimiter) { + uint32_t len = 0; + for (len = 0; len < slice->size() && slice->data()[len] != delimiter; ++len) { + // nothing + } + + Slice ret(slice->data(), len); + slice->remove_prefix(len + ((len < slice->size()) ? 1 : 0)); + return ret; +} + +template +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void +PutUnaligned(T* memory, const T& value) { +#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) + char* nonAlignedMemory = reinterpret_cast(memory); + memcpy(nonAlignedMemory, reinterpret_cast(&value), sizeof(T)); +#else + *memory = value; +#endif +} + +template +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void +GetUnaligned(const T* memory, T* value) { +#if defined(PLATFORM_UNALIGNED_ACCESS_NOT_ALLOWED) + char* nonAlignedMemory = reinterpret_cast(value); + memcpy(nonAlignedMemory, reinterpret_cast(memory), sizeof(T)); +#else + *value = *memory; +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/coding_lean.h b/src/rocksdb/util/coding_lean.h new file mode 100644 index 000000000..6966f7a66 --- /dev/null +++ b/src/rocksdb/util/coding_lean.h @@ -0,0 +1,101 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Encoding independent of machine byte order: +// * Fixed-length numbers are encoded with least-significant byte first +// (little endian, native order on Intel and others) +// +// More functions in coding.h + +#pragma once + +#include +#include + +#include "port/port.h" // for port::kLittleEndian + +namespace ROCKSDB_NAMESPACE { + +// Lower-level versions of Put... that write directly into a character buffer +// REQUIRES: dst has enough space for the value being written +// -- Implementation of the functions declared above +inline void EncodeFixed16(char* buf, uint16_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + } +} + +inline void EncodeFixed32(char* buf, uint32_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + } +} + +inline void EncodeFixed64(char* buf, uint64_t value) { + if (port::kLittleEndian) { + memcpy(buf, &value, sizeof(value)); + } else { + buf[0] = value & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; + buf[3] = (value >> 24) & 0xff; + buf[4] = (value >> 32) & 0xff; + buf[5] = (value >> 40) & 0xff; + buf[6] = (value >> 48) & 0xff; + buf[7] = (value >> 56) & 0xff; + } +} + +// Lower-level versions of Get... that read directly from a character buffer +// without any bounds checking. + +inline uint16_t DecodeFixed16(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint16_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) | + (static_cast(static_cast(ptr[1])) << 8)); + } +} + +inline uint32_t DecodeFixed32(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint32_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + return ((static_cast(static_cast(ptr[0]))) | + (static_cast(static_cast(ptr[1])) << 8) | + (static_cast(static_cast(ptr[2])) << 16) | + (static_cast(static_cast(ptr[3])) << 24)); + } +} + +inline uint64_t DecodeFixed64(const char* ptr) { + if (port::kLittleEndian) { + // Load the raw bytes + uint64_t result; + memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load + return result; + } else { + uint64_t lo = DecodeFixed32(ptr); + uint64_t hi = DecodeFixed32(ptr + 4); + return (hi << 32) | lo; + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/coding_test.cc b/src/rocksdb/util/coding_test.cc new file mode 100644 index 000000000..79dd7b82e --- /dev/null +++ b/src/rocksdb/util/coding_test.cc @@ -0,0 +1,217 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/coding.h" + +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class Coding {}; +TEST(Coding, Fixed16) { + std::string s; + for (uint16_t v = 0; v < 0xFFFF; v++) { + PutFixed16(&s, v); + } + + const char* p = s.data(); + for (uint16_t v = 0; v < 0xFFFF; v++) { + uint16_t actual = DecodeFixed16(p); + ASSERT_EQ(v, actual); + p += sizeof(uint16_t); + } +} + +TEST(Coding, Fixed32) { + std::string s; + for (uint32_t v = 0; v < 100000; v++) { + PutFixed32(&s, v); + } + + const char* p = s.data(); + for (uint32_t v = 0; v < 100000; v++) { + uint32_t actual = DecodeFixed32(p); + ASSERT_EQ(v, actual); + p += sizeof(uint32_t); + } +} + +TEST(Coding, Fixed64) { + std::string s; + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + PutFixed64(&s, v - 1); + PutFixed64(&s, v + 0); + PutFixed64(&s, v + 1); + } + + const char* p = s.data(); + for (int power = 0; power <= 63; power++) { + uint64_t v = static_cast(1) << power; + uint64_t actual = 0; + actual = DecodeFixed64(p); + ASSERT_EQ(v - 1, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v + 0, actual); + p += sizeof(uint64_t); + + actual = DecodeFixed64(p); + ASSERT_EQ(v + 1, actual); + p += sizeof(uint64_t); + } +} + +// Test that encoding routines generate little-endian encodings +TEST(Coding, EncodingOutput) { + std::string dst; + PutFixed32(&dst, 0x04030201); + ASSERT_EQ(4U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + + dst.clear(); + PutFixed64(&dst, 0x0807060504030201ull); + ASSERT_EQ(8U, dst.size()); + ASSERT_EQ(0x01, static_cast(dst[0])); + ASSERT_EQ(0x02, static_cast(dst[1])); + ASSERT_EQ(0x03, static_cast(dst[2])); + ASSERT_EQ(0x04, static_cast(dst[3])); + ASSERT_EQ(0x05, static_cast(dst[4])); + ASSERT_EQ(0x06, static_cast(dst[5])); + ASSERT_EQ(0x07, static_cast(dst[6])); + ASSERT_EQ(0x08, static_cast(dst[7])); +} + +TEST(Coding, Varint32) { + std::string s; + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t v = (i / 32) << (i % 32); + PutVarint32(&s, v); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (uint32_t i = 0; i < (32 * 32); i++) { + uint32_t expected = (i / 32) << (i % 32); + uint32_t actual = 0; + const char* start = p; + p = GetVarint32Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(expected, actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, s.data() + s.size()); +} + +TEST(Coding, Varint64) { + // Construct the list of values to check + std::vector values; + // Some special values + values.push_back(0); + values.push_back(100); + values.push_back(~static_cast(0)); + values.push_back(~static_cast(0) - 1); + for (uint32_t k = 0; k < 64; k++) { + // Test values near powers of two + const uint64_t power = 1ull << k; + values.push_back(power); + values.push_back(power - 1); + values.push_back(power + 1); + }; + + std::string s; + for (unsigned int i = 0; i < values.size(); i++) { + PutVarint64(&s, values[i]); + } + + const char* p = s.data(); + const char* limit = p + s.size(); + for (unsigned int i = 0; i < values.size(); i++) { + ASSERT_TRUE(p < limit); + uint64_t actual = 0; + const char* start = p; + p = GetVarint64Ptr(p, limit, &actual); + ASSERT_TRUE(p != nullptr); + ASSERT_EQ(values[i], actual); + ASSERT_EQ(VarintLength(actual), p - start); + } + ASSERT_EQ(p, limit); +} + +TEST(Coding, Varint32Overflow) { + uint32_t result; + std::string input("\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(), + &result) == nullptr); +} + +TEST(Coding, Varint32Truncation) { + uint32_t large_value = (1u << 31) + 100; + std::string s; + PutVarint32(&s, large_value); + uint32_t result; + for (unsigned int len = 0; len + 1 < s.size(); len++) { + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + s.size(), &result) != + nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Varint64Overflow) { + uint64_t result; + std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11"); + ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(), + &result) == nullptr); +} + +TEST(Coding, Varint64Truncation) { + uint64_t large_value = (1ull << 63) + 100ull; + std::string s; + PutVarint64(&s, large_value); + uint64_t result; + for (unsigned int len = 0; len + 1 < s.size(); len++) { + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr); + } + ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + s.size(), &result) != + nullptr); + ASSERT_EQ(large_value, result); +} + +TEST(Coding, Strings) { + std::string s; + PutLengthPrefixedSlice(&s, Slice("")); + PutLengthPrefixedSlice(&s, Slice("foo")); + PutLengthPrefixedSlice(&s, Slice("bar")); + PutLengthPrefixedSlice(&s, Slice(std::string(200, 'x'))); + + Slice input(s); + Slice v; + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("foo", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ("bar", v.ToString()); + ASSERT_TRUE(GetLengthPrefixedSlice(&input, &v)); + ASSERT_EQ(std::string(200, 'x'), v.ToString()); + ASSERT_EQ("", input.ToString()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/compaction_job_stats_impl.cc b/src/rocksdb/util/compaction_job_stats_impl.cc new file mode 100644 index 000000000..cfab2a4fe --- /dev/null +++ b/src/rocksdb/util/compaction_job_stats_impl.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/compaction_job_stats.h" + +namespace ROCKSDB_NAMESPACE { + +#ifndef ROCKSDB_LITE + +void CompactionJobStats::Reset() { + elapsed_micros = 0; + cpu_micros = 0; + + num_input_records = 0; + num_blobs_read = 0; + num_input_files = 0; + num_input_files_at_output_level = 0; + + num_output_records = 0; + num_output_files = 0; + num_output_files_blob = 0; + + is_full_compaction = false; + is_manual_compaction = false; + + total_input_bytes = 0; + total_blob_bytes_read = 0; + total_output_bytes = 0; + total_output_bytes_blob = 0; + + num_records_replaced = 0; + + total_input_raw_key_bytes = 0; + total_input_raw_value_bytes = 0; + + num_input_deletion_records = 0; + num_expired_deletion_records = 0; + + num_corrupt_keys = 0; + + file_write_nanos = 0; + file_range_sync_nanos = 0; + file_fsync_nanos = 0; + file_prepare_write_nanos = 0; + + smallest_output_key_prefix.clear(); + largest_output_key_prefix.clear(); + + num_single_del_fallthru = 0; + num_single_del_mismatch = 0; +} + +void CompactionJobStats::Add(const CompactionJobStats& stats) { + elapsed_micros += stats.elapsed_micros; + cpu_micros += stats.cpu_micros; + + num_input_records += stats.num_input_records; + num_blobs_read += stats.num_blobs_read; + num_input_files += stats.num_input_files; + num_input_files_at_output_level += stats.num_input_files_at_output_level; + + num_output_records += stats.num_output_records; + num_output_files += stats.num_output_files; + num_output_files_blob += stats.num_output_files_blob; + + total_input_bytes += stats.total_input_bytes; + total_blob_bytes_read += stats.total_blob_bytes_read; + total_output_bytes += stats.total_output_bytes; + total_output_bytes_blob += stats.total_output_bytes_blob; + + num_records_replaced += stats.num_records_replaced; + + total_input_raw_key_bytes += stats.total_input_raw_key_bytes; + total_input_raw_value_bytes += stats.total_input_raw_value_bytes; + + num_input_deletion_records += stats.num_input_deletion_records; + num_expired_deletion_records += stats.num_expired_deletion_records; + + num_corrupt_keys += stats.num_corrupt_keys; + + file_write_nanos += stats.file_write_nanos; + file_range_sync_nanos += stats.file_range_sync_nanos; + file_fsync_nanos += stats.file_fsync_nanos; + file_prepare_write_nanos += stats.file_prepare_write_nanos; + + num_single_del_fallthru += stats.num_single_del_fallthru; + num_single_del_mismatch += stats.num_single_del_mismatch; +} + +#else + +void CompactionJobStats::Reset() {} + +void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {} + +#endif // !ROCKSDB_LITE + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/comparator.cc b/src/rocksdb/util/comparator.cc new file mode 100644 index 000000000..f85ed69ee --- /dev/null +++ b/src/rocksdb/util/comparator.cc @@ -0,0 +1,391 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/comparator.h" + +#include + +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "port/lang.h" +#include "port/port.h" +#include "rocksdb/convenience.h" +#include "rocksdb/slice.h" +#include "rocksdb/utilities/customizable_util.h" +#include "rocksdb/utilities/object_registry.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +class BytewiseComparatorImpl : public Comparator { + public: + BytewiseComparatorImpl() {} + static const char* kClassName() { return "leveldb.BytewiseComparator"; } + const char* Name() const override { return kClassName(); } + + int Compare(const Slice& a, const Slice& b) const override { + return a.compare(b); + } + + bool Equal(const Slice& a, const Slice& b) const override { return a == b; } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + if (diff_index >= min_length) { + // Do not shorten if one string is a prefix of the other + } else { + uint8_t start_byte = static_cast((*start)[diff_index]); + uint8_t limit_byte = static_cast(limit[diff_index]); + if (start_byte >= limit_byte) { + // Cannot shorten since limit is smaller than start or start is + // already the shortest possible. + return; + } + assert(start_byte < limit_byte); + + if (diff_index < limit.size() - 1 || start_byte + 1 < limit_byte) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + } else { + // v + // A A 1 A A A + // A A 2 + // + // Incrementing the current byte will make start bigger than limit, we + // will skip this byte, and find the first non 0xFF byte in start and + // increment it. + diff_index++; + + while (diff_index < start->size()) { + // Keep moving until we find the first non 0xFF byte to + // increment it + if (static_cast((*start)[diff_index]) < + static_cast(0xff)) { + (*start)[diff_index]++; + start->resize(diff_index + 1); + break; + } + diff_index++; + } + } + assert(Compare(*start, limit) < 0); + } + } + + void FindShortSuccessor(std::string* key) const override { + // Find first character that can be incremented + size_t n = key->size(); + for (size_t i = 0; i < n; i++) { + const uint8_t byte = (*key)[i]; + if (byte != static_cast(0xff)) { + (*key)[i] = byte + 1; + key->resize(i + 1); + return; + } + } + // *key is a run of 0xffs. Leave it alone. + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + if (s.size() != t.size() || s.size() == 0) { + return false; + } + size_t diff_ind = s.difference_offset(t); + // same slice + if (diff_ind >= s.size()) return false; + uint8_t byte_s = static_cast(s[diff_ind]); + uint8_t byte_t = static_cast(t[diff_ind]); + // first different byte must be consecutive, and remaining bytes must be + // 0xff for s and 0x00 for t + if (byte_s != uint8_t{0xff} && byte_s + 1 == byte_t) { + for (size_t i = diff_ind + 1; i < s.size(); ++i) { + byte_s = static_cast(s[i]); + byte_t = static_cast(t[i]); + if (byte_s != uint8_t{0xff} || byte_t != uint8_t{0x00}) { + return false; + } + } + return true; + } else { + return false; + } + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return false; + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b, + bool /*b_has_ts*/) const override { + return a.compare(b); + } + + bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const override { + return a == b; + } +}; + +class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { + public: + ReverseBytewiseComparatorImpl() {} + + static const char* kClassName() { + return "rocksdb.ReverseBytewiseComparator"; + } + const char* Name() const override { return kClassName(); } + + int Compare(const Slice& a, const Slice& b) const override { + return -a.compare(b); + } + + void FindShortestSeparator(std::string* start, + const Slice& limit) const override { + // Find length of common prefix + size_t min_length = std::min(start->size(), limit.size()); + size_t diff_index = 0; + while ((diff_index < min_length) && + ((*start)[diff_index] == limit[diff_index])) { + diff_index++; + } + + assert(diff_index <= min_length); + if (diff_index == min_length) { + // Do not shorten if one string is a prefix of the other + // + // We could handle cases like: + // V + // A A 2 X Y + // A A 2 + // in a similar way as BytewiseComparator::FindShortestSeparator(). + // We keep it simple by not implementing it. We can come back to it + // later when needed. + } else { + uint8_t start_byte = static_cast((*start)[diff_index]); + uint8_t limit_byte = static_cast(limit[diff_index]); + if (start_byte > limit_byte && diff_index < start->size() - 1) { + // Case like + // V + // A A 3 A A + // A A 1 B B + // + // or + // v + // A A 2 A A + // A A 1 B B + // In this case "AA2" will be good. +#ifndef NDEBUG + std::string old_start = *start; +#endif + start->resize(diff_index + 1); +#ifndef NDEBUG + assert(old_start >= *start); +#endif + assert(Slice(*start).compare(limit) > 0); + } + } + } + + void FindShortSuccessor(std::string* /*key*/) const override { + // Don't do anything for simplicity. + } + + bool IsSameLengthImmediateSuccessor(const Slice& s, + const Slice& t) const override { + // Always returning false to prevent surfacing design flaws in + // auto_prefix_mode + (void)s, (void)t; + return false; + // "Correct" implementation: + // return BytewiseComparatorImpl::IsSameLengthImmediateSuccessor(t, s); + } + + bool CanKeysWithDifferentByteContentsBeEqual() const override { + return false; + } + + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool /*a_has_ts*/, const Slice& b, + bool /*b_has_ts*/) const override { + return -a.compare(b); + } +}; + +// EXPERIMENTAL +// Comparator with 64-bit integer timestamp. +// We did not performance test this yet. +template +class ComparatorWithU64TsImpl : public Comparator { + static_assert(std::is_base_of::value, + "template type must be a inherited type of comparator"); + + public: + explicit ComparatorWithU64TsImpl() : Comparator(/*ts_sz=*/sizeof(uint64_t)) { + assert(cmp_without_ts_.timestamp_size() == 0); + } + + static const char* kClassName() { + static std::string class_name = kClassNameInternal(); + return class_name.c_str(); + } + + const char* Name() const override { return kClassName(); } + + void FindShortSuccessor(std::string*) const override {} + void FindShortestSeparator(std::string*, const Slice&) const override {} + int Compare(const Slice& a, const Slice& b) const override { + int ret = CompareWithoutTimestamp(a, b); + size_t ts_sz = timestamp_size(); + if (ret != 0) { + return ret; + } + // Compare timestamp. + // For the same user key with different timestamps, larger (newer) timestamp + // comes first. + return -CompareTimestamp(ExtractTimestampFromUserKey(a, ts_sz), + ExtractTimestampFromUserKey(b, ts_sz)); + } + using Comparator::CompareWithoutTimestamp; + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const override { + const size_t ts_sz = timestamp_size(); + assert(!a_has_ts || a.size() >= ts_sz); + assert(!b_has_ts || b.size() >= ts_sz); + Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, ts_sz) : a; + Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, ts_sz) : b; + return cmp_without_ts_.Compare(lhs, rhs); + } + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override { + assert(ts1.size() == sizeof(uint64_t)); + assert(ts2.size() == sizeof(uint64_t)); + uint64_t lhs = DecodeFixed64(ts1.data()); + uint64_t rhs = DecodeFixed64(ts2.data()); + if (lhs < rhs) { + return -1; + } else if (lhs > rhs) { + return 1; + } else { + return 0; + } + } + + private: + static std::string kClassNameInternal() { + std::stringstream ss; + ss << TComparator::kClassName() << ".u64ts"; + return ss.str(); + } + + TComparator cmp_without_ts_; +}; + +} // namespace + +const Comparator* BytewiseComparator() { + STATIC_AVOID_DESTRUCTION(BytewiseComparatorImpl, bytewise); + return &bytewise; +} + +const Comparator* ReverseBytewiseComparator() { + STATIC_AVOID_DESTRUCTION(ReverseBytewiseComparatorImpl, rbytewise); + return &rbytewise; +} + +const Comparator* BytewiseComparatorWithU64Ts() { + STATIC_AVOID_DESTRUCTION(ComparatorWithU64TsImpl, + comp_with_u64_ts); + return &comp_with_u64_ts; +} + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinComparators(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + BytewiseComparatorImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return BytewiseComparator(); }); + library.AddFactory( + ReverseBytewiseComparatorImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); + library.AddFactory( + ComparatorWithU64TsImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard */, + std::string* /* errmsg */) { return BytewiseComparatorWithU64Ts(); }); + return 3; +} +#endif // ROCKSDB_LITE + +Status Comparator::CreateFromString(const ConfigOptions& config_options, + const std::string& value, + const Comparator** result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinComparators(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, *result, value, + &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } + if (id == BytewiseComparatorImpl::kClassName()) { + *result = BytewiseComparator(); + } else if (id == ReverseBytewiseComparatorImpl::kClassName()) { + *result = ReverseBytewiseComparator(); + } else if (id == + ComparatorWithU64TsImpl::kClassName()) { + *result = BytewiseComparatorWithU64Ts(); + } else if (value.empty()) { + // No Id and no options. Clear the object + *result = nullptr; + return Status::OK(); + } else if (id.empty()) { // We have no Id but have options. Not good + return Status::NotSupported("Cannot reset object ", id); + } else { +#ifndef ROCKSDB_LITE + status = config_options.registry->NewStaticObject(id, result); +#else + status = Status::NotSupported("Cannot load object in LITE mode ", id); +#endif // ROCKSDB_LITE + if (!status.ok()) { + if (config_options.ignore_unsupported_options && + status.IsNotSupported()) { + return Status::OK(); + } else { + return status; + } + } else { + Comparator* comparator = const_cast(*result); + status = + Customizable::ConfigureNewObject(config_options, comparator, opt_map); + } + } + return status; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/compression.cc b/src/rocksdb/util/compression.cc new file mode 100644 index 000000000..8e2f01b12 --- /dev/null +++ b/src/rocksdb/util/compression.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2022-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/compression.h" + +namespace ROCKSDB_NAMESPACE { + +StreamingCompress* StreamingCompress::Create(CompressionType compression_type, + const CompressionOptions& opts, + uint32_t compress_format_version, + size_t max_output_len) { + switch (compression_type) { + case kZSTD: { + if (!ZSTD_Streaming_Supported()) { + return nullptr; + } + return new ZSTDStreamingCompress(opts, compress_format_version, + max_output_len); + } + default: + return nullptr; + } +} + +StreamingUncompress* StreamingUncompress::Create( + CompressionType compression_type, uint32_t compress_format_version, + size_t max_output_len) { + switch (compression_type) { + case kZSTD: { + if (!ZSTD_Streaming_Supported()) { + return nullptr; + } + return new ZSTDStreamingUncompress(compress_format_version, + max_output_len); + } + default: + return nullptr; + } +} + +int ZSTDStreamingCompress::Compress(const char* input, size_t input_size, + char* output, size_t* output_pos) { + assert(input != nullptr && output != nullptr && output_pos != nullptr); + *output_pos = 0; + // Don't need to compress an empty input + if (input_size == 0) { + return 0; + } +#ifndef ZSTD_STREAMING + (void)input; + (void)input_size; + (void)output; + return -1; +#else + if (input_buffer_.src == nullptr || input_buffer_.src != input) { + // New input + // Catch errors where the previous input was not fully decompressed. + assert(input_buffer_.pos == input_buffer_.size); + input_buffer_ = {input, input_size, /*pos=*/0}; + } else if (input_buffer_.src == input) { + // Same input, not fully compressed. + } + ZSTD_outBuffer output_buffer = {output, max_output_len_, /*pos=*/0}; + const size_t remaining = + ZSTD_compressStream2(cctx_, &output_buffer, &input_buffer_, ZSTD_e_end); + if (ZSTD_isError(remaining)) { + // Failure + Reset(); + return -1; + } + // Success + *output_pos = output_buffer.pos; + return (int)remaining; +#endif +} + +void ZSTDStreamingCompress::Reset() { +#ifdef ZSTD_STREAMING + ZSTD_CCtx_reset(cctx_, ZSTD_ResetDirective::ZSTD_reset_session_only); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif +} + +int ZSTDStreamingUncompress::Uncompress(const char* input, size_t input_size, + char* output, size_t* output_pos) { + assert(input != nullptr && output != nullptr && output_pos != nullptr); + *output_pos = 0; + // Don't need to uncompress an empty input + if (input_size == 0) { + return 0; + } +#ifdef ZSTD_STREAMING + if (input_buffer_.src != input) { + // New input + input_buffer_ = {input, input_size, /*pos=*/0}; + } + ZSTD_outBuffer output_buffer = {output, max_output_len_, /*pos=*/0}; + size_t ret = ZSTD_decompressStream(dctx_, &output_buffer, &input_buffer_); + if (ZSTD_isError(ret)) { + Reset(); + return -1; + } + *output_pos = output_buffer.pos; + return (int)(input_buffer_.size - input_buffer_.pos); +#else + (void)input; + (void)input_size; + (void)output; + return -1; +#endif +} + +void ZSTDStreamingUncompress::Reset() { +#ifdef ZSTD_STREAMING + ZSTD_DCtx_reset(dctx_, ZSTD_ResetDirective::ZSTD_reset_session_only); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/compression.h b/src/rocksdb/util/compression.h new file mode 100644 index 000000000..0d4febcfb --- /dev/null +++ b/src/rocksdb/util/compression.h @@ -0,0 +1,1786 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once + +#include +#include +#ifdef ROCKSDB_MALLOC_USABLE_SIZE +#ifdef OS_FREEBSD +#include +#else // OS_FREEBSD +#include +#endif // OS_FREEBSD +#endif // ROCKSDB_MALLOC_USABLE_SIZE +#include + +#include "memory/memory_allocator.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/string_util.h" + +#ifdef SNAPPY +#include +#endif + +#ifdef ZLIB +#include +#endif + +#ifdef BZIP2 +#include +#endif + +#if defined(LZ4) +#include +#include +#endif + +#if defined(ZSTD) +#include +// v1.1.3+ +#if ZSTD_VERSION_NUMBER >= 10103 +#include +#endif // ZSTD_VERSION_NUMBER >= 10103 +// v1.4.0+ +#if ZSTD_VERSION_NUMBER >= 10400 +#define ZSTD_STREAMING +#endif // ZSTD_VERSION_NUMBER >= 10400 +namespace ROCKSDB_NAMESPACE { +// Need this for the context allocation override +// On windows we need to do this explicitly +#if (ZSTD_VERSION_NUMBER >= 500) +#if defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && \ + defined(ZSTD_STATIC_LINKING_ONLY) +#define ROCKSDB_ZSTD_CUSTOM_MEM +namespace port { +ZSTD_customMem GetJeZstdAllocationOverrides(); +} // namespace port +#endif // defined(ROCKSDB_JEMALLOC) && defined(OS_WIN) && + // defined(ZSTD_STATIC_LINKING_ONLY) + +// We require `ZSTD_sizeof_DDict` and `ZSTD_createDDict_byReference` to use +// `ZSTD_DDict`. The former was introduced in v1.0.0 and the latter was +// introduced in v1.1.3. But an important bug fix for `ZSTD_sizeof_DDict` came +// in v1.1.4, so that is the version we require. As of today's latest version +// (v1.3.8), they are both still in the experimental API, which means they are +// only exported when the compiler flag `ZSTD_STATIC_LINKING_ONLY` is set. +#if defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104 +#define ROCKSDB_ZSTD_DDICT +#endif // defined(ZSTD_STATIC_LINKING_ONLY) && ZSTD_VERSION_NUMBER >= 10104 + +// Cached data represents a portion that can be re-used +// If, in the future we have more than one native context to +// cache we can arrange this as a tuple +class ZSTDUncompressCachedData { + public: + using ZSTDNativeContext = ZSTD_DCtx*; + ZSTDUncompressCachedData() {} + // Init from cache + ZSTDUncompressCachedData(const ZSTDUncompressCachedData& o) = delete; + ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete; + ZSTDUncompressCachedData(ZSTDUncompressCachedData&& o) noexcept + : ZSTDUncompressCachedData() { + *this = std::move(o); + } + ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&& o) noexcept { + assert(zstd_ctx_ == nullptr); + std::swap(zstd_ctx_, o.zstd_ctx_); + std::swap(cache_idx_, o.cache_idx_); + return *this; + } + ZSTDNativeContext Get() const { return zstd_ctx_; } + int64_t GetCacheIndex() const { return cache_idx_; } + void CreateIfNeeded() { + if (zstd_ctx_ == nullptr) { +#ifdef ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = + ZSTD_createDCtx_advanced(port::GetJeZstdAllocationOverrides()); +#else // ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = ZSTD_createDCtx(); +#endif // ROCKSDB_ZSTD_CUSTOM_MEM + cache_idx_ = -1; + } + } + void InitFromCache(const ZSTDUncompressCachedData& o, int64_t idx) { + zstd_ctx_ = o.zstd_ctx_; + cache_idx_ = idx; + } + ~ZSTDUncompressCachedData() { + if (zstd_ctx_ != nullptr && cache_idx_ == -1) { + ZSTD_freeDCtx(zstd_ctx_); + } + } + + private: + ZSTDNativeContext zstd_ctx_ = nullptr; + int64_t cache_idx_ = -1; // -1 means this instance owns the context +}; +#endif // (ZSTD_VERSION_NUMBER >= 500) +} // namespace ROCKSDB_NAMESPACE +#endif // ZSTD + +#if !(defined ZSTD) || !(ZSTD_VERSION_NUMBER >= 500) +namespace ROCKSDB_NAMESPACE { +class ZSTDUncompressCachedData { + void* padding; // unused + public: + using ZSTDNativeContext = void*; + ZSTDUncompressCachedData() {} + ZSTDUncompressCachedData(const ZSTDUncompressCachedData&) {} + ZSTDUncompressCachedData& operator=(const ZSTDUncompressCachedData&) = delete; + ZSTDUncompressCachedData(ZSTDUncompressCachedData&&) noexcept = default; + ZSTDUncompressCachedData& operator=(ZSTDUncompressCachedData&&) noexcept = + default; + ZSTDNativeContext Get() const { return nullptr; } + int64_t GetCacheIndex() const { return -1; } + void CreateIfNeeded() {} + void InitFromCache(const ZSTDUncompressCachedData&, int64_t) {} + + private: + void ignore_padding__() { padding = nullptr; } +}; +} // namespace ROCKSDB_NAMESPACE +#endif + +#if defined(XPRESS) +#include "port/xpress.h" +#endif + +namespace ROCKSDB_NAMESPACE { + +// Holds dictionary and related data, like ZSTD's digested compression +// dictionary. +struct CompressionDict { +#if ZSTD_VERSION_NUMBER >= 700 + ZSTD_CDict* zstd_cdict_ = nullptr; +#endif // ZSTD_VERSION_NUMBER >= 700 + std::string dict_; + + public: +#if ZSTD_VERSION_NUMBER >= 700 + CompressionDict(std::string dict, CompressionType type, int level) { +#else // ZSTD_VERSION_NUMBER >= 700 + CompressionDict(std::string dict, CompressionType /*type*/, int /*level*/) { +#endif // ZSTD_VERSION_NUMBER >= 700 + dict_ = std::move(dict); +#if ZSTD_VERSION_NUMBER >= 700 + zstd_cdict_ = nullptr; + if (!dict_.empty() && (type == kZSTD || type == kZSTDNotFinalCompression)) { + if (level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } + // Should be safe (but slower) if below call fails as we'll use the + // raw dictionary to compress. + zstd_cdict_ = ZSTD_createCDict(dict_.data(), dict_.size(), level); + assert(zstd_cdict_ != nullptr); + } +#endif // ZSTD_VERSION_NUMBER >= 700 + } + + ~CompressionDict() { +#if ZSTD_VERSION_NUMBER >= 700 + size_t res = 0; + if (zstd_cdict_ != nullptr) { + res = ZSTD_freeCDict(zstd_cdict_); + } + assert(res == 0); // Last I checked they can't fail + (void)res; // prevent unused var warning +#endif // ZSTD_VERSION_NUMBER >= 700 + } + +#if ZSTD_VERSION_NUMBER >= 700 + const ZSTD_CDict* GetDigestedZstdCDict() const { return zstd_cdict_; } +#endif // ZSTD_VERSION_NUMBER >= 700 + + Slice GetRawDict() const { return dict_; } + + static const CompressionDict& GetEmptyDict() { + static CompressionDict empty_dict{}; + return empty_dict; + } + + CompressionDict() = default; + // Disable copy/move + CompressionDict(const CompressionDict&) = delete; + CompressionDict& operator=(const CompressionDict&) = delete; + CompressionDict(CompressionDict&&) = delete; + CompressionDict& operator=(CompressionDict&&) = delete; +}; + +// Holds dictionary and related data, like ZSTD's digested uncompression +// dictionary. +struct UncompressionDict { + // Block containing the data for the compression dictionary in case the + // constructor that takes a string parameter is used. + std::string dict_; + + // Block containing the data for the compression dictionary in case the + // constructor that takes a Slice parameter is used and the passed in + // CacheAllocationPtr is not nullptr. + CacheAllocationPtr allocation_; + + // Slice pointing to the compression dictionary data. Can point to + // dict_, allocation_, or some other memory location, depending on how + // the object was constructed. + Slice slice_; + +#ifdef ROCKSDB_ZSTD_DDICT + // Processed version of the contents of slice_ for ZSTD compression. + ZSTD_DDict* zstd_ddict_ = nullptr; +#endif // ROCKSDB_ZSTD_DDICT + +#ifdef ROCKSDB_ZSTD_DDICT + UncompressionDict(std::string dict, bool using_zstd) +#else // ROCKSDB_ZSTD_DDICT + UncompressionDict(std::string dict, bool /* using_zstd */) +#endif // ROCKSDB_ZSTD_DDICT + : dict_(std::move(dict)), slice_(dict_) { +#ifdef ROCKSDB_ZSTD_DDICT + if (!slice_.empty() && using_zstd) { + zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size()); + assert(zstd_ddict_ != nullptr); + } +#endif // ROCKSDB_ZSTD_DDICT + } + +#ifdef ROCKSDB_ZSTD_DDICT + UncompressionDict(Slice slice, CacheAllocationPtr&& allocation, + bool using_zstd) +#else // ROCKSDB_ZSTD_DDICT + UncompressionDict(Slice slice, CacheAllocationPtr&& allocation, + bool /* using_zstd */) +#endif // ROCKSDB_ZSTD_DDICT + : allocation_(std::move(allocation)), slice_(std::move(slice)) { +#ifdef ROCKSDB_ZSTD_DDICT + if (!slice_.empty() && using_zstd) { + zstd_ddict_ = ZSTD_createDDict_byReference(slice_.data(), slice_.size()); + assert(zstd_ddict_ != nullptr); + } +#endif // ROCKSDB_ZSTD_DDICT + } + + UncompressionDict(UncompressionDict&& rhs) + : dict_(std::move(rhs.dict_)), + allocation_(std::move(rhs.allocation_)), + slice_(std::move(rhs.slice_)) +#ifdef ROCKSDB_ZSTD_DDICT + , + zstd_ddict_(rhs.zstd_ddict_) +#endif + { +#ifdef ROCKSDB_ZSTD_DDICT + rhs.zstd_ddict_ = nullptr; +#endif + } + + ~UncompressionDict() { +#ifdef ROCKSDB_ZSTD_DDICT + size_t res = 0; + if (zstd_ddict_ != nullptr) { + res = ZSTD_freeDDict(zstd_ddict_); + } + assert(res == 0); // Last I checked they can't fail + (void)res; // prevent unused var warning +#endif // ROCKSDB_ZSTD_DDICT + } + + UncompressionDict& operator=(UncompressionDict&& rhs) { + if (this == &rhs) { + return *this; + } + + dict_ = std::move(rhs.dict_); + allocation_ = std::move(rhs.allocation_); + slice_ = std::move(rhs.slice_); + +#ifdef ROCKSDB_ZSTD_DDICT + zstd_ddict_ = rhs.zstd_ddict_; + rhs.zstd_ddict_ = nullptr; +#endif + + return *this; + } + + // The object is self-contained if the string constructor is used, or the + // Slice constructor is invoked with a non-null allocation. Otherwise, it + // is the caller's responsibility to ensure that the underlying storage + // outlives this object. + bool own_bytes() const { return !dict_.empty() || allocation_; } + + const Slice& GetRawDict() const { return slice_; } + +#ifdef ROCKSDB_ZSTD_DDICT + const ZSTD_DDict* GetDigestedZstdDDict() const { return zstd_ddict_; } +#endif // ROCKSDB_ZSTD_DDICT + + static const UncompressionDict& GetEmptyDict() { + static UncompressionDict empty_dict{}; + return empty_dict; + } + + size_t ApproximateMemoryUsage() const { + size_t usage = sizeof(struct UncompressionDict); + usage += dict_.size(); + if (allocation_) { + auto allocator = allocation_.get_deleter().allocator; + if (allocator) { + usage += allocator->UsableSize(allocation_.get(), slice_.size()); + } else { + usage += slice_.size(); + } + } +#ifdef ROCKSDB_ZSTD_DDICT + usage += ZSTD_sizeof_DDict(zstd_ddict_); +#endif // ROCKSDB_ZSTD_DDICT + return usage; + } + + UncompressionDict() = default; + // Disable copy + UncompressionDict(const CompressionDict&) = delete; + UncompressionDict& operator=(const CompressionDict&) = delete; +}; + +class CompressionContext { + private: +#if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500) + ZSTD_CCtx* zstd_ctx_ = nullptr; + void CreateNativeContext(CompressionType type) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { +#ifdef ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = + ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides()); +#else // ROCKSDB_ZSTD_CUSTOM_MEM + zstd_ctx_ = ZSTD_createCCtx(); +#endif // ROCKSDB_ZSTD_CUSTOM_MEM + } + } + void DestroyNativeContext() { + if (zstd_ctx_ != nullptr) { + ZSTD_freeCCtx(zstd_ctx_); + } + } + + public: + // callable inside ZSTD_Compress + ZSTD_CCtx* ZSTDPreallocCtx() const { + assert(zstd_ctx_ != nullptr); + return zstd_ctx_; + } + +#else // ZSTD && (ZSTD_VERSION_NUMBER >= 500) + private: + void CreateNativeContext(CompressionType /* type */) {} + void DestroyNativeContext() {} +#endif // ZSTD && (ZSTD_VERSION_NUMBER >= 500) + public: + explicit CompressionContext(CompressionType type) { + CreateNativeContext(type); + } + ~CompressionContext() { DestroyNativeContext(); } + CompressionContext(const CompressionContext&) = delete; + CompressionContext& operator=(const CompressionContext&) = delete; +}; + +class CompressionInfo { + const CompressionOptions& opts_; + const CompressionContext& context_; + const CompressionDict& dict_; + const CompressionType type_; + const uint64_t sample_for_compression_; + + public: + CompressionInfo(const CompressionOptions& _opts, + const CompressionContext& _context, + const CompressionDict& _dict, CompressionType _type, + uint64_t _sample_for_compression) + : opts_(_opts), + context_(_context), + dict_(_dict), + type_(_type), + sample_for_compression_(_sample_for_compression) {} + + const CompressionOptions& options() const { return opts_; } + const CompressionContext& context() const { return context_; } + const CompressionDict& dict() const { return dict_; } + CompressionType type() const { return type_; } + uint64_t SampleForCompression() const { return sample_for_compression_; } +}; + +class UncompressionContext { + private: + CompressionContextCache* ctx_cache_ = nullptr; + ZSTDUncompressCachedData uncomp_cached_data_; + + public: + explicit UncompressionContext(CompressionType type) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { + ctx_cache_ = CompressionContextCache::Instance(); + uncomp_cached_data_ = ctx_cache_->GetCachedZSTDUncompressData(); + } + } + ~UncompressionContext() { + if (uncomp_cached_data_.GetCacheIndex() != -1) { + assert(ctx_cache_ != nullptr); + ctx_cache_->ReturnCachedZSTDUncompressData( + uncomp_cached_data_.GetCacheIndex()); + } + } + UncompressionContext(const UncompressionContext&) = delete; + UncompressionContext& operator=(const UncompressionContext&) = delete; + + ZSTDUncompressCachedData::ZSTDNativeContext GetZSTDContext() const { + return uncomp_cached_data_.Get(); + } +}; + +class UncompressionInfo { + const UncompressionContext& context_; + const UncompressionDict& dict_; + const CompressionType type_; + + public: + UncompressionInfo(const UncompressionContext& _context, + const UncompressionDict& _dict, CompressionType _type) + : context_(_context), dict_(_dict), type_(_type) {} + + const UncompressionContext& context() const { return context_; } + const UncompressionDict& dict() const { return dict_; } + CompressionType type() const { return type_; } +}; + +inline bool Snappy_Supported() { +#ifdef SNAPPY + return true; +#else + return false; +#endif +} + +inline bool Zlib_Supported() { +#ifdef ZLIB + return true; +#else + return false; +#endif +} + +inline bool BZip2_Supported() { +#ifdef BZIP2 + return true; +#else + return false; +#endif +} + +inline bool LZ4_Supported() { +#ifdef LZ4 + return true; +#else + return false; +#endif +} + +inline bool XPRESS_Supported() { +#ifdef XPRESS + return true; +#else + return false; +#endif +} + +inline bool ZSTD_Supported() { +#ifdef ZSTD + // ZSTD format is finalized since version 0.8.0. + return (ZSTD_versionNumber() >= 800); +#else + return false; +#endif +} + +inline bool ZSTDNotFinal_Supported() { +#ifdef ZSTD + return true; +#else + return false; +#endif +} + +inline bool ZSTD_Streaming_Supported() { +#if defined(ZSTD) && defined(ZSTD_STREAMING) + return true; +#else + return false; +#endif +} + +inline bool StreamingCompressionTypeSupported( + CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return true; + case kZSTD: + return ZSTD_Streaming_Supported(); + default: + return false; + } +} + +inline bool CompressionTypeSupported(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return true; + case kSnappyCompression: + return Snappy_Supported(); + case kZlibCompression: + return Zlib_Supported(); + case kBZip2Compression: + return BZip2_Supported(); + case kLZ4Compression: + return LZ4_Supported(); + case kLZ4HCCompression: + return LZ4_Supported(); + case kXpressCompression: + return XPRESS_Supported(); + case kZSTDNotFinalCompression: + return ZSTDNotFinal_Supported(); + case kZSTD: + return ZSTD_Supported(); + default: + assert(false); + return false; + } +} + +inline bool DictCompressionTypeSupported(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return false; + case kSnappyCompression: + return false; + case kZlibCompression: + return Zlib_Supported(); + case kBZip2Compression: + return false; + case kLZ4Compression: + case kLZ4HCCompression: +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + return LZ4_Supported(); +#else + return false; +#endif + case kXpressCompression: + return false; + case kZSTDNotFinalCompression: +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + return ZSTDNotFinal_Supported(); +#else + return false; +#endif + case kZSTD: +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + return ZSTD_Supported(); +#else + return false; +#endif + default: + assert(false); + return false; + } +} + +inline std::string CompressionTypeToString(CompressionType compression_type) { + switch (compression_type) { + case kNoCompression: + return "NoCompression"; + case kSnappyCompression: + return "Snappy"; + case kZlibCompression: + return "Zlib"; + case kBZip2Compression: + return "BZip2"; + case kLZ4Compression: + return "LZ4"; + case kLZ4HCCompression: + return "LZ4HC"; + case kXpressCompression: + return "Xpress"; + case kZSTD: + return "ZSTD"; + case kZSTDNotFinalCompression: + return "ZSTDNotFinal"; + case kDisableCompressionOption: + return "DisableOption"; + default: + assert(false); + return ""; + } +} + +inline std::string CompressionOptionsToString( + CompressionOptions& compression_options) { + std::string result; + result.reserve(512); + result.append("window_bits=") + .append(std::to_string(compression_options.window_bits)) + .append("; "); + result.append("level=") + .append(std::to_string(compression_options.level)) + .append("; "); + result.append("strategy=") + .append(std::to_string(compression_options.strategy)) + .append("; "); + result.append("max_dict_bytes=") + .append(std::to_string(compression_options.max_dict_bytes)) + .append("; "); + result.append("zstd_max_train_bytes=") + .append(std::to_string(compression_options.zstd_max_train_bytes)) + .append("; "); + result.append("enabled=") + .append(std::to_string(compression_options.enabled)) + .append("; "); + result.append("max_dict_buffer_bytes=") + .append(std::to_string(compression_options.max_dict_buffer_bytes)) + .append("; "); + result.append("use_zstd_dict_trainer=") + .append(std::to_string(compression_options.use_zstd_dict_trainer)) + .append("; "); + return result; +} + +// compress_format_version can have two values: +// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed +// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent +// way. +// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the +// start of compressed block. Snappy format is the same as version 1. + +inline bool Snappy_Compress(const CompressionInfo& /*info*/, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#else + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +inline CacheAllocationPtr Snappy_Uncompress( + const char* input, size_t length, size_t* uncompressed_size, + MemoryAllocator* allocator = nullptr) { +#ifdef SNAPPY + size_t uncompressed_length = 0; + if (!snappy::GetUncompressedLength(input, length, &uncompressed_length)) { + return nullptr; + } + + CacheAllocationPtr output = AllocateBlock(uncompressed_length, allocator); + + if (!snappy::RawUncompress(input, length, output.get())) { + return nullptr; + } + + *uncompressed_size = uncompressed_length; + + return output; +#else + (void)input; + (void)length; + (void)uncompressed_size; + (void)allocator; + return nullptr; +#endif +} + +namespace compression { +// returns size +inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) { + PutVarint32(output, length); + return output->size(); +} + +inline bool GetDecompressedSizeInfo(const char** input_data, + size_t* input_length, + uint32_t* output_len) { + auto new_input_data = + GetVarint32Ptr(*input_data, *input_data + *input_length, output_len); + if (new_input_data == nullptr) { + return false; + } + *input_length -= (new_input_data - *input_data); + *input_data = new_input_data; + return true; +} +} // namespace compression + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool Zlib_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef ZLIB + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } + + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + level = Z_DEFAULT_COMPRESSION; + } else { + level = info.options().level; + } + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, level, Z_DEFLATED, info.options().window_bits, + memLevel, info.options().strategy); + if (st != Z_OK) { + return false; + } + + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + // Initialize the compression library's dictionary + st = deflateSetDictionary( + &_stream, reinterpret_cast(compression_dict.data()), + static_cast(compression_dict.size())); + if (st != Z_OK) { + deflateEnd(&_stream); + return false; + } + } + + // Get an upper bound on the compressed size. + size_t upper_bound = + deflateBound(&_stream, static_cast(length)); + output->resize(output_header_len + upper_bound); + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef*)input; + _stream.avail_in = static_cast(length); + + // Initialize the output size. + _stream.avail_out = static_cast(upper_bound); + _stream.next_out = reinterpret_cast(&(*output)[output_header_len]); + + bool compressed = false; + st = deflate(&_stream, Z_FINISH); + if (st == Z_STREAM_END) { + compressed = true; + output->resize(output->size() - _stream.avail_out); + } + // The only return value we really care about is Z_STREAM_END. + // Z_OK means insufficient output space. This means the compression is + // bigger than decompressed size. Just fail the compression in that case. + + deflateEnd(&_stream); + return compressed; +#else + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr Zlib_Uncompress( + const UncompressionInfo& info, const char* input_data, size_t input_length, + size_t* uncompressed_size, uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr, int windowBits = -14) { +#ifdef ZLIB + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast( + std::min(proposed_output_len, + static_cast(std::numeric_limits::max()))); + } + + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = + inflateInit2(&_stream, windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + const Slice& compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + // Initialize the compression library's dictionary + st = inflateSetDictionary( + &_stream, reinterpret_cast(compression_dict.data()), + static_cast(compression_dict.size())); + if (st != Z_OK) { + return nullptr; + } + } + + _stream.next_in = (Bytef*)input_data; + _stream.avail_in = static_cast(input_length); + + auto output = AllocateBlock(output_len, allocator); + + _stream.next_out = (Bytef*)output.get(); + _stream.avail_out = static_cast(output_len); + + bool done = false; + while (!done) { + st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + size_t old_sz = output_len; + uint32_t output_len_delta = output_len / 5; + output_len += output_len_delta < 10 ? 10 : output_len_delta; + auto tmp = AllocateBlock(output_len, allocator); + memcpy(tmp.get(), output.get(), old_sz); + output = std::move(tmp); + + // Set more output. + _stream.next_out = (Bytef*)(output.get() + old_sz); + _stream.avail_out = static_cast(output_len - old_sz); + break; + } + case Z_BUF_ERROR: + default: + inflateEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + assert(output_len >= _stream.avail_out); + *uncompressed_size = output_len - _stream.avail_out; + inflateEnd(&_stream); + return output; +#else + (void)info; + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)compress_format_version; + (void)allocator; + (void)windowBits; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool BZip2_Compress(const CompressionInfo& /*info*/, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef BZIP2 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(output_header_len + length); + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Compress the input, and put compressed data in output. + _stream.next_in = (char*)input; + _stream.avail_in = static_cast(length); + + // Initialize the output size. + _stream.avail_out = static_cast(length); + _stream.next_out = reinterpret_cast(&(*output)[output_header_len]); + + bool compressed = false; + st = BZ2_bzCompress(&_stream, BZ_FINISH); + if (st == BZ_STREAM_END) { + compressed = true; + output->resize(output->size() - _stream.avail_out); + } + // The only return value we really care about is BZ_STREAM_END. + // BZ_FINISH_OK means insufficient output space. This means the compression + // is bigger than decompressed size. Just fail the compression in that case. + + BZ2_bzCompressEnd(&_stream); + return compressed; +#else + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline CacheAllocationPtr BZip2_Uncompress( + const char* input_data, size_t input_length, size_t* uncompressed_size, + uint32_t compress_format_version, MemoryAllocator* allocator = nullptr) { +#ifdef BZIP2 + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the next page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast( + std::min(proposed_output_len, + static_cast(std::numeric_limits::max()))); + } + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char*)input_data; + _stream.avail_in = static_cast(input_length); + + auto output = AllocateBlock(output_len, allocator); + + _stream.next_out = (char*)output.get(); + _stream.avail_out = static_cast(output_len); + + bool done = false; + while (!done) { + st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + done = true; + break; + case BZ_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + uint32_t old_sz = output_len; + output_len = output_len * 1.2; + auto tmp = AllocateBlock(output_len, allocator); + memcpy(tmp.get(), output.get(), old_sz); + output = std::move(tmp); + + // Set more output. + _stream.next_out = (char*)(output.get() + old_sz); + _stream.avail_out = static_cast(output_len - old_sz); + break; + } + default: + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + assert(output_len >= _stream.avail_out); + *uncompressed_size = output_len - _stream.avail_out; + BZ2_bzDecompressEnd(&_stream); + return output; +#else + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)compress_format_version; + (void)allocator; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool LZ4_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + int compress_bound = LZ4_compressBound(static_cast(length)); + output->resize(static_cast(output_header_len + compress_bound)); + + int outlen; +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_stream_t* stream = LZ4_createStream(); + Slice compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + LZ4_loadDict(stream, compression_dict.data(), + static_cast(compression_dict.size())); + } +#if LZ4_VERSION_NUMBER >= 10700 // r129+ + outlen = + LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len], + static_cast(length), compress_bound, 1); +#else // up to r128 + outlen = LZ4_compress_limitedOutput_continue( + stream, input, &(*output)[output_header_len], static_cast(length), + compress_bound); +#endif + LZ4_freeStream(stream); +#else // up to r123 + outlen = LZ4_compress_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (outlen == 0) { + return false; + } + output->resize(static_cast(output_header_len + outlen)); + return true; +#else // LZ4 + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr LZ4_Uncompress(const UncompressionInfo& info, + const char* input_data, + size_t input_length, + size_t* uncompressed_size, + uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr) { +#ifdef LZ4 + uint32_t output_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + if (input_length < 8) { + return nullptr; + } + if (port::kLittleEndian) { + memcpy(&output_len, input_data, sizeof(output_len)); + } else { + memcpy(&output_len, input_data + 4, sizeof(output_len)); + } + input_length -= 8; + input_data += 8; + } + + auto output = AllocateBlock(output_len, allocator); + + int decompress_bytes = 0; + +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_streamDecode_t* stream = LZ4_createStreamDecode(); + const Slice& compression_dict = info.dict().GetRawDict(); + if (compression_dict.size()) { + LZ4_setStreamDecode(stream, compression_dict.data(), + static_cast(compression_dict.size())); + } + decompress_bytes = LZ4_decompress_safe_continue( + stream, input_data, output.get(), static_cast(input_length), + static_cast(output_len)); + LZ4_freeStreamDecode(stream); +#else // up to r123 + decompress_bytes = LZ4_decompress_safe(input_data, output.get(), + static_cast(input_length), + static_cast(output_len)); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (decompress_bytes < 0) { + return nullptr; + } + assert(decompress_bytes == static_cast(output_len)); + *uncompressed_size = decompress_bytes; + return output; +#else // LZ4 + (void)info; + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)compress_format_version; + (void)allocator; + return nullptr; +#endif +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline bool LZ4HC_Compress(const CompressionInfo& info, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + int compress_bound = LZ4_compressBound(static_cast(length)); + output->resize(static_cast(output_header_len + compress_bound)); + + int outlen; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + level = 0; // lz4hc.h says any value < 1 will be sanitized to default + } else { + level = info.options().level; + } +#if LZ4_VERSION_NUMBER >= 10400 // r124+ + LZ4_streamHC_t* stream = LZ4_createStreamHC(); + LZ4_resetStreamHC(stream, level); + Slice compression_dict = info.dict().GetRawDict(); + const char* compression_dict_data = + compression_dict.size() > 0 ? compression_dict.data() : nullptr; + size_t compression_dict_size = compression_dict.size(); + if (compression_dict_data != nullptr) { + LZ4_loadDictHC(stream, compression_dict_data, + static_cast(compression_dict_size)); + } + +#if LZ4_VERSION_NUMBER >= 10700 // r129+ + outlen = + LZ4_compress_HC_continue(stream, input, &(*output)[output_header_len], + static_cast(length), compress_bound); +#else // r124-r128 + outlen = LZ4_compressHC_limitedOutput_continue( + stream, input, &(*output)[output_header_len], static_cast(length), + compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10700 + LZ4_freeStreamHC(stream); + +#elif LZ4_VERSION_MAJOR // r113-r123 + outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), + compress_bound, level); +#else // up to r112 + outlen = + LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), compress_bound); +#endif // LZ4_VERSION_NUMBER >= 10400 + + if (outlen == 0) { + return false; + } + output->resize(static_cast(output_header_len + outlen)); + return true; +#else // LZ4 + (void)info; + (void)compress_format_version; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +#ifdef XPRESS +inline bool XPRESS_Compress(const char* input, size_t length, + std::string* output) { + return port::xpress::Compress(input, length, output); +} +#else +inline bool XPRESS_Compress(const char* /*input*/, size_t /*length*/, + std::string* /*output*/) { + return false; +} +#endif + +#ifdef XPRESS +inline char* XPRESS_Uncompress(const char* input_data, size_t input_length, + size_t* uncompressed_size) { + return port::xpress::Decompress(input_data, input_length, uncompressed_size); +} +#else +inline char* XPRESS_Uncompress(const char* /*input_data*/, + size_t /*input_length*/, + size_t* /*uncompressed_size*/) { + return nullptr; +} +#endif + +inline bool ZSTD_Compress(const CompressionInfo& info, const char* input, + size_t length, ::std::string* output) { +#ifdef ZSTD + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + + size_t compressBound = ZSTD_compressBound(length); + output->resize(static_cast(output_header_len + compressBound)); + size_t outlen = 0; + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } else { + level = info.options().level; + } +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + ZSTD_CCtx* context = info.context().ZSTDPreallocCtx(); + assert(context != nullptr); +#if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+ + if (info.dict().GetDigestedZstdCDict() != nullptr) { + outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len], + compressBound, input, length, + info.dict().GetDigestedZstdCDict()); + } +#endif // ZSTD_VERSION_NUMBER >= 700 + if (outlen == 0) { + outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len], + compressBound, input, length, + info.dict().GetRawDict().data(), + info.dict().GetRawDict().size(), level); + } +#else // up to v0.4.x + outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input, + length, level); +#endif // ZSTD_VERSION_NUMBER >= 500 + if (outlen == 0) { + return false; + } + output->resize(output_header_len + outlen); + return true; +#else // ZSTD + (void)info; + (void)input; + (void)length; + (void)output; + return false; +#endif +} + +// @param compression_dict Data for presetting the compression library's +// dictionary. +inline CacheAllocationPtr ZSTD_Uncompress( + const UncompressionInfo& info, const char* input_data, size_t input_length, + size_t* uncompressed_size, MemoryAllocator* allocator = nullptr) { +#ifdef ZSTD + uint32_t output_len = 0; + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + + auto output = AllocateBlock(output_len, allocator); + size_t actual_output_length = 0; +#if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ + ZSTD_DCtx* context = info.context().GetZSTDContext(); + assert(context != nullptr); +#ifdef ROCKSDB_ZSTD_DDICT + if (info.dict().GetDigestedZstdDDict() != nullptr) { + actual_output_length = ZSTD_decompress_usingDDict( + context, output.get(), output_len, input_data, input_length, + info.dict().GetDigestedZstdDDict()); + } +#endif // ROCKSDB_ZSTD_DDICT + if (actual_output_length == 0) { + actual_output_length = ZSTD_decompress_usingDict( + context, output.get(), output_len, input_data, input_length, + info.dict().GetRawDict().data(), info.dict().GetRawDict().size()); + } +#else // up to v0.4.x + (void)info; + actual_output_length = + ZSTD_decompress(output.get(), output_len, input_data, input_length); +#endif // ZSTD_VERSION_NUMBER >= 500 + assert(actual_output_length == output_len); + *uncompressed_size = actual_output_length; + return output; +#else // ZSTD + (void)info; + (void)input_data; + (void)input_length; + (void)uncompressed_size; + (void)allocator; + return nullptr; +#endif +} + +inline bool ZSTD_TrainDictionarySupported() { +#ifdef ZSTD + // Dictionary trainer is available since v0.6.1 for static linking, but not + // available for dynamic linking until v1.1.3. For now we enable the feature + // in v1.1.3+ only. + return (ZSTD_versionNumber() >= 10103); +#else + return false; +#endif +} + +inline std::string ZSTD_TrainDictionary(const std::string& samples, + const std::vector& sample_lens, + size_t max_dict_bytes) { + // Dictionary trainer is available since v0.6.1 for static linking, but not + // available for dynamic linking until v1.1.3. For now we enable the feature + // in v1.1.3+ only. +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ + assert(samples.empty() == sample_lens.empty()); + if (samples.empty()) { + return ""; + } + std::string dict_data(max_dict_bytes, '\0'); + size_t dict_len = ZDICT_trainFromBuffer( + &dict_data[0], max_dict_bytes, &samples[0], &sample_lens[0], + static_cast(sample_lens.size())); + if (ZDICT_isError(dict_len)) { + return ""; + } + assert(dict_len <= max_dict_bytes); + dict_data.resize(dict_len); + return dict_data; +#else // up to v1.1.2 + assert(false); + (void)samples; + (void)sample_lens; + (void)max_dict_bytes; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10103 +} + +inline std::string ZSTD_TrainDictionary(const std::string& samples, + size_t sample_len_shift, + size_t max_dict_bytes) { + // Dictionary trainer is available since v0.6.1, but ZSTD was marked stable + // only since v0.8.0. For now we enable the feature in stable versions only. +#if ZSTD_VERSION_NUMBER >= 10103 // v1.1.3+ + // skips potential partial sample at the end of "samples" + size_t num_samples = samples.size() >> sample_len_shift; + std::vector sample_lens(num_samples, size_t(1) << sample_len_shift); + return ZSTD_TrainDictionary(samples, sample_lens, max_dict_bytes); +#else // up to v1.1.2 + assert(false); + (void)samples; + (void)sample_len_shift; + (void)max_dict_bytes; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10103 +} + +inline bool ZSTD_FinalizeDictionarySupported() { +#ifdef ZSTD + // ZDICT_finalizeDictionary API is stable since v1.4.5 + return (ZSTD_versionNumber() >= 10405); +#else + return false; +#endif +} + +inline std::string ZSTD_FinalizeDictionary( + const std::string& samples, const std::vector& sample_lens, + size_t max_dict_bytes, int level) { + // ZDICT_finalizeDictionary is stable since version v1.4.5 +#if ZSTD_VERSION_NUMBER >= 10405 // v1.4.5+ + assert(samples.empty() == sample_lens.empty()); + if (samples.empty()) { + return ""; + } + if (level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } + std::string dict_data(max_dict_bytes, '\0'); + size_t dict_len = ZDICT_finalizeDictionary( + dict_data.data(), max_dict_bytes, samples.data(), + std::min(static_cast(samples.size()), max_dict_bytes), + samples.data(), sample_lens.data(), + static_cast(sample_lens.size()), + {level, 0 /* notificationLevel */, 0 /* dictID */}); + if (ZDICT_isError(dict_len)) { + return ""; + } else { + assert(dict_len <= max_dict_bytes); + dict_data.resize(dict_len); + return dict_data; + } +#else // up to v1.4.4 + (void)samples; + (void)sample_lens; + (void)max_dict_bytes; + (void)level; + return ""; +#endif // ZSTD_VERSION_NUMBER >= 10405 +} + +inline bool CompressData(const Slice& raw, + const CompressionInfo& compression_info, + uint32_t compress_format_version, + std::string* compressed_output) { + bool ret = false; + + // Will return compressed block contents if (1) the compression method is + // supported in this platform and (2) the compression rate is "good enough". + switch (compression_info.type()) { + case kSnappyCompression: + ret = Snappy_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + break; + case kZlibCompression: + ret = Zlib_Compress(compression_info, compress_format_version, raw.data(), + raw.size(), compressed_output); + break; + case kBZip2Compression: + ret = BZip2_Compress(compression_info, compress_format_version, + raw.data(), raw.size(), compressed_output); + break; + case kLZ4Compression: + ret = LZ4_Compress(compression_info, compress_format_version, raw.data(), + raw.size(), compressed_output); + break; + case kLZ4HCCompression: + ret = LZ4HC_Compress(compression_info, compress_format_version, + raw.data(), raw.size(), compressed_output); + break; + case kXpressCompression: + ret = XPRESS_Compress(raw.data(), raw.size(), compressed_output); + break; + case kZSTD: + case kZSTDNotFinalCompression: + ret = ZSTD_Compress(compression_info, raw.data(), raw.size(), + compressed_output); + break; + default: + // Do not recognize this compression type + break; + } + + TEST_SYNC_POINT_CALLBACK("CompressData:TamperWithReturnValue", + static_cast(&ret)); + + return ret; +} + +inline CacheAllocationPtr UncompressData( + const UncompressionInfo& uncompression_info, const char* data, size_t n, + size_t* uncompressed_size, uint32_t compress_format_version, + MemoryAllocator* allocator = nullptr) { + switch (uncompression_info.type()) { + case kSnappyCompression: + return Snappy_Uncompress(data, n, uncompressed_size, allocator); + case kZlibCompression: + return Zlib_Uncompress(uncompression_info, data, n, uncompressed_size, + compress_format_version, allocator); + case kBZip2Compression: + return BZip2_Uncompress(data, n, uncompressed_size, + compress_format_version, allocator); + case kLZ4Compression: + case kLZ4HCCompression: + return LZ4_Uncompress(uncompression_info, data, n, uncompressed_size, + compress_format_version, allocator); + case kXpressCompression: + // XPRESS allocates memory internally, thus no support for custom + // allocator. + return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size)); + case kZSTD: + case kZSTDNotFinalCompression: + return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size, + allocator); + default: + return CacheAllocationPtr(); + } +} + +// Records the compression type for subsequent WAL records. +class CompressionTypeRecord { + public: + explicit CompressionTypeRecord(CompressionType compression_type) + : compression_type_(compression_type) {} + + CompressionType GetCompressionType() const { return compression_type_; } + + inline void EncodeTo(std::string* dst) const { + assert(dst != nullptr); + PutFixed32(dst, compression_type_); + } + + inline Status DecodeFrom(Slice* src) { + constexpr char class_name[] = "CompressionTypeRecord"; + + uint32_t val; + if (!GetFixed32(src, &val)) { + return Status::Corruption(class_name, + "Error decoding WAL compression type"); + } + CompressionType compression_type = static_cast(val); + if (!StreamingCompressionTypeSupported(compression_type)) { + return Status::Corruption(class_name, + "WAL compression type not supported"); + } + compression_type_ = compression_type; + return Status::OK(); + } + + inline std::string DebugString() const { + return "compression_type: " + CompressionTypeToString(compression_type_); + } + + private: + CompressionType compression_type_; +}; + +// Base class to implement compression for a stream of buffers. +// Instantiate an implementation of the class using Create() with the +// compression type and use Compress() repeatedly. +// The output buffer needs to be at least max_output_len. +// Call Reset() in between frame boundaries or in case of an error. +// NOTE: This class is not thread safe. +class StreamingCompress { + public: + StreamingCompress(CompressionType compression_type, + const CompressionOptions& opts, + uint32_t compress_format_version, size_t max_output_len) + : compression_type_(compression_type), + opts_(opts), + compress_format_version_(compress_format_version), + max_output_len_(max_output_len) {} + virtual ~StreamingCompress() = default; + // compress should be called repeatedly with the same input till the method + // returns 0 + // Parameters: + // input - buffer to compress + // input_size - size of input buffer + // output - compressed buffer allocated by caller, should be at least + // max_output_len + // output_size - size of the output buffer + // Returns -1 for errors, the remaining size of the input buffer that needs to + // be compressed + virtual int Compress(const char* input, size_t input_size, char* output, + size_t* output_pos) = 0; + // static method to create object of a class inherited from StreamingCompress + // based on the actual compression type. + static StreamingCompress* Create(CompressionType compression_type, + const CompressionOptions& opts, + uint32_t compress_format_version, + size_t max_output_len); + virtual void Reset() = 0; + + protected: + const CompressionType compression_type_; + const CompressionOptions opts_; + const uint32_t compress_format_version_; + const size_t max_output_len_; +}; + +// Base class to uncompress a stream of compressed buffers. +// Instantiate an implementation of the class using Create() with the +// compression type and use Uncompress() repeatedly. +// The output buffer needs to be at least max_output_len. +// Call Reset() in between frame boundaries or in case of an error. +// NOTE: This class is not thread safe. +class StreamingUncompress { + public: + StreamingUncompress(CompressionType compression_type, + uint32_t compress_format_version, size_t max_output_len) + : compression_type_(compression_type), + compress_format_version_(compress_format_version), + max_output_len_(max_output_len) {} + virtual ~StreamingUncompress() = default; + // uncompress should be called again with the same input if output_size is + // equal to max_output_len or with the next input fragment. + // Parameters: + // input - buffer to uncompress + // input_size - size of input buffer + // output - uncompressed buffer allocated by caller, should be at least + // max_output_len + // output_size - size of the output buffer + // Returns -1 for errors, remaining input to be processed otherwise. + virtual int Uncompress(const char* input, size_t input_size, char* output, + size_t* output_pos) = 0; + static StreamingUncompress* Create(CompressionType compression_type, + uint32_t compress_format_version, + size_t max_output_len); + virtual void Reset() = 0; + + protected: + CompressionType compression_type_; + uint32_t compress_format_version_; + size_t max_output_len_; +}; + +class ZSTDStreamingCompress final : public StreamingCompress { + public: + explicit ZSTDStreamingCompress(const CompressionOptions& opts, + uint32_t compress_format_version, + size_t max_output_len) + : StreamingCompress(kZSTD, opts, compress_format_version, + max_output_len) { +#ifdef ZSTD_STREAMING + cctx_ = ZSTD_createCCtx(); + // Each compressed frame will have a checksum + ZSTD_CCtx_setParameter(cctx_, ZSTD_c_checksumFlag, 1); + assert(cctx_ != nullptr); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif + } + ~ZSTDStreamingCompress() override { +#ifdef ZSTD_STREAMING + ZSTD_freeCCtx(cctx_); +#endif + } + int Compress(const char* input, size_t input_size, char* output, + size_t* output_pos) override; + void Reset() override; +#ifdef ZSTD_STREAMING + ZSTD_CCtx* cctx_; + ZSTD_inBuffer input_buffer_; +#endif +}; + +class ZSTDStreamingUncompress final : public StreamingUncompress { + public: + explicit ZSTDStreamingUncompress(uint32_t compress_format_version, + size_t max_output_len) + : StreamingUncompress(kZSTD, compress_format_version, max_output_len) { +#ifdef ZSTD_STREAMING + dctx_ = ZSTD_createDCtx(); + assert(dctx_ != nullptr); + input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; +#endif + } + ~ZSTDStreamingUncompress() override { +#ifdef ZSTD_STREAMING + ZSTD_freeDCtx(dctx_); +#endif + } + int Uncompress(const char* input, size_t input_size, char* output, + size_t* output_size) override; + void Reset() override; + + private: +#ifdef ZSTD_STREAMING + ZSTD_DCtx* dctx_; + ZSTD_inBuffer input_buffer_; +#endif +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/compression_context_cache.cc b/src/rocksdb/util/compression_context_cache.cc new file mode 100644 index 000000000..52c3fac72 --- /dev/null +++ b/src/rocksdb/util/compression_context_cache.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +#include "util/compression_context_cache.h" + +#include + +#include "util/compression.h" +#include "util/core_local.h" + +namespace ROCKSDB_NAMESPACE { +namespace compression_cache { + +void* const SentinelValue = nullptr; +// Cache ZSTD uncompression contexts for reads +// if needed we can add ZSTD compression context caching +// which is currently is not done since BlockBasedTableBuilder +// simply creates one compression context per new SST file. +struct ZSTDCachedData { + // We choose to cache the below structure instead of a ptr + // because we want to avoid a) native types leak b) make + // cache use transparent for the user + ZSTDUncompressCachedData uncomp_cached_data_; + std::atomic zstd_uncomp_sentinel_; + + char + padding[(CACHE_LINE_SIZE - + (sizeof(ZSTDUncompressCachedData) + sizeof(std::atomic)) % + CACHE_LINE_SIZE)]; // unused padding field + + ZSTDCachedData() : zstd_uncomp_sentinel_(&uncomp_cached_data_) {} + ZSTDCachedData(const ZSTDCachedData&) = delete; + ZSTDCachedData& operator=(const ZSTDCachedData&) = delete; + + ZSTDUncompressCachedData GetUncompressData(int64_t idx) { + ZSTDUncompressCachedData result; + void* expected = &uncomp_cached_data_; + if (zstd_uncomp_sentinel_.compare_exchange_strong(expected, + SentinelValue)) { + uncomp_cached_data_.CreateIfNeeded(); + result.InitFromCache(uncomp_cached_data_, idx); + } else { + // Creates one time use data + result.CreateIfNeeded(); + } + return result; + } + // Return the entry back into circulation + // This is executed only when we successfully obtained + // in the first place + void ReturnUncompressData() { + if (zstd_uncomp_sentinel_.exchange(&uncomp_cached_data_) != SentinelValue) { + // Means we are returning while not having it acquired. + assert(false); + } + } +}; +static_assert(sizeof(ZSTDCachedData) % CACHE_LINE_SIZE == 0, + "Expected CACHE_LINE_SIZE alignment"); +} // namespace compression_cache + +class CompressionContextCache::Rep { + public: + Rep() {} + ZSTDUncompressCachedData GetZSTDUncompressData() { + auto p = per_core_uncompr_.AccessElementAndIndex(); + int64_t idx = static_cast(p.second); + return p.first->GetUncompressData(idx); + } + void ReturnZSTDUncompressData(int64_t idx) { + assert(idx >= 0); + auto* cn = per_core_uncompr_.AccessAtCore(static_cast(idx)); + cn->ReturnUncompressData(); + } + + private: + CoreLocalArray per_core_uncompr_; +}; + +CompressionContextCache::CompressionContextCache() : rep_(new Rep()) {} + +CompressionContextCache* CompressionContextCache::Instance() { + static CompressionContextCache instance; + return &instance; +} + +void CompressionContextCache::InitSingleton() { Instance(); } + +ZSTDUncompressCachedData +CompressionContextCache::GetCachedZSTDUncompressData() { + return rep_->GetZSTDUncompressData(); +} + +void CompressionContextCache::ReturnCachedZSTDUncompressData(int64_t idx) { + rep_->ReturnZSTDUncompressData(idx); +} + +CompressionContextCache::~CompressionContextCache() { delete rep_; } + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/compression_context_cache.h b/src/rocksdb/util/compression_context_cache.h new file mode 100644 index 000000000..7b7b2d507 --- /dev/null +++ b/src/rocksdb/util/compression_context_cache.h @@ -0,0 +1,47 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// + +// Compression context cache allows to cache compression/uncompression contexts +// This helps with Random Read latencies and reduces CPU utilization +// Caching is implemented using CoreLocal facility. Compression/Uncompression +// instances are cached on a per core basis using CoreLocalArray. A borrowed +// instance is atomically replaced with a sentinel value for the time of being +// used. If it turns out that another thread is already makes use of the +// instance we still create one on the heap which is later is destroyed. + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class ZSTDUncompressCachedData; + +class CompressionContextCache { + public: + // Singleton + static CompressionContextCache* Instance(); + static void InitSingleton(); + CompressionContextCache(const CompressionContextCache&) = delete; + CompressionContextCache& operator=(const CompressionContextCache&) = delete; + + ZSTDUncompressCachedData GetCachedZSTDUncompressData(); + void ReturnCachedZSTDUncompressData(int64_t idx); + + private: + // Singleton + CompressionContextCache(); + ~CompressionContextCache(); + + class Rep; + Rep* rep_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.cc b/src/rocksdb/util/concurrent_task_limiter_impl.cc new file mode 100644 index 000000000..a0fc7331f --- /dev/null +++ b/src/rocksdb/util/concurrent_task_limiter_impl.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/concurrent_task_limiter_impl.h" + +#include "rocksdb/concurrent_task_limiter.h" + +namespace ROCKSDB_NAMESPACE { + +ConcurrentTaskLimiterImpl::ConcurrentTaskLimiterImpl( + const std::string& name, int32_t max_outstanding_task) + : name_(name), + max_outstanding_tasks_{max_outstanding_task}, + outstanding_tasks_{0} {} + +ConcurrentTaskLimiterImpl::~ConcurrentTaskLimiterImpl() { + assert(outstanding_tasks_ == 0); +} + +const std::string& ConcurrentTaskLimiterImpl::GetName() const { return name_; } + +void ConcurrentTaskLimiterImpl::SetMaxOutstandingTask(int32_t limit) { + max_outstanding_tasks_.store(limit, std::memory_order_relaxed); +} + +void ConcurrentTaskLimiterImpl::ResetMaxOutstandingTask() { + max_outstanding_tasks_.store(-1, std::memory_order_relaxed); +} + +int32_t ConcurrentTaskLimiterImpl::GetOutstandingTask() const { + return outstanding_tasks_.load(std::memory_order_relaxed); +} + +std::unique_ptr ConcurrentTaskLimiterImpl::GetToken( + bool force) { + int32_t limit = max_outstanding_tasks_.load(std::memory_order_relaxed); + int32_t tasks = outstanding_tasks_.load(std::memory_order_relaxed); + // force = true, bypass the throttle. + // limit < 0 means unlimited tasks. + while (force || limit < 0 || tasks < limit) { + if (outstanding_tasks_.compare_exchange_weak(tasks, tasks + 1)) { + return std::unique_ptr(new TaskLimiterToken(this)); + } + } + return nullptr; +} + +ConcurrentTaskLimiter* NewConcurrentTaskLimiter(const std::string& name, + int32_t limit) { + return new ConcurrentTaskLimiterImpl(name, limit); +} + +TaskLimiterToken::~TaskLimiterToken() { + --limiter_->outstanding_tasks_; + assert(limiter_->outstanding_tasks_ >= 0); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/concurrent_task_limiter_impl.h b/src/rocksdb/util/concurrent_task_limiter_impl.h new file mode 100644 index 000000000..4952ae23a --- /dev/null +++ b/src/rocksdb/util/concurrent_task_limiter_impl.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include "rocksdb/concurrent_task_limiter.h" +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +class TaskLimiterToken; + +class ConcurrentTaskLimiterImpl : public ConcurrentTaskLimiter { + public: + explicit ConcurrentTaskLimiterImpl(const std::string& name, + int32_t max_outstanding_task); + // No copying allowed + ConcurrentTaskLimiterImpl(const ConcurrentTaskLimiterImpl&) = delete; + ConcurrentTaskLimiterImpl& operator=(const ConcurrentTaskLimiterImpl&) = + delete; + + virtual ~ConcurrentTaskLimiterImpl(); + + virtual const std::string& GetName() const override; + + virtual void SetMaxOutstandingTask(int32_t limit) override; + + virtual void ResetMaxOutstandingTask() override; + + virtual int32_t GetOutstandingTask() const override; + + // Request token for adding a new task. + // If force == true, it requests a token bypassing throttle. + // Returns nullptr if it got throttled. + virtual std::unique_ptr GetToken(bool force); + + private: + friend class TaskLimiterToken; + + std::string name_; + std::atomic max_outstanding_tasks_; + std::atomic outstanding_tasks_; +}; + +class TaskLimiterToken { + public: + explicit TaskLimiterToken(ConcurrentTaskLimiterImpl* limiter) + : limiter_(limiter) {} + ~TaskLimiterToken(); + + private: + ConcurrentTaskLimiterImpl* limiter_; + + // no copying allowed + TaskLimiterToken(const TaskLimiterToken&) = delete; + void operator=(const TaskLimiterToken&) = delete; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/core_local.h b/src/rocksdb/util/core_local.h new file mode 100644 index 000000000..b444a1152 --- /dev/null +++ b/src/rocksdb/util/core_local.h @@ -0,0 +1,83 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "port/likely.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +// An array of core-local values. Ideally the value type, T, is cache aligned to +// prevent false sharing. +template +class CoreLocalArray { + public: + CoreLocalArray(); + + size_t Size() const; + // returns pointer to the element corresponding to the core that the thread + // currently runs on. + T* Access() const; + // same as above, but also returns the core index, which the client can cache + // to reduce how often core ID needs to be retrieved. Only do this if some + // inaccuracy is tolerable, as the thread may migrate to a different core. + std::pair AccessElementAndIndex() const; + // returns pointer to element for the specified core index. This can be used, + // e.g., for aggregation, or if the client caches core index. + T* AccessAtCore(size_t core_idx) const; + + private: + std::unique_ptr data_; + int size_shift_; +}; + +template +CoreLocalArray::CoreLocalArray() { + int num_cpus = static_cast(std::thread::hardware_concurrency()); + // find a power of two >= num_cpus and >= 8 + size_shift_ = 3; + while (1 << size_shift_ < num_cpus) { + ++size_shift_; + } + data_.reset(new T[static_cast(1) << size_shift_]); +} + +template +size_t CoreLocalArray::Size() const { + return static_cast(1) << size_shift_; +} + +template +T* CoreLocalArray::Access() const { + return AccessElementAndIndex().first; +} + +template +std::pair CoreLocalArray::AccessElementAndIndex() const { + int cpuid = port::PhysicalCoreID(); + size_t core_idx; + if (UNLIKELY(cpuid < 0)) { + // cpu id unavailable, just pick randomly + core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_); + } else { + core_idx = static_cast(cpuid & ((1 << size_shift_) - 1)); + } + return {AccessAtCore(core_idx), core_idx}; +} + +template +T* CoreLocalArray::AccessAtCore(size_t core_idx) const { + assert(core_idx < static_cast(1) << size_shift_); + return &data_[core_idx]; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/coro_utils.h b/src/rocksdb/util/coro_utils.h new file mode 100644 index 000000000..5b4211135 --- /dev/null +++ b/src/rocksdb/util/coro_utils.h @@ -0,0 +1,112 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if defined(USE_COROUTINES) +#include "folly/experimental/coro/Coroutine.h" +#include "folly/experimental/coro/Task.h" +#endif +#include "rocksdb/rocksdb_namespace.h" + +// This file has two sctions. The first section applies to all instances of +// header file inclusion and has an include guard. The second section is +// meant for multiple inclusions in the same source file, and is idempotent. +namespace ROCKSDB_NAMESPACE { + +#ifndef UTIL_CORO_UTILS_H_ +#define UTIL_CORO_UTILS_H_ + +#if defined(USE_COROUTINES) + +// The follwoing macros expand to regular and coroutine function +// declarations for a given function +#define DECLARE_SYNC_AND_ASYNC(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__); \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__); + +#define DECLARE_SYNC_AND_ASYNC_OVERRIDE(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) override; \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__) \ + override; + +#define DECLARE_SYNC_AND_ASYNC_CONST(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) const; \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine(__VA_ARGS__) const; + +constexpr bool using_coroutines() { return true; } +#else // !USE_COROUTINES + +// The follwoing macros expand to a regular function declaration for a given +// function +#define DECLARE_SYNC_AND_ASYNC(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__); + +#define DECLARE_SYNC_AND_ASYNC_OVERRIDE(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) override; + +#define DECLARE_SYNC_AND_ASYNC_CONST(__ret_type__, __func_name__, ...) \ + __ret_type__ __func_name__(__VA_ARGS__) const; + +constexpr bool using_coroutines() { return false; } +#endif // USE_COROUTINES +#endif // UTIL_CORO_UTILS_H_ + +// The following section of the file is meant to be included twice in a +// source file - once defining WITH_COROUTINES and once defining +// WITHOUT_COROUTINES +#undef DEFINE_SYNC_AND_ASYNC +#undef CO_AWAIT +#undef CO_RETURN + +#if defined(WITH_COROUTINES) && defined(USE_COROUTINES) + +// This macro should be used in the beginning of the function +// definition. The declaration should have been done using one of the +// DECLARE_SYNC_AND_ASYNC* macros. It expands to the return type and +// the function name with the Coroutine suffix. For example - +// DEFINE_SYNC_AND_ASYNC(int, foo)(bool bar) {} +// would expand to - +// folly::coro::Task fooCoroutine(bool bar) {} +#define DEFINE_SYNC_AND_ASYNC(__ret_type__, __func_name__) \ + folly::coro::Task<__ret_type__> __func_name__##Coroutine + +// This macro should be used to call a function that might be a +// coroutine. It expands to the correct function name and prefixes +// the co_await operator if necessary. For example - +// s = CO_AWAIT(foo)(true); +// if the code is compiled WITH_COROUTINES, would expand to +// s = co_await fooCoroutine(true); +// if compiled WITHOUT_COROUTINES, would expand to +// s = foo(true); +#define CO_AWAIT(__func_name__) co_await __func_name__##Coroutine + +#define CO_RETURN co_return + +#elif defined(WITHOUT_COROUTINES) + +// This macro should be used in the beginning of the function +// definition. The declaration should have been done using one of the +// DECLARE_SYNC_AND_ASYNC* macros. It expands to the return type and +// the function name without the Coroutine suffix. For example - +// DEFINE_SYNC_AND_ASYNC(int, foo)(bool bar) {} +// would expand to - +// int foo(bool bar) {} +#define DEFINE_SYNC_AND_ASYNC(__ret_type__, __func_name__) \ + __ret_type__ __func_name__ + +// This macro should be used to call a function that might be a +// coroutine. It expands to the correct function name and prefixes +// the co_await operator if necessary. For example - +// s = CO_AWAIT(foo)(true); +// if the code is compiled WITH_COROUTINES, would expand to +// s = co_await fooCoroutine(true); +// if compiled WITHOUT_COROUTINES, would expand to +// s = foo(true); +#define CO_AWAIT(__func_name__) __func_name__ + +#define CO_RETURN return + +#endif // DO_NOT_USE_COROUTINES +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/crc32c.cc b/src/rocksdb/util/crc32c.cc new file mode 100644 index 000000000..d71c71c2e --- /dev/null +++ b/src/rocksdb/util/crc32c.cc @@ -0,0 +1,1351 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// four bytes at a time. +#include "util/crc32c.h" + +#include + +#include +#include +#ifdef HAVE_SSE42 +#include +#include +#endif + +#include "port/lang.h" +#include "util/coding.h" +#include "util/crc32c_arm64.h" +#include "util/math.h" + +#ifdef __powerpc64__ +#include "util/crc32c_ppc.h" +#include "util/crc32c_ppc_constants.h" + +#if __linux__ +#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT +#include +#endif + +#ifndef PPC_FEATURE2_VEC_CRYPTO +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#endif + +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif + +#elif __FreeBSD__ +#include +#include +#include +#endif /* __linux__ */ + +#endif + +#if defined(HAVE_ARM64_CRC) +bool pmull_runtime_flag = false; +#endif + +namespace ROCKSDB_NAMESPACE { +namespace crc32c { + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +#ifdef __powerpc64__ +static int arch_ppc_crc32 = 0; +#endif /* __powerpc64__ */ +#endif + +static const uint32_t table0_[256] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351}; +static const uint32_t table1_[256] = { + 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, + 0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, + 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, 0x3fc5f181, 0x2c6769f6, + 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, + 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, + 0xcb1e630b, 0xd8bcfb7c, 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, + 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, 0xe29f20ba, 0xf13db8cd, + 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, + 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, + 0x298143b1, 0x3a23dbc6, 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, + 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, 0xff17c604, 0xecb55e73, + 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, + 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, + 0x0bcc548e, 0x186eccf9, 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, + 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, 0x5dc6f43d, 0x4e646c4a, + 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, + 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, + 0xe9537434, 0xfaf1ec43, 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, + 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, 0xbf59d487, 0xacfb4cf0, + 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, + 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, + 0x4b82460d, 0x5820de7a, 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, + 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, 0x66d73941, 0x7575a136, + 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, + 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, + 0xadc95a4a, 0xbe6bc23d, 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, + 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, 0x844819fb, 0x97ea818c, + 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, + 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, + 0x70938b71, 0x63311306, 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, + 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, 0x26992bc2, 0x353bb3b5, + 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, + 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, + 0x6d1b6dcf, 0x7eb9f5b8, 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, + 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, 0x3b11cd7c, 0x28b3550b, + 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, + 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, + 0xcfca5ff6, 0xdc68c781, 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, + 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, 0xe64b1c47, 0xf5e98430, + 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, + 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, + 0x2d557f4c, 0x3ef7e73b, 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, + 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483}; +static const uint32_t table2_[256] = { + 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, + 0xd1b1f617, 0x74f06469, 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, + 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, 0x70a27d8a, 0xd5e3eff4, + 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, + 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, + 0x9942b558, 0x3c032726, 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, + 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, 0xd915c5d1, 0x7c5457af, + 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, + 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, + 0x40577089, 0xe516e2f7, 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, + 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, 0xc76580d9, 0x622412a7, + 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, + 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, + 0x2e85480b, 0x8bc4da75, 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, + 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, 0x8f96c396, 0x2ad751e8, + 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, + 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, + 0xf7908dda, 0x52d11fa4, 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, + 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, 0x56830647, 0xf3c29439, + 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, + 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, + 0xbf63ce95, 0x1a225ceb, 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, + 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, 0xb3764986, 0x1637dbf8, + 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, + 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, + 0x2a34fcde, 0x8f756ea0, 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, + 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, 0x6a638c57, 0xcf221e29, + 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, + 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, + 0x83834485, 0x26c2d6fb, 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, + 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, 0x2290cf18, 0x87d15d66, + 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, + 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, + 0x9df3018d, 0x38b293f3, 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, + 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, 0x3ce08a10, 0x99a1186e, + 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, + 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, + 0xd50042c2, 0x7041d0bc, 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, + 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, 0x9557324b, 0x3016a035, + 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, + 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, + 0x0c158713, 0xa954156d, 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, + 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8}; +static const uint32_t table3_[256] = { + 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, + 0xc4451272, 0x1900b8ca, 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, + 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, 0xe964b13d, 0x34211b85, + 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, + 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, + 0xdb65c0a9, 0x06206a11, 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, + 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, 0x2161776d, 0xfc24ddd5, + 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, + 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, + 0xfa04b7c4, 0x27411d7c, 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, + 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, 0xaba65fe7, 0x76e3f55f, + 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, + 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, + 0x99a72e73, 0x44e284cb, 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, + 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, 0xb4868d3c, 0x69c32784, + 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, + 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, + 0xb8c6591e, 0x6583f3a6, 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, + 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, 0x95e7fa51, 0x48a250e9, + 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, + 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, + 0xa7e68bc5, 0x7aa3217d, 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, + 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, 0xa4e4aad9, 0x79a10061, + 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, + 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, + 0x7f816a70, 0xa2c4c0c8, 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, + 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, 0x8585ddb4, 0x58c0770c, + 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, + 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, + 0xb784ac20, 0x6ac10698, 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, + 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, 0x9aa50f6f, 0x47e0a5d7, + 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, + 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, + 0x3d4384aa, 0xe0062e12, 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, + 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, 0x106227e5, 0xcd278d5d, + 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, + 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, + 0x22635671, 0xff26fcc9, 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, + 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, 0xd867e1b5, 0x05224b0d, + 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, + 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, + 0x0302211c, 0xde478ba4, 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, + 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842}; + +// Used to fetch a naturally-aligned 32-bit word in little endian byte-order +static inline uint32_t LE_LOAD32(const uint8_t* p) { + return DecodeFixed32(reinterpret_cast(p)); +} + +#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64)) +static inline uint64_t LE_LOAD64(const uint8_t* p) { + return DecodeFixed64(reinterpret_cast(p)); +} +#endif + +static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) { + uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; + // DO it twice. + c = static_cast(*l ^ LE_LOAD32(*p)); + *p += 4; + *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ + table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; +} + +#if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \ + (!defined(HAVE_ARM64_CRC)) || \ + defined(NO_THREEWAY_CRC32C) +static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) { +#ifndef HAVE_SSE42 + Slow_CRC32(l, p); +#elif defined(__LP64__) || defined(_WIN64) + *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *p += 8; +#else + *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); + *p += 4; + *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); + *p += 4; +#endif +} +#endif + +template +uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { + const uint8_t* p = reinterpret_cast(buf); + const uint8_t* e = p + size; + uint64_t l = crc ^ 0xffffffffu; + +// Align n to (1 << m) byte boundary +#define ALIGN(n, m) ((n + ((1 << m) - 1)) & ~((1 << m) - 1)) + +#define STEP1 \ + do { \ + int c = (l & 0xff) ^ *p++; \ + l = table0_[c] ^ (l >> 8); \ + } while (0) + + // Point x at first 16-byte aligned byte in string. This might be + // just past the end of the string. + const uintptr_t pval = reinterpret_cast(p); + const uint8_t* x = reinterpret_cast(ALIGN(pval, 4)); + if (x <= e) { + // Process bytes until finished or p is 16-byte aligned + while (p != x) { + STEP1; + } + } + // Process bytes 16 at a time + while ((e - p) >= 16) { + CRC32(&l, &p); + CRC32(&l, &p); + } + // Process bytes 8 at a time + while ((e - p) >= 8) { + CRC32(&l, &p); + } + // Process the last few bytes + while (p != e) { + STEP1; + } +#undef STEP1 +#undef ALIGN + return static_cast(l ^ 0xffffffffu); +} + +// Detect if ARM64 CRC or not. +#ifndef HAVE_ARM64_CRC +// Detect if SS42 or not. +#ifndef HAVE_POWER8 + +static bool isSSE42() { +#ifndef HAVE_SSE42 + return false; +#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); + return c_ & (1U << 20); // copied from CpuId.h in Folly. Test SSE42 +#elif defined(_WIN64) + int info[4]; + __cpuidex(info, 0x00000001, 0); + return (info[2] & ((int)1 << 20)) != 0; +#else + return false; +#endif +} + +static bool isPCLMULQDQ() { +#ifndef HAVE_SSE42 + // in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ + // are supported by compiler + return false; +#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) + uint32_t c_; + __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); + return c_ & (1U << 1); // PCLMULQDQ is in bit 1 (not bit 0) +#elif defined(_WIN64) + int info[4]; + __cpuidex(info, 0x00000001, 0); + return (info[2] & ((int)1 << 1)) != 0; +#else + return false; +#endif +} + +#endif // HAVE_POWER8 +#endif // HAVE_ARM64_CRC + +using Function = uint32_t (*)(uint32_t, const char*, size_t); + +#if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) +uint32_t ExtendPPCImpl(uint32_t crc, const char* buf, size_t size) { + return crc32c_ppc(crc, (const unsigned char*)buf, size); +} + +#if __linux__ +static int arch_ppc_probe(void) { + arch_ppc_crc32 = 0; + +#if defined(__powerpc64__) && defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) + if (getauxval(AT_HWCAP2) & PPC_FEATURE2_VEC_CRYPTO) arch_ppc_crc32 = 1; +#endif /* __powerpc64__ */ + + return arch_ppc_crc32; +} +#elif __FreeBSD__ +static int arch_ppc_probe(void) { + unsigned long cpufeatures; + arch_ppc_crc32 = 0; + +#if defined(__powerpc64__) + elf_aux_info(AT_HWCAP2, &cpufeatures, sizeof(cpufeatures)); + if (cpufeatures & PPC_FEATURE2_HAS_VEC_CRYPTO) arch_ppc_crc32 = 1; +#endif /* __powerpc64__ */ + + return arch_ppc_crc32; +} +#endif // __linux__ + +static bool isAltiVec() { + if (arch_ppc_probe()) { + return true; + } else { + return false; + } +} +#endif + +#if defined(HAVE_ARM64_CRC) +uint32_t ExtendARMImpl(uint32_t crc, const char* buf, size_t size) { + return crc32c_arm64(crc, (const unsigned char*)buf, size); +} +#endif + +std::string IsFastCrc32Supported() { + bool has_fast_crc = false; + std::string fast_zero_msg; + std::string arch; +#ifdef HAVE_POWER8 +#ifdef HAS_ALTIVEC + if (arch_ppc_probe()) { + has_fast_crc = true; + arch = "PPC"; + } +#else + has_fast_crc = false; + arch = "PPC"; +#endif +#elif defined(HAVE_ARM64_CRC) + if (crc32c_runtime_check()) { + has_fast_crc = true; + arch = "Arm64"; + pmull_runtime_flag = crc32c_pmull_runtime_check(); + } else { + has_fast_crc = false; + arch = "Arm64"; + } +#else + has_fast_crc = isSSE42(); + arch = "x86"; +#endif + if (has_fast_crc) { + fast_zero_msg.append("Supported on " + arch); + } else { + fast_zero_msg.append("Not supported on " + arch); + } + return fast_zero_msg; +} + +/* + * Copyright 2016 Ferry Toth, Exalon Delft BV, The Netherlands + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the author be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * Ferry Toth + * ftoth@exalondelft.nl + * + * https://github.com/htot/crc32c + * + * Modified by Facebook + * + * Original intel whitepaper: + * "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction" + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf + * + * This version is from the folly library, created by Dave Watson + * + * + */ +#if defined HAVE_SSE42 && defined HAVE_PCLMUL + +#define CRCtriplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); \ + crc##2 = _mm_crc32_u64(crc##2, *(buf##2 + offset)); + +#define CRCduplet(crc, buf, offset) \ + crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ + crc##1 = _mm_crc32_u64(crc##1, *(buf##1 + offset)); + +#define CRCsinglet(crc, buf, offset) \ + crc = _mm_crc32_u64(crc, *(uint64_t*)(buf + offset)); + +// Numbers taken directly from intel whitepaper. +// clang-format off +const uint64_t clmul_constants[] = { + 0x14cd00bd6, 0x105ec76f0, 0x0ba4fc28e, 0x14cd00bd6, + 0x1d82c63da, 0x0f20c0dfe, 0x09e4addf8, 0x0ba4fc28e, + 0x039d3b296, 0x1384aa63a, 0x102f9b8a2, 0x1d82c63da, + 0x14237f5e6, 0x01c291d04, 0x00d3b6092, 0x09e4addf8, + 0x0c96cfdc0, 0x0740eef02, 0x18266e456, 0x039d3b296, + 0x0daece73e, 0x0083a6eec, 0x0ab7aff2a, 0x102f9b8a2, + 0x1248ea574, 0x1c1733996, 0x083348832, 0x14237f5e6, + 0x12c743124, 0x02ad91c30, 0x0b9e02b86, 0x00d3b6092, + 0x018b33a4e, 0x06992cea2, 0x1b331e26a, 0x0c96cfdc0, + 0x17d35ba46, 0x07e908048, 0x1bf2e8b8a, 0x18266e456, + 0x1a3e0968a, 0x11ed1f9d8, 0x0ce7f39f4, 0x0daece73e, + 0x061d82e56, 0x0f1d0f55e, 0x0d270f1a2, 0x0ab7aff2a, + 0x1c3f5f66c, 0x0a87ab8a8, 0x12ed0daac, 0x1248ea574, + 0x065863b64, 0x08462d800, 0x11eef4f8e, 0x083348832, + 0x1ee54f54c, 0x071d111a8, 0x0b3e32c28, 0x12c743124, + 0x0064f7f26, 0x0ffd852c6, 0x0dd7e3b0c, 0x0b9e02b86, + 0x0f285651c, 0x0dcb17aa4, 0x010746f3c, 0x018b33a4e, + 0x1c24afea4, 0x0f37c5aee, 0x0271d9844, 0x1b331e26a, + 0x08e766a0c, 0x06051d5a2, 0x093a5f730, 0x17d35ba46, + 0x06cb08e5c, 0x11d5ca20e, 0x06b749fb2, 0x1bf2e8b8a, + 0x1167f94f2, 0x021f3d99c, 0x0cec3662e, 0x1a3e0968a, + 0x19329634a, 0x08f158014, 0x0e6fc4e6a, 0x0ce7f39f4, + 0x08227bb8a, 0x1a5e82106, 0x0b0cd4768, 0x061d82e56, + 0x13c2b89c4, 0x188815ab2, 0x0d7a4825c, 0x0d270f1a2, + 0x10f5ff2ba, 0x105405f3e, 0x00167d312, 0x1c3f5f66c, + 0x0f6076544, 0x0e9adf796, 0x026f6a60a, 0x12ed0daac, + 0x1a2adb74e, 0x096638b34, 0x19d34af3a, 0x065863b64, + 0x049c3cc9c, 0x1e50585a0, 0x068bce87a, 0x11eef4f8e, + 0x1524fa6c6, 0x19f1c69dc, 0x16cba8aca, 0x1ee54f54c, + 0x042d98888, 0x12913343e, 0x1329d9f7e, 0x0b3e32c28, + 0x1b1c69528, 0x088f25a3a, 0x02178513a, 0x0064f7f26, + 0x0e0ac139e, 0x04e36f0b0, 0x0170076fa, 0x0dd7e3b0c, + 0x141a1a2e2, 0x0bd6f81f8, 0x16ad828b4, 0x0f285651c, + 0x041d17b64, 0x19425cbba, 0x1fae1cc66, 0x010746f3c, + 0x1a75b4b00, 0x18db37e8a, 0x0f872e54c, 0x1c24afea4, + 0x01e41e9fc, 0x04c144932, 0x086d8e4d2, 0x0271d9844, + 0x160f7af7a, 0x052148f02, 0x05bb8f1bc, 0x08e766a0c, + 0x0a90fd27a, 0x0a3c6f37a, 0x0b3af077a, 0x093a5f730, + 0x04984d782, 0x1d22c238e, 0x0ca6ef3ac, 0x06cb08e5c, + 0x0234e0b26, 0x063ded06a, 0x1d88abd4a, 0x06b749fb2, + 0x04597456a, 0x04d56973c, 0x0e9e28eb4, 0x1167f94f2, + 0x07b3ff57a, 0x19385bf2e, 0x0c9c8b782, 0x0cec3662e, + 0x13a9cba9e, 0x0e417f38a, 0x093e106a4, 0x19329634a, + 0x167001a9c, 0x14e727980, 0x1ddffc5d4, 0x0e6fc4e6a, + 0x00df04680, 0x0d104b8fc, 0x02342001e, 0x08227bb8a, + 0x00a2a8d7e, 0x05b397730, 0x168763fa6, 0x0b0cd4768, + 0x1ed5a407a, 0x0e78eb416, 0x0d2c3ed1a, 0x13c2b89c4, + 0x0995a5724, 0x1641378f0, 0x19b1afbc4, 0x0d7a4825c, + 0x109ffedc0, 0x08d96551c, 0x0f2271e60, 0x10f5ff2ba, + 0x00b0bf8ca, 0x00bf80dd2, 0x123888b7a, 0x00167d312, + 0x1e888f7dc, 0x18dcddd1c, 0x002ee03b2, 0x0f6076544, + 0x183e8d8fe, 0x06a45d2b2, 0x133d7a042, 0x026f6a60a, + 0x116b0f50c, 0x1dd3e10e8, 0x05fabe670, 0x1a2adb74e, + 0x130004488, 0x0de87806c, 0x000bcf5f6, 0x19d34af3a, + 0x18f0c7078, 0x014338754, 0x017f27698, 0x049c3cc9c, + 0x058ca5f00, 0x15e3e77ee, 0x1af900c24, 0x068bce87a, + 0x0b5cfca28, 0x0dd07448e, 0x0ded288f8, 0x1524fa6c6, + 0x059f229bc, 0x1d8048348, 0x06d390dec, 0x16cba8aca, + 0x037170390, 0x0a3e3e02c, 0x06353c1cc, 0x042d98888, + 0x0c4584f5c, 0x0d73c7bea, 0x1f16a3418, 0x1329d9f7e, + 0x0531377e2, 0x185137662, 0x1d8d9ca7c, 0x1b1c69528, + 0x0b25b29f2, 0x18a08b5bc, 0x19fb2a8b0, 0x02178513a, + 0x1a08fe6ac, 0x1da758ae0, 0x045cddf4e, 0x0e0ac139e, + 0x1a91647f2, 0x169cf9eb0, 0x1a0f717c4, 0x0170076fa, +}; + +// Compute the crc32c value for buffer smaller than 8 +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +inline void align_to_8( + size_t len, + uint64_t& crc0, // crc so far, updated on return + const unsigned char*& next) { // next data pointer, updated on return + uint32_t crc32bit = static_cast(crc0); + if (len & 0x04) { + crc32bit = _mm_crc32_u32(crc32bit, *(uint32_t*)next); + next += sizeof(uint32_t); + } + if (len & 0x02) { + crc32bit = _mm_crc32_u16(crc32bit, *(uint16_t*)next); + next += sizeof(uint16_t); + } + if (len & 0x01) { + crc32bit = _mm_crc32_u8(crc32bit, *(next)); + next++; + } + crc0 = crc32bit; +} + +// +// CombineCRC performs pclmulqdq multiplication of 2 partial CRC's and a well +// chosen constant and xor's these with the remaining CRC. +// +inline uint64_t CombineCRC( + size_t block_size, + uint64_t crc0, + uint64_t crc1, + uint64_t crc2, + const uint64_t* next2) { + const auto multiplier = + *(reinterpret_cast(clmul_constants) + block_size - 1); + const auto crc0_xmm = _mm_set_epi64x(0, crc0); + const auto res0 = _mm_clmulepi64_si128(crc0_xmm, multiplier, 0x00); + const auto crc1_xmm = _mm_set_epi64x(0, crc1); + const auto res1 = _mm_clmulepi64_si128(crc1_xmm, multiplier, 0x10); + const auto res = _mm_xor_si128(res0, res1); + crc0 = _mm_cvtsi128_si64(res); + crc0 = crc0 ^ *((uint64_t*)next2 - 1); + crc2 = _mm_crc32_u64(crc2, crc0); + return crc2; +} + +// Compute CRC-32C using the Intel hardware instruction. +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { + const unsigned char* next = (const unsigned char*)buf; + uint64_t count; + uint64_t crc0, crc1, crc2; + crc0 = crc ^ 0xffffffffu; + + + if (len >= 8) { + // if len > 216 then align and use triplets + if (len > 216) { + { + // Work on the bytes (< 8) before the first 8-byte alignment addr starts + uint64_t align_bytes = (8 - (uintptr_t)next) & 7; + len -= align_bytes; + align_to_8(align_bytes, crc0, next); + } + + // Now work on the remaining blocks + count = len / 24; // number of triplets + len %= 24; // bytes remaining + uint64_t n = count >> 7; // #blocks = first block + full blocks + uint64_t block_size = count & 127; + if (block_size == 0) { + block_size = 128; + } else { + n++; + } + // points to the first byte of the next block + const uint64_t* next0 = (uint64_t*)next + block_size; + const uint64_t* next1 = next0 + block_size; + const uint64_t* next2 = next1 + block_size; + + crc1 = crc2 = 0; + // Use Duff's device, a for() loop inside a switch() + // statement. This needs to execute at least once, round len + // down to nearest triplet multiple + switch (block_size) { + case 128: + do { + // jumps here for a full block of len 128 + CRCtriplet(crc, next, -128); + FALLTHROUGH_INTENDED; + case 127: + // jumps here or below for the first block smaller + CRCtriplet(crc, next, -127); + FALLTHROUGH_INTENDED; + case 126: + CRCtriplet(crc, next, -126); // than 128 + FALLTHROUGH_INTENDED; + case 125: + CRCtriplet(crc, next, -125); + FALLTHROUGH_INTENDED; + case 124: + CRCtriplet(crc, next, -124); + FALLTHROUGH_INTENDED; + case 123: + CRCtriplet(crc, next, -123); + FALLTHROUGH_INTENDED; + case 122: + CRCtriplet(crc, next, -122); + FALLTHROUGH_INTENDED; + case 121: + CRCtriplet(crc, next, -121); + FALLTHROUGH_INTENDED; + case 120: + CRCtriplet(crc, next, -120); + FALLTHROUGH_INTENDED; + case 119: + CRCtriplet(crc, next, -119); + FALLTHROUGH_INTENDED; + case 118: + CRCtriplet(crc, next, -118); + FALLTHROUGH_INTENDED; + case 117: + CRCtriplet(crc, next, -117); + FALLTHROUGH_INTENDED; + case 116: + CRCtriplet(crc, next, -116); + FALLTHROUGH_INTENDED; + case 115: + CRCtriplet(crc, next, -115); + FALLTHROUGH_INTENDED; + case 114: + CRCtriplet(crc, next, -114); + FALLTHROUGH_INTENDED; + case 113: + CRCtriplet(crc, next, -113); + FALLTHROUGH_INTENDED; + case 112: + CRCtriplet(crc, next, -112); + FALLTHROUGH_INTENDED; + case 111: + CRCtriplet(crc, next, -111); + FALLTHROUGH_INTENDED; + case 110: + CRCtriplet(crc, next, -110); + FALLTHROUGH_INTENDED; + case 109: + CRCtriplet(crc, next, -109); + FALLTHROUGH_INTENDED; + case 108: + CRCtriplet(crc, next, -108); + FALLTHROUGH_INTENDED; + case 107: + CRCtriplet(crc, next, -107); + FALLTHROUGH_INTENDED; + case 106: + CRCtriplet(crc, next, -106); + FALLTHROUGH_INTENDED; + case 105: + CRCtriplet(crc, next, -105); + FALLTHROUGH_INTENDED; + case 104: + CRCtriplet(crc, next, -104); + FALLTHROUGH_INTENDED; + case 103: + CRCtriplet(crc, next, -103); + FALLTHROUGH_INTENDED; + case 102: + CRCtriplet(crc, next, -102); + FALLTHROUGH_INTENDED; + case 101: + CRCtriplet(crc, next, -101); + FALLTHROUGH_INTENDED; + case 100: + CRCtriplet(crc, next, -100); + FALLTHROUGH_INTENDED; + case 99: + CRCtriplet(crc, next, -99); + FALLTHROUGH_INTENDED; + case 98: + CRCtriplet(crc, next, -98); + FALLTHROUGH_INTENDED; + case 97: + CRCtriplet(crc, next, -97); + FALLTHROUGH_INTENDED; + case 96: + CRCtriplet(crc, next, -96); + FALLTHROUGH_INTENDED; + case 95: + CRCtriplet(crc, next, -95); + FALLTHROUGH_INTENDED; + case 94: + CRCtriplet(crc, next, -94); + FALLTHROUGH_INTENDED; + case 93: + CRCtriplet(crc, next, -93); + FALLTHROUGH_INTENDED; + case 92: + CRCtriplet(crc, next, -92); + FALLTHROUGH_INTENDED; + case 91: + CRCtriplet(crc, next, -91); + FALLTHROUGH_INTENDED; + case 90: + CRCtriplet(crc, next, -90); + FALLTHROUGH_INTENDED; + case 89: + CRCtriplet(crc, next, -89); + FALLTHROUGH_INTENDED; + case 88: + CRCtriplet(crc, next, -88); + FALLTHROUGH_INTENDED; + case 87: + CRCtriplet(crc, next, -87); + FALLTHROUGH_INTENDED; + case 86: + CRCtriplet(crc, next, -86); + FALLTHROUGH_INTENDED; + case 85: + CRCtriplet(crc, next, -85); + FALLTHROUGH_INTENDED; + case 84: + CRCtriplet(crc, next, -84); + FALLTHROUGH_INTENDED; + case 83: + CRCtriplet(crc, next, -83); + FALLTHROUGH_INTENDED; + case 82: + CRCtriplet(crc, next, -82); + FALLTHROUGH_INTENDED; + case 81: + CRCtriplet(crc, next, -81); + FALLTHROUGH_INTENDED; + case 80: + CRCtriplet(crc, next, -80); + FALLTHROUGH_INTENDED; + case 79: + CRCtriplet(crc, next, -79); + FALLTHROUGH_INTENDED; + case 78: + CRCtriplet(crc, next, -78); + FALLTHROUGH_INTENDED; + case 77: + CRCtriplet(crc, next, -77); + FALLTHROUGH_INTENDED; + case 76: + CRCtriplet(crc, next, -76); + FALLTHROUGH_INTENDED; + case 75: + CRCtriplet(crc, next, -75); + FALLTHROUGH_INTENDED; + case 74: + CRCtriplet(crc, next, -74); + FALLTHROUGH_INTENDED; + case 73: + CRCtriplet(crc, next, -73); + FALLTHROUGH_INTENDED; + case 72: + CRCtriplet(crc, next, -72); + FALLTHROUGH_INTENDED; + case 71: + CRCtriplet(crc, next, -71); + FALLTHROUGH_INTENDED; + case 70: + CRCtriplet(crc, next, -70); + FALLTHROUGH_INTENDED; + case 69: + CRCtriplet(crc, next, -69); + FALLTHROUGH_INTENDED; + case 68: + CRCtriplet(crc, next, -68); + FALLTHROUGH_INTENDED; + case 67: + CRCtriplet(crc, next, -67); + FALLTHROUGH_INTENDED; + case 66: + CRCtriplet(crc, next, -66); + FALLTHROUGH_INTENDED; + case 65: + CRCtriplet(crc, next, -65); + FALLTHROUGH_INTENDED; + case 64: + CRCtriplet(crc, next, -64); + FALLTHROUGH_INTENDED; + case 63: + CRCtriplet(crc, next, -63); + FALLTHROUGH_INTENDED; + case 62: + CRCtriplet(crc, next, -62); + FALLTHROUGH_INTENDED; + case 61: + CRCtriplet(crc, next, -61); + FALLTHROUGH_INTENDED; + case 60: + CRCtriplet(crc, next, -60); + FALLTHROUGH_INTENDED; + case 59: + CRCtriplet(crc, next, -59); + FALLTHROUGH_INTENDED; + case 58: + CRCtriplet(crc, next, -58); + FALLTHROUGH_INTENDED; + case 57: + CRCtriplet(crc, next, -57); + FALLTHROUGH_INTENDED; + case 56: + CRCtriplet(crc, next, -56); + FALLTHROUGH_INTENDED; + case 55: + CRCtriplet(crc, next, -55); + FALLTHROUGH_INTENDED; + case 54: + CRCtriplet(crc, next, -54); + FALLTHROUGH_INTENDED; + case 53: + CRCtriplet(crc, next, -53); + FALLTHROUGH_INTENDED; + case 52: + CRCtriplet(crc, next, -52); + FALLTHROUGH_INTENDED; + case 51: + CRCtriplet(crc, next, -51); + FALLTHROUGH_INTENDED; + case 50: + CRCtriplet(crc, next, -50); + FALLTHROUGH_INTENDED; + case 49: + CRCtriplet(crc, next, -49); + FALLTHROUGH_INTENDED; + case 48: + CRCtriplet(crc, next, -48); + FALLTHROUGH_INTENDED; + case 47: + CRCtriplet(crc, next, -47); + FALLTHROUGH_INTENDED; + case 46: + CRCtriplet(crc, next, -46); + FALLTHROUGH_INTENDED; + case 45: + CRCtriplet(crc, next, -45); + FALLTHROUGH_INTENDED; + case 44: + CRCtriplet(crc, next, -44); + FALLTHROUGH_INTENDED; + case 43: + CRCtriplet(crc, next, -43); + FALLTHROUGH_INTENDED; + case 42: + CRCtriplet(crc, next, -42); + FALLTHROUGH_INTENDED; + case 41: + CRCtriplet(crc, next, -41); + FALLTHROUGH_INTENDED; + case 40: + CRCtriplet(crc, next, -40); + FALLTHROUGH_INTENDED; + case 39: + CRCtriplet(crc, next, -39); + FALLTHROUGH_INTENDED; + case 38: + CRCtriplet(crc, next, -38); + FALLTHROUGH_INTENDED; + case 37: + CRCtriplet(crc, next, -37); + FALLTHROUGH_INTENDED; + case 36: + CRCtriplet(crc, next, -36); + FALLTHROUGH_INTENDED; + case 35: + CRCtriplet(crc, next, -35); + FALLTHROUGH_INTENDED; + case 34: + CRCtriplet(crc, next, -34); + FALLTHROUGH_INTENDED; + case 33: + CRCtriplet(crc, next, -33); + FALLTHROUGH_INTENDED; + case 32: + CRCtriplet(crc, next, -32); + FALLTHROUGH_INTENDED; + case 31: + CRCtriplet(crc, next, -31); + FALLTHROUGH_INTENDED; + case 30: + CRCtriplet(crc, next, -30); + FALLTHROUGH_INTENDED; + case 29: + CRCtriplet(crc, next, -29); + FALLTHROUGH_INTENDED; + case 28: + CRCtriplet(crc, next, -28); + FALLTHROUGH_INTENDED; + case 27: + CRCtriplet(crc, next, -27); + FALLTHROUGH_INTENDED; + case 26: + CRCtriplet(crc, next, -26); + FALLTHROUGH_INTENDED; + case 25: + CRCtriplet(crc, next, -25); + FALLTHROUGH_INTENDED; + case 24: + CRCtriplet(crc, next, -24); + FALLTHROUGH_INTENDED; + case 23: + CRCtriplet(crc, next, -23); + FALLTHROUGH_INTENDED; + case 22: + CRCtriplet(crc, next, -22); + FALLTHROUGH_INTENDED; + case 21: + CRCtriplet(crc, next, -21); + FALLTHROUGH_INTENDED; + case 20: + CRCtriplet(crc, next, -20); + FALLTHROUGH_INTENDED; + case 19: + CRCtriplet(crc, next, -19); + FALLTHROUGH_INTENDED; + case 18: + CRCtriplet(crc, next, -18); + FALLTHROUGH_INTENDED; + case 17: + CRCtriplet(crc, next, -17); + FALLTHROUGH_INTENDED; + case 16: + CRCtriplet(crc, next, -16); + FALLTHROUGH_INTENDED; + case 15: + CRCtriplet(crc, next, -15); + FALLTHROUGH_INTENDED; + case 14: + CRCtriplet(crc, next, -14); + FALLTHROUGH_INTENDED; + case 13: + CRCtriplet(crc, next, -13); + FALLTHROUGH_INTENDED; + case 12: + CRCtriplet(crc, next, -12); + FALLTHROUGH_INTENDED; + case 11: + CRCtriplet(crc, next, -11); + FALLTHROUGH_INTENDED; + case 10: + CRCtriplet(crc, next, -10); + FALLTHROUGH_INTENDED; + case 9: + CRCtriplet(crc, next, -9); + FALLTHROUGH_INTENDED; + case 8: + CRCtriplet(crc, next, -8); + FALLTHROUGH_INTENDED; + case 7: + CRCtriplet(crc, next, -7); + FALLTHROUGH_INTENDED; + case 6: + CRCtriplet(crc, next, -6); + FALLTHROUGH_INTENDED; + case 5: + CRCtriplet(crc, next, -5); + FALLTHROUGH_INTENDED; + case 4: + CRCtriplet(crc, next, -4); + FALLTHROUGH_INTENDED; + case 3: + CRCtriplet(crc, next, -3); + FALLTHROUGH_INTENDED; + case 2: + CRCtriplet(crc, next, -2); + FALLTHROUGH_INTENDED; + case 1: + CRCduplet(crc, next, -1); // the final triplet is actually only 2 + //{ CombineCRC(); } + crc0 = CombineCRC(block_size, crc0, crc1, crc2, next2); + if (--n > 0) { + crc1 = crc2 = 0; + block_size = 128; + // points to the first byte of the next block + next0 = next2 + 128; + next1 = next0 + 128; // from here on all blocks are 128 long + next2 = next1 + 128; + } + FALLTHROUGH_INTENDED; + case 0:; + } while (n > 0); + } + next = (const unsigned char*)next2; + } + uint64_t count2 = len >> 3; // 216 of less bytes is 27 or less singlets + len = len & 7; + next += (count2 * 8); + switch (count2) { + case 27: + CRCsinglet(crc0, next, -27 * 8); + FALLTHROUGH_INTENDED; + case 26: + CRCsinglet(crc0, next, -26 * 8); + FALLTHROUGH_INTENDED; + case 25: + CRCsinglet(crc0, next, -25 * 8); + FALLTHROUGH_INTENDED; + case 24: + CRCsinglet(crc0, next, -24 * 8); + FALLTHROUGH_INTENDED; + case 23: + CRCsinglet(crc0, next, -23 * 8); + FALLTHROUGH_INTENDED; + case 22: + CRCsinglet(crc0, next, -22 * 8); + FALLTHROUGH_INTENDED; + case 21: + CRCsinglet(crc0, next, -21 * 8); + FALLTHROUGH_INTENDED; + case 20: + CRCsinglet(crc0, next, -20 * 8); + FALLTHROUGH_INTENDED; + case 19: + CRCsinglet(crc0, next, -19 * 8); + FALLTHROUGH_INTENDED; + case 18: + CRCsinglet(crc0, next, -18 * 8); + FALLTHROUGH_INTENDED; + case 17: + CRCsinglet(crc0, next, -17 * 8); + FALLTHROUGH_INTENDED; + case 16: + CRCsinglet(crc0, next, -16 * 8); + FALLTHROUGH_INTENDED; + case 15: + CRCsinglet(crc0, next, -15 * 8); + FALLTHROUGH_INTENDED; + case 14: + CRCsinglet(crc0, next, -14 * 8); + FALLTHROUGH_INTENDED; + case 13: + CRCsinglet(crc0, next, -13 * 8); + FALLTHROUGH_INTENDED; + case 12: + CRCsinglet(crc0, next, -12 * 8); + FALLTHROUGH_INTENDED; + case 11: + CRCsinglet(crc0, next, -11 * 8); + FALLTHROUGH_INTENDED; + case 10: + CRCsinglet(crc0, next, -10 * 8); + FALLTHROUGH_INTENDED; + case 9: + CRCsinglet(crc0, next, -9 * 8); + FALLTHROUGH_INTENDED; + case 8: + CRCsinglet(crc0, next, -8 * 8); + FALLTHROUGH_INTENDED; + case 7: + CRCsinglet(crc0, next, -7 * 8); + FALLTHROUGH_INTENDED; + case 6: + CRCsinglet(crc0, next, -6 * 8); + FALLTHROUGH_INTENDED; + case 5: + CRCsinglet(crc0, next, -5 * 8); + FALLTHROUGH_INTENDED; + case 4: + CRCsinglet(crc0, next, -4 * 8); + FALLTHROUGH_INTENDED; + case 3: + CRCsinglet(crc0, next, -3 * 8); + FALLTHROUGH_INTENDED; + case 2: + CRCsinglet(crc0, next, -2 * 8); + FALLTHROUGH_INTENDED; + case 1: + CRCsinglet(crc0, next, -1 * 8); + FALLTHROUGH_INTENDED; + case 0:; + } + } + { + align_to_8(len, crc0, next); + return (uint32_t)crc0 ^ 0xffffffffu; + } +} + +#endif //HAVE_SSE42 && HAVE_PCLMUL + +static inline Function Choose_Extend() { +#ifdef HAVE_POWER8 + return isAltiVec() ? ExtendPPCImpl : ExtendImpl; +#elif defined(HAVE_ARM64_CRC) + if(crc32c_runtime_check()) { + pmull_runtime_flag = crc32c_pmull_runtime_check(); + return ExtendARMImpl; + } else { + return ExtendImpl; + } +#else + if (isSSE42()) { + if (isPCLMULQDQ()) { +#if (defined HAVE_SSE42 && defined HAVE_PCLMUL) && !defined NO_THREEWAY_CRC32C + return crc32c_3way; +#else + return ExtendImpl; // Fast_CRC32 will check HAVE_SSE42 itself +#endif + } + else { // no runtime PCLMULQDQ support but has SSE42 support + return ExtendImpl; + } + } // end of isSSE42() + else { + return ExtendImpl; + } +#endif +} + +static Function ChosenExtend = Choose_Extend(); +uint32_t Extend(uint32_t crc, const char* buf, size_t size) { + return ChosenExtend(crc, buf, size); +} + +// The code for crc32c combine, copied with permission from folly + +// Standard galois-field multiply. The only modification is that a, +// b, m, and p are all bit-reflected. +// +// https://en.wikipedia.org/wiki/Finite_field_arithmetic +static constexpr uint32_t gf_multiply_sw_1( + size_t i, uint32_t p, uint32_t a, uint32_t b, uint32_t m) { + // clang-format off + return i == 32 ? p : gf_multiply_sw_1( + /* i = */ i + 1, + /* p = */ p ^ ((0u-((b >> 31) & 1)) & a), + /* a = */ (a >> 1) ^ ((0u-(a & 1)) & m), + /* b = */ b << 1, + /* m = */ m); + // clang-format on +} +static constexpr uint32_t gf_multiply_sw(uint32_t a, uint32_t b, uint32_t m) { + return gf_multiply_sw_1(/* i = */ 0, /* p = */ 0, a, b, m); +} + +static constexpr uint32_t gf_square_sw(uint32_t a, uint32_t m) { + return gf_multiply_sw(a, a, m); +} + +template +struct gf_powers_memo { + static constexpr uint32_t value = + gf_square_sw(gf_powers_memo::value, m); +}; +template +struct gf_powers_memo<0, m> { + static constexpr uint32_t value = m; +}; + +template +struct integer_sequence { + using value_type = T; + static constexpr size_t size() { return sizeof...(Ints); } +}; + +template +struct make_integer_sequence : make_integer_sequence {}; + +template +struct make_integer_sequence : integer_sequence {}; + +template +using make_index_sequence = make_integer_sequence; + +template +struct gf_powers_make { + template + using index_sequence = integer_sequence; + template + constexpr std::array operator()( + index_sequence) const { + return std::array{{gf_powers_memo::value...}}; + } +}; + +static constexpr uint32_t crc32c_m = 0x82f63b78; + +static constexpr std::array const crc32c_powers = + gf_powers_make{}(make_index_sequence<62>{}); + +// Expects a "pure" crc (see Crc32cCombine) +static uint32_t Crc32AppendZeroes( + uint32_t crc, size_t len_over_4, uint32_t polynomial, + std::array const& powers_array) { + auto powers = powers_array.data(); + // Append by multiplying by consecutive powers of two of the zeroes + // array + size_t len_bits = len_over_4; + + while (len_bits) { + // Advance directly to next bit set. + auto r = CountTrailingZeroBits(len_bits); + len_bits >>= r; + powers += r; + + crc = gf_multiply_sw(crc, *powers, polynomial); + + len_bits >>= 1; + powers++; + } + + return crc; +} + +static inline uint32_t InvertedToPure(uint32_t crc) { return ~crc; } + +static inline uint32_t PureToInverted(uint32_t crc) { return ~crc; } + +static inline uint32_t PureExtend(uint32_t crc, const char* buf, size_t size) { + return InvertedToPure(Extend(PureToInverted(crc), buf, size)); +} + +// Background: +// RocksDB uses two kinds of crc32c values: masked and unmasked. Neither is +// a "pure" CRC because a pure CRC satisfies (^ for xor) +// crc(a ^ b) = crc(a) ^ crc(b) +// The unmasked is closest, and this function takes unmasked crc32c values. +// The unmasked values are impure in two ways: +// * The initial setting at the start of CRC computation is all 1 bits +// (like -1) instead of zero. +// * The result has all bits invered. +// Note that together, these result in the empty string having a crc32c of +// zero. See +// https://en.wikipedia.org/wiki/Computation_of_cyclic_redundancy_checks#CRC_variants +// +// Simplified version of strategy, using xor through pure CRCs (+ for concat): +// +// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^ +// pure_crc(zeros(len(str1)) + str2) +// +// because the xor of these two zero-padded strings is str1 + str2. For pure +// CRC, leading zeros don't affect the result, so we only need +// +// pure_crc(str1 + str2) = pure_crc(str1 + zeros(len(str2))) ^ +// pure_crc(str2) +// +// Considering we aren't working with pure CRCs, what is actually in the input? +// +// crc1 = PureToInverted(PureExtendCrc32c(-1, zeros, crc1len) ^ +// PureCrc32c(str1, crc1len)) +// crc2 = PureToInverted(PureExtendCrc32c(-1, zeros, crc2len) ^ +// PureCrc32c(str2, crc2len)) +// +// The result we want to compute is +// combined = PureToInverted(PureExtendCrc32c(PureExtendCrc32c(-1, zeros, +// crc1len) ^ +// PureCrc32c(str1, crc1len), +// zeros, crc2len) ^ +// PureCrc32c(str2, crc2len)) +// +// Thus, in addition to extending crc1 over the length of str2 in (virtual) +// zeros, we need to cancel out the -1 initializer that was used in computing +// crc2. To cancel it out, we also need to extend it over crc2len in zeros. +// To simplify, since the end of str1 and that -1 initializer for crc2 are at +// the same logical position, we can combine them before we extend over the +// zeros. +uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len) { + uint32_t pure_crc1_with_init = InvertedToPure(crc1); + uint32_t pure_crc2_with_init = InvertedToPure(crc2); + uint32_t pure_crc2_init = static_cast(-1); + + // Append up to 32 bits of zeroes in the normal way + char zeros[4] = {0, 0, 0, 0}; + auto len = crc2len & 3; + uint32_t tmp = pure_crc1_with_init ^ pure_crc2_init; + if (len) { + tmp = PureExtend(tmp, zeros, len); + } + return PureToInverted( + Crc32AppendZeroes(tmp, crc2len / 4, crc32c_m, crc32c_powers) ^ + pure_crc2_with_init); +} + +} // namespace crc32c +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/crc32c.h b/src/rocksdb/util/crc32c.h new file mode 100644 index 000000000..a08ad60af --- /dev/null +++ b/src/rocksdb/util/crc32c.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include +#include + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +namespace crc32c { + +extern std::string IsFastCrc32Supported(); + +// Return the crc32c of concat(A, data[0,n-1]) where init_crc is the +// crc32c of some string A. Extend() is often used to maintain the +// crc32c of a stream of data. +extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); + +// Takes two unmasked crc32c values, and the length of the string from +// which `crc2` was computed, and computes a crc32c value for the +// concatenation of the original two input strings. Running time is +// ~ log(crc2len). +extern uint32_t Crc32cCombine(uint32_t crc1, uint32_t crc2, size_t crc2len); + +// Return the crc32c of data[0,n-1] +inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); } + +static const uint32_t kMaskDelta = 0xa282ead8ul; + +// Return a masked representation of crc. +// +// Motivation: it is problematic to compute the CRC of a string that +// contains embedded CRCs. Therefore we recommend that CRCs stored +// somewhere (e.g., in files) should be masked before being stored. +inline uint32_t Mask(uint32_t crc) { + // Rotate right by 15 bits and add a constant. + return ((crc >> 15) | (crc << 17)) + kMaskDelta; +} + +// Return the crc whose masked representation is masked_crc. +inline uint32_t Unmask(uint32_t masked_crc) { + uint32_t rot = masked_crc - kMaskDelta; + return ((rot >> 17) | (rot << 15)); +} + +} // namespace crc32c +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/crc32c_arm64.cc b/src/rocksdb/util/crc32c_arm64.cc new file mode 100644 index 000000000..4885f4fe1 --- /dev/null +++ b/src/rocksdb/util/crc32c_arm64.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/crc32c_arm64.h" + +#if defined(HAVE_ARM64_CRC) + +#if defined(__linux__) +#include +#endif +#ifdef ROCKSDB_AUXV_GETAUXVAL_PRESENT +#include +#endif +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif +#ifndef HWCAP_PMULL +#define HWCAP_PMULL (1 << 4) +#endif +#if defined(__APPLE__) +#include +#endif +#if defined(__OpenBSD__) +#include +#include +#include +#include +#endif + +#ifdef HAVE_ARM64_CRYPTO +/* unfolding to compute 8 * 3 = 24 bytes parallelly */ +#define CRC32C24BYTES(ITR) \ + crc1 = crc32c_u64(crc1, *(buf64 + BLK_LENGTH + (ITR))); \ + crc2 = crc32c_u64(crc2, *(buf64 + BLK_LENGTH * 2 + (ITR))); \ + crc0 = crc32c_u64(crc0, *(buf64 + (ITR))); + +/* unfolding to compute 24 * 7 = 168 bytes parallelly */ +#define CRC32C7X24BYTES(ITR) \ + do { \ + CRC32C24BYTES((ITR)*7 + 0) \ + CRC32C24BYTES((ITR)*7 + 1) \ + CRC32C24BYTES((ITR)*7 + 2) \ + CRC32C24BYTES((ITR)*7 + 3) \ + CRC32C24BYTES((ITR)*7 + 4) \ + CRC32C24BYTES((ITR)*7 + 5) \ + CRC32C24BYTES((ITR)*7 + 6) \ + } while (0) +#endif + +extern bool pmull_runtime_flag; + +uint32_t crc32c_runtime_check(void) { +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) + uint64_t auxv = 0; +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) + auxv = getauxval(AT_HWCAP); +#elif defined(__FreeBSD__) + elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv)); +#endif + return (auxv & HWCAP_CRC32) != 0; +#elif defined(__APPLE__) + int r; + size_t l = sizeof(r); + if (sysctlbyname("hw.optional.armv8_crc32", &r, &l, NULL, 0) == -1) return 0; + return r == 1; +#elif defined(__OpenBSD__) + int r = 0; + const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0; + size_t len = sizeof(isar0); + + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) + r = 1; + } + return r; +#else + return 0; +#endif +} + +bool crc32c_pmull_runtime_check(void) { +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) || defined(__FreeBSD__) + uint64_t auxv = 0; +#if defined(ROCKSDB_AUXV_GETAUXVAL_PRESENT) + auxv = getauxval(AT_HWCAP); +#elif defined(__FreeBSD__) + elf_aux_info(AT_HWCAP, &auxv, sizeof(auxv)); +#endif + return (auxv & HWCAP_PMULL) != 0; +#elif defined(__APPLE__) + return true; +#elif defined(__OpenBSD__) + bool r = false; + const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + uint64_t isar0; + size_t len = sizeof(isar0); + + if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { + if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL) + r = true; + } + return r; +#else + return false; +#endif +} + +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +uint32_t +crc32c_arm64(uint32_t crc, unsigned char const *data, size_t len) { + const uint8_t *buf8; + const uint64_t *buf64 = (uint64_t *)data; + int length = (int)len; + crc ^= 0xffffffff; + + /* + * Pmull runtime check here. + * Raspberry Pi supports crc32 but doesn't support pmull. + * Skip Crc32c Parallel computation if no crypto extension available. + */ + if (pmull_runtime_flag) { +/* Macro (HAVE_ARM64_CRYPTO) is used for compiling check */ +#ifdef HAVE_ARM64_CRYPTO +/* Crc32c Parallel computation + * Algorithm comes from Intel whitepaper: + * crc-iscsi-polynomial-crc32-instruction-paper + * + * Input data is divided into three equal-sized blocks + * Three parallel blocks (crc0, crc1, crc2) for 1024 Bytes + * One Block: 42(BLK_LENGTH) * 8(step length: crc32c_u64) bytes + */ +#define BLK_LENGTH 42 + while (length >= 1024) { + uint64_t t0, t1; + uint32_t crc0 = 0, crc1 = 0, crc2 = 0; + + /* Parallel Param: + * k0 = CRC32(x ^ (42 * 8 * 8 * 2 - 1)); + * k1 = CRC32(x ^ (42 * 8 * 8 - 1)); + */ + uint32_t k0 = 0xe417f38a, k1 = 0x8f158014; + + /* Prefetch data for following block to avoid cache miss */ + PREF1KL1((uint8_t *)buf64, 1024); + + /* First 8 byte for better pipelining */ + crc0 = crc32c_u64(crc, *buf64++); + + /* 3 blocks crc32c parallel computation + * Macro unfolding to compute parallelly + * 168 * 6 = 1008 (bytes) + */ + CRC32C7X24BYTES(0); + CRC32C7X24BYTES(1); + CRC32C7X24BYTES(2); + CRC32C7X24BYTES(3); + CRC32C7X24BYTES(4); + CRC32C7X24BYTES(5); + buf64 += (BLK_LENGTH * 3); + + /* Last 8 bytes */ + crc = crc32c_u64(crc2, *buf64++); + + t0 = (uint64_t)vmull_p64(crc0, k0); + t1 = (uint64_t)vmull_p64(crc1, k1); + + /* Merge (crc0, crc1, crc2) -> crc */ + crc1 = crc32c_u64(0, t1); + crc ^= crc1; + crc0 = crc32c_u64(0, t0); + crc ^= crc0; + + length -= 1024; + } + + if (length == 0) return crc ^ (0xffffffffU); +#endif + } // if Pmull runtime check here + + buf8 = (const uint8_t *)buf64; + while (length >= 8) { + crc = crc32c_u64(crc, *(const uint64_t *)buf8); + buf8 += 8; + length -= 8; + } + + /* The following is more efficient than the straight loop */ + if (length >= 4) { + crc = crc32c_u32(crc, *(const uint32_t *)buf8); + buf8 += 4; + length -= 4; + } + + if (length >= 2) { + crc = crc32c_u16(crc, *(const uint16_t *)buf8); + buf8 += 2; + length -= 2; + } + + if (length >= 1) crc = crc32c_u8(crc, *buf8); + + crc ^= 0xffffffff; + return crc; +} + +#endif diff --git a/src/rocksdb/util/crc32c_arm64.h b/src/rocksdb/util/crc32c_arm64.h new file mode 100644 index 000000000..4b27fe871 --- /dev/null +++ b/src/rocksdb/util/crc32c_arm64.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018, Arm Limited and affiliates. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef UTIL_CRC32C_ARM64_H +#define UTIL_CRC32C_ARM64_H + +#include +#include + +#if defined(__aarch64__) || defined(__AARCH64__) + +#ifdef __ARM_FEATURE_CRC32 +#define HAVE_ARM64_CRC +#include +#define crc32c_u8(crc, v) __crc32cb(crc, v) +#define crc32c_u16(crc, v) __crc32ch(crc, v) +#define crc32c_u32(crc, v) __crc32cw(crc, v) +#define crc32c_u64(crc, v) __crc32cd(crc, v) +// clang-format off +#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 0) * 64)); \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 1) * 64)); \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 2) * 64)); \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]" ::[v] "r"(buffer), \ + [c] "I"((PREF_OFFSET) + ((ITR) + 3) * 64)); +// clang-format on + +#define PREF1KL1(buffer, PREF_OFFSET) \ + PREF4X64L1(buffer, (PREF_OFFSET), 0) \ + PREF4X64L1(buffer, (PREF_OFFSET), 4) \ + PREF4X64L1(buffer, (PREF_OFFSET), 8) \ + PREF4X64L1(buffer, (PREF_OFFSET), 12) + +extern uint32_t crc32c_arm64(uint32_t crc, unsigned char const *data, + size_t len); +extern uint32_t crc32c_runtime_check(void); +extern bool crc32c_pmull_runtime_check(void); + +#ifdef __ARM_FEATURE_CRYPTO +#define HAVE_ARM64_CRYPTO +#include +#endif // __ARM_FEATURE_CRYPTO +#endif // __ARM_FEATURE_CRC32 + +#endif // defined(__aarch64__) || defined(__AARCH64__) + +#endif diff --git a/src/rocksdb/util/crc32c_ppc.c b/src/rocksdb/util/crc32c_ppc.c new file mode 100644 index 000000000..b37dfb158 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc.c @@ -0,0 +1,94 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#define CRC_TABLE +#include +#include +#include +#include "util/crc32c_ppc_constants.h" + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN - 1) + +#ifdef REFLECT +static unsigned int crc32_align(unsigned int crc, unsigned char const *p, + unsigned long len) { + while (len--) crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} +#endif + +#ifdef HAVE_POWER8 +unsigned int __crc32_vpmsum(unsigned int crc, unsigned char const *p, + unsigned long len); + +static uint32_t crc32_vpmsum(uint32_t crc, unsigned char const *data, + size_t len) { + unsigned int prealign; + unsigned int tail; + +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, data, (unsigned long)len); + goto out; + } + + if ((unsigned long)data & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)data & VMX_ALIGN_MASK); + crc = crc32_align(crc, data, prealign); + len -= prealign; + data += prealign; + } + + crc = __crc32_vpmsum(crc, data, (unsigned long)len & ~VMX_ALIGN_MASK); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + data += len & ~VMX_ALIGN_MASK; + crc = crc32_align(crc, data, tail); + } + +out: +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + return crc; +} + +/* This wrapper function works around the fact that crc32_vpmsum + * does not gracefully handle the case where the data pointer is NULL. There + * may be room for performance improvement here. + */ +uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) { + unsigned char *buf2; + + if (!data) { + buf2 = (unsigned char *)malloc(len); + bzero(buf2, len); + crc = crc32_vpmsum(crc, buf2, len); + free(buf2); + } else { + crc = crc32_vpmsum(crc, data, (unsigned long)len); + } + return crc; +} + +#else /* HAVE_POWER8 */ + +/* This symbol has to exist on non-ppc architectures (and on legacy + * ppc systems using power7 or below) in order to compile properly + * there, even though it won't be called. + */ +uint32_t crc32c_ppc(uint32_t crc, unsigned char const *data, size_t len) { + return 0; +} + +#endif /* HAVE_POWER8 */ diff --git a/src/rocksdb/util/crc32c_ppc.h b/src/rocksdb/util/crc32c_ppc.h new file mode 100644 index 000000000..f0b0b66d5 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc.h @@ -0,0 +1,22 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern uint32_t crc32c_ppc(uint32_t crc, unsigned char const *buffer, + size_t len); + +#ifdef __cplusplus +} +#endif diff --git a/src/rocksdb/util/crc32c_ppc_asm.S b/src/rocksdb/util/crc32c_ppc_asm.S new file mode 100644 index 000000000..6959ba839 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc_asm.S @@ -0,0 +1,756 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2015 Anton Blanchard , IBM +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if defined (__clang__) +#include "third-party/gcc/ppc-asm.h" +#else +#include +#endif +#include "ppc-opcode.h" + +#undef toc + +#ifndef r1 +#define r1 1 +#endif + +#ifndef r2 +#define r2 2 +#endif + + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#define __ASSEMBLY__ +#include "crc32c_ppc_constants.h" + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v24 +#define const2 v25 + +#define byteswap v26 +#define mask_32bit v27 +#define mask_64bit v28 +#define zeroes v29 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(__crc32_vpmsum) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + /* Enough room for saving 10 non volatile VMX registers */ + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + stvx v20,0,r6 + stvx v21,off16,r6 + stvx v22,off32,r6 + stvx v23,off48,r6 + stvx v24,off64,r6 + stvx v25,off80,r6 + stvx v26,off96,r6 + stvx v27,off112,r6 + stvx v28,0,r7 + stvx v29,off16,r7 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw v0,-1 + + vsldoi mask_32bit,zeroes,v0,4 + vsldoi mask_64bit,zeroes,v0,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, r3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit + +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(r3, v0) + +.Lout: + subi r6,r1,56+10*16 + subi r7,r1,56+2*16 + + lvx v20,0,r6 + lvx v21,off16,r6 + lvx v22,off32,r6 + lvx v23,off48,r6 + lvx v24,off64,r6 + lvx v25,off80,r6 + lvx v26,off96,r6 + lvx v27,off112,r6 + lvx v28,0,r7 + lvx v29,off16,r7 + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + b .Lout + +FUNC_END(__crc32_vpmsum) diff --git a/src/rocksdb/util/crc32c_ppc_constants.h b/src/rocksdb/util/crc32c_ppc_constants.h new file mode 100644 index 000000000..f6494cd01 --- /dev/null +++ b/src/rocksdb/util/crc32c_ppc_constants.h @@ -0,0 +1,900 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (C) 2015, 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#define CRC 0x1edc6f41 +#define REFLECT +#define CRC_XOR + +#ifndef __ASSEMBLY__ +#ifdef CRC_TABLE +static const unsigned int crc_table[] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, + 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, + 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, + 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, + 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, + 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, + 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, + 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, + 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, + 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, + 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, + 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, + 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, + 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, + 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, + 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, + 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, + 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, + 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, + 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, + 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, + 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351, +}; + +#endif + +#else +#define MAX_SIZE 32768 +.constants : + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + + .short_constants : + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include + the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod + p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod + p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod + p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod + p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod + p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod + p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod + p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod + p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` + */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + .barrett_constants : + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 +#endif diff --git a/src/rocksdb/util/crc32c_test.cc b/src/rocksdb/util/crc32c_test.cc new file mode 100644 index 000000000..715d63e2d --- /dev/null +++ b/src/rocksdb/util/crc32c_test.cc @@ -0,0 +1,213 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "util/crc32c.h" + +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { +namespace crc32c { + +class CRC {}; + +// Tests for 3-way crc32c algorithm. We need these tests because it uses +// different lookup tables than the original Fast_CRC32 +const unsigned int BUFFER_SIZE = 512 * 1024 * sizeof(uint64_t); +char buffer[BUFFER_SIZE]; + +struct ExpectedResult { + size_t offset; + size_t length; + uint32_t crc32c; +}; + +ExpectedResult expectedResults[] = { + // Zero-byte input + {0, 0, ~0U}, + // Small aligned inputs to test special cases in SIMD implementations + {8, 1, 1543413366}, + {8, 2, 523493126}, + {8, 3, 1560427360}, + {8, 4, 3422504776}, + {8, 5, 447841138}, + {8, 6, 3910050499}, + {8, 7, 3346241981}, + // Small unaligned inputs + {9, 1, 3855826643}, + {10, 2, 560880875}, + {11, 3, 1479707779}, + {12, 4, 2237687071}, + {13, 5, 4063855784}, + {14, 6, 2553454047}, + {15, 7, 1349220140}, + // Larger inputs to test leftover chunks at the end of aligned blocks + {8, 8, 627613930}, + {8, 9, 2105929409}, + {8, 10, 2447068514}, + {8, 11, 863807079}, + {8, 12, 292050879}, + {8, 13, 1411837737}, + {8, 14, 2614515001}, + {8, 15, 3579076296}, + {8, 16, 2897079161}, + {8, 17, 675168386}, + // // Much larger inputs + {0, BUFFER_SIZE, 2096790750}, + {1, BUFFER_SIZE / 2, 3854797577}, + +}; + +TEST(CRC, StandardResults) { + // Original Fast_CRC32 tests. + // From rfc3720 section B.4. + char buf[32]; + + memset(buf, 0, sizeof(buf)); + ASSERT_EQ(0x8a9136aaU, Value(buf, sizeof(buf))); + + memset(buf, 0xff, sizeof(buf)); + ASSERT_EQ(0x62a8ab43U, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = static_cast(i); + } + ASSERT_EQ(0x46dd794eU, Value(buf, sizeof(buf))); + + for (int i = 0; i < 32; i++) { + buf[i] = static_cast(31 - i); + } + ASSERT_EQ(0x113fdb5cU, Value(buf, sizeof(buf))); + + unsigned char data[48] = { + 0x01, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x28, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + ASSERT_EQ(0xd9963a56, Value(reinterpret_cast(data), sizeof(data))); + + // 3-Way Crc32c tests ported from folly. + // Test 1: single computation + for (auto expected : expectedResults) { + uint32_t result = Value(buffer + expected.offset, expected.length); + EXPECT_EQ(~expected.crc32c, result); + } + + // Test 2: stitching two computations + for (auto expected : expectedResults) { + size_t partialLength = expected.length / 2; + uint32_t partialChecksum = Value(buffer + expected.offset, partialLength); + uint32_t result = + Extend(partialChecksum, buffer + expected.offset + partialLength, + expected.length - partialLength); + EXPECT_EQ(~expected.crc32c, result); + } +} + +TEST(CRC, Values) { ASSERT_NE(Value("a", 1), Value("foo", 3)); } + +TEST(CRC, Extend) { + ASSERT_EQ(Value("hello world", 11), Extend(Value("hello ", 6), "world", 5)); +} + +TEST(CRC, Mask) { + uint32_t crc = Value("foo", 3); + ASSERT_NE(crc, Mask(crc)); + ASSERT_NE(crc, Mask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Mask(crc))); + ASSERT_EQ(crc, Unmask(Unmask(Mask(Mask(crc))))); +} + +TEST(CRC, Crc32cCombineBasicTest) { + uint32_t crc1 = Value("hello ", 6); + uint32_t crc2 = Value("world", 5); + uint32_t crc3 = Value("hello world", 11); + uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, 5); + ASSERT_EQ(crc3, crc1_2_combine); +} + +TEST(CRC, Crc32cCombineOrderMattersTest) { + uint32_t crc1 = Value("hello ", 6); + uint32_t crc2 = Value("world", 5); + uint32_t crc3 = Value("hello world", 11); + uint32_t crc2_1_combine = Crc32cCombine(crc2, crc1, 6); + ASSERT_NE(crc3, crc2_1_combine); +} + +TEST(CRC, Crc32cCombineFullCoverTest) { + int scale = 4 * 1024; + Random rnd(test::RandomSeed()); + int size_1 = 1024 * 1024; + std::string s1 = rnd.RandomBinaryString(size_1); + uint32_t crc1 = Value(s1.data(), size_1); + for (int i = 0; i < scale; i++) { + int size_2 = i; + std::string s2 = rnd.RandomBinaryString(size_2); + uint32_t crc2 = Value(s2.data(), s2.size()); + uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size()); + uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2); + ASSERT_EQ(crc1_2, crc1_2_combine); + } +} + +TEST(CRC, Crc32cCombineBigSizeTest) { + Random rnd(test::RandomSeed()); + int size_1 = 1024 * 1024; + std::string s1 = rnd.RandomBinaryString(size_1); + uint32_t crc1 = Value(s1.data(), size_1); + int size_2 = 16 * 1024 * 1024 - 1; + std::string s2 = rnd.RandomBinaryString(size_2); + uint32_t crc2 = Value(s2.data(), s2.size()); + uint32_t crc1_2 = Extend(crc1, s2.data(), s2.size()); + uint32_t crc1_2_combine = Crc32cCombine(crc1, crc2, size_2); + ASSERT_EQ(crc1_2, crc1_2_combine); +} + +} // namespace crc32c +} // namespace ROCKSDB_NAMESPACE + +// copied from folly +const uint64_t FNV_64_HASH_START = 14695981039346656037ULL; +inline uint64_t fnv64_buf(const void* buf, size_t n, + uint64_t hash = FNV_64_HASH_START) { + // forcing signed char, since other platforms can use unsigned + const signed char* char_buf = reinterpret_cast(buf); + + for (size_t i = 0; i < n; ++i) { + hash += (hash << 1) + (hash << 4) + (hash << 5) + (hash << 7) + + (hash << 8) + (hash << 40); + hash ^= char_buf[i]; + } + return hash; +} + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + // Populate a buffer with a deterministic pattern + // on which to compute checksums + + const uint8_t* src = (uint8_t*)ROCKSDB_NAMESPACE::crc32c::buffer; + uint64_t* dst = (uint64_t*)ROCKSDB_NAMESPACE::crc32c::buffer; + const uint64_t* end = + (const uint64_t*)(ROCKSDB_NAMESPACE::crc32c::buffer + + ROCKSDB_NAMESPACE::crc32c::BUFFER_SIZE); + *dst++ = 0; + while (dst < end) { + ROCKSDB_NAMESPACE::EncodeFixed64( + reinterpret_cast(dst), + fnv64_buf((const char*)src, sizeof(uint64_t))); + dst++; + src += sizeof(uint64_t); + } + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/defer.h b/src/rocksdb/util/defer.h new file mode 100644 index 000000000..f71e67ba9 --- /dev/null +++ b/src/rocksdb/util/defer.h @@ -0,0 +1,82 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Defers the execution of the provided function until the Defer +// object goes out of scope. +// +// Usage example: +// +// Status DeferTest() { +// Status s; +// Defer defer([&s]() { +// if (!s.ok()) { +// // do cleanups ... +// } +// }); +// // do something ... +// if (!s.ok()) return; +// // do some other things ... +// return s; +// } +// +// The above code ensures that cleanups will always happen on returning. +// +// Without the help of Defer, you can +// 1. every time when !s.ok(), do the cleanup; +// 2. instead of returning when !s.ok(), continue the work only when s.ok(), +// but sometimes, this might lead to nested blocks of "if (s.ok()) {...}". +// +// With the help of Defer, you can centralize the cleanup logic inside the +// lambda passed to Defer, and you can return immediately on failure when +// necessary. +class Defer final { + public: + explicit Defer(std::function&& fn) : fn_(std::move(fn)) {} + ~Defer() { fn_(); } + + // Disallow copy. + Defer(const Defer&) = delete; + Defer& operator=(const Defer&) = delete; + + private: + std::function fn_; +}; + +// An RAII utility object that saves the current value of an object so that +// it can be overwritten, and restores it to the saved value when the +// SaveAndRestore object goes out of scope. +template +class SaveAndRestore { + public: + // obj is non-null pointer to value to be saved and later restored. + explicit SaveAndRestore(T* obj) : obj_(obj), saved_(*obj) {} + // new_value is stored in *obj + SaveAndRestore(T* obj, const T& new_value) + : obj_(obj), saved_(std::move(*obj)) { + *obj = new_value; + } + SaveAndRestore(T* obj, T&& new_value) : obj_(obj), saved_(std::move(*obj)) { + *obj = std::move(new_value); + } + ~SaveAndRestore() { *obj_ = std::move(saved_); } + + // No copies + SaveAndRestore(const SaveAndRestore&) = delete; + SaveAndRestore& operator=(const SaveAndRestore&) = delete; + + private: + T* const obj_; + T saved_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/defer_test.cc b/src/rocksdb/util/defer_test.cc new file mode 100644 index 000000000..0e98f68b6 --- /dev/null +++ b/src/rocksdb/util/defer_test.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/defer.h" + +#include "port/port.h" +#include "port/stack_trace.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class DeferTest {}; + +TEST(DeferTest, BlockScope) { + int v = 1; + { + Defer defer([&v]() { v *= 2; }); + } + ASSERT_EQ(2, v); +} + +TEST(DeferTest, FunctionScope) { + int v = 1; + auto f = [&v]() { + Defer defer([&v]() { v *= 2; }); + v = 2; + }; + f(); + ASSERT_EQ(4, v); +} + +TEST(SaveAndRestoreTest, BlockScope) { + int v = 1; + { + SaveAndRestore sr(&v); + ASSERT_EQ(v, 1); + v = 2; + ASSERT_EQ(v, 2); + } + ASSERT_EQ(v, 1); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/distributed_mutex.h b/src/rocksdb/util/distributed_mutex.h new file mode 100644 index 000000000..9675a1e2d --- /dev/null +++ b/src/rocksdb/util/distributed_mutex.h @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +// This file declares a wrapper around the efficient folly DistributedMutex +// that falls back on a standard mutex when not available. See +// https://github.com/facebook/folly/blob/main/folly/synchronization/DistributedMutex.h +// for benefits and limitations. + +// At the moment, only scoped locking is supported using DMutexLock +// RAII wrapper, because lock/unlock APIs will vary. + +#ifdef USE_FOLLY + +#include + +namespace ROCKSDB_NAMESPACE { + +class DMutex : public folly::DistributedMutex { + public: + static const char* kName() { return "folly::DistributedMutex"; } + + explicit DMutex(bool IGNORED_adaptive = false) { (void)IGNORED_adaptive; } + + // currently no-op + void AssertHeld() {} +}; +using DMutexLock = std::lock_guard; + +} // namespace ROCKSDB_NAMESPACE + +#else + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +using DMutex = port::Mutex; +using DMutexLock = std::lock_guard; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/util/duplicate_detector.h b/src/rocksdb/util/duplicate_detector.h new file mode 100644 index 000000000..d778622db --- /dev/null +++ b/src/rocksdb/util/duplicate_detector.h @@ -0,0 +1,71 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/db_impl/db_impl.h" +#include "logging/logging.h" +#include "util/set_comparator.h" + +namespace ROCKSDB_NAMESPACE { +// During recovery if the memtable is flushed we cannot rely on its help on +// duplicate key detection and as key insert will not be attempted. This class +// will be used as a emulator of memtable to tell if insertion of a key/seq +// would have resulted in duplication. +class DuplicateDetector { + public: + explicit DuplicateDetector(DBImpl* db) : db_(db) {} + bool IsDuplicateKeySeq(uint32_t cf, const Slice& key, SequenceNumber seq) { + assert(seq >= batch_seq_); + if (batch_seq_ != seq) { // it is a new batch + keys_.clear(); + } + batch_seq_ = seq; + CFKeys& cf_keys = keys_[cf]; + if (cf_keys.size() == 0) { // just inserted + InitWithComp(cf); + } + auto it = cf_keys.insert(key); + if (it.second == false) { // second is false if a element already existed. + keys_.clear(); + InitWithComp(cf); + keys_[cf].insert(key); + return true; + } + return false; + } + + private: + SequenceNumber batch_seq_ = 0; + DBImpl* db_; + using CFKeys = std::set; + std::map keys_; + void InitWithComp(const uint32_t cf) { + auto h = db_->GetColumnFamilyHandle(cf); + if (!h) { + // TODO(myabandeh): This is not a concern in MyRocks as drop cf is not + // implemented yet. When it does, we should return proper error instead + // of throwing exception. + ROCKS_LOG_FATAL( + db_->immutable_db_options().info_log, + "Recovering an entry from the dropped column family %" PRIu32 + ". WAL must must have been emptied before dropping the column " + "family", + cf); +#ifndef ROCKSDB_LITE + throw std::runtime_error( + "Recovering an entry from a dropped column family. " + "WAL must must have been flushed before dropping the column " + "family"); +#endif + return; + } + auto cmp = h->GetComparator(); + keys_[cf] = CFKeys(SetComparator(cmp)); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/dynamic_bloom.cc b/src/rocksdb/util/dynamic_bloom.cc new file mode 100644 index 000000000..0ff3b4a75 --- /dev/null +++ b/src/rocksdb/util/dynamic_bloom.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "dynamic_bloom.h" + +#include + +#include "memory/allocator.h" +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +uint32_t roundUpToPow2(uint32_t x) { + uint32_t rv = 1; + while (rv < x) { + rv <<= 1; + } + return rv; +} +} // namespace + +DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, + uint32_t num_probes, size_t huge_page_tlb_size, + Logger* logger) + // Round down, except round up with 1 + : kNumDoubleProbes((num_probes + (num_probes == 1)) / 2) { + assert(num_probes % 2 == 0); // limitation of current implementation + assert(num_probes <= 10); // limitation of current implementation + assert(kNumDoubleProbes > 0); + + // Determine how much to round off + align by so that x ^ i (that's xor) is + // a valid u64 index if x is a valid u64 index and 0 <= i < kNumDoubleProbes. + uint32_t block_bytes = /*bytes/u64*/ 8 * + /*u64s*/ std::max(1U, roundUpToPow2(kNumDoubleProbes)); + uint32_t block_bits = block_bytes * 8; + uint32_t blocks = (total_bits + block_bits - 1) / block_bits; + uint32_t sz = blocks * block_bytes; + kLen = sz / /*bytes/u64*/ 8; + assert(kLen > 0); +#ifndef NDEBUG + for (uint32_t i = 0; i < kNumDoubleProbes; ++i) { + // Ensure probes starting at last word are in range + assert(((kLen - 1) ^ i) < kLen); + } +#endif + + // Padding to correct for allocation not originally aligned on block_bytes + // boundary + sz += block_bytes - 1; + assert(allocator); + + char* raw = allocator->AllocateAligned(sz, huge_page_tlb_size, logger); + memset(raw, 0, sz); + auto block_offset = reinterpret_cast(raw) % block_bytes; + if (block_offset > 0) { + // Align on block_bytes boundary + raw += block_bytes - block_offset; + } + static_assert(sizeof(std::atomic) == sizeof(uint64_t), + "Expecting zero-space-overhead atomic"); + data_ = reinterpret_cast*>(raw); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/dynamic_bloom.h b/src/rocksdb/util/dynamic_bloom.h new file mode 100644 index 000000000..40cd29404 --- /dev/null +++ b/src/rocksdb/util/dynamic_bloom.h @@ -0,0 +1,214 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/slice.h" +#include "table/multiget_context.h" +#include "util/hash.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; +class Allocator; +class Logger; + +// A Bloom filter intended only to be used in memory, never serialized in a way +// that could lead to schema incompatibility. Supports opt-in lock-free +// concurrent access. +// +// This implementation is also intended for applications generally preferring +// speed vs. maximum accuracy: roughly 0.9x BF op latency for 1.1x FP rate. +// For 1% FP rate, that means that the latency of a look-up triggered by an FP +// should be less than roughly 100x the cost of a Bloom filter op. +// +// For simplicity and performance, the current implementation requires +// num_probes to be a multiple of two and <= 10. +// +class DynamicBloom { + public: + // allocator: pass allocator to bloom filter, hence trace the usage of memory + // total_bits: fixed total bits for the bloom + // num_probes: number of hash probes for a single key + // hash_func: customized hash function + // huge_page_tlb_size: if >0, try to allocate bloom bytes from huge page TLB + // within this page size. Need to reserve huge pages for + // it to be allocated, like: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + explicit DynamicBloom(Allocator* allocator, uint32_t total_bits, + uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, + Logger* logger = nullptr); + + ~DynamicBloom() {} + + // Assuming single threaded access to this function. + void Add(const Slice& key); + + // Like Add, but may be called concurrent with other functions. + void AddConcurrently(const Slice& key); + + // Assuming single threaded access to this function. + void AddHash(uint32_t hash); + + // Like AddHash, but may be called concurrent with other functions. + void AddHashConcurrently(uint32_t hash); + + // Multithreaded access to this function is OK + bool MayContain(const Slice& key) const; + + void MayContain(int num_keys, Slice* keys, bool* may_match) const; + + // Multithreaded access to this function is OK + bool MayContainHash(uint32_t hash) const; + + void Prefetch(uint32_t h); + + private: + // Length of the structure, in 64-bit words. For this structure, "word" + // will always refer to 64-bit words. + uint32_t kLen; + // We make the k probes in pairs, two for each 64-bit read/write. Thus, + // this stores k/2, the number of words to double-probe. + const uint32_t kNumDoubleProbes; + + std::atomic* data_; + + // or_func(ptr, mask) should effect *ptr |= mask with the appropriate + // concurrency safety, working with bytes. + template + void AddHash(uint32_t hash, const OrFunc& or_func); + + bool DoubleProbe(uint32_t h32, size_t a) const; +}; + +inline void DynamicBloom::Add(const Slice& key) { AddHash(BloomHash(key)); } + +inline void DynamicBloom::AddConcurrently(const Slice& key) { + AddHashConcurrently(BloomHash(key)); +} + +inline void DynamicBloom::AddHash(uint32_t hash) { + AddHash(hash, [](std::atomic* ptr, uint64_t mask) { + ptr->store(ptr->load(std::memory_order_relaxed) | mask, + std::memory_order_relaxed); + }); +} + +inline void DynamicBloom::AddHashConcurrently(uint32_t hash) { + AddHash(hash, [](std::atomic* ptr, uint64_t mask) { + // Happens-before between AddHash and MaybeContains is handled by + // access to versions_->LastSequence(), so all we have to do here is + // avoid races (so we don't give the compiler a license to mess up + // our code) and not lose bits. std::memory_order_relaxed is enough + // for that. + if ((mask & ptr->load(std::memory_order_relaxed)) != mask) { + ptr->fetch_or(mask, std::memory_order_relaxed); + } + }); +} + +inline bool DynamicBloom::MayContain(const Slice& key) const { + return (MayContainHash(BloomHash(key))); +} + +inline void DynamicBloom::MayContain(int num_keys, Slice* keys, + bool* may_match) const { + std::array hashes; + std::array byte_offsets; + for (int i = 0; i < num_keys; ++i) { + hashes[i] = BloomHash(keys[i]); + size_t a = FastRange32(kLen, hashes[i]); + PREFETCH(data_ + a, 0, 3); + byte_offsets[i] = a; + } + + for (int i = 0; i < num_keys; i++) { + may_match[i] = DoubleProbe(hashes[i], byte_offsets[i]); + } +} + +#if defined(_MSC_VER) +#pragma warning(push) +// local variable is initialized but not referenced +#pragma warning(disable : 4189) +#endif +inline void DynamicBloom::Prefetch(uint32_t h32) { + size_t a = FastRange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); +} +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + +// Speed hacks in this implementation: +// * Uses fastrange instead of % +// * Minimum logic to determine first (and all) probed memory addresses. +// (Uses constant bit-xor offsets from the starting probe address.) +// * (Major) Two probes per 64-bit memory fetch/write. +// Code simplification / optimization: only allow even number of probes. +// * Very fast and effective (murmur-like) hash expansion/re-mixing. (At +// least on recent CPUs, integer multiplication is very cheap. Each 64-bit +// remix provides five pairs of bit addresses within a uint64_t.) +// Code simplification / optimization: only allow up to 10 probes, from a +// single 64-bit remix. +// +// The FP rate penalty for this implementation, vs. standard Bloom filter, is +// roughly 1.12x on top of the 1.15x penalty for a 512-bit cache-local Bloom. +// This implementation does not explicitly use the cache line size, but is +// effectively cache-local (up to 16 probes) because of the bit-xor offsetting. +// +// NB: could easily be upgraded to support a 64-bit hash and +// total_bits > 2^32 (512MB). (The latter is a bad idea without the former, +// because of false positives.) + +inline bool DynamicBloom::MayContainHash(uint32_t h32) const { + size_t a = FastRange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); + return DoubleProbe(h32, a); +} + +inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const { + // Expand/remix with 64-bit golden ratio + uint64_t h = 0x9e3779b97f4a7c13ULL * h32; + for (unsigned i = 0;; ++i) { + // Two bit probes per uint64_t probe + uint64_t mask = + ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63)); + uint64_t val = data_[byte_offset ^ i].load(std::memory_order_relaxed); + if (i + 1 >= kNumDoubleProbes) { + return (val & mask) == mask; + } else if ((val & mask) != mask) { + return false; + } + h = (h >> 12) | (h << 52); + } +} + +template +inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) { + size_t a = FastRange32(kLen, h32); + PREFETCH(data_ + a, 0, 3); + // Expand/remix with 64-bit golden ratio + uint64_t h = 0x9e3779b97f4a7c13ULL * h32; + for (unsigned i = 0;; ++i) { + // Two bit probes per uint64_t probe + uint64_t mask = + ((uint64_t)1 << (h & 63)) | ((uint64_t)1 << ((h >> 6) & 63)); + or_func(&data_[a ^ i], mask); + if (i + 1 >= kNumDoubleProbes) { + return; + } + h = (h >> 12) | (h << 52); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/dynamic_bloom_test.cc b/src/rocksdb/util/dynamic_bloom_test.cc new file mode 100644 index 000000000..925c5479a --- /dev/null +++ b/src/rocksdb/util/dynamic_bloom_test.cc @@ -0,0 +1,325 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run this test... Skipping...\n"); + return 0; +} +#else + +#include +#include +#include +#include +#include +#include +#include + +#include "dynamic_bloom.h" +#include "memory/arena.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/gflags_compat.h" +#include "util/stop_watch.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; + +DEFINE_int32(bits_per_key, 10, ""); +DEFINE_int32(num_probes, 6, ""); +DEFINE_bool(enable_perf, false, ""); + +namespace ROCKSDB_NAMESPACE { + +struct KeyMaker { + uint64_t a; + uint64_t b; + + // Sequential, within a hash function block + inline Slice Seq(uint64_t i) { + a = i; + return Slice(reinterpret_cast(&a), sizeof(a)); + } + // Not quite sequential, varies across hash function blocks + inline Slice Nonseq(uint64_t i) { + a = i; + b = i * 123; + return Slice(reinterpret_cast(this), sizeof(*this)); + } + inline Slice Key(uint64_t i, bool nonseq) { + return nonseq ? Nonseq(i) : Seq(i); + } +}; + +class DynamicBloomTest : public testing::Test {}; + +TEST_F(DynamicBloomTest, EmptyFilter) { + Arena arena; + DynamicBloom bloom1(&arena, 100, 2); + ASSERT_TRUE(!bloom1.MayContain("hello")); + ASSERT_TRUE(!bloom1.MayContain("world")); + + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2); + ASSERT_TRUE(!bloom2.MayContain("hello")); + ASSERT_TRUE(!bloom2.MayContain("world")); +} + +TEST_F(DynamicBloomTest, Small) { + Arena arena; + DynamicBloom bloom1(&arena, 100, 2); + bloom1.Add("hello"); + bloom1.Add("world"); + ASSERT_TRUE(bloom1.MayContain("hello")); + ASSERT_TRUE(bloom1.MayContain("world")); + ASSERT_TRUE(!bloom1.MayContain("x")); + ASSERT_TRUE(!bloom1.MayContain("foo")); + + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2); + bloom2.Add("hello"); + bloom2.Add("world"); + ASSERT_TRUE(bloom2.MayContain("hello")); + ASSERT_TRUE(bloom2.MayContain("world")); + ASSERT_TRUE(!bloom2.MayContain("x")); + ASSERT_TRUE(!bloom2.MayContain("foo")); +} + +TEST_F(DynamicBloomTest, SmallConcurrentAdd) { + Arena arena; + DynamicBloom bloom1(&arena, 100, 2); + bloom1.AddConcurrently("hello"); + bloom1.AddConcurrently("world"); + ASSERT_TRUE(bloom1.MayContain("hello")); + ASSERT_TRUE(bloom1.MayContain("world")); + ASSERT_TRUE(!bloom1.MayContain("x")); + ASSERT_TRUE(!bloom1.MayContain("foo")); + + DynamicBloom bloom2(&arena, CACHE_LINE_SIZE * 8 * 2 - 1, 2); + bloom2.AddConcurrently("hello"); + bloom2.AddConcurrently("world"); + ASSERT_TRUE(bloom2.MayContain("hello")); + ASSERT_TRUE(bloom2.MayContain("world")); + ASSERT_TRUE(!bloom2.MayContain("x")); + ASSERT_TRUE(!bloom2.MayContain("foo")); +} + +static uint32_t NextNum(uint32_t num) { + if (num < 10) { + num += 1; + } else if (num < 100) { + num += 10; + } else if (num < 1000) { + num += 100; + } else { + num = num * 26 / 10; + } + return num; +} + +TEST_F(DynamicBloomTest, VaryingLengths) { + KeyMaker km; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + uint32_t num_probes = static_cast(FLAGS_num_probes); + + fprintf(stderr, "bits_per_key: %d num_probes: %d\n", FLAGS_bits_per_key, + num_probes); + + // NB: FP rate impact of 32-bit hash is noticeable starting around 10M keys. + // But that effect is hidden if using sequential keys (unique hashes). + for (bool nonseq : {false, true}) { + const uint32_t max_num = FLAGS_enable_perf ? 40000000 : 400000; + for (uint32_t num = 1; num <= max_num; num = NextNum(num)) { + uint32_t bloom_bits = 0; + Arena arena; + bloom_bits = num * FLAGS_bits_per_key; + DynamicBloom bloom(&arena, bloom_bits, num_probes); + for (uint64_t i = 0; i < num; i++) { + bloom.Add(km.Key(i, nonseq)); + ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq))); + } + + // All added keys must match + for (uint64_t i = 0; i < num; i++) { + ASSERT_TRUE(bloom.MayContain(km.Key(i, nonseq))); + } + + // Check false positive rate + int result = 0; + for (uint64_t i = 0; i < 30000; i++) { + if (bloom.MayContain(km.Key(i + 1000000000, nonseq))) { + result++; + } + } + double rate = result / 30000.0; + + fprintf(stderr, + "False positives (%s keys): " + "%5.2f%% @ num = %6u, bloom_bits = %6u\n", + nonseq ? "nonseq" : "seq", rate * 100.0, num, bloom_bits); + + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + } + + fprintf(stderr, "Filters: %d good, %d mediocre\n", good_filters, + mediocre_filters); + ASSERT_LE(mediocre_filters, good_filters / 25); +} + +TEST_F(DynamicBloomTest, perf) { + KeyMaker km; + StopWatchNano timer(SystemClock::Default().get()); + uint32_t num_probes = static_cast(FLAGS_num_probes); + + if (!FLAGS_enable_perf) { + return; + } + + for (uint32_t m = 1; m <= 8; ++m) { + Arena arena; + const uint32_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8); + + DynamicBloom std_bloom(&arena, num_keys * 10, num_probes); + + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + std_bloom.Add(km.Seq(i)); + } + + uint64_t elapsed = timer.ElapsedNanos(); + fprintf(stderr, "dynamic bloom, avg add latency %3g\n", + static_cast(elapsed) / num_keys); + + uint32_t count = 0; + timer.Start(); + for (uint64_t i = 1; i <= num_keys; ++i) { + if (std_bloom.MayContain(km.Seq(i))) { + ++count; + } + } + ASSERT_EQ(count, num_keys); + elapsed = timer.ElapsedNanos(); + assert(count > 0); + fprintf(stderr, "dynamic bloom, avg query latency %3g\n", + static_cast(elapsed) / count); + } +} + +TEST_F(DynamicBloomTest, concurrent_with_perf) { + uint32_t num_probes = static_cast(FLAGS_num_probes); + + uint32_t m_limit = FLAGS_enable_perf ? 8 : 1; + + uint32_t num_threads = 4; + std::vector threads; + + // NB: Uses sequential keys for speed, but that hides the FP rate + // impact of 32-bit hash, which is noticeable starting around 10M keys + // when they vary across hashing blocks. + for (uint32_t m = 1; m <= m_limit; ++m) { + Arena arena; + const uint32_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8); + + DynamicBloom std_bloom(&arena, num_keys * 10, num_probes); + + std::atomic elapsed(0); + + std::function adder([&](size_t t) { + KeyMaker km; + StopWatchNano timer(SystemClock::Default().get()); + timer.Start(); + for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { + std_bloom.AddConcurrently(km.Seq(i)); + } + elapsed += timer.ElapsedNanos(); + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(adder, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } + + fprintf(stderr, + "dynamic bloom, avg parallel add latency %3g" + " nanos/key\n", + static_cast(elapsed) / num_threads / num_keys); + + elapsed = 0; + std::function hitter([&](size_t t) { + KeyMaker km; + StopWatchNano timer(SystemClock::Default().get()); + timer.Start(); + for (uint64_t i = 1 + t; i <= num_keys; i += num_threads) { + bool f = std_bloom.MayContain(km.Seq(i)); + ASSERT_TRUE(f); + } + elapsed += timer.ElapsedNanos(); + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(hitter, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } + + fprintf(stderr, + "dynamic bloom, avg parallel hit latency %3g" + " nanos/key\n", + static_cast(elapsed) / num_threads / num_keys); + + elapsed = 0; + std::atomic false_positives(0); + std::function misser([&](size_t t) { + KeyMaker km; + StopWatchNano timer(SystemClock::Default().get()); + timer.Start(); + for (uint64_t i = num_keys + 1 + t; i <= 2 * num_keys; i += num_threads) { + bool f = std_bloom.MayContain(km.Seq(i)); + if (f) { + ++false_positives; + } + } + elapsed += timer.ElapsedNanos(); + }); + for (size_t t = 0; t < num_threads; ++t) { + threads.emplace_back(misser, t); + } + while (threads.size() > 0) { + threads.back().join(); + threads.pop_back(); + } + + fprintf(stderr, + "dynamic bloom, avg parallel miss latency %3g" + " nanos/key, %f%% false positive rate\n", + static_cast(elapsed) / num_threads / num_keys, + false_positives.load() * 100.0 / num_keys); + } +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char **argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + ParseCommandLineFlags(&argc, &argv, true); + + return RUN_ALL_TESTS(); +} + +#endif // GFLAGS diff --git a/src/rocksdb/util/fastrange.h b/src/rocksdb/util/fastrange.h new file mode 100644 index 000000000..a70a980f6 --- /dev/null +++ b/src/rocksdb/util/fastrange.h @@ -0,0 +1,114 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// fastrange/FastRange: A faster alternative to % for mapping a hash value +// to an arbitrary range. See https://github.com/lemire/fastrange +// +// Generally recommended are FastRange32 for mapping results of 32-bit +// hash functions and FastRange64 for mapping results of 64-bit hash +// functions. FastRange is less forgiving than % if the input hashes are +// not well distributed over the full range of the type (32 or 64 bits). +// +// Also included is a templated implementation FastRangeGeneric for use +// in generic algorithms, but not otherwise recommended because of +// potential ambiguity. Unlike with %, it is critical to use the right +// FastRange variant for the output size of your hash function. + +#pragma once + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +#ifdef TEST_UINT128_COMPAT +#undef HAVE_UINT128_EXTENSION +#endif + +namespace ROCKSDB_NAMESPACE { + +namespace detail { + +// Using a class template to support partial specialization +template +struct FastRangeGenericImpl { + // only reach this on no supported specialization +}; + +template +struct FastRangeGenericImpl { + static inline Range Fn(uint32_t hash, Range range) { + static_assert(std::is_unsigned::value, "must be unsigned"); + static_assert(sizeof(Range) <= sizeof(uint32_t), + "cannot be larger than hash (32 bits)"); + + uint64_t product = uint64_t{range} * hash; + return static_cast(product >> 32); + } +}; + +template +struct FastRangeGenericImpl { + static inline Range Fn(uint64_t hash, Range range) { + static_assert(std::is_unsigned::value, "must be unsigned"); + static_assert(sizeof(Range) <= sizeof(uint64_t), + "cannot be larger than hash (64 bits)"); + +#ifdef HAVE_UINT128_EXTENSION + // Can use compiler's 128-bit type. Trust it to do the right thing. + __uint128_t wide = __uint128_t{range} * hash; + return static_cast(wide >> 64); +#else + // Fall back: full decomposition. + // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit + // -> 128-bit multiplication and optimize it appropriately + uint64_t range64 = range; // ok to shift by 32, even if Range is 32-bit + uint64_t tmp = uint64_t{range64 & 0xffffFFFF} * uint64_t{hash & 0xffffFFFF}; + tmp >>= 32; + tmp += uint64_t{range64 & 0xffffFFFF} * uint64_t{hash >> 32}; + // Avoid overflow: first add lower 32 of tmp2, and later upper 32 + uint64_t tmp2 = uint64_t{range64 >> 32} * uint64_t{hash & 0xffffFFFF}; + tmp += static_cast(tmp2); + tmp >>= 32; + tmp += (tmp2 >> 32); + tmp += uint64_t{range64 >> 32} * uint64_t{hash >> 32}; + return static_cast(tmp); +#endif + } +}; + +} // namespace detail + +// Now an omnibus templated function (yay parameter inference). +// +// NOTICE: +// This templated version is not recommended for typical use because +// of the potential to mix a 64-bit FastRange with a 32-bit bit hash, +// most likely because you put your 32-bit hash in an "unsigned long" +// which is 64 bits on some platforms. That doesn't really matter for +// an operation like %, but 64-bit FastRange gives extremely bad results, +// mostly zero, on 32-bit hash values. And because good hashing is not +// generally required for correctness, this kind of mistake could go +// unnoticed with just unit tests. Plus it could vary by platform. +template +inline Range FastRangeGeneric(Hash hash, Range range) { + return detail::FastRangeGenericImpl::Fn(hash, range); +} + +// The most popular / convenient / recommended variants: + +// Map a quality 64-bit hash value down to an arbitrary size_t range. +// (size_t is standard for mapping to things in memory.) +inline size_t FastRange64(uint64_t hash, size_t range) { + return FastRangeGeneric(hash, range); +} + +// Map a quality 32-bit hash value down to an arbitrary uint32_t range. +inline uint32_t FastRange32(uint32_t hash, uint32_t range) { + return FastRangeGeneric(hash, range); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/file_checksum_helper.cc b/src/rocksdb/util/file_checksum_helper.cc new file mode 100644 index 000000000..a73920352 --- /dev/null +++ b/src/rocksdb/util/file_checksum_helper.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/file_checksum_helper.h" + +#include + +#include "db/log_reader.h" +#include "db/version_edit.h" +#include "db/version_edit_handler.h" +#include "file/sequence_file_reader.h" +#include "rocksdb/utilities/customizable_util.h" + +namespace ROCKSDB_NAMESPACE { + +void FileChecksumListImpl::reset() { checksum_map_.clear(); } + +size_t FileChecksumListImpl::size() const { return checksum_map_.size(); } + +Status FileChecksumListImpl::GetAllFileChecksums( + std::vector* file_numbers, std::vector* checksums, + std::vector* checksum_func_names) { + if (file_numbers == nullptr || checksums == nullptr || + checksum_func_names == nullptr) { + return Status::InvalidArgument("Pointer has not been initiated"); + } + + for (auto i : checksum_map_) { + file_numbers->push_back(i.first); + checksums->push_back(i.second.first); + checksum_func_names->push_back(i.second.second); + } + return Status::OK(); +} + +Status FileChecksumListImpl::SearchOneFileChecksum( + uint64_t file_number, std::string* checksum, + std::string* checksum_func_name) { + if (checksum == nullptr || checksum_func_name == nullptr) { + return Status::InvalidArgument("Pointer has not been initiated"); + } + + auto it = checksum_map_.find(file_number); + if (it == checksum_map_.end()) { + return Status::NotFound(); + } else { + *checksum = it->second.first; + *checksum_func_name = it->second.second; + } + return Status::OK(); +} + +Status FileChecksumListImpl::InsertOneFileChecksum( + uint64_t file_number, const std::string& checksum, + const std::string& checksum_func_name) { + auto it = checksum_map_.find(file_number); + if (it == checksum_map_.end()) { + checksum_map_.insert(std::make_pair( + file_number, std::make_pair(checksum, checksum_func_name))); + } else { + it->second.first = checksum; + it->second.second = checksum_func_name; + } + return Status::OK(); +} + +Status FileChecksumListImpl::RemoveOneFileChecksum(uint64_t file_number) { + auto it = checksum_map_.find(file_number); + if (it == checksum_map_.end()) { + return Status::NotFound(); + } else { + checksum_map_.erase(it); + } + return Status::OK(); +} + +FileChecksumList* NewFileChecksumList() { + FileChecksumListImpl* checksum_list = new FileChecksumListImpl(); + return checksum_list; +} + +std::shared_ptr GetFileChecksumGenCrc32cFactory() { + static std::shared_ptr default_crc32c_gen_factory( + new FileChecksumGenCrc32cFactory()); + return default_crc32c_gen_factory; +} + +Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, + uint64_t manifest_file_size, + FileChecksumList* checksum_list) { + if (checksum_list == nullptr) { + return Status::InvalidArgument("checksum_list is nullptr"); + } + assert(checksum_list); + checksum_list->reset(); + Status s; + + std::unique_ptr file_reader; + { + std::unique_ptr file; + const std::shared_ptr& fs = src_env->GetFileSystem(); + s = fs->NewSequentialFile(abs_path, + fs->OptimizeForManifestRead(FileOptions()), &file, + nullptr /* dbg */); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader(std::move(file), abs_path)); + } + + struct LogReporter : public log::Reader::Reporter { + Status* status_ptr; + virtual void Corruption(size_t /*bytes*/, const Status& st) override { + if (status_ptr->ok()) { + *status_ptr = st; + } + } + } reporter; + reporter.status_ptr = &s; + log::Reader reader(nullptr, std::move(file_reader), &reporter, + true /* checksum */, 0 /* log_number */); + FileChecksumRetriever retriever(manifest_file_size, *checksum_list); + retriever.Iterate(reader, &s); + assert(!retriever.status().ok() || + manifest_file_size == std::numeric_limits::max() || + reader.LastRecordEnd() == manifest_file_size); + + return retriever.status(); +} + +#ifndef ROCKSDB_LITE +namespace { +static int RegisterFileChecksumGenFactories(ObjectLibrary& library, + const std::string& /*arg*/) { + library.AddFactory( + FileChecksumGenCrc32cFactory::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /* errmsg */) { + guard->reset(new FileChecksumGenCrc32cFactory()); + return guard->get(); + }); + return 1; +} +} // namespace +#endif // !ROCKSDB_LITE + +Status FileChecksumGenFactory::CreateFromString( + const ConfigOptions& options, const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterFileChecksumGenFactories(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + if (value == FileChecksumGenCrc32cFactory::kClassName()) { + *result = GetFileChecksumGenCrc32cFactory(); + return Status::OK(); + } else { + Status s = LoadSharedObject(options, value, nullptr, + result); + return s; + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/file_checksum_helper.h b/src/rocksdb/util/file_checksum_helper.h new file mode 100644 index 000000000..d622e9bba --- /dev/null +++ b/src/rocksdb/util/file_checksum_helper.h @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "port/port.h" +#include "rocksdb/file_checksum.h" +#include "rocksdb/status.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { + +// This is the class to generate the file checksum based on Crc32. It +// will be used as the default checksum method for SST file checksum +class FileChecksumGenCrc32c : public FileChecksumGenerator { + public: + FileChecksumGenCrc32c(const FileChecksumGenContext& /*context*/) { + checksum_ = 0; + } + + void Update(const char* data, size_t n) override { + checksum_ = crc32c::Extend(checksum_, data, n); + } + + void Finalize() override { + assert(checksum_str_.empty()); + // Store as big endian raw bytes + PutFixed32(&checksum_str_, EndianSwapValue(checksum_)); + } + + std::string GetChecksum() const override { + assert(!checksum_str_.empty()); + return checksum_str_; + } + + const char* Name() const override { return "FileChecksumCrc32c"; } + + private: + uint32_t checksum_; + std::string checksum_str_; +}; + +class FileChecksumGenCrc32cFactory : public FileChecksumGenFactory { + public: + std::unique_ptr CreateFileChecksumGenerator( + const FileChecksumGenContext& context) override { + if (context.requested_checksum_func_name.empty() || + context.requested_checksum_func_name == "FileChecksumCrc32c") { + return std::unique_ptr( + new FileChecksumGenCrc32c(context)); + } else { + return nullptr; + } + } + + static const char* kClassName() { return "FileChecksumGenCrc32cFactory"; } + const char* Name() const override { return kClassName(); } +}; + +// The default implementaion of FileChecksumList +class FileChecksumListImpl : public FileChecksumList { + public: + FileChecksumListImpl() {} + void reset() override; + + size_t size() const override; + + Status GetAllFileChecksums( + std::vector* file_numbers, std::vector* checksums, + std::vector* checksum_func_names) override; + + Status SearchOneFileChecksum(uint64_t file_number, std::string* checksum, + std::string* checksum_func_name) override; + + Status InsertOneFileChecksum(uint64_t file_number, + const std::string& checksum, + const std::string& checksum_func_name) override; + + Status RemoveOneFileChecksum(uint64_t file_number) override; + + private: + // Key is the file number, the first portion of the value is checksum, the + // second portion of the value is checksum function name. + std::unordered_map> + checksum_map_; +}; + +// If manifest_file_size < std::numeric_limits::max(), only use +// that length prefix of the manifest file. +Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, + uint64_t manifest_file_size, + FileChecksumList* checksum_list); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/file_reader_writer_test.cc b/src/rocksdb/util/file_reader_writer_test.cc new file mode 100644 index 000000000..e778efc3c --- /dev/null +++ b/src/rocksdb/util/file_reader_writer_test.cc @@ -0,0 +1,1066 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include +#include + +#include "db/db_test_util.h" +#include "env/mock_env.h" +#include "file/line_file_reader.h" +#include "file/random_access_file_reader.h" +#include "file/read_write_util.h" +#include "file/readahead_raf.h" +#include "file/sequence_file_reader.h" +#include "file/writable_file_writer.h" +#include "rocksdb/file_system.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/crc32c.h" +#include "util/random.h" +#include "utilities/fault_injection_fs.h" + +namespace ROCKSDB_NAMESPACE { + +class WritableFileWriterTest : public testing::Test {}; + +constexpr uint32_t kMb = static_cast(1) << 20; + +TEST_F(WritableFileWriterTest, RangeSync) { + class FakeWF : public FSWritableFile { + public: + explicit FakeWF() : size_(0), last_synced_(0) {} + ~FakeWF() override {} + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + size_ += data.size(); + return IOStatus::OK(); + } + IOStatus Truncate(uint64_t /*size*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + EXPECT_GE(size_, last_synced_ + kMb); + EXPECT_LT(size_, last_synced_ + 2 * kMb); + // Make sure random writes generated enough writes. + EXPECT_GT(size_, 10 * kMb); + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + void SetIOPriority(Env::IOPriority /*pri*/) override {} + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return size_; + } + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override { + return IOStatus::OK(); + } + + protected: + IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus RangeSync(uint64_t offset, uint64_t nbytes, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(offset % 4096, 0u); + EXPECT_EQ(nbytes % 4096, 0u); + + EXPECT_EQ(offset, last_synced_); + last_synced_ = offset + nbytes; + EXPECT_GE(size_, last_synced_ + kMb); + if (size_ > 2 * kMb) { + EXPECT_LT(size_, last_synced_ + 2 * kMb); + } + return IOStatus::OK(); + } + + uint64_t size_; + uint64_t last_synced_; + }; + + EnvOptions env_options; + env_options.bytes_per_sync = kMb; + std::unique_ptr wf(new FakeWF); + std::unique_ptr writer( + new WritableFileWriter(std::move(wf), "" /* don't care */, env_options)); + Random r(301); + Status s; + std::unique_ptr large_buf(new char[10 * kMb]); + for (int i = 0; i < 1000; i++) { + int skew_limit = (i < 700) ? 10 : 15; + uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); + s = writer->Append(Slice(large_buf.get(), num)); + ASSERT_OK(s); + + // Flush in a chance of 1/10. + if (r.Uniform(10) == 0) { + s = writer->Flush(); + ASSERT_OK(s); + } + } + s = writer->Close(); + ASSERT_OK(s); +} + +TEST_F(WritableFileWriterTest, IncrementalBuffer) { + class FakeWF : public FSWritableFile { + public: + explicit FakeWF(std::string* _file_data, bool _use_direct_io, + bool _no_flush) + : file_data_(_file_data), + use_direct_io_(_use_direct_io), + no_flush_(_no_flush) {} + ~FakeWF() override {} + + using FSWritableFile::Append; + IOStatus Append(const Slice& data, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + file_data_->append(data.data(), data.size()); + size_ += data.size(); + return IOStatus::OK(); + } + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& data, uint64_t pos, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + EXPECT_TRUE(pos % 512 == 0); + EXPECT_TRUE(data.size() % 512 == 0); + file_data_->resize(pos); + file_data_->append(data.data(), data.size()); + size_ += data.size(); + return IOStatus::OK(); + } + + IOStatus Truncate(uint64_t size, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + file_data_->resize(size); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Fsync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + void SetIOPriority(Env::IOPriority /*pri*/) override {} + uint64_t GetFileSize(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return size_; + } + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override { + return IOStatus::OK(); + } + bool use_direct_io() const override { return use_direct_io_; } + + std::string* file_data_; + bool use_direct_io_; + bool no_flush_; + size_t size_ = 0; + }; + + Random r(301); + const int kNumAttempts = 50; + for (int attempt = 0; attempt < kNumAttempts; attempt++) { + bool no_flush = (attempt % 3 == 0); + EnvOptions env_options; + env_options.writable_file_max_buffer_size = + (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024; + std::string actual; + std::unique_ptr wf(new FakeWF(&actual, +#ifndef ROCKSDB_LITE + attempt % 2 == 1, +#else + false, +#endif + no_flush)); + std::unique_ptr writer(new WritableFileWriter( + std::move(wf), "" /* don't care */, env_options)); + + std::string target; + for (int i = 0; i < 20; i++) { + uint32_t num = r.Skewed(16) * 100 + r.Uniform(100); + std::string random_string = r.RandomString(num); + ASSERT_OK(writer->Append(Slice(random_string.c_str(), num))); + target.append(random_string.c_str(), num); + + // In some attempts, flush in a chance of 1/10. + if (!no_flush && r.Uniform(10) == 0) { + ASSERT_OK(writer->Flush()); + } + } + ASSERT_OK(writer->Flush()); + ASSERT_OK(writer->Close()); + ASSERT_EQ(target.size(), actual.size()); + ASSERT_EQ(target, actual); + } +} + +TEST_F(WritableFileWriterTest, BufferWithZeroCapacityDirectIO) { + EnvOptions env_opts; + env_opts.use_direct_writes = true; + env_opts.writable_file_max_buffer_size = 0; + { + std::unique_ptr writer; + const Status s = + WritableFileWriter::Create(FileSystem::Default(), /*fname=*/"dont_care", + FileOptions(env_opts), &writer, + /*dbg=*/nullptr); + ASSERT_TRUE(s.IsInvalidArgument()); + } +} + +class DBWritableFileWriterTest : public DBTestBase { + public: + DBWritableFileWriterTest() + : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) { + fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_)); + } + + std::shared_ptr fault_fs_; + std::unique_ptr fault_env_; +}; + +TEST_F(DBWritableFileWriterTest, AppendWithChecksum) { + FileOptions file_options = FileOptions(); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + std::string fname = dbname_ + "/test_file"; + std::unique_ptr writable_file_ptr; + ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr, + /*dbg*/ nullptr)); + std::unique_ptr file; + file.reset(new TestFSWritableFile( + fname, file_options, std::move(writable_file_ptr), fault_fs_.get())); + std::unique_ptr file_writer; + ImmutableOptions ioptions(options); + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, SystemClock::Default().get(), + nullptr, ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), true, true)); + + Random rnd(301); + std::string data = rnd.RandomString(1000); + uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size()); + fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + + ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK(file_writer->Flush()); + Random size_r(47); + for (int i = 0; i < 2000; i++) { + data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); + data_crc32c = crc32c::Value(data.c_str(), data.size()); + ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + + data = rnd.RandomString((static_cast(size_r.Next()) % 97)); + ASSERT_OK(file_writer->Append(Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush()); + } + ASSERT_OK(file_writer->Close()); + Destroy(options); +} + +TEST_F(DBWritableFileWriterTest, AppendVerifyNoChecksum) { + FileOptions file_options = FileOptions(); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + std::string fname = dbname_ + "/test_file"; + std::unique_ptr writable_file_ptr; + ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr, + /*dbg*/ nullptr)); + std::unique_ptr file; + file.reset(new TestFSWritableFile( + fname, file_options, std::move(writable_file_ptr), fault_fs_.get())); + std::unique_ptr file_writer; + ImmutableOptions ioptions(options); + // Enable checksum handoff for this file, but do not enable buffer checksum. + // So Append with checksum logic will not be triggered + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, SystemClock::Default().get(), + nullptr, ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), true, false)); + + Random rnd(301); + std::string data = rnd.RandomString(1000); + uint32_t data_crc32c = crc32c::Value(data.c_str(), data.size()); + fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + + ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + ASSERT_OK(file_writer->Flush()); + Random size_r(47); + for (int i = 0; i < 1000; i++) { + data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); + data_crc32c = crc32c::Value(data.c_str(), data.size()); + ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + + data = rnd.RandomString((static_cast(size_r.Next()) % 97)); + ASSERT_OK(file_writer->Append(Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush()); + } + ASSERT_OK(file_writer->Close()); + Destroy(options); +} + +TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { + FileOptions file_options = FileOptions(); + file_options.rate_limiter = nullptr; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + std::string fname = dbname_ + "/test_file"; + std::unique_ptr writable_file_ptr; + ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options, &writable_file_ptr, + /*dbg*/ nullptr)); + std::unique_ptr file; + file.reset(new TestFSWritableFile( + fname, file_options, std::move(writable_file_ptr), fault_fs_.get())); + std::unique_ptr file_writer; + ImmutableOptions ioptions(options); + // Enable checksum handoff for this file, but do not enable buffer checksum. + // So Append with checksum logic will not be triggered + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options, SystemClock::Default().get(), + nullptr, ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), true, true)); + fault_fs_->SetChecksumHandoffFuncType(ChecksumType::kCRC32c); + + Random rnd(301); + std::string data; + uint32_t data_crc32c; + uint64_t start = fault_env_->NowMicros(); + Random size_r(47); + uint64_t bytes_written = 0; + for (int i = 0; i < 100; i++) { + data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); + data_crc32c = crc32c::Value(data.c_str(), data.size()); + ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + bytes_written += static_cast(data.size()); + + data = rnd.RandomString((static_cast(size_r.Next()) % 97)); + ASSERT_OK(file_writer->Append(Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush()); + bytes_written += static_cast(data.size()); + } + uint64_t elapsed = fault_env_->NowMicros() - start; + double raw_rate = bytes_written * 1000000.0 / elapsed; + ASSERT_OK(file_writer->Close()); + + // Set the rate-limiter + FileOptions file_options1 = FileOptions(); + file_options1.rate_limiter = + NewGenericRateLimiter(static_cast(0.5 * raw_rate)); + fname = dbname_ + "/test_file_1"; + std::unique_ptr writable_file_ptr1; + ASSERT_OK(fault_fs_->NewWritableFile(fname, file_options1, + &writable_file_ptr1, + /*dbg*/ nullptr)); + file.reset(new TestFSWritableFile( + fname, file_options1, std::move(writable_file_ptr1), fault_fs_.get())); + // Enable checksum handoff for this file, but do not enable buffer checksum. + // So Append with checksum logic will not be triggered + file_writer.reset(new WritableFileWriter( + std::move(file), fname, file_options1, SystemClock::Default().get(), + nullptr, ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), true, true)); + + for (int i = 0; i < 1000; i++) { + data = rnd.RandomString((static_cast(size_r.Next()) % 10000)); + data_crc32c = crc32c::Value(data.c_str(), data.size()); + ASSERT_OK(file_writer->Append(Slice(data.c_str()), data_crc32c)); + + data = rnd.RandomString((static_cast(size_r.Next()) % 97)); + ASSERT_OK(file_writer->Append(Slice(data.c_str()))); + ASSERT_OK(file_writer->Flush()); + } + ASSERT_OK(file_writer->Close()); + if (file_options1.rate_limiter != nullptr) { + delete file_options1.rate_limiter; + } + + Destroy(options); +} + +#ifndef ROCKSDB_LITE +TEST_F(WritableFileWriterTest, AppendStatusReturn) { + class FakeWF : public FSWritableFile { + public: + explicit FakeWF() : use_direct_io_(false), io_error_(false) {} + + bool use_direct_io() const override { return use_direct_io_; } + + using FSWritableFile::Append; + IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (io_error_) { + return IOStatus::IOError("Fake IO error"); + } + return IOStatus::OK(); + } + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& /*data*/, uint64_t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (io_error_) { + return IOStatus::IOError("Fake IO error"); + } + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + void Setuse_direct_io(bool val) { use_direct_io_ = val; } + void SetIOError(bool val) { io_error_ = val; } + + protected: + bool use_direct_io_; + bool io_error_; + }; + std::unique_ptr wf(new FakeWF()); + wf->Setuse_direct_io(true); + std::unique_ptr writer( + new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions())); + + ASSERT_OK(writer->Append(std::string(2 * kMb, 'a'))); + + // Next call to WritableFile::Append() should fail + FakeWF* fwf = static_cast(writer->writable_file()); + fwf->SetIOError(true); + ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b'))); +} +#endif + +class ReadaheadRandomAccessFileTest + : public testing::Test, + public testing::WithParamInterface { + public: + static std::vector GetReadaheadSizeList() { + return {1lu << 12, 1lu << 16}; + } + void SetUp() override { + readahead_size_ = GetParam(); + scratch_.reset(new char[2 * readahead_size_]); + ResetSourceStr(); + } + ReadaheadRandomAccessFileTest() : control_contents_() {} + std::string Read(uint64_t offset, size_t n) { + Slice result; + Status s = test_read_holder_->Read(offset, n, IOOptions(), &result, + scratch_.get(), nullptr); + EXPECT_TRUE(s.ok() || s.IsInvalidArgument()); + return std::string(result.data(), result.size()); + } + void ResetSourceStr(const std::string& str = "") { + std::unique_ptr sink( + new test::StringSink(&control_contents_)); + std::unique_ptr write_holder(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); + Status s = write_holder->Append(Slice(str)); + EXPECT_OK(s); + s = write_holder->Flush(); + EXPECT_OK(s); + std::unique_ptr read_holder( + new test::StringSource(control_contents_)); + test_read_holder_ = + NewReadaheadRandomAccessFile(std::move(read_holder), readahead_size_); + } + size_t GetReadaheadSize() const { return readahead_size_; } + + private: + size_t readahead_size_; + Slice control_contents_; + std::unique_ptr test_read_holder_; + std::unique_ptr scratch_; +}; + +TEST_P(ReadaheadRandomAccessFileTest, EmptySourceStr) { + ASSERT_EQ("", Read(0, 1)); + ASSERT_EQ("", Read(0, 0)); + ASSERT_EQ("", Read(13, 13)); +} + +TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenLessThanReadaheadSize) { + std::string str = "abcdefghijklmnopqrs"; + ResetSourceStr(str); + ASSERT_EQ(str.substr(3, 4), Read(3, 4)); + ASSERT_EQ(str.substr(0, 3), Read(0, 3)); + ASSERT_EQ(str, Read(0, str.size())); + ASSERT_EQ(str.substr(7, std::min(static_cast(str.size()) - 7, 30)), + Read(7, 30)); + ASSERT_EQ("", Read(100, 100)); +} + +TEST_P(ReadaheadRandomAccessFileTest, SourceStrLenGreaterThanReadaheadSize) { + Random rng(42); + for (int k = 0; k < 100; ++k) { + size_t strLen = k * GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + std::string str = rng.HumanReadableString(static_cast(strLen)); + ResetSourceStr(str); + for (int test = 1; test <= 100; ++test) { + size_t offset = rng.Uniform(static_cast(strLen)); + size_t n = rng.Uniform(static_cast(GetReadaheadSize())); + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), + Read(offset, n)); + } + } +} + +TEST_P(ReadaheadRandomAccessFileTest, ReadExceedsReadaheadSize) { + Random rng(7); + size_t strLen = 4 * GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + std::string str = rng.HumanReadableString(static_cast(strLen)); + ResetSourceStr(str); + for (int test = 1; test <= 100; ++test) { + size_t offset = rng.Uniform(static_cast(strLen)); + size_t n = + GetReadaheadSize() + rng.Uniform(static_cast(GetReadaheadSize())); + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), + Read(offset, n)); + } +} + +INSTANTIATE_TEST_CASE_P( + EmptySourceStr, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenLessThanReadaheadSize, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenGreaterThanReadaheadSize, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + ReadExceedsReadaheadSize, ReadaheadRandomAccessFileTest, + ::testing::ValuesIn(ReadaheadRandomAccessFileTest::GetReadaheadSizeList())); + +class ReadaheadSequentialFileTest : public testing::Test, + public testing::WithParamInterface { + public: + static std::vector GetReadaheadSizeList() { + return {1lu << 8, 1lu << 12, 1lu << 16, 1lu << 18}; + } + void SetUp() override { + readahead_size_ = GetParam(); + scratch_.reset(new char[2 * readahead_size_]); + ResetSourceStr(); + } + ReadaheadSequentialFileTest() {} + std::string Read(size_t n) { + Slice result; + Status s = test_read_holder_->Read( + n, &result, scratch_.get(), Env::IO_TOTAL /* rate_limiter_priority*/); + EXPECT_TRUE(s.ok() || s.IsInvalidArgument()); + return std::string(result.data(), result.size()); + } + void Skip(size_t n) { test_read_holder_->Skip(n); } + void ResetSourceStr(const std::string& str = "") { + auto read_holder = std::unique_ptr( + new test::SeqStringSource(str, &seq_read_count_)); + test_read_holder_.reset(new SequentialFileReader(std::move(read_holder), + "test", readahead_size_)); + } + size_t GetReadaheadSize() const { return readahead_size_; } + + private: + size_t readahead_size_; + std::unique_ptr test_read_holder_; + std::unique_ptr scratch_; + std::atomic seq_read_count_; +}; + +TEST_P(ReadaheadSequentialFileTest, EmptySourceStr) { + ASSERT_EQ("", Read(0)); + ASSERT_EQ("", Read(1)); + ASSERT_EQ("", Read(13)); +} + +TEST_P(ReadaheadSequentialFileTest, SourceStrLenLessThanReadaheadSize) { + std::string str = "abcdefghijklmnopqrs"; + ResetSourceStr(str); + ASSERT_EQ(str.substr(0, 3), Read(3)); + ASSERT_EQ(str.substr(3, 1), Read(1)); + ASSERT_EQ(str.substr(4), Read(str.size())); + ASSERT_EQ("", Read(100)); +} + +TEST_P(ReadaheadSequentialFileTest, SourceStrLenGreaterThanReadaheadSize) { + Random rng(42); + for (int s = 0; s < 1; ++s) { + for (int k = 0; k < 100; ++k) { + size_t strLen = k * GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + std::string str = rng.HumanReadableString(static_cast(strLen)); + ResetSourceStr(str); + size_t offset = 0; + for (int test = 1; test <= 100; ++test) { + size_t n = rng.Uniform(static_cast(GetReadaheadSize())); + if (s && test % 2) { + Skip(n); + } else { + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n)); + } + offset = std::min(offset + n, strLen); + } + } + } +} + +TEST_P(ReadaheadSequentialFileTest, ReadExceedsReadaheadSize) { + Random rng(42); + for (int s = 0; s < 1; ++s) { + for (int k = 0; k < 100; ++k) { + size_t strLen = k * GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + std::string str = rng.HumanReadableString(static_cast(strLen)); + ResetSourceStr(str); + size_t offset = 0; + for (int test = 1; test <= 100; ++test) { + size_t n = GetReadaheadSize() + + rng.Uniform(static_cast(GetReadaheadSize())); + if (s && test % 2) { + Skip(n); + } else { + ASSERT_EQ(str.substr(offset, std::min(n, strLen - offset)), Read(n)); + } + offset = std::min(offset + n, strLen); + } + } + } +} + +INSTANTIATE_TEST_CASE_P( + EmptySourceStr, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenLessThanReadaheadSize, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + SourceStrLenGreaterThanReadaheadSize, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); +INSTANTIATE_TEST_CASE_P( + ReadExceedsReadaheadSize, ReadaheadSequentialFileTest, + ::testing::ValuesIn(ReadaheadSequentialFileTest::GetReadaheadSizeList())); + +namespace { +std::string GenerateLine(int n) { + std::string rv; + // Multiples of 17 characters per line, for likely bad buffer alignment + for (int i = 0; i < n; ++i) { + rv.push_back(static_cast('0' + (i % 10))); + rv.append("xxxxxxxxxxxxxxxx"); + } + return rv; +} +} // namespace + +TEST(LineFileReaderTest, LineFileReaderTest) { + const int nlines = 1000; + + std::unique_ptr mem_env(MockEnv::Create(Env::Default())); + std::shared_ptr fs = mem_env->GetFileSystem(); + // Create an input file + { + std::unique_ptr file; + ASSERT_OK( + fs->NewWritableFile("testfile", FileOptions(), &file, /*dbg*/ nullptr)); + + for (int i = 0; i < nlines; ++i) { + std::string line = GenerateLine(i); + line.push_back('\n'); + ASSERT_OK(file->Append(line, IOOptions(), /*dbg*/ nullptr)); + } + } + + // Verify with no I/O errors + { + std::unique_ptr reader; + ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader, + nullptr /* dbg */, + nullptr /* rate_limiter */)); + std::string line; + int count = 0; + while (reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) { + ASSERT_EQ(line, GenerateLine(count)); + ++count; + ASSERT_EQ(static_cast(reader->GetLineNumber()), count); + } + ASSERT_OK(reader->GetStatus()); + ASSERT_EQ(count, nlines); + ASSERT_EQ(static_cast(reader->GetLineNumber()), count); + // And still + ASSERT_FALSE( + reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_OK(reader->GetStatus()); + ASSERT_EQ(static_cast(reader->GetLineNumber()), count); + } + + // Verify with injected I/O error + { + std::unique_ptr reader; + ASSERT_OK(LineFileReader::Create(fs, "testfile", FileOptions(), &reader, + nullptr /* dbg */, + nullptr /* rate_limiter */)); + std::string line; + int count = 0; + // Read part way through the file + while (count < nlines / 4) { + ASSERT_TRUE( + reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_EQ(line, GenerateLine(count)); + ++count; + ASSERT_EQ(static_cast(reader->GetLineNumber()), count); + } + ASSERT_OK(reader->GetStatus()); + + // Inject error + int callback_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "MemFile::Read:IOStatus", [&](void* arg) { + IOStatus* status = static_cast(arg); + *status = IOStatus::Corruption("test"); + ++callback_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + while (reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)) { + ASSERT_EQ(line, GenerateLine(count)); + ++count; + ASSERT_EQ(static_cast(reader->GetLineNumber()), count); + } + ASSERT_TRUE(reader->GetStatus().IsCorruption()); + ASSERT_LT(count, nlines / 2); + ASSERT_EQ(callback_count, 1); + + // Still get error & no retry + ASSERT_FALSE( + reader->ReadLine(&line, Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_TRUE(reader->GetStatus().IsCorruption()); + ASSERT_EQ(callback_count, 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } +} + +#ifndef ROCKSDB_LITE +class IOErrorEventListener : public EventListener { + public: + IOErrorEventListener() { notify_error_.store(0); } + + void OnIOError(const IOErrorInfo& io_error_info) override { + notify_error_++; + EXPECT_FALSE(io_error_info.file_path.empty()); + EXPECT_FALSE(io_error_info.io_status.ok()); + } + + size_t NotifyErrorCount() { return notify_error_; } + + bool ShouldBeNotifiedOnFileIO() override { return true; } + + private: + std::atomic notify_error_; +}; + +TEST_F(DBWritableFileWriterTest, IOErrorNotification) { + class FakeWF : public FSWritableFile { + public: + explicit FakeWF() : io_error_(false) { + file_append_errors_.store(0); + file_flush_errors_.store(0); + } + + using FSWritableFile::Append; + IOStatus Append(const Slice& /*data*/, const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (io_error_) { + file_append_errors_++; + return IOStatus::IOError("Fake IO error"); + } + return IOStatus::OK(); + } + + using FSWritableFile::PositionedAppend; + IOStatus PositionedAppend(const Slice& /*data*/, uint64_t, + const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (io_error_) { + return IOStatus::IOError("Fake IO error"); + } + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + if (io_error_) { + file_flush_errors_++; + return IOStatus::IOError("Fake IO error"); + } + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& /*options*/, + IODebugContext* /*dbg*/) override { + return IOStatus::OK(); + } + + void SetIOError(bool val) { io_error_ = val; } + + void CheckCounters(int file_append_errors, int file_flush_errors) { + ASSERT_EQ(file_append_errors, file_append_errors_); + ASSERT_EQ(file_flush_errors_, file_flush_errors); + } + + protected: + bool io_error_; + std::atomic file_append_errors_; + std::atomic file_flush_errors_; + }; + + FileOptions file_options = FileOptions(); + Options options = GetDefaultOptions(); + options.create_if_missing = true; + IOErrorEventListener* listener = new IOErrorEventListener(); + options.listeners.emplace_back(listener); + + DestroyAndReopen(options); + ImmutableOptions ioptions(options); + + std::string fname = dbname_ + "/test_file"; + std::unique_ptr writable_file_ptr(new FakeWF); + + std::unique_ptr file_writer; + writable_file_ptr->SetIOError(true); + + file_writer.reset(new WritableFileWriter( + std::move(writable_file_ptr), fname, file_options, + SystemClock::Default().get(), nullptr, ioptions.stats, ioptions.listeners, + ioptions.file_checksum_gen_factory.get(), true, true)); + + FakeWF* fwf = static_cast(file_writer->writable_file()); + + fwf->SetIOError(true); + ASSERT_NOK(file_writer->Append(std::string(2 * kMb, 'a'))); + fwf->CheckCounters(1, 0); + ASSERT_EQ(listener->NotifyErrorCount(), 1); + + file_writer->reset_seen_error(); + fwf->SetIOError(true); + ASSERT_NOK(file_writer->Flush()); + fwf->CheckCounters(1, 1); + ASSERT_EQ(listener->NotifyErrorCount(), 2); + + /* No error generation */ + file_writer->reset_seen_error(); + fwf->SetIOError(false); + ASSERT_OK(file_writer->Append(std::string(2 * kMb, 'b'))); + ASSERT_EQ(listener->NotifyErrorCount(), 2); + fwf->CheckCounters(1, 1); +} +#endif // ROCKSDB_LITE + +class WritableFileWriterIOPriorityTest : public testing::Test { + protected: + // This test is to check whether the rate limiter priority can be passed + // correctly from WritableFileWriter functions to FSWritableFile functions. + + void SetUp() override { + // When op_rate_limiter_priority parameter in WritableFileWriter functions + // is the default (Env::IO_TOTAL). + std::unique_ptr wf{new FakeWF(Env::IO_HIGH)}; + FileOptions file_options; + writer_.reset(new WritableFileWriter(std::move(wf), "" /* don't care */, + file_options)); + } + + class FakeWF : public FSWritableFile { + public: + explicit FakeWF(Env::IOPriority io_priority) { SetIOPriority(io_priority); } + ~FakeWF() override {} + + IOStatus Append(const Slice& /*data*/, const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus Append(const Slice& data, const IOOptions& options, + const DataVerificationInfo& /* verification_info */, + IODebugContext* dbg) override { + return Append(data, options, dbg); + } + IOStatus PositionedAppend(const Slice& /*data*/, uint64_t /*offset*/, + const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus PositionedAppend( + const Slice& /* data */, uint64_t /* offset */, + const IOOptions& options, + const DataVerificationInfo& /* verification_info */, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus Truncate(uint64_t /*size*/, const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus Close(const IOOptions& options, IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus Flush(const IOOptions& options, IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus Sync(const IOOptions& options, IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus Fsync(const IOOptions& options, IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + uint64_t GetFileSize(const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return 0; + } + void GetPreallocationStatus(size_t* /*block_size*/, + size_t* /*last_allocated_block*/) override {} + size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const override { + return 0; + } + IOStatus InvalidateCache(size_t /*offset*/, size_t /*length*/) override { + return IOStatus::OK(); + } + + IOStatus Allocate(uint64_t /*offset*/, uint64_t /*len*/, + const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + IOStatus RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/, + const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + return IOStatus::OK(); + } + + void PrepareWrite(size_t /*offset*/, size_t /*len*/, + const IOOptions& options, + IODebugContext* /*dbg*/) override { + EXPECT_EQ(options.rate_limiter_priority, io_priority_); + } + + bool IsSyncThreadSafe() const override { return true; } + }; + + std::unique_ptr writer_; +}; + +TEST_F(WritableFileWriterIOPriorityTest, Append) { + ASSERT_OK(writer_->Append(Slice("abc"))); +} + +TEST_F(WritableFileWriterIOPriorityTest, Pad) { ASSERT_OK(writer_->Pad(500)); } + +TEST_F(WritableFileWriterIOPriorityTest, Flush) { ASSERT_OK(writer_->Flush()); } + +TEST_F(WritableFileWriterIOPriorityTest, Close) { ASSERT_OK(writer_->Close()); } + +TEST_F(WritableFileWriterIOPriorityTest, Sync) { + ASSERT_OK(writer_->Sync(false)); + ASSERT_OK(writer_->Sync(true)); +} + +TEST_F(WritableFileWriterIOPriorityTest, SyncWithoutFlush) { + ASSERT_OK(writer_->SyncWithoutFlush(false)); + ASSERT_OK(writer_->SyncWithoutFlush(true)); +} + +TEST_F(WritableFileWriterIOPriorityTest, BasicOp) { + EnvOptions env_options; + env_options.bytes_per_sync = kMb; + std::unique_ptr wf(new FakeWF(Env::IO_HIGH)); + std::unique_ptr writer( + new WritableFileWriter(std::move(wf), "" /* don't care */, env_options)); + Random r(301); + Status s; + std::unique_ptr large_buf(new char[10 * kMb]); + for (int i = 0; i < 1000; i++) { + int skew_limit = (i < 700) ? 10 : 15; + uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); + s = writer->Append(Slice(large_buf.get(), num)); + ASSERT_OK(s); + + // Flush in a chance of 1/10. + if (r.Uniform(10) == 0) { + s = writer->Flush(); + ASSERT_OK(s); + } + } + s = writer->Close(); + ASSERT_OK(s); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/filelock_test.cc b/src/rocksdb/util/filelock_test.cc new file mode 100644 index 000000000..69947a732 --- /dev/null +++ b/src/rocksdb/util/filelock_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include + +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#ifdef __FreeBSD__ +#include +#include +#endif +#include + +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +class LockTest : public testing::Test { + public: + static LockTest* current_; + std::string file_; + ROCKSDB_NAMESPACE::Env* env_; + + LockTest() + : file_(test::PerThreadDBPath("db_testlock_file")), + env_(ROCKSDB_NAMESPACE::Env::Default()) { + current_ = this; + } + + ~LockTest() override {} + + Status LockFile(FileLock** db_lock) { return env_->LockFile(file_, db_lock); } + + Status UnlockFile(FileLock* db_lock) { return env_->UnlockFile(db_lock); } + + bool AssertFileIsLocked() { + return CheckFileLock(/* lock_expected = */ true); + } + + bool AssertFileIsNotLocked() { + return CheckFileLock(/* lock_expected = */ false); + } + + bool CheckFileLock(bool lock_expected) { + // We need to fork to check the fcntl lock as we need + // to open and close the file from a different process + // to avoid either releasing the lock on close, or not + // contending for it when requesting a lock. + +#ifdef OS_WIN + + // WaitForSingleObject and GetExitCodeProcess can do what waitpid does. + // TODO - implement on Windows + return true; + +#else + + pid_t pid = fork(); + if (0 == pid) { + // child process + int exit_val = EXIT_FAILURE; + int fd = open(file_.c_str(), O_RDWR | O_CREAT, 0644); + if (fd < 0) { + // could not open file, could not check if it was locked + fprintf(stderr, "Open on on file %s failed.\n", file_.c_str()); + exit(exit_val); + } + + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (F_WRLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + int value = fcntl(fd, F_SETLK, &f); + if (value == -1) { + if (lock_expected) { + exit_val = EXIT_SUCCESS; + } + } else { + if (!lock_expected) { + exit_val = EXIT_SUCCESS; + } + } + close(fd); // lock is released for child process + exit(exit_val); + } else if (pid > 0) { + // parent process + int status; + while (-1 == waitpid(pid, &status, 0)) + ; + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + // child process exited with non success status + return false; + } else { + return true; + } + } else { + fprintf(stderr, "Fork failed\n"); + return false; + } + return false; + +#endif + } +}; +LockTest* LockTest::current_; + +TEST_F(LockTest, LockBySameThread) { + FileLock* lock1; + FileLock* lock2; + + // acquire a lock on a file + ASSERT_OK(LockFile(&lock1)); + + // check the file is locked + ASSERT_TRUE(AssertFileIsLocked()); + + // re-acquire the lock on the same file. This should fail. + Status s = LockFile(&lock2); + ASSERT_TRUE(s.IsIOError()); +#ifndef OS_WIN + // Validate that error message contains current thread ID. + ASSERT_TRUE(s.ToString().find(std::to_string( + Env::Default()->GetThreadID())) != std::string::npos); +#endif + + // check the file is locked + ASSERT_TRUE(AssertFileIsLocked()); + + // release the lock + ASSERT_OK(UnlockFile(lock1)); + + // check the file is not locked + ASSERT_TRUE(AssertFileIsNotLocked()); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/filter_bench.cc b/src/rocksdb/util/filter_bench.cc new file mode 100644 index 000000000..93186cd08 --- /dev/null +++ b/src/rocksdb/util/filter_bench.cc @@ -0,0 +1,840 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#if !defined(GFLAGS) || defined(ROCKSDB_LITE) +#include +int main() { + fprintf(stderr, "filter_bench requires gflags and !ROCKSDB_LITE\n"); + return 1; +} +#else + +#include +#include +#include +#include + +#include "memory/arena.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/table.h" +#include "table/block_based/filter_policy_internal.h" +#include "table/block_based/full_filter_block.h" +#include "table/block_based/mock_block_based_table.h" +#include "table/plain/plain_table_bloom.h" +#include "util/cast_util.h" +#include "util/gflags_compat.h" +#include "util/hash.h" +#include "util/random.h" +#include "util/stderr_logger.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::RegisterFlagValidator; +using GFLAGS_NAMESPACE::SetUsageMessage; + +DEFINE_uint32(seed, 0, "Seed for random number generators"); + +DEFINE_double(working_mem_size_mb, 200, + "MB of memory to get up to among all filters, unless " + "m_keys_total_max is specified."); + +DEFINE_uint32(average_keys_per_filter, 10000, + "Average number of keys per filter"); + +DEFINE_double(vary_key_count_ratio, 0.4, + "Vary number of keys by up to +/- vary_key_count_ratio * " + "average_keys_per_filter."); + +DEFINE_uint32(key_size, 24, "Average number of bytes for each key"); + +DEFINE_bool(vary_key_alignment, true, + "Whether to vary key alignment (default: at least 32-bit " + "alignment)"); + +DEFINE_uint32(vary_key_size_log2_interval, 5, + "Use same key size 2^n times, then change. Key size varies from " + "-2 to +2 bytes vs. average, unless n>=30 to fix key size."); + +DEFINE_uint32(batch_size, 8, "Number of keys to group in each batch"); + +DEFINE_double(bits_per_key, 10.0, "Bits per key setting for filters"); + +DEFINE_double(m_queries, 200, "Millions of queries for each test mode"); + +DEFINE_double(m_keys_total_max, 0, + "Maximum total keys added to filters, in millions. " + "0 (default) disables. Non-zero overrides working_mem_size_mb " + "option."); + +DEFINE_bool(use_full_block_reader, false, + "Use FullFilterBlockReader interface rather than FilterBitsReader"); + +DEFINE_bool(use_plain_table_bloom, false, + "Use PlainTableBloom structure and interface rather than " + "FilterBitsReader/FullFilterBlockReader"); + +DEFINE_bool(new_builder, false, + "Whether to create a new builder for each new filter"); + +DEFINE_uint32(impl, 0, + "Select filter implementation. Without -use_plain_table_bloom:" + "0 = legacy full Bloom filter, " + "1 = format_version 5 Bloom filter, 2 = Ribbon128 filter. With " + "-use_plain_table_bloom: 0 = no locality, 1 = locality."); + +DEFINE_bool(net_includes_hashing, false, + "Whether query net ns/op times should include hashing. " + "(if not, dry run will include hashing) " + "(build times always include hashing)"); + +DEFINE_bool(optimize_filters_for_memory, false, + "Setting for BlockBasedTableOptions::optimize_filters_for_memory"); + +DEFINE_bool(detect_filter_construct_corruption, false, + "Setting for " + "BlockBasedTableOptions::detect_filter_construct_corruption"); + +DEFINE_uint32(block_cache_capacity_MB, 8, + "Setting for " + "LRUCacheOptions::capacity"); + +DEFINE_bool(charge_filter_construction, false, + "Setting for " + "CacheEntryRoleOptions::charged of" + "CacheEntryRole::kFilterConstruction"); + +DEFINE_bool(strict_capacity_limit, false, + "Setting for " + "LRUCacheOptions::strict_capacity_limit"); + +DEFINE_bool(quick, false, "Run more limited set of tests, fewer queries"); + +DEFINE_bool(best_case, false, "Run limited tests only for best-case"); + +DEFINE_bool(allow_bad_fp_rate, false, "Continue even if FP rate is bad"); + +DEFINE_bool(legend, false, + "Print more information about interpreting results instead of " + "running tests"); + +DEFINE_uint32(runs, 1, "Number of times to rebuild and run benchmark tests"); + +void _always_assert_fail(int line, const char *file, const char *expr) { + fprintf(stderr, "%s: %d: Assertion %s failed\n", file, line, expr); + abort(); +} + +#define ALWAYS_ASSERT(cond) \ + ((cond) ? (void)0 : ::_always_assert_fail(__LINE__, __FILE__, #cond)) + +#ifndef NDEBUG +// This could affect build times enough that we should not include it for +// accurate speed tests +#define PREDICT_FP_RATE +#endif + +using ROCKSDB_NAMESPACE::Arena; +using ROCKSDB_NAMESPACE::BlockContents; +using ROCKSDB_NAMESPACE::BloomFilterPolicy; +using ROCKSDB_NAMESPACE::BloomHash; +using ROCKSDB_NAMESPACE::BloomLikeFilterPolicy; +using ROCKSDB_NAMESPACE::BuiltinFilterBitsBuilder; +using ROCKSDB_NAMESPACE::CachableEntry; +using ROCKSDB_NAMESPACE::Cache; +using ROCKSDB_NAMESPACE::CacheEntryRole; +using ROCKSDB_NAMESPACE::CacheEntryRoleOptions; +using ROCKSDB_NAMESPACE::EncodeFixed32; +using ROCKSDB_NAMESPACE::Env; +using ROCKSDB_NAMESPACE::FastRange32; +using ROCKSDB_NAMESPACE::FilterBitsReader; +using ROCKSDB_NAMESPACE::FilterBuildingContext; +using ROCKSDB_NAMESPACE::FilterPolicy; +using ROCKSDB_NAMESPACE::FullFilterBlockReader; +using ROCKSDB_NAMESPACE::GetSliceHash; +using ROCKSDB_NAMESPACE::GetSliceHash64; +using ROCKSDB_NAMESPACE::Lower32of64; +using ROCKSDB_NAMESPACE::LRUCacheOptions; +using ROCKSDB_NAMESPACE::ParsedFullFilterBlock; +using ROCKSDB_NAMESPACE::PlainTableBloomV1; +using ROCKSDB_NAMESPACE::Random32; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::static_cast_with_check; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::StderrLogger; +using ROCKSDB_NAMESPACE::mock::MockBlockBasedTableTester; + +struct KeyMaker { + KeyMaker(size_t avg_size) + : smallest_size_(avg_size - + (FLAGS_vary_key_size_log2_interval >= 30 ? 2 : 0)), + buf_size_(avg_size + 11), // pad to vary key size and alignment + buf_(new char[buf_size_]) { + memset(buf_.get(), 0, buf_size_); + assert(smallest_size_ > 8); + } + size_t smallest_size_; + size_t buf_size_; + std::unique_ptr buf_; + + // Returns a unique(-ish) key based on the given parameter values. Each + // call returns a Slice from the same buffer so previously returned + // Slices should be considered invalidated. + Slice Get(uint32_t filter_num, uint32_t val_num) { + size_t start = FLAGS_vary_key_alignment ? val_num % 4 : 0; + size_t len = smallest_size_; + if (FLAGS_vary_key_size_log2_interval < 30) { + // To get range [avg_size - 2, avg_size + 2] + // use range [smallest_size, smallest_size + 4] + len += FastRange32( + (val_num >> FLAGS_vary_key_size_log2_interval) * 1234567891, 5); + } + char *data = buf_.get() + start; + // Populate key data such that all data makes it into a key of at + // least 8 bytes. We also don't want all the within-filter key + // variance confined to a contiguous 32 bits, because then a 32 bit + // hash function can "cheat" the false positive rate by + // approximating a perfect hash. + EncodeFixed32(data, val_num); + EncodeFixed32(data + 4, filter_num + val_num); + // ensure clearing leftovers from different alignment + EncodeFixed32(data + 8, 0); + return Slice(data, len); + } +}; + +void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif +} + +void PrintError(const char *error) { fprintf(stderr, "ERROR: %s\n", error); } + +struct FilterInfo { + uint32_t filter_id_ = 0; + std::unique_ptr owner_; + Slice filter_; + Status filter_construction_status = Status::OK(); + uint32_t keys_added_ = 0; + std::unique_ptr reader_; + std::unique_ptr full_block_reader_; + std::unique_ptr plain_table_bloom_; + uint64_t outside_queries_ = 0; + uint64_t false_positives_ = 0; +}; + +enum TestMode { + kSingleFilter, + kBatchPrepared, + kBatchUnprepared, + kFiftyOneFilter, + kEightyTwentyFilter, + kRandomFilter, +}; + +static const std::vector allTestModes = { + kSingleFilter, kBatchPrepared, kBatchUnprepared, + kFiftyOneFilter, kEightyTwentyFilter, kRandomFilter, +}; + +static const std::vector quickTestModes = { + kSingleFilter, + kRandomFilter, +}; + +static const std::vector bestCaseTestModes = { + kSingleFilter, +}; + +const char *TestModeToString(TestMode tm) { + switch (tm) { + case kSingleFilter: + return "Single filter"; + case kBatchPrepared: + return "Batched, prepared"; + case kBatchUnprepared: + return "Batched, unprepared"; + case kFiftyOneFilter: + return "Skewed 50% in 1%"; + case kEightyTwentyFilter: + return "Skewed 80% in 20%"; + case kRandomFilter: + return "Random filter"; + } + return "Bad TestMode"; +} + +// Do just enough to keep some data dependence for the +// compiler / CPU +static uint32_t DryRunNoHash(Slice &s) { + uint32_t sz = static_cast(s.size()); + if (sz >= 4) { + return sz + s.data()[3]; + } else { + return sz; + } +} + +static uint32_t DryRunHash32(Slice &s) { + // Same perf characteristics as GetSliceHash() + return BloomHash(s); +} + +static uint32_t DryRunHash64(Slice &s) { + return Lower32of64(GetSliceHash64(s)); +} + +const std::shared_ptr &GetPolicy() { + static std::shared_ptr policy; + if (!policy) { + policy = BloomLikeFilterPolicy::Create( + BloomLikeFilterPolicy::GetAllFixedImpls().at(FLAGS_impl), + FLAGS_bits_per_key); + } + return policy; +} + +struct FilterBench : public MockBlockBasedTableTester { + std::vector kms_; + std::vector infos_; + Random32 random_; + std::ostringstream fp_rate_report_; + Arena arena_; + double m_queries_; + StderrLogger stderr_logger_; + + FilterBench() + : MockBlockBasedTableTester(GetPolicy()), + random_(FLAGS_seed), + m_queries_(0) { + for (uint32_t i = 0; i < FLAGS_batch_size; ++i) { + kms_.emplace_back(FLAGS_key_size < 8 ? 8 : FLAGS_key_size); + } + ioptions_.logger = &stderr_logger_; + table_options_.optimize_filters_for_memory = + FLAGS_optimize_filters_for_memory; + table_options_.detect_filter_construct_corruption = + FLAGS_detect_filter_construct_corruption; + table_options_.cache_usage_options.options_overrides.insert( + {CacheEntryRole::kFilterConstruction, + {/*.charged = */ FLAGS_charge_filter_construction + ? CacheEntryRoleOptions::Decision::kEnabled + : CacheEntryRoleOptions::Decision::kDisabled}}); + if (FLAGS_charge_filter_construction) { + table_options_.no_block_cache = false; + LRUCacheOptions lo; + lo.capacity = FLAGS_block_cache_capacity_MB * 1024 * 1024; + lo.num_shard_bits = 0; // 2^0 shard + lo.strict_capacity_limit = FLAGS_strict_capacity_limit; + std::shared_ptr cache(NewLRUCache(lo)); + table_options_.block_cache = cache; + } + } + + void Go(); + + double RandomQueryTest(uint32_t inside_threshold, bool dry_run, + TestMode mode); +}; + +void FilterBench::Go() { + if (FLAGS_use_plain_table_bloom && FLAGS_use_full_block_reader) { + throw std::runtime_error( + "Can't combine -use_plain_table_bloom and -use_full_block_reader"); + } + if (FLAGS_use_plain_table_bloom) { + if (FLAGS_impl > 1) { + throw std::runtime_error( + "-impl must currently be >= 0 and <= 1 for Plain table"); + } + } else { + if (FLAGS_impl > 2) { + throw std::runtime_error( + "-impl must currently be >= 0 and <= 2 for Block-based table"); + } + } + + if (FLAGS_vary_key_count_ratio < 0.0 || FLAGS_vary_key_count_ratio > 1.0) { + throw std::runtime_error("-vary_key_count_ratio must be >= 0.0 and <= 1.0"); + } + + // For example, average_keys_per_filter = 100, vary_key_count_ratio = 0.1. + // Varys up to +/- 10 keys. variance_range = 21 (generating value 0..20). + // variance_offset = 10, so value - offset average value is always 0. + const uint32_t variance_range = + 1 + 2 * static_cast(FLAGS_vary_key_count_ratio * + FLAGS_average_keys_per_filter); + const uint32_t variance_offset = variance_range / 2; + + const std::vector &testModes = FLAGS_best_case ? bestCaseTestModes + : FLAGS_quick ? quickTestModes + : allTestModes; + + m_queries_ = FLAGS_m_queries; + double working_mem_size_mb = FLAGS_working_mem_size_mb; + if (FLAGS_quick) { + m_queries_ /= 7.0; + } else if (FLAGS_best_case) { + m_queries_ /= 3.0; + working_mem_size_mb /= 10.0; + } + + std::cout << "Building..." << std::endl; + + std::unique_ptr builder; + + size_t total_memory_used = 0; + size_t total_size = 0; + size_t total_keys_added = 0; +#ifdef PREDICT_FP_RATE + double weighted_predicted_fp_rate = 0.0; +#endif + size_t max_total_keys; + size_t max_mem; + if (FLAGS_m_keys_total_max > 0) { + max_total_keys = static_cast(1000000 * FLAGS_m_keys_total_max); + max_mem = SIZE_MAX; + } else { + max_total_keys = SIZE_MAX; + max_mem = static_cast(1024 * 1024 * working_mem_size_mb); + } + + ROCKSDB_NAMESPACE::StopWatchNano timer( + ROCKSDB_NAMESPACE::SystemClock::Default().get(), true); + + infos_.clear(); + while ((working_mem_size_mb == 0 || total_size < max_mem) && + total_keys_added < max_total_keys) { + uint32_t filter_id = random_.Next(); + uint32_t keys_to_add = FLAGS_average_keys_per_filter + + FastRange32(random_.Next(), variance_range) - + variance_offset; + if (max_total_keys - total_keys_added < keys_to_add) { + keys_to_add = static_cast(max_total_keys - total_keys_added); + } + infos_.emplace_back(); + FilterInfo &info = infos_.back(); + info.filter_id_ = filter_id; + info.keys_added_ = keys_to_add; + if (FLAGS_use_plain_table_bloom) { + info.plain_table_bloom_.reset(new PlainTableBloomV1()); + info.plain_table_bloom_->SetTotalBits( + &arena_, static_cast(keys_to_add * FLAGS_bits_per_key), + FLAGS_impl, 0 /*huge_page*/, nullptr /*logger*/); + for (uint32_t i = 0; i < keys_to_add; ++i) { + uint32_t hash = GetSliceHash(kms_[0].Get(filter_id, i)); + info.plain_table_bloom_->AddHash(hash); + } + info.filter_ = info.plain_table_bloom_->GetRawData(); + } else { + if (!builder) { + builder.reset( + static_cast_with_check(GetBuilder())); + } + for (uint32_t i = 0; i < keys_to_add; ++i) { + builder->AddKey(kms_[0].Get(filter_id, i)); + } + info.filter_ = + builder->Finish(&info.owner_, &info.filter_construction_status); + if (info.filter_construction_status.ok()) { + info.filter_construction_status = + builder->MaybePostVerify(info.filter_); + } + if (!info.filter_construction_status.ok()) { + PrintError(info.filter_construction_status.ToString().c_str()); + } +#ifdef PREDICT_FP_RATE + weighted_predicted_fp_rate += + keys_to_add * + builder->EstimatedFpRate(keys_to_add, info.filter_.size()); +#endif + if (FLAGS_new_builder) { + builder.reset(); + } + info.reader_.reset( + table_options_.filter_policy->GetFilterBitsReader(info.filter_)); + CachableEntry block( + new ParsedFullFilterBlock(table_options_.filter_policy.get(), + BlockContents(info.filter_)), + nullptr /* cache */, nullptr /* cache_handle */, + true /* own_value */); + info.full_block_reader_.reset( + new FullFilterBlockReader(table_.get(), std::move(block))); + } + total_size += info.filter_.size(); +#ifdef ROCKSDB_MALLOC_USABLE_SIZE + total_memory_used += + malloc_usable_size(const_cast(info.filter_.data())); +#endif // ROCKSDB_MALLOC_USABLE_SIZE + total_keys_added += keys_to_add; + } + + uint64_t elapsed_nanos = timer.ElapsedNanos(); + double ns = double(elapsed_nanos) / total_keys_added; + std::cout << "Build avg ns/key: " << ns << std::endl; + std::cout << "Number of filters: " << infos_.size() << std::endl; + std::cout << "Total size (MB): " << total_size / 1024.0 / 1024.0 << std::endl; + if (total_memory_used > 0) { + std::cout << "Reported total allocated memory (MB): " + << total_memory_used / 1024.0 / 1024.0 << std::endl; + std::cout << "Reported internal fragmentation: " + << (total_memory_used - total_size) * 100.0 / total_size << "%" + << std::endl; + } + + double bpk = total_size * 8.0 / total_keys_added; + std::cout << "Bits/key stored: " << bpk << std::endl; +#ifdef PREDICT_FP_RATE + std::cout << "Predicted FP rate %: " + << 100.0 * (weighted_predicted_fp_rate / total_keys_added) + << std::endl; +#endif + if (!FLAGS_quick && !FLAGS_best_case) { + double tolerable_rate = std::pow(2.0, -(bpk - 1.0) / (1.4 + bpk / 50.0)); + std::cout << "Best possible FP rate %: " << 100.0 * std::pow(2.0, -bpk) + << std::endl; + std::cout << "Tolerable FP rate %: " << 100.0 * tolerable_rate << std::endl; + + std::cout << "----------------------------" << std::endl; + std::cout << "Verifying..." << std::endl; + + uint32_t outside_q_per_f = + static_cast(m_queries_ * 1000000 / infos_.size()); + uint64_t fps = 0; + for (uint32_t i = 0; i < infos_.size(); ++i) { + FilterInfo &info = infos_[i]; + for (uint32_t j = 0; j < info.keys_added_; ++j) { + if (FLAGS_use_plain_table_bloom) { + uint32_t hash = GetSliceHash(kms_[0].Get(info.filter_id_, j)); + ALWAYS_ASSERT(info.plain_table_bloom_->MayContainHash(hash)); + } else { + ALWAYS_ASSERT( + info.reader_->MayMatch(kms_[0].Get(info.filter_id_, j))); + } + } + for (uint32_t j = 0; j < outside_q_per_f; ++j) { + if (FLAGS_use_plain_table_bloom) { + uint32_t hash = + GetSliceHash(kms_[0].Get(info.filter_id_, j | 0x80000000)); + fps += info.plain_table_bloom_->MayContainHash(hash); + } else { + fps += info.reader_->MayMatch( + kms_[0].Get(info.filter_id_, j | 0x80000000)); + } + } + } + std::cout << " No FNs :)" << std::endl; + double prelim_rate = double(fps) / outside_q_per_f / infos_.size(); + std::cout << " Prelim FP rate %: " << (100.0 * prelim_rate) << std::endl; + + if (!FLAGS_allow_bad_fp_rate) { + ALWAYS_ASSERT(prelim_rate < tolerable_rate); + } + } + + std::cout << "----------------------------" << std::endl; + std::cout << "Mixed inside/outside queries..." << std::endl; + // 50% each inside and outside + uint32_t inside_threshold = UINT32_MAX / 2; + for (TestMode tm : testModes) { + random_.Seed(FLAGS_seed + 1); + double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm); + random_.Seed(FLAGS_seed + 1); + double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm); + std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d) + << std::endl; + } + + if (!FLAGS_quick) { + std::cout << "----------------------------" << std::endl; + std::cout << "Inside queries (mostly)..." << std::endl; + // Do about 95% inside queries rather than 100% so that branch predictor + // can't give itself an artifically crazy advantage. + inside_threshold = UINT32_MAX / 20 * 19; + for (TestMode tm : testModes) { + random_.Seed(FLAGS_seed + 1); + double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm); + random_.Seed(FLAGS_seed + 1); + double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm); + std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d) + << std::endl; + } + + std::cout << "----------------------------" << std::endl; + std::cout << "Outside queries (mostly)..." << std::endl; + // Do about 95% outside queries rather than 100% so that branch predictor + // can't give itself an artifically crazy advantage. + inside_threshold = UINT32_MAX / 20; + for (TestMode tm : testModes) { + random_.Seed(FLAGS_seed + 2); + double f = RandomQueryTest(inside_threshold, /*dry_run*/ false, tm); + random_.Seed(FLAGS_seed + 2); + double d = RandomQueryTest(inside_threshold, /*dry_run*/ true, tm); + std::cout << " " << TestModeToString(tm) << " net ns/op: " << (f - d) + << std::endl; + } + } + std::cout << fp_rate_report_.str(); + + std::cout << "----------------------------" << std::endl; + std::cout << "Done. (For more info, run with -legend or -help.)" << std::endl; +} + +double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run, + TestMode mode) { + for (auto &info : infos_) { + info.outside_queries_ = 0; + info.false_positives_ = 0; + } + + auto dry_run_hash_fn = DryRunNoHash; + if (!FLAGS_net_includes_hashing) { + if (FLAGS_impl == 0 || FLAGS_use_plain_table_bloom) { + dry_run_hash_fn = DryRunHash32; + } else { + dry_run_hash_fn = DryRunHash64; + } + } + + uint32_t num_infos = static_cast(infos_.size()); + uint32_t dry_run_hash = 0; + uint64_t max_queries = static_cast(m_queries_ * 1000000 + 0.50); + // Some filters may be considered secondary in order to implement skewed + // queries. num_primary_filters is the number that are to be treated as + // equal, and any remainder will be treated as secondary. + uint32_t num_primary_filters = num_infos; + // The proportion (when divided by 2^32 - 1) of filter queries going to + // the primary filters (default = all). The remainder of queries are + // against secondary filters. + uint32_t primary_filter_threshold = 0xffffffff; + if (mode == kSingleFilter) { + // 100% of queries to 1 filter + num_primary_filters = 1; + } else if (mode == kFiftyOneFilter) { + if (num_infos < 50) { + return 0.0; // skip + } + // 50% of queries + primary_filter_threshold /= 2; + // to 1% of filters + num_primary_filters = (num_primary_filters + 99) / 100; + } else if (mode == kEightyTwentyFilter) { + if (num_infos < 5) { + return 0.0; // skip + } + // 80% of queries + primary_filter_threshold = primary_filter_threshold / 5 * 4; + // to 20% of filters + num_primary_filters = (num_primary_filters + 4) / 5; + } else if (mode == kRandomFilter) { + if (num_infos == 1) { + return 0.0; // skip + } + } + uint32_t batch_size = 1; + std::unique_ptr batch_slices; + std::unique_ptr batch_slice_ptrs; + std::unique_ptr batch_results; + if (mode == kBatchPrepared || mode == kBatchUnprepared) { + batch_size = static_cast(kms_.size()); + } + + batch_slices.reset(new Slice[batch_size]); + batch_slice_ptrs.reset(new Slice *[batch_size]); + batch_results.reset(new bool[batch_size]); + for (uint32_t i = 0; i < batch_size; ++i) { + batch_results[i] = false; + batch_slice_ptrs[i] = &batch_slices[i]; + } + + ROCKSDB_NAMESPACE::StopWatchNano timer( + ROCKSDB_NAMESPACE::SystemClock::Default().get(), true); + + for (uint64_t q = 0; q < max_queries; q += batch_size) { + bool inside_this_time = random_.Next() <= inside_threshold; + + uint32_t filter_index; + if (random_.Next() <= primary_filter_threshold) { + filter_index = random_.Uniformish(num_primary_filters); + } else { + // secondary + filter_index = num_primary_filters + + random_.Uniformish(num_infos - num_primary_filters); + } + FilterInfo &info = infos_[filter_index]; + for (uint32_t i = 0; i < batch_size; ++i) { + if (inside_this_time) { + batch_slices[i] = + kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_)); + } else { + batch_slices[i] = + kms_[i].Get(info.filter_id_, random_.Uniformish(info.keys_added_) | + uint32_t{0x80000000}); + info.outside_queries_++; + } + } + // TODO: implement batched interface to full block reader + // TODO: implement batched interface to plain table bloom + if (mode == kBatchPrepared && !FLAGS_use_full_block_reader && + !FLAGS_use_plain_table_bloom) { + for (uint32_t i = 0; i < batch_size; ++i) { + batch_results[i] = false; + } + if (dry_run) { + for (uint32_t i = 0; i < batch_size; ++i) { + batch_results[i] = true; + dry_run_hash += dry_run_hash_fn(batch_slices[i]); + } + } else { + info.reader_->MayMatch(batch_size, batch_slice_ptrs.get(), + batch_results.get()); + } + for (uint32_t i = 0; i < batch_size; ++i) { + if (inside_this_time) { + ALWAYS_ASSERT(batch_results[i]); + } else { + info.false_positives_ += batch_results[i]; + } + } + } else { + for (uint32_t i = 0; i < batch_size; ++i) { + bool may_match; + if (FLAGS_use_plain_table_bloom) { + if (dry_run) { + dry_run_hash += dry_run_hash_fn(batch_slices[i]); + may_match = true; + } else { + uint32_t hash = GetSliceHash(batch_slices[i]); + may_match = info.plain_table_bloom_->MayContainHash(hash); + } + } else if (FLAGS_use_full_block_reader) { + if (dry_run) { + dry_run_hash += dry_run_hash_fn(batch_slices[i]); + may_match = true; + } else { + may_match = info.full_block_reader_->KeyMayMatch( + batch_slices[i], + /*no_io=*/false, /*const_ikey_ptr=*/nullptr, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr, Env::IO_TOTAL); + } + } else { + if (dry_run) { + dry_run_hash += dry_run_hash_fn(batch_slices[i]); + may_match = true; + } else { + may_match = info.reader_->MayMatch(batch_slices[i]); + } + } + if (inside_this_time) { + ALWAYS_ASSERT(may_match); + } else { + info.false_positives_ += may_match; + } + } + } + } + + uint64_t elapsed_nanos = timer.ElapsedNanos(); + double ns = double(elapsed_nanos) / max_queries; + + if (!FLAGS_quick) { + if (dry_run) { + // Printing part of hash prevents dry run components from being optimized + // away by compiler + std::cout << " Dry run (" << std::hex << (dry_run_hash & 0xfffff) + << std::dec << ") "; + } else { + std::cout << " Gross filter "; + } + std::cout << "ns/op: " << ns << std::endl; + } + + if (!dry_run) { + fp_rate_report_.str(""); + uint64_t q = 0; + uint64_t fp = 0; + double worst_fp_rate = 0.0; + double best_fp_rate = 1.0; + for (auto &info : infos_) { + q += info.outside_queries_; + fp += info.false_positives_; + if (info.outside_queries_ > 0) { + double fp_rate = double(info.false_positives_) / info.outside_queries_; + worst_fp_rate = std::max(worst_fp_rate, fp_rate); + best_fp_rate = std::min(best_fp_rate, fp_rate); + } + } + fp_rate_report_ << " Average FP rate %: " << 100.0 * fp / q << std::endl; + if (!FLAGS_quick && !FLAGS_best_case) { + fp_rate_report_ << " Worst FP rate %: " << 100.0 * worst_fp_rate + << std::endl; + fp_rate_report_ << " Best FP rate %: " << 100.0 * best_fp_rate + << std::endl; + fp_rate_report_ << " Best possible bits/key: " + << -std::log(double(fp) / q) / std::log(2.0) << std::endl; + } + } + return ns; +} + +int main(int argc, char **argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [-quick] [OTHER OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + PrintWarnings(); + + if (FLAGS_legend) { + std::cout + << "Legend:" << std::endl + << " \"Inside\" - key that was added to filter" << std::endl + << " \"Outside\" - key that was not added to filter" << std::endl + << " \"FN\" - false negative query (must not happen)" << std::endl + << " \"FP\" - false positive query (OK at low rate)" << std::endl + << " \"Dry run\" - cost of testing and hashing overhead." << std::endl + << " \"Gross filter\" - cost of filter queries including testing " + << "\n and hashing overhead." << std::endl + << " \"net\" - best estimate of time in filter operation, without " + << "\n testing and hashing overhead (gross filter - dry run)" + << std::endl + << " \"ns/op\" - nanoseconds per operation (key query or add)" + << std::endl + << " \"Single filter\" - essentially minimum cost, assuming filter" + << "\n fits easily in L1 CPU cache." << std::endl + << " \"Batched, prepared\" - several queries at once against a" + << "\n randomly chosen filter, using multi-query interface." + << std::endl + << " \"Batched, unprepared\" - similar, but using serial calls" + << "\n to single query interface." << std::endl + << " \"Random filter\" - a filter is chosen at random as target" + << "\n of each query." << std::endl + << " \"Skewed X% in Y%\" - like \"Random filter\" except Y% of" + << "\n the filters are designated as \"hot\" and receive X%" + << "\n of queries." << std::endl; + } else { + FilterBench b; + for (uint32_t i = 0; i < FLAGS_runs; ++i) { + b.Go(); + FLAGS_seed += 100; + b.random_.Seed(FLAGS_seed); + } + } + + return 0; +} + +#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE) diff --git a/src/rocksdb/util/gflags_compat.h b/src/rocksdb/util/gflags_compat.h new file mode 100644 index 000000000..b6f88a5bc --- /dev/null +++ b/src/rocksdb/util/gflags_compat.h @@ -0,0 +1,30 @@ +// Copyright (c) 2017-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include + +#include + +#ifndef GFLAGS_NAMESPACE +// in case it's not defined in old versions, that's probably because it was +// still google by default. +#define GFLAGS_NAMESPACE google +#endif + +#ifndef DEFINE_uint32 +// DEFINE_uint32 does not appear in older versions of gflags. This should be +// a sane definition for those versions. +#include +#define DEFINE_uint32(name, val, txt) \ + namespace gflags_compat { \ + DEFINE_int32(name, val, txt); \ + } \ + std::reference_wrapper FLAGS_##name = \ + std::ref(*reinterpret_cast(&gflags_compat::FLAGS_##name)); + +#define DECLARE_uint32(name) \ + extern std::reference_wrapper FLAGS_##name; +#endif // !DEFINE_uint32 diff --git a/src/rocksdb/util/hash.cc b/src/rocksdb/util/hash.cc new file mode 100644 index 000000000..0f7f2edc1 --- /dev/null +++ b/src/rocksdb/util/hash.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/hash.h" + +#include + +#include "port/lang.h" +#include "util/coding.h" +#include "util/hash128.h" +#include "util/math128.h" +#include "util/xxhash.h" +#include "util/xxph3.h" + +namespace ROCKSDB_NAMESPACE { + +uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&) = &GetSliceHash64; + +uint32_t Hash(const char* data, size_t n, uint32_t seed) { + // MurmurHash1 - fast but mediocre quality + // https://github.com/aappleby/smhasher/wiki/MurmurHash1 + // + const uint32_t m = 0xc6a4a793; + const uint32_t r = 24; + const char* limit = data + n; + uint32_t h = static_cast(seed ^ (n * m)); + + // Pick up four bytes at a time + while (data + 4 <= limit) { + uint32_t w = DecodeFixed32(data); + data += 4; + h += w; + h *= m; + h ^= (h >> 16); + } + + // Pick up remaining bytes + switch (limit - data) { + // Note: The original hash implementation used data[i] << shift, which + // promotes the char to int and then performs the shift. If the char is + // negative, the shift is undefined behavior in C++. The hash algorithm is + // part of the format definition, so we cannot change it; to obtain the same + // behavior in a legal way we just cast to uint32_t, which will do + // sign-extension. To guarantee compatibility with architectures where chars + // are unsigned we first cast the char to int8_t. + case 3: + h += static_cast(static_cast(data[2])) << 16; + FALLTHROUGH_INTENDED; + case 2: + h += static_cast(static_cast(data[1])) << 8; + FALLTHROUGH_INTENDED; + case 1: + h += static_cast(static_cast(data[0])); + h *= m; + h ^= (h >> r); + break; + } + return h; +} + +// We are standardizing on a preview release of XXH3, because that's +// the best available at time of standardizing. +// +// In testing (mostly Intel Skylake), this hash function is much more +// thorough than Hash32 and is almost universally faster. Hash() only +// seems faster when passing runtime-sized keys of the same small size +// (less than about 24 bytes) thousands of times in a row; this seems +// to allow the branch predictor to work some magic. XXH3's speed is +// much less dependent on branch prediction. +// +// Hashing with a prefix extractor is potentially a common case of +// hashing objects of small, predictable size. We could consider +// bundling hash functions specialized for particular lengths with +// the prefix extractors. +uint64_t Hash64(const char* data, size_t n, uint64_t seed) { + return XXPH3_64bits_withSeed(data, n, seed); +} + +uint64_t Hash64(const char* data, size_t n) { + // Same as seed = 0 + return XXPH3_64bits(data, n); +} + +uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed) { + // TODO(ajkr): use XXH3 streaming APIs to avoid the copy/allocation. + size_t concat_len = 0; + for (int i = 0; i < data.num_parts; ++i) { + concat_len += data.parts[i].size(); + } + std::string concat_data; + concat_data.reserve(concat_len); + for (int i = 0; i < data.num_parts; ++i) { + concat_data.append(data.parts[i].data(), data.parts[i].size()); + } + assert(concat_data.size() == concat_len); + return NPHash64(concat_data.data(), concat_len, seed); +} + +Unsigned128 Hash128(const char* data, size_t n, uint64_t seed) { + auto h = XXH3_128bits_withSeed(data, n, seed); + return (Unsigned128{h.high64} << 64) | (h.low64); +} + +Unsigned128 Hash128(const char* data, size_t n) { + // Same as seed = 0 + auto h = XXH3_128bits(data, n); + return (Unsigned128{h.high64} << 64) | (h.low64); +} + +void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64) { + // Same as seed = 0 + auto h = XXH3_128bits(data, n); + *high64 = h.high64; + *low64 = h.low64; +} + +void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, + uint64_t* low64) { + auto h = XXH3_128bits_withSeed(data, n, seed); + *high64 = h.high64; + *low64 = h.low64; +} + +namespace { + +inline uint64_t XXH3_avalanche(uint64_t h64) { + h64 ^= h64 >> 37; + h64 *= 0x165667919E3779F9U; + h64 ^= h64 >> 32; + return h64; +} + +inline uint64_t XXH3_unavalanche(uint64_t h64) { + h64 ^= h64 >> 32; + h64 *= 0x8da8ee41d6df849U; // inverse of 0x165667919E3779F9U + h64 ^= h64 >> 37; + return h64; +} + +} // namespace + +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64) { + // Adapted from XXH3_len_9to16_128b + const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; + const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; + Unsigned128 tmp128 = + Multiply64to128(in_low64 ^ in_high64 ^ bitflipl, 0x9E3779B185EBCA87U); + uint64_t lo = Lower64of128(tmp128); + uint64_t hi = Upper64of128(tmp128); + lo += 0x3c0000000000000U; // (len - 1) << 54 + in_high64 ^= bitfliph; + hi += in_high64 + (Lower32of64(in_high64) * uint64_t{0x85EBCA76}); + lo ^= EndianSwapValue(hi); + tmp128 = Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU); + lo = Lower64of128(tmp128); + hi = Upper64of128(tmp128) + (hi * 0xC2B2AE3D27D4EB4FU); + *out_low64 = XXH3_avalanche(lo); + *out_high64 = XXH3_avalanche(hi); +} + +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64) { + // Inverted above (also consulting XXH3_len_9to16_128b) + const uint64_t bitflipl = /*secret part*/ 0x59973f0033362349U - seed; + const uint64_t bitfliph = /*secret part*/ 0xc202797692d63d58U + seed; + uint64_t lo = XXH3_unavalanche(in_low64); + uint64_t hi = XXH3_unavalanche(in_high64); + lo *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU + hi -= Upper64of128(Multiply64to128(lo, 0xC2B2AE3D27D4EB4FU)); + hi *= 0xba79078168d4baf; // inverse of 0xC2B2AE3D27D4EB4FU + lo ^= EndianSwapValue(hi); + lo -= 0x3c0000000000000U; + lo *= 0x887493432badb37U; // inverse of 0x9E3779B185EBCA87U + hi -= Upper64of128(Multiply64to128(lo, 0x9E3779B185EBCA87U)); + uint32_t tmp32 = Lower32of64(hi) * 0xb6c92f47; // inverse of 0x85EBCA77 + hi -= tmp32; + hi = (hi & 0xFFFFFFFF00000000U) - + ((tmp32 * uint64_t{0x85EBCA76}) & 0xFFFFFFFF00000000U) + tmp32; + hi ^= bitfliph; + lo ^= hi ^ bitflipl; + *out_high64 = hi; + *out_low64 = lo; +} + +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64) { + BijectiveHash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); +} + +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64) { + BijectiveUnhash2x64(in_high64, in_low64, /*seed*/ 0, out_high64, out_low64); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/hash.h b/src/rocksdb/util/hash.h new file mode 100644 index 000000000..eafa47f34 --- /dev/null +++ b/src/rocksdb/util/hash.h @@ -0,0 +1,137 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Common hash functions with convenient interfaces. If hashing a +// statically-sized input in a performance-critical context, consider +// calling a specific hash implementation directly, such as +// XXH3_64bits from xxhash.h. +// +// Since this is a very common header, implementation details are kept +// out-of-line. Out-of-lining also aids in tracking the time spent in +// hashing functions. Inlining is of limited benefit for runtime-sized +// hash inputs. + +#pragma once + +#include +#include + +#include "rocksdb/slice.h" +#include "util/fastrange.h" + +namespace ROCKSDB_NAMESPACE { + +// Stable/persistent 64-bit hash. Higher quality and generally faster than +// Hash(), especially for inputs > 24 bytes. +// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent +// results from previous seed. Recommend incrementing by a large odd number. +extern uint64_t Hash64(const char* data, size_t n, uint64_t seed); + +// Specific optimization without seed (same as seed = 0) +extern uint64_t Hash64(const char* data, size_t n); + +// Non-persistent hash. Must only used for in-memory data structures. +// The hash results are thus subject to change between releases, +// architectures, build configuration, etc. (Thus, it rarely makes sense +// to specify a seed for this function, except for a "rolling" hash.) +// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent +// results from previous seed. Recommend incrementing by a large odd number. +inline uint64_t NPHash64(const char* data, size_t n, uint64_t seed) { +#ifdef ROCKSDB_MODIFY_NPHASH + // For testing "subject to change" + return Hash64(data, n, seed + 123456789); +#else + // Currently same as Hash64 + return Hash64(data, n, seed); +#endif +} + +// Specific optimization without seed (same as seed = 0) +inline uint64_t NPHash64(const char* data, size_t n) { +#ifdef ROCKSDB_MODIFY_NPHASH + // For testing "subject to change" + return Hash64(data, n, 123456789); +#else + // Currently same as Hash64 + return Hash64(data, n); +#endif +} + +// Convenient and equivalent version of Hash128 without depending on 128-bit +// scalars +void Hash2x64(const char* data, size_t n, uint64_t* high64, uint64_t* low64); +void Hash2x64(const char* data, size_t n, uint64_t seed, uint64_t* high64, + uint64_t* low64); + +// Hash 128 bits to 128 bits, guaranteed not to lose data (equivalent to +// Hash2x64 on 16 bytes little endian) +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64); +void BijectiveHash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64); + +// Inverse of above (mostly for testing) +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, + uint64_t* out_high64, uint64_t* out_low64); +void BijectiveUnhash2x64(uint64_t in_high64, uint64_t in_low64, uint64_t seed, + uint64_t* out_high64, uint64_t* out_low64); + +// Stable/persistent 32-bit hash. Moderate quality and high speed on +// small inputs. +// TODO: consider rename to Hash32 +// KNOWN FLAW: incrementing seed by 1 might not give sufficiently independent +// results from previous seed. Recommend pseudorandom or hashed seeds. +extern uint32_t Hash(const char* data, size_t n, uint32_t seed); + +// TODO: consider rename to LegacyBloomHash32 +inline uint32_t BloomHash(const Slice& key) { + return Hash(key.data(), key.size(), 0xbc9f1d34); +} + +inline uint64_t GetSliceHash64(const Slice& key) { + return Hash64(key.data(), key.size()); +} +// Provided for convenience for use with template argument deduction, where a +// specific overload needs to be used. +extern uint64_t (*kGetSliceNPHash64UnseededFnPtr)(const Slice&); + +inline uint64_t GetSliceNPHash64(const Slice& s) { + return NPHash64(s.data(), s.size()); +} + +inline uint64_t GetSliceNPHash64(const Slice& s, uint64_t seed) { + return NPHash64(s.data(), s.size(), seed); +} + +// Similar to `GetSliceNPHash64()` with `seed`, but input comes from +// concatenation of `Slice`s in `data`. +extern uint64_t GetSlicePartsNPHash64(const SliceParts& data, uint64_t seed); + +inline size_t GetSliceRangedNPHash(const Slice& s, size_t range) { + return FastRange64(NPHash64(s.data(), s.size()), range); +} + +// TODO: consider rename to GetSliceHash32 +inline uint32_t GetSliceHash(const Slice& s) { + return Hash(s.data(), s.size(), 397); +} + +// Useful for splitting up a 64-bit hash +inline uint32_t Upper32of64(uint64_t v) { + return static_cast(v >> 32); +} +inline uint32_t Lower32of64(uint64_t v) { return static_cast(v); } + +// std::hash compatible interface. +// TODO: consider rename to SliceHasher32 +struct SliceHasher { + uint32_t operator()(const Slice& s) const { return GetSliceHash(s); } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/hash128.h b/src/rocksdb/util/hash128.h new file mode 100644 index 000000000..305caa14a --- /dev/null +++ b/src/rocksdb/util/hash128.h @@ -0,0 +1,26 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +// 128-bit hash gets it own header so that more popular hash.h doesn't +// depend on math128.h + +#include "rocksdb/slice.h" +#include "util/math128.h" + +namespace ROCKSDB_NAMESPACE { + +// Stable/persistent 128-bit hash for non-cryptographic applications. +Unsigned128 Hash128(const char* data, size_t n, uint64_t seed); + +// Specific optimization without seed (same as seed = 0) +Unsigned128 Hash128(const char* data, size_t n); + +inline Unsigned128 GetSliceHash128(const Slice& key) { + return Hash128(key.data(), key.size()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/hash_containers.h b/src/rocksdb/util/hash_containers.h new file mode 100644 index 000000000..52be3718c --- /dev/null +++ b/src/rocksdb/util/hash_containers.h @@ -0,0 +1,51 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// This header establishes compile-time pluggable implementations of hashed +// container structures, so that deployments have the option of minimal +// dependencies with ok performance (e.g. std::unordered_map) or more +// dependencies with optimized performance (e.g. folly::F14FastMap). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +#ifdef USE_FOLLY + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +template +using UnorderedMap = folly::F14FastMap; + +template +using UnorderedMapH = folly::F14FastMap; + +template +using UnorderedSet = folly::F14FastSet; + +} // namespace ROCKSDB_NAMESPACE + +#else + +#include +#include + +namespace ROCKSDB_NAMESPACE { + +template +using UnorderedMap = std::unordered_map; + +template +using UnorderedMapH = std::unordered_map; + +template +using UnorderedSet = std::unordered_set; + +} // namespace ROCKSDB_NAMESPACE + +#endif diff --git a/src/rocksdb/util/hash_map.h b/src/rocksdb/util/hash_map.h new file mode 100644 index 000000000..e3ad2584f --- /dev/null +++ b/src/rocksdb/util/hash_map.h @@ -0,0 +1,67 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include +#include + +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// This is similar to std::unordered_map, except that it tries to avoid +// allocating or deallocating memory as much as possible. With +// std::unordered_map, an allocation/deallocation is made for every insertion +// or deletion because of the requirement that iterators remain valid even +// with insertions or deletions. This means that the hash chains will be +// implemented as linked lists. +// +// This implementation uses autovector as hash chains insteads. +// +template +class HashMap { + std::array, 1>, size> table_; + + public: + bool Contains(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair& p) { return p.first == key; }); + return it != bucket.end(); + } + + void Insert(K key, const V& value) { + auto& bucket = table_[key % size]; + bucket.push_back({key, value}); + } + + void Delete(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair& p) { return p.first == key; }); + if (it != bucket.end()) { + auto last = bucket.end() - 1; + if (it != last) { + *it = *last; + } + bucket.pop_back(); + } + } + + V& Get(K key) { + auto& bucket = table_[key % size]; + auto it = std::find_if( + bucket.begin(), bucket.end(), + [key](const std::pair& p) { return p.first == key; }); + return it->second; + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/hash_test.cc b/src/rocksdb/util/hash_test.cc new file mode 100644 index 000000000..72112b044 --- /dev/null +++ b/src/rocksdb/util/hash_test.cc @@ -0,0 +1,853 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/hash.h" + +#include +#include +#include + +#include "test_util/testharness.h" +#include "util/coding.h" +#include "util/coding_lean.h" +#include "util/hash128.h" +#include "util/math.h" +#include "util/math128.h" + +using ROCKSDB_NAMESPACE::BijectiveHash2x64; +using ROCKSDB_NAMESPACE::BijectiveUnhash2x64; +using ROCKSDB_NAMESPACE::DecodeFixed64; +using ROCKSDB_NAMESPACE::EncodeFixed32; +using ROCKSDB_NAMESPACE::EndianSwapValue; +using ROCKSDB_NAMESPACE::GetSliceHash64; +using ROCKSDB_NAMESPACE::Hash; +using ROCKSDB_NAMESPACE::Hash128; +using ROCKSDB_NAMESPACE::Hash2x64; +using ROCKSDB_NAMESPACE::Hash64; +using ROCKSDB_NAMESPACE::Lower32of64; +using ROCKSDB_NAMESPACE::Lower64of128; +using ROCKSDB_NAMESPACE::ReverseBits; +using ROCKSDB_NAMESPACE::Slice; +using ROCKSDB_NAMESPACE::Unsigned128; +using ROCKSDB_NAMESPACE::Upper32of64; +using ROCKSDB_NAMESPACE::Upper64of128; + +// The hash algorithm is part of the file format, for example for the Bloom +// filters. Test that the hash values are stable for a set of random strings of +// varying lengths. +TEST(HashTest, Values) { + constexpr uint32_t kSeed = 0xbc9f1d34; // Same as BloomHash. + + EXPECT_EQ(Hash("", 0, kSeed), 3164544308u); + EXPECT_EQ(Hash("\x08", 1, kSeed), 422599524u); + EXPECT_EQ(Hash("\x17", 1, kSeed), 3168152998u); + EXPECT_EQ(Hash("\x9a", 1, kSeed), 3195034349u); + EXPECT_EQ(Hash("\x1c", 1, kSeed), 2651681383u); + EXPECT_EQ(Hash("\x4d\x76", 2, kSeed), 2447836956u); + EXPECT_EQ(Hash("\x52\xd5", 2, kSeed), 3854228105u); + EXPECT_EQ(Hash("\x91\xf7", 2, kSeed), 31066776u); + EXPECT_EQ(Hash("\xd6\x27", 2, kSeed), 1806091603u); + EXPECT_EQ(Hash("\x30\x46\x0b", 3, kSeed), 3808221797u); + EXPECT_EQ(Hash("\x56\xdc\xd6", 3, kSeed), 2157698265u); + EXPECT_EQ(Hash("\xd4\x52\x33", 3, kSeed), 1721992661u); + EXPECT_EQ(Hash("\x6a\xb5\xf4", 3, kSeed), 2469105222u); + EXPECT_EQ(Hash("\x67\x53\x81\x1c", 4, kSeed), 118283265u); + EXPECT_EQ(Hash("\x69\xb8\xc0\x88", 4, kSeed), 3416318611u); + EXPECT_EQ(Hash("\x1e\x84\xaf\x2d", 4, kSeed), 3315003572u); + EXPECT_EQ(Hash("\x46\xdc\x54\xbe", 4, kSeed), 447346355u); + EXPECT_EQ(Hash("\xd0\x7a\x6e\xea\x56", 5, kSeed), 4255445370u); + EXPECT_EQ(Hash("\x86\x83\xd5\xa4\xd8", 5, kSeed), 2390603402u); + EXPECT_EQ(Hash("\xb7\x46\xbb\x77\xce", 5, kSeed), 2048907743u); + EXPECT_EQ(Hash("\x6c\xa8\xbc\xe5\x99", 5, kSeed), 2177978500u); + EXPECT_EQ(Hash("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), 1036846008u); + EXPECT_EQ(Hash("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), 229980482u); + EXPECT_EQ(Hash("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), 3655585422u); + EXPECT_EQ(Hash("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), 3502708029u); + EXPECT_EQ(Hash("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), 815120748u); + EXPECT_EQ(Hash("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), 3056033698u); + EXPECT_EQ(Hash("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), 587205227u); + EXPECT_EQ(Hash("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), 2030937252u); + EXPECT_EQ(Hash("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), 469635402u); + EXPECT_EQ(Hash("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), 3530274698u); + EXPECT_EQ(Hash("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), 1974545809u); + EXPECT_EQ(Hash("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), 3563570120u); + EXPECT_EQ(Hash("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), + 2706087434u); + EXPECT_EQ(Hash("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), + 1534654151u); + EXPECT_EQ(Hash("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), + 2355554696u); + EXPECT_EQ(Hash("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), + 1400800912u); + EXPECT_EQ(Hash("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed), + 3420325137u); + EXPECT_EQ(Hash("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed), + 3427803584u); + EXPECT_EQ(Hash("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed), + 1152407945u); + EXPECT_EQ(Hash("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed), + 3382479516u); +} + +// The hash algorithm is part of the file format, for example for the Bloom +// filters. +TEST(HashTest, Hash64Misc) { + constexpr uint32_t kSeed = 0; // Same as GetSliceHash64 + + for (char fill : {'\0', 'a', '1', '\xff'}) { + const size_t max_size = 1000; + const std::string str(max_size, fill); + + for (size_t size = 0; size <= max_size; ++size) { + uint64_t here = Hash64(str.data(), size, kSeed); + + // Must be same as unseeded Hash64 and GetSliceHash64 + EXPECT_EQ(here, Hash64(str.data(), size)); + EXPECT_EQ(here, GetSliceHash64(Slice(str.data(), size))); + + // Upper and Lower must reconstruct hash + EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) | Lower32of64(here)); + EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) + Lower32of64(here)); + EXPECT_EQ(here, (uint64_t{Upper32of64(here)} << 32) ^ Lower32of64(here)); + + // Seed changes hash value (with high probability) + for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) { + EXPECT_NE(here, Hash64(str.data(), size, var_seed)); + } + + // Size changes hash value (with high probability) + size_t max_smaller_by = std::min(size_t{30}, size); + for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) { + EXPECT_NE(here, Hash64(str.data(), size - smaller_by, kSeed)); + } + } + } +} + +// Test that hash values are "non-trivial" for "trivial" inputs +TEST(HashTest, Hash64Trivial) { + // Thorough test too slow for regression testing + constexpr bool thorough = false; + + // For various seeds, make sure hash of empty string is not zero. + constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000; + for (uint64_t seed = 0; seed < max_seed; ++seed) { + uint64_t here = Hash64("", 0, seed); + EXPECT_NE(Lower32of64(here), 0u); + EXPECT_NE(Upper32of64(here), 0u); + } + + // For standard seed, make sure hash of small strings are not zero + constexpr uint32_t kSeed = 0; // Same as GetSliceHash64 + char input[4]; + constexpr int max_len = thorough ? 3 : 2; + for (int len = 1; len <= max_len; ++len) { + for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) { + EncodeFixed32(input, i); + uint64_t here = Hash64(input, len, kSeed); + EXPECT_NE(Lower32of64(here), 0u); + EXPECT_NE(Upper32of64(here), 0u); + } + } +} + +// Test that the hash values are stable for a set of random strings of +// varying small lengths. +TEST(HashTest, Hash64SmallValueSchema) { + constexpr uint32_t kSeed = 0; // Same as GetSliceHash64 + + EXPECT_EQ(Hash64("", 0, kSeed), uint64_t{5999572062939766020u}); + EXPECT_EQ(Hash64("\x08", 1, kSeed), uint64_t{583283813901344696u}); + EXPECT_EQ(Hash64("\x17", 1, kSeed), uint64_t{16175549975585474943u}); + EXPECT_EQ(Hash64("\x9a", 1, kSeed), uint64_t{16322991629225003903u}); + EXPECT_EQ(Hash64("\x1c", 1, kSeed), uint64_t{13269285487706833447u}); + EXPECT_EQ(Hash64("\x4d\x76", 2, kSeed), uint64_t{6859542833406258115u}); + EXPECT_EQ(Hash64("\x52\xd5", 2, kSeed), uint64_t{4919611532550636959u}); + EXPECT_EQ(Hash64("\x91\xf7", 2, kSeed), uint64_t{14199427467559720719u}); + EXPECT_EQ(Hash64("\xd6\x27", 2, kSeed), uint64_t{12292689282614532691u}); + EXPECT_EQ(Hash64("\x30\x46\x0b", 3, kSeed), uint64_t{11404699285340020889u}); + EXPECT_EQ(Hash64("\x56\xdc\xd6", 3, kSeed), uint64_t{12404347133785524237u}); + EXPECT_EQ(Hash64("\xd4\x52\x33", 3, kSeed), uint64_t{15853805298481534034u}); + EXPECT_EQ(Hash64("\x6a\xb5\xf4", 3, kSeed), uint64_t{16863488758399383382u}); + EXPECT_EQ(Hash64("\x67\x53\x81\x1c", 4, kSeed), + uint64_t{9010661983527562386u}); + EXPECT_EQ(Hash64("\x69\xb8\xc0\x88", 4, kSeed), + uint64_t{6611781377647041447u}); + EXPECT_EQ(Hash64("\x1e\x84\xaf\x2d", 4, kSeed), + uint64_t{15290969111616346501u}); + EXPECT_EQ(Hash64("\x46\xdc\x54\xbe", 4, kSeed), + uint64_t{7063754590279313623u}); + EXPECT_EQ(Hash64("\xd0\x7a\x6e\xea\x56", 5, kSeed), + uint64_t{6384167718754869899u}); + EXPECT_EQ(Hash64("\x86\x83\xd5\xa4\xd8", 5, kSeed), + uint64_t{16874407254108011067u}); + EXPECT_EQ(Hash64("\xb7\x46\xbb\x77\xce", 5, kSeed), + uint64_t{16809880630149135206u}); + EXPECT_EQ(Hash64("\x6c\xa8\xbc\xe5\x99", 5, kSeed), + uint64_t{1249038833153141148u}); + EXPECT_EQ(Hash64("\x5c\x5e\xe1\xa0\x73\x81", 6, kSeed), + uint64_t{17358142495308219330u}); + EXPECT_EQ(Hash64("\x08\x5d\x73\x1c\xe5\x2e", 6, kSeed), + uint64_t{4237646583134806322u}); + EXPECT_EQ(Hash64("\x42\xfb\xf2\x52\xb4\x10", 6, kSeed), + uint64_t{4373664924115234051u}); + EXPECT_EQ(Hash64("\x73\xe1\xff\x56\x9c\xce", 6, kSeed), + uint64_t{12012981210634596029u}); + EXPECT_EQ(Hash64("\x5c\xbe\x97\x75\x54\x9a\x52", 7, kSeed), + uint64_t{5716522398211028826u}); + EXPECT_EQ(Hash64("\x16\x82\x39\x49\x88\x2b\x36", 7, kSeed), + uint64_t{15604531309862565013u}); + EXPECT_EQ(Hash64("\x59\x77\xf0\xa7\x24\xf4\x78", 7, kSeed), + uint64_t{8601330687345614172u}); + EXPECT_EQ(Hash64("\xd3\xa5\x7c\x0e\xc0\x02\x07", 7, kSeed), + uint64_t{8088079329364056942u}); + EXPECT_EQ(Hash64("\x31\x1b\x98\x75\x96\x22\xd3\x9a", 8, kSeed), + uint64_t{9844314944338447628u}); + EXPECT_EQ(Hash64("\x38\xd6\xf7\x28\x20\xb4\x8a\xe9", 8, kSeed), + uint64_t{10973293517982163143u}); + EXPECT_EQ(Hash64("\xbb\x18\x5d\xf4\x12\x03\xf7\x99", 8, kSeed), + uint64_t{9986007080564743219u}); + EXPECT_EQ(Hash64("\x80\xd4\x3b\x3b\xae\x22\xa2\x78", 8, kSeed), + uint64_t{1729303145008254458u}); + EXPECT_EQ(Hash64("\x1a\xb5\xd0\xfe\xab\xc3\x61\xb2\x99", 9, kSeed), + uint64_t{13253403748084181481u}); + EXPECT_EQ(Hash64("\x8e\x4a\xc3\x18\x20\x2f\x06\xe6\x3c", 9, kSeed), + uint64_t{7768754303876232188u}); + EXPECT_EQ(Hash64("\xb6\xc0\xdd\x05\x3f\xc4\x86\x4c\xef", 9, kSeed), + uint64_t{12439346786701492u}); + EXPECT_EQ(Hash64("\x9a\x5f\x78\x0d\xaf\x50\xe1\x1f\x55", 9, kSeed), + uint64_t{10841838338450144690u}); + EXPECT_EQ(Hash64("\x22\x6f\x39\x1f\xf8\xdd\x4f\x52\x17\x94", 10, kSeed), + uint64_t{12883919702069153152u}); + EXPECT_EQ(Hash64("\x32\x89\x2a\x75\x48\x3a\x4a\x02\x69\xdd", 10, kSeed), + uint64_t{12692903507676842188u}); + EXPECT_EQ(Hash64("\x06\x92\x5c\xf4\x88\x0e\x7e\x68\x38\x3e", 10, kSeed), + uint64_t{6540985900674032620u}); + EXPECT_EQ(Hash64("\xbd\x2c\x63\x38\xbf\xe9\x78\xb7\xbf\x15", 10, kSeed), + uint64_t{10551812464348219044u}); +} + +std::string Hash64TestDescriptor(const char *repeat, size_t limit) { + const char *mod61_encode = + "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + std::string input; + while (input.size() < limit) { + input.append(repeat); + } + std::string rv; + for (size_t i = 0; i < limit; ++i) { + uint64_t h = GetSliceHash64(Slice(input.data(), i)); + rv.append(1, mod61_encode[static_cast(h % 61)]); + } + return rv; +} + +// XXPH3 changes its algorithm for various sizes up through 250 bytes, so +// we need to check the stability of larger sizes also. +TEST(HashTest, Hash64LargeValueSchema) { + // Each of these derives a "descriptor" from the hash values for all + // lengths up to 430. + // Note that "c" is common for the zero-length string. + EXPECT_EQ( + Hash64TestDescriptor("foo", 430), + "cRhyWsY67B6klRA1udmOuiYuX7IthyGBKqbeosz2hzVglWCmQx8nEdnpkvPfYX56Up2OWOTV" + "lTzfAoYwvtqKzjD8E9xttR2unelbXbIV67NUe6bOO23BxaSFRcA3njGu5cUWfgwOqNoTsszp" + "uPvKRP6qaUR5VdoBkJUCFIefd7edlNK5mv6JYWaGdwxehg65hTkTmjZoPKxTZo4PLyzbL9U4" + "xt12ITSfeP2MfBHuLI2z2pDlBb44UQKVMx27LEoAHsdLp3WfWfgH3sdRBRCHm33UxCM4QmE2" + "xJ7gqSvNwTeH7v9GlC8zWbGroyD3UVNeShMLx29O7tH1biemLULwAHyIw8zdtLMDpEJ8m2ic" + "l6Lb4fDuuFNAs1GCVUthjK8CV8SWI8Rsz5THSwn5CGhpqUwSZcFknjwWIl5rNCvDxXJqYr"); + // Note that "1EeRk" is common for "Rocks" + EXPECT_EQ( + Hash64TestDescriptor("Rocks", 430), + "c1EeRkrzgOYWLA8PuhJrwTePJewoB44WdXYDfhbk3ZxTqqg25WlPExDl7IKIQLJvnA6gJxxn" + "9TCSLkFGfJeXehaSS1GBqWSzfhEH4VXiXIUCuxJXxtKXcSC6FrNIQGTZbYDiUOLD6Y5inzrF" + "9etwQhXUBanw55xAUdNMFQAm2GjJ6UDWp2mISLiMMkLjANWMKLaZMqaFLX37qB4MRO1ooVRv" + "zSvaNRSCLxlggQCasQq8icWjzf3HjBlZtU6pd4rkaUxSzHqmo9oM5MghbU5Rtxg8wEfO7lVN" + "5wdMONYecslQTwjZUpO1K3LDf3K3XK6sUXM6ShQQ3RHmMn2acB4YtTZ3QQcHYJSOHn2DuWpa" + "Q8RqzX5lab92YmOLaCdOHq1BPsM7SIBzMdLgePNsJ1vvMALxAaoDUHPxoFLO2wx18IXnyX"); + EXPECT_EQ( + Hash64TestDescriptor("RocksDB", 430), + "c1EeRkukbkb28wLTahwD2sfUhZzaBEnF8SVrxnPVB6A7b8CaAl3UKsDZISF92GSq2wDCukOq" + "Jgrsp7A3KZhDiLW8dFXp8UPqPxMCRlMdZeVeJ2dJxrmA6cyt99zkQFj7ELbut6jAeVqARFnw" + "fnWVXOsaLrq7bDCbMcns2DKvTaaqTCLMYxI7nhtLpFN1jR755FRQFcOzrrDbh7QhypjdvlYw" + "cdAMSZgp9JMHxbM23wPSuH6BOFgxejz35PScZfhDPvTOxIy1jc3MZsWrMC3P324zNolO7JdW" + "CX2I5UDKjjaEJfxbgVgJIXxtQGlmj2xkO5sPpjULQV4X2HlY7FQleJ4QRaJIB4buhCA4vUTF" + "eMFlxCIYUpTCsal2qsmnGOWa8WCcefrohMjDj1fjzSvSaQwlpyR1GZHF2uPOoQagiCpHpm"); +} + +TEST(HashTest, Hash128Misc) { + constexpr uint32_t kSeed = 0; // Same as GetSliceHash128 + + for (char fill : {'\0', 'a', '1', '\xff', 'e'}) { + const size_t max_size = 1000; + std::string str(max_size, fill); + + if (fill == 'e') { + // Use different characters to check endianness handling + for (size_t i = 0; i < str.size(); ++i) { + str[i] += static_cast(i); + } + } + + for (size_t size = 0; size <= max_size; ++size) { + Unsigned128 here = Hash128(str.data(), size, kSeed); + + // Must be same as unseeded Hash128 and GetSliceHash128 + EXPECT_EQ(here, Hash128(str.data(), size)); + EXPECT_EQ(here, GetSliceHash128(Slice(str.data(), size))); + { + uint64_t hi, lo; + Hash2x64(str.data(), size, &hi, &lo); + EXPECT_EQ(Lower64of128(here), lo); + EXPECT_EQ(Upper64of128(here), hi); + } + if (size == 16) { + const uint64_t in_hi = DecodeFixed64(str.data() + 8); + const uint64_t in_lo = DecodeFixed64(str.data()); + uint64_t hi, lo; + BijectiveHash2x64(in_hi, in_lo, &hi, &lo); + EXPECT_EQ(Lower64of128(here), lo); + EXPECT_EQ(Upper64of128(here), hi); + uint64_t un_hi, un_lo; + BijectiveUnhash2x64(hi, lo, &un_hi, &un_lo); + EXPECT_EQ(in_lo, un_lo); + EXPECT_EQ(in_hi, un_hi); + } + + // Upper and Lower must reconstruct hash + EXPECT_EQ(here, + (Unsigned128{Upper64of128(here)} << 64) | Lower64of128(here)); + EXPECT_EQ(here, + (Unsigned128{Upper64of128(here)} << 64) ^ Lower64of128(here)); + + // Seed changes hash value (with high probability) + for (uint64_t var_seed = 1; var_seed != 0; var_seed <<= 1) { + Unsigned128 seeded = Hash128(str.data(), size, var_seed); + EXPECT_NE(here, seeded); + // Must match seeded Hash2x64 + { + uint64_t hi, lo; + Hash2x64(str.data(), size, var_seed, &hi, &lo); + EXPECT_EQ(Lower64of128(seeded), lo); + EXPECT_EQ(Upper64of128(seeded), hi); + } + if (size == 16) { + const uint64_t in_hi = DecodeFixed64(str.data() + 8); + const uint64_t in_lo = DecodeFixed64(str.data()); + uint64_t hi, lo; + BijectiveHash2x64(in_hi, in_lo, var_seed, &hi, &lo); + EXPECT_EQ(Lower64of128(seeded), lo); + EXPECT_EQ(Upper64of128(seeded), hi); + uint64_t un_hi, un_lo; + BijectiveUnhash2x64(hi, lo, var_seed, &un_hi, &un_lo); + EXPECT_EQ(in_lo, un_lo); + EXPECT_EQ(in_hi, un_hi); + } + } + + // Size changes hash value (with high probability) + size_t max_smaller_by = std::min(size_t{30}, size); + for (size_t smaller_by = 1; smaller_by <= max_smaller_by; ++smaller_by) { + EXPECT_NE(here, Hash128(str.data(), size - smaller_by, kSeed)); + } + } + } +} + +// Test that hash values are "non-trivial" for "trivial" inputs +TEST(HashTest, Hash128Trivial) { + // Thorough test too slow for regression testing + constexpr bool thorough = false; + + // For various seeds, make sure hash of empty string is not zero. + constexpr uint64_t max_seed = thorough ? 0x1000000 : 0x10000; + for (uint64_t seed = 0; seed < max_seed; ++seed) { + Unsigned128 here = Hash128("", 0, seed); + EXPECT_NE(Lower64of128(here), 0u); + EXPECT_NE(Upper64of128(here), 0u); + } + + // For standard seed, make sure hash of small strings are not zero + constexpr uint32_t kSeed = 0; // Same as GetSliceHash128 + char input[4]; + constexpr int max_len = thorough ? 3 : 2; + for (int len = 1; len <= max_len; ++len) { + for (uint32_t i = 0; (i >> (len * 8)) == 0; ++i) { + EncodeFixed32(input, i); + Unsigned128 here = Hash128(input, len, kSeed); + EXPECT_NE(Lower64of128(here), 0u); + EXPECT_NE(Upper64of128(here), 0u); + } + } +} + +std::string Hash128TestDescriptor(const char *repeat, size_t limit) { + const char *mod61_encode = + "abcdefghijklmnopqrstuvwxyz123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + std::string input; + while (input.size() < limit) { + input.append(repeat); + } + std::string rv; + for (size_t i = 0; i < limit; ++i) { + auto h = GetSliceHash128(Slice(input.data(), i)); + uint64_t h2 = Upper64of128(h) + Lower64of128(h); + rv.append(1, mod61_encode[static_cast(h2 % 61)]); + } + return rv; +} + +// XXH3 changes its algorithm for various sizes up through 250 bytes, so +// we need to check the stability of larger sizes also. +TEST(HashTest, Hash128ValueSchema) { + // Each of these derives a "descriptor" from the hash values for all + // lengths up to 430. + // Note that "b" is common for the zero-length string. + EXPECT_EQ( + Hash128TestDescriptor("foo", 430), + "bUMA3As8n9I4vNGhThXlEevxZlyMcbb6TYAlIKJ2f5ponsv99q962rYclQ7u3gfnRdCDQ5JI" + "2LrGUaCycbXrvLFe4SjgRb9RQwCfrnmNQ7VSEwSKMnkGCK3bDbXSrnIh5qLXdtvIZklbJpGH" + "Dqr93BlqF9ubTnOSYkSdx89XvQqflMIW8bjfQp9BPjQejWOeEQspnN1D3sfgVdFhpaQdHYA5" + "pI2XcPlCMFPxvrFuRr7joaDvjNe9IUZaunLPMewuXmC3EL95h52Ju3D7y9RNKhgYxMTrA84B" + "yJrMvyjdm3vlBxet4EN7v2GEyjbGuaZW9UL6lrX6PghJDg7ACfLGdxNbH3qXM4zaiG2RKnL5" + "S3WXKR78RBB5fRFQ8KDIEQjHFvSNsc3GrAEi6W8P2lv8JMTzjBODO2uN4wadVQFT9wpGfV"); + // Note that "35D2v" is common for "Rocks" + EXPECT_EQ( + Hash128TestDescriptor("Rocks", 430), + "b35D2vzvklFVDqJmyLRXyApwGGO3EAT3swhe8XJAN3mY2UVPglzdmydxcba6JI2tSvwO6zSu" + "ANpjSM7tc9G5iMhsa7R8GfyCXRO1TnLg7HvdWNdgGGBirxZR68BgT7TQsYJt6zyEyISeXI1n" + "MXA48Xo7dWfJeYN6Z4KWlqZY7TgFXGbks9AX4ehZNSGtIhdO5i58qlgVX1bEejeOVaCcjC79" + "67DrMfOKds7rUQzjBa77sMPcoPW1vu6ljGJPZH3XkRyDMZ1twxXKkNxN3tE8nR7JHwyqBAxE" + "fTcjbOWrLZ1irWxRSombD8sGDEmclgF11IxqEhe3Rt7gyofO3nExGckKkS9KfRqsCHbiUyva" + "JGkJwUHRXaZnh58b4i1Ei9aQKZjXlvIVDixoZrjcNaH5XJIJlRZce9Z9t82wYapTpckYSg"); + EXPECT_EQ( + Hash128TestDescriptor("RocksDB", 430), + "b35D2vFUst3XDZCRlSrhmYYakmqImV97LbBsV6EZlOEQpUPH1d1sD3xMKAPlA5UErHehg5O7" + "n966fZqhAf3hRc24kGCLfNAWjyUa7vSNOx3IcPoTyVRFZeFlcCtfl7t1QJumHOCpS33EBmBF" + "hvK13QjBbDWYWeHQhJhgV9Mqbx17TIcvUkEnYZxb8IzWNmjVsJG44Z7v52DjGj1ZzS62S2Vv" + "qWcDO7apvH5VHg68E9Wl6nXP21vlmUqEH9GeWRehfWVvY7mUpsAg5drHHQyDSdiMceiUuUxJ" + "XJqHFcDdzbbPk7xDvbLgWCKvH8k3MpQNWOmbSSRDdAP6nGlDjoTToYkcqVREHJzztSWAAq5h" + "GHSUNJ6OxsMHhf8EhXfHtKyUzRmPtjYyeckQcGmrQfFFLidc6cjMDKCdBG6c6HVBrS7H2R"); +} + +TEST(FastRange32Test, Values) { + using ROCKSDB_NAMESPACE::FastRange32; + // Zero range + EXPECT_EQ(FastRange32(0, 0), 0U); + EXPECT_EQ(FastRange32(123, 0), 0U); + EXPECT_EQ(FastRange32(0xffffffff, 0), 0U); + + // One range + EXPECT_EQ(FastRange32(0, 1), 0U); + EXPECT_EQ(FastRange32(123, 1), 0U); + EXPECT_EQ(FastRange32(0xffffffff, 1), 0U); + + // Two range + EXPECT_EQ(FastRange32(0, 2), 0U); + EXPECT_EQ(FastRange32(123, 2), 0U); + EXPECT_EQ(FastRange32(0x7fffffff, 2), 0U); + EXPECT_EQ(FastRange32(0x80000000, 2), 1U); + EXPECT_EQ(FastRange32(0xffffffff, 2), 1U); + + // Seven range + EXPECT_EQ(FastRange32(0, 7), 0U); + EXPECT_EQ(FastRange32(123, 7), 0U); + EXPECT_EQ(FastRange32(613566756, 7), 0U); + EXPECT_EQ(FastRange32(613566757, 7), 1U); + EXPECT_EQ(FastRange32(1227133513, 7), 1U); + EXPECT_EQ(FastRange32(1227133514, 7), 2U); + // etc. + EXPECT_EQ(FastRange32(0xffffffff, 7), 6U); + + // Big + EXPECT_EQ(FastRange32(1, 0x80000000), 0U); + EXPECT_EQ(FastRange32(2, 0x80000000), 1U); + EXPECT_EQ(FastRange32(4, 0x7fffffff), 1U); + EXPECT_EQ(FastRange32(4, 0x80000000), 2U); + EXPECT_EQ(FastRange32(0xffffffff, 0x7fffffff), 0x7ffffffeU); + EXPECT_EQ(FastRange32(0xffffffff, 0x80000000), 0x7fffffffU); +} + +TEST(FastRange64Test, Values) { + using ROCKSDB_NAMESPACE::FastRange64; + // Zero range + EXPECT_EQ(FastRange64(0, 0), 0U); + EXPECT_EQ(FastRange64(123, 0), 0U); + EXPECT_EQ(FastRange64(0xffffFFFF, 0), 0U); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0), 0U); + + // One range + EXPECT_EQ(FastRange64(0, 1), 0U); + EXPECT_EQ(FastRange64(123, 1), 0U); + EXPECT_EQ(FastRange64(0xffffFFFF, 1), 0U); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 1), 0U); + + // Two range + EXPECT_EQ(FastRange64(0, 2), 0U); + EXPECT_EQ(FastRange64(123, 2), 0U); + EXPECT_EQ(FastRange64(0xffffFFFF, 2), 0U); + EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 2), 0U); + EXPECT_EQ(FastRange64(0x8000000000000000, 2), 1U); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 2), 1U); + + // Seven range + EXPECT_EQ(FastRange64(0, 7), 0U); + EXPECT_EQ(FastRange64(123, 7), 0U); + EXPECT_EQ(FastRange64(0xffffFFFF, 7), 0U); + EXPECT_EQ(FastRange64(2635249153387078802, 7), 0U); + EXPECT_EQ(FastRange64(2635249153387078803, 7), 1U); + EXPECT_EQ(FastRange64(5270498306774157604, 7), 1U); + EXPECT_EQ(FastRange64(5270498306774157605, 7), 2U); + EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 7), 3U); + EXPECT_EQ(FastRange64(0x8000000000000000, 7), 3U); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 7), 6U); + + // Big but 32-bit range + EXPECT_EQ(FastRange64(0x100000000, 0x80000000), 0U); + EXPECT_EQ(FastRange64(0x200000000, 0x80000000), 1U); + EXPECT_EQ(FastRange64(0x400000000, 0x7fffFFFF), 1U); + EXPECT_EQ(FastRange64(0x400000000, 0x80000000), 2U); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0x7fffFFFF), 0x7fffFFFEU); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0x80000000), 0x7fffFFFFU); + + // Big, > 32-bit range +#if SIZE_MAX == UINT64_MAX + EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 0x4200000002), 0x2100000000U); + EXPECT_EQ(FastRange64(0x8000000000000000, 0x4200000002), 0x2100000001U); + + EXPECT_EQ(FastRange64(0x0000000000000000, 420000000002), 0U); + EXPECT_EQ(FastRange64(0x7fffFFFFffffFFFF, 420000000002), 210000000000U); + EXPECT_EQ(FastRange64(0x8000000000000000, 420000000002), 210000000001U); + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 420000000002), 420000000001U); + + EXPECT_EQ(FastRange64(0xffffFFFFffffFFFF, 0xffffFFFFffffFFFF), + 0xffffFFFFffffFFFEU); +#endif +} + +TEST(FastRangeGenericTest, Values) { + using ROCKSDB_NAMESPACE::FastRangeGeneric; + // Generic (including big and small) + // Note that FastRangeGeneric is also tested indirectly above via + // FastRange32 and FastRange64. + EXPECT_EQ( + FastRangeGeneric(uint64_t{0x8000000000000000}, uint64_t{420000000002}), + uint64_t{210000000001}); + EXPECT_EQ(FastRangeGeneric(uint64_t{0x8000000000000000}, uint16_t{12468}), + uint16_t{6234}); + EXPECT_EQ(FastRangeGeneric(uint32_t{0x80000000}, uint16_t{12468}), + uint16_t{6234}); + // Not recommended for typical use because for example this could fail on + // some platforms and pass on others: + // EXPECT_EQ(FastRangeGeneric(static_cast(0x80000000), + // uint16_t{12468}), + // uint16_t{6234}); +} + +// for inspection of disassembly +uint32_t FastRange32(uint32_t hash, uint32_t range) { + return ROCKSDB_NAMESPACE::FastRange32(hash, range); +} + +// for inspection of disassembly +size_t FastRange64(uint64_t hash, size_t range) { + return ROCKSDB_NAMESPACE::FastRange64(hash, range); +} + +// Tests for math.h / math128.h (not worth a separate test binary) +using ROCKSDB_NAMESPACE::BitParity; +using ROCKSDB_NAMESPACE::BitsSetToOne; +using ROCKSDB_NAMESPACE::ConstexprFloorLog2; +using ROCKSDB_NAMESPACE::CountTrailingZeroBits; +using ROCKSDB_NAMESPACE::DecodeFixed128; +using ROCKSDB_NAMESPACE::DecodeFixedGeneric; +using ROCKSDB_NAMESPACE::DownwardInvolution; +using ROCKSDB_NAMESPACE::EncodeFixed128; +using ROCKSDB_NAMESPACE::EncodeFixedGeneric; +using ROCKSDB_NAMESPACE::FloorLog2; +using ROCKSDB_NAMESPACE::Lower64of128; +using ROCKSDB_NAMESPACE::Multiply64to128; +using ROCKSDB_NAMESPACE::Unsigned128; +using ROCKSDB_NAMESPACE::Upper64of128; + +int blah(int x) { return DownwardInvolution(x); } + +template +static void test_BitOps() { + // This complex code is to generalize to 128-bit values. Otherwise + // we could just use = static_cast(0x5555555555555555ULL); + T everyOtherBit = 0; + for (unsigned i = 0; i < sizeof(T); ++i) { + everyOtherBit = (everyOtherBit << 8) | T{0x55}; + } + + // This one built using bit operations, as our 128-bit layer + // might not implement arithmetic such as subtraction. + T vm1 = 0; // "v minus one" + + for (int i = 0; i < int{8 * sizeof(T)}; ++i) { + T v = T{1} << i; + // If we could directly use arithmetic: + // T vm1 = static_cast(v - 1); + + // FloorLog2 + if (v > 0) { + EXPECT_EQ(FloorLog2(v), i); + EXPECT_EQ(ConstexprFloorLog2(v), i); + } + if (vm1 > 0) { + EXPECT_EQ(FloorLog2(vm1), i - 1); + EXPECT_EQ(ConstexprFloorLog2(vm1), i - 1); + EXPECT_EQ(FloorLog2(everyOtherBit & vm1), (i - 1) & ~1); + EXPECT_EQ(ConstexprFloorLog2(everyOtherBit & vm1), (i - 1) & ~1); + } + + // CountTrailingZeroBits + if (v != 0) { + EXPECT_EQ(CountTrailingZeroBits(v), i); + } + if (vm1 != 0) { + EXPECT_EQ(CountTrailingZeroBits(vm1), 0); + } + if (i < int{8 * sizeof(T)} - 1) { + EXPECT_EQ(CountTrailingZeroBits(~vm1 & everyOtherBit), (i + 1) & ~1); + } + + // BitsSetToOne + EXPECT_EQ(BitsSetToOne(v), 1); + EXPECT_EQ(BitsSetToOne(vm1), i); + EXPECT_EQ(BitsSetToOne(vm1 & everyOtherBit), (i + 1) / 2); + + // BitParity + EXPECT_EQ(BitParity(v), 1); + EXPECT_EQ(BitParity(vm1), i & 1); + EXPECT_EQ(BitParity(vm1 & everyOtherBit), ((i + 1) / 2) & 1); + + // EndianSwapValue + T ev = T{1} << (((sizeof(T) - 1 - (i / 8)) * 8) + i % 8); + EXPECT_EQ(EndianSwapValue(v), ev); + + // ReverseBits + EXPECT_EQ(ReverseBits(v), static_cast(T{1} << (8 * sizeof(T) - 1 - i))); +#ifdef HAVE_UINT128_EXTENSION // Uses multiplication + if (std::is_unsigned::value) { // Technical UB on signed type + T rv = T{1} << (8 * sizeof(T) - 1 - i); + EXPECT_EQ(ReverseBits(vm1), static_cast(rv * ~T{1})); + } +#endif + + // DownwardInvolution + { + T misc = static_cast(/*random*/ 0xc682cd153d0e3279U + + i * /*random*/ 0x9b3972f3bea0baa3U); + if constexpr (sizeof(T) > 8) { + misc = (misc << 64) | (/*random*/ 0x52af031a38ced62dU + + i * /*random*/ 0x936f803d9752ddc3U); + } + T misc_masked = misc & vm1; + EXPECT_LE(misc_masked, vm1); + T di_misc_masked = DownwardInvolution(misc_masked); + EXPECT_LE(di_misc_masked, vm1); + if (misc_masked > 0) { + // Highest-order 1 in same position + EXPECT_EQ(FloorLog2(misc_masked), FloorLog2(di_misc_masked)); + } + // Validate involution property on short value + EXPECT_EQ(DownwardInvolution(di_misc_masked), misc_masked); + + // Validate involution property on large value + T di_misc = DownwardInvolution(misc); + EXPECT_EQ(DownwardInvolution(di_misc), misc); + // Highest-order 1 in same position + if (misc > 0) { + EXPECT_EQ(FloorLog2(misc), FloorLog2(di_misc)); + } + + // Validate distributes over xor. + // static_casts to avoid numerical promotion effects. + EXPECT_EQ(DownwardInvolution(static_cast(misc_masked ^ vm1)), + static_cast(di_misc_masked ^ DownwardInvolution(vm1))); + T misc2 = static_cast(misc >> 1); + EXPECT_EQ(DownwardInvolution(static_cast(misc ^ misc2)), + static_cast(di_misc ^ DownwardInvolution(misc2))); + + // Choose some small number of bits to pull off to test combined + // uniqueness guarantee + int in_bits = i % 7; + unsigned in_mask = (unsigned{1} << in_bits) - 1U; + // IMPLICIT: int out_bits = 8 - in_bits; + std::vector seen(256, false); + for (int j = 0; j < 255; ++j) { + T t_in = misc ^ static_cast(j); + unsigned in = static_cast(t_in); + unsigned out = static_cast(DownwardInvolution(t_in)); + unsigned val = ((out << in_bits) | (in & in_mask)) & 255U; + EXPECT_FALSE(seen[val]); + seen[val] = true; + } + + if (i + 8 < int{8 * sizeof(T)}) { + // Also test manipulating bits in the middle of input is + // bijective in bottom of output + seen = std::vector(256, false); + for (int j = 0; j < 255; ++j) { + T in = misc ^ (static_cast(j) << i); + unsigned val = static_cast(DownwardInvolution(in)) & 255U; + EXPECT_FALSE(seen[val]); + seen[val] = true; + } + } + } + + vm1 = (vm1 << 1) | 1; + } + + EXPECT_EQ(ConstexprFloorLog2(T{1}), 0); + EXPECT_EQ(ConstexprFloorLog2(T{2}), 1); + EXPECT_EQ(ConstexprFloorLog2(T{3}), 1); + EXPECT_EQ(ConstexprFloorLog2(T{42}), 5); +} + +TEST(MathTest, BitOps) { + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); + test_BitOps(); +} + +TEST(MathTest, BitOps128) { test_BitOps(); } + +TEST(MathTest, Math128) { + const Unsigned128 sixteenHexOnes = 0x1111111111111111U; + const Unsigned128 thirtyHexOnes = (sixteenHexOnes << 56) | sixteenHexOnes; + const Unsigned128 sixteenHexTwos = 0x2222222222222222U; + const Unsigned128 thirtyHexTwos = (sixteenHexTwos << 56) | sixteenHexTwos; + + // v will slide from all hex ones to all hex twos + Unsigned128 v = thirtyHexOnes; + for (int i = 0; i <= 30; ++i) { + // Test bitwise operations + EXPECT_EQ(BitsSetToOne(v), 30); + EXPECT_EQ(BitsSetToOne(~v), 128 - 30); + EXPECT_EQ(BitsSetToOne(v & thirtyHexOnes), 30 - i); + EXPECT_EQ(BitsSetToOne(v | thirtyHexOnes), 30 + i); + EXPECT_EQ(BitsSetToOne(v ^ thirtyHexOnes), 2 * i); + EXPECT_EQ(BitsSetToOne(v & thirtyHexTwos), i); + EXPECT_EQ(BitsSetToOne(v | thirtyHexTwos), 60 - i); + EXPECT_EQ(BitsSetToOne(v ^ thirtyHexTwos), 60 - 2 * i); + + // Test comparisons + EXPECT_EQ(v == thirtyHexOnes, i == 0); + EXPECT_EQ(v == thirtyHexTwos, i == 30); + EXPECT_EQ(v > thirtyHexOnes, i > 0); + EXPECT_EQ(v > thirtyHexTwos, false); + EXPECT_EQ(v >= thirtyHexOnes, true); + EXPECT_EQ(v >= thirtyHexTwos, i == 30); + EXPECT_EQ(v < thirtyHexOnes, false); + EXPECT_EQ(v < thirtyHexTwos, i < 30); + EXPECT_EQ(v <= thirtyHexOnes, i == 0); + EXPECT_EQ(v <= thirtyHexTwos, true); + + // Update v, clearing upper-most byte + v = ((v << 12) >> 8) | 0x2; + } + + for (int i = 0; i < 128; ++i) { + // Test shifts + Unsigned128 sl = thirtyHexOnes << i; + Unsigned128 sr = thirtyHexOnes >> i; + EXPECT_EQ(BitsSetToOne(sl), std::min(30, 32 - i / 4)); + EXPECT_EQ(BitsSetToOne(sr), std::max(0, 30 - (i + 3) / 4)); + EXPECT_EQ(BitsSetToOne(sl & sr), i % 2 ? 0 : std::max(0, 30 - i / 2)); + } + + // Test 64x64->128 multiply + Unsigned128 product = + Multiply64to128(0x1111111111111111U, 0x2222222222222222U); + EXPECT_EQ(Lower64of128(product), 2295594818061633090U); + EXPECT_EQ(Upper64of128(product), 163971058432973792U); +} + +TEST(MathTest, Coding128) { + const char *in = "_1234567890123456"; + // Note: in + 1 is likely unaligned + Unsigned128 decoded = DecodeFixed128(in + 1); + EXPECT_EQ(Lower64of128(decoded), 0x3837363534333231U); + EXPECT_EQ(Upper64of128(decoded), 0x3635343332313039U); + char out[18]; + out[0] = '_'; + EncodeFixed128(out + 1, decoded); + out[17] = '\0'; + EXPECT_EQ(std::string(in), std::string(out)); +} + +TEST(MathTest, CodingGeneric) { + const char *in = "_1234567890123456"; + // Decode + // Note: in + 1 is likely unaligned + Unsigned128 decoded128 = DecodeFixedGeneric(in + 1); + EXPECT_EQ(Lower64of128(decoded128), 0x3837363534333231U); + EXPECT_EQ(Upper64of128(decoded128), 0x3635343332313039U); + + uint64_t decoded64 = DecodeFixedGeneric(in + 1); + EXPECT_EQ(decoded64, 0x3837363534333231U); + + uint32_t decoded32 = DecodeFixedGeneric(in + 1); + EXPECT_EQ(decoded32, 0x34333231U); + + uint16_t decoded16 = DecodeFixedGeneric(in + 1); + EXPECT_EQ(decoded16, 0x3231U); + + // Encode + char out[18]; + out[0] = '_'; + memset(out + 1, '\0', 17); + EncodeFixedGeneric(out + 1, decoded128); + EXPECT_EQ(std::string(in), std::string(out)); + + memset(out + 1, '\0', 9); + EncodeFixedGeneric(out + 1, decoded64); + EXPECT_EQ(std::string("_12345678"), std::string(out)); + + memset(out + 1, '\0', 5); + EncodeFixedGeneric(out + 1, decoded32); + EXPECT_EQ(std::string("_1234"), std::string(out)); + + memset(out + 1, '\0', 3); + EncodeFixedGeneric(out + 1, decoded16); + EXPECT_EQ(std::string("_12"), std::string(out)); +} + +int main(int argc, char **argv) { + fprintf(stderr, "NPHash64 id: %x\n", + static_cast(ROCKSDB_NAMESPACE::GetSliceNPHash64("RocksDB"))); + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/heap.h b/src/rocksdb/util/heap.h new file mode 100644 index 000000000..f221fc732 --- /dev/null +++ b/src/rocksdb/util/heap.h @@ -0,0 +1,174 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Binary heap implementation optimized for use in multi-way merge sort. +// Comparison to std::priority_queue: +// - In libstdc++, std::priority_queue::pop() usually performs just over logN +// comparisons but never fewer. +// - std::priority_queue does not have a replace-top operation, requiring a +// pop+push. If the replacement element is the new top, this requires +// around 2logN comparisons. +// - This heap's pop() uses a "schoolbook" downheap which requires up to ~2logN +// comparisons. +// - This heap provides a replace_top() operation which requires [1, 2logN] +// comparisons. When the replacement element is also the new top, this +// takes just 1 or 2 comparisons. +// +// The last property can yield an order-of-magnitude performance improvement +// when merge-sorting real-world non-random data. If the merge operation is +// likely to take chunks of elements from the same input stream, only 1 +// comparison per element is needed. In RocksDB-land, this happens when +// compacting a database where keys are not randomly distributed across L0 +// files but nearby keys are likely to be in the same L0 file. +// +// The container uses the same counterintuitive ordering as +// std::priority_queue: the comparison operator is expected to provide the +// less-than relation, but top() will return the maximum. + +template > +class BinaryHeap { + public: + BinaryHeap() {} + explicit BinaryHeap(Compare cmp) : cmp_(std::move(cmp)) {} + + void push(const T& value) { + data_.push_back(value); + upheap(data_.size() - 1); + } + + void push(T&& value) { + data_.push_back(std::move(value)); + upheap(data_.size() - 1); + } + + const T& top() const { + assert(!empty()); + return data_.front(); + } + + void replace_top(const T& value) { + assert(!empty()); + data_.front() = value; + downheap(get_root()); + } + + void replace_top(T&& value) { + assert(!empty()); + data_.front() = std::move(value); + downheap(get_root()); + } + + void pop() { + assert(!empty()); + if (data_.size() > 1) { + // Avoid self-move-assign, because it could cause problems with + // classes which are not prepared for this and it trips up the + // STL debugger when activated. + data_.front() = std::move(data_.back()); + } + data_.pop_back(); + if (!empty()) { + downheap(get_root()); + } else { + reset_root_cmp_cache(); + } + } + + void swap(BinaryHeap& other) { + std::swap(cmp_, other.cmp_); + data_.swap(other.data_); + std::swap(root_cmp_cache_, other.root_cmp_cache_); + } + + void clear() { + data_.clear(); + reset_root_cmp_cache(); + } + + bool empty() const { return data_.empty(); } + + size_t size() const { return data_.size(); } + + void reset_root_cmp_cache() { + root_cmp_cache_ = std::numeric_limits::max(); + } + + private: + static inline size_t get_root() { return 0; } + static inline size_t get_parent(size_t index) { return (index - 1) / 2; } + static inline size_t get_left(size_t index) { return 2 * index + 1; } + static inline size_t get_right(size_t index) { return 2 * index + 2; } + + void upheap(size_t index) { + T v = std::move(data_[index]); + while (index > get_root()) { + const size_t parent = get_parent(index); + if (!cmp_(data_[parent], v)) { + break; + } + data_[index] = std::move(data_[parent]); + index = parent; + } + data_[index] = std::move(v); + reset_root_cmp_cache(); + } + + void downheap(size_t index) { + T v = std::move(data_[index]); + + size_t picked_child = std::numeric_limits::max(); + while (1) { + const size_t left_child = get_left(index); + if (get_left(index) >= data_.size()) { + break; + } + const size_t right_child = left_child + 1; + assert(right_child == get_right(index)); + picked_child = left_child; + if (index == 0 && root_cmp_cache_ < data_.size()) { + picked_child = root_cmp_cache_; + } else if (right_child < data_.size() && + cmp_(data_[left_child], data_[right_child])) { + picked_child = right_child; + } + if (!cmp_(v, data_[picked_child])) { + break; + } + data_[index] = std::move(data_[picked_child]); + index = picked_child; + } + + if (index == 0) { + // We did not change anything in the tree except for the value + // of the root node, left and right child did not change, we can + // cache that `picked_child` is the smallest child + // so next time we compare againist it directly + root_cmp_cache_ = picked_child; + } else { + // the tree changed, reset cache + reset_root_cmp_cache(); + } + + data_[index] = std::move(v); + } + + Compare cmp_; + autovector data_; + // Used to reduce number of cmp_ calls in downheap() + size_t root_cmp_cache_ = std::numeric_limits::max(); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/heap_test.cc b/src/rocksdb/util/heap_test.cc new file mode 100644 index 000000000..bbb93324f --- /dev/null +++ b/src/rocksdb/util/heap_test.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/heap.h" + +#include + +#include +#include +#include +#include + +#include "port/stack_trace.h" + +#ifndef GFLAGS +const int64_t FLAGS_iters = 100000; +#else +#include "util/gflags_compat.h" +DEFINE_int64(iters, 100000, "number of pseudo-random operations in each test"); +#endif // GFLAGS + +/* + * Compares the custom heap implementation in util/heap.h against + * std::priority_queue on a pseudo-random sequence of operations. + */ + +namespace ROCKSDB_NAMESPACE { + +using HeapTestValue = uint64_t; +using Params = std::tuple; + +class HeapTest : public ::testing::TestWithParam {}; + +TEST_P(HeapTest, Test) { + // This test performs the same pseudorandom sequence of operations on a + // BinaryHeap and an std::priority_queue, comparing output. The three + // possible operations are insert, replace top and pop. + // + // Insert is chosen slightly more often than the others so that the size of + // the heap slowly grows. Once the size heats the MAX_HEAP_SIZE limit, we + // disallow inserting until the heap becomes empty, testing the "draining" + // scenario. + + const auto MAX_HEAP_SIZE = std::get<0>(GetParam()); + const auto MAX_VALUE = std::get<1>(GetParam()); + const auto RNG_SEED = std::get<2>(GetParam()); + + BinaryHeap heap; + std::priority_queue ref; + + std::mt19937 rng(static_cast(RNG_SEED)); + std::uniform_int_distribution value_dist(0, MAX_VALUE); + int ndrains = 0; + bool draining = false; // hit max size, draining until we empty the heap + size_t size = 0; + for (int64_t i = 0; i < FLAGS_iters; ++i) { + if (size == 0) { + draining = false; + } + + if (!draining && (size == 0 || std::bernoulli_distribution(0.4)(rng))) { + // insert + HeapTestValue val = value_dist(rng); + heap.push(val); + ref.push(val); + ++size; + if (size == MAX_HEAP_SIZE) { + draining = true; + ++ndrains; + } + } else if (std::bernoulli_distribution(0.5)(rng)) { + // replace top + HeapTestValue val = value_dist(rng); + heap.replace_top(val); + ref.pop(); + ref.push(val); + } else { + // pop + assert(size > 0); + heap.pop(); + ref.pop(); + --size; + } + + // After every operation, check that the public methods give the same + // results + assert((size == 0) == ref.empty()); + ASSERT_EQ(size == 0, heap.empty()); + if (size > 0) { + ASSERT_EQ(ref.top(), heap.top()); + } + } + + // Probabilities should be set up to occasionally hit the max heap size and + // drain it + assert(ndrains > 0); + + heap.clear(); + ASSERT_TRUE(heap.empty()); +} + +// Basic test, MAX_VALUE = 3*MAX_HEAP_SIZE (occasional duplicates) +INSTANTIATE_TEST_CASE_P(Basic, HeapTest, + ::testing::Values(Params(1000, 3000, + 0x1b575cf05b708945))); +// Mid-size heap with small values (many duplicates) +INSTANTIATE_TEST_CASE_P(SmallValues, HeapTest, + ::testing::Values(Params(100, 10, 0x5ae213f7bd5dccd0))); +// Small heap, large value range (no duplicates) +INSTANTIATE_TEST_CASE_P(SmallHeap, HeapTest, + ::testing::Values(Params(10, ULLONG_MAX, + 0x3e1fa8f4d01707cf))); +// Two-element heap +INSTANTIATE_TEST_CASE_P(TwoElementHeap, HeapTest, + ::testing::Values(Params(2, 5, 0x4b5e13ea988c6abc))); +// One-element heap +INSTANTIATE_TEST_CASE_P(OneElementHeap, HeapTest, + ::testing::Values(Params(1, 3, 0x176a1019ab0b612e))); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/kv_map.h b/src/rocksdb/util/kv_map.h new file mode 100644 index 000000000..62be6d18e --- /dev/null +++ b/src/rocksdb/util/kv_map.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#pragma once + +#include +#include + +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { +namespace stl_wrappers { + +struct LessOfComparator { + explicit LessOfComparator(const Comparator* c = BytewiseComparator()) + : cmp(c) {} + + bool operator()(const std::string& a, const std::string& b) const { + return cmp->Compare(Slice(a), Slice(b)) < 0; + } + bool operator()(const Slice& a, const Slice& b) const { + return cmp->Compare(a, b) < 0; + } + + const Comparator* cmp; +}; + +using KVMap = std::map; +} // namespace stl_wrappers +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/log_write_bench.cc b/src/rocksdb/util/log_write_bench.cc new file mode 100644 index 000000000..c1637db15 --- /dev/null +++ b/src/rocksdb/util/log_write_bench.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include "file/writable_file_writer.h" +#include "monitoring/histogram.h" +#include "rocksdb/env.h" +#include "rocksdb/system_clock.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/gflags_compat.h" + +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +using GFLAGS_NAMESPACE::SetUsageMessage; + +// A simple benchmark to simulate transactional logs + +DEFINE_int32(num_records, 6000, "Number of records."); +DEFINE_int32(record_size, 249, "Size of each record."); +DEFINE_int32(record_interval, 10000, "Interval between records (microSec)"); +DEFINE_int32(bytes_per_sync, 0, "bytes_per_sync parameter in EnvOptions"); +DEFINE_bool(enable_sync, false, "sync after each write."); + +namespace ROCKSDB_NAMESPACE { +void RunBenchmark() { + std::string file_name = test::PerThreadDBPath("log_write_benchmark.log"); + DBOptions options; + Env* env = Env::Default(); + const auto& clock = env->GetSystemClock(); + EnvOptions env_options = env->OptimizeForLogWrite(EnvOptions(), options); + env_options.bytes_per_sync = FLAGS_bytes_per_sync; + std::unique_ptr file; + env->NewWritableFile(file_name, &file, env_options); + std::unique_ptr writer; + writer.reset(new WritableFileWriter(std::move(file), file_name, env_options, + clock, nullptr /* stats */, + options.listeners)); + + std::string record; + record.assign(FLAGS_record_size, 'X'); + + HistogramImpl hist; + + uint64_t start_time = clock->NowMicros(); + for (int i = 0; i < FLAGS_num_records; i++) { + uint64_t start_nanos = clock->NowNanos(); + writer->Append(record); + writer->Flush(); + if (FLAGS_enable_sync) { + writer->Sync(false); + } + hist.Add(clock->NowNanos() - start_nanos); + + if (i % 1000 == 1) { + fprintf(stderr, "Wrote %d records...\n", i); + } + + int time_to_sleep = + (i + 1) * FLAGS_record_interval - (clock->NowMicros() - start_time); + if (time_to_sleep > 0) { + clock->SleepForMicroseconds(time_to_sleep); + } + } + + fprintf(stderr, "Distribution of latency of append+flush: \n%s", + hist.ToString().c_str()); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + ROCKSDB_NAMESPACE::RunBenchmark(); + return 0; +} + +#endif // GFLAGS diff --git a/src/rocksdb/util/math.h b/src/rocksdb/util/math.h new file mode 100644 index 000000000..da31b43ec --- /dev/null +++ b/src/rocksdb/util/math.h @@ -0,0 +1,294 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#ifdef _MSC_VER +#include +#endif + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Fast implementation of floor(log2(v)). Undefined for 0 or negative +// numbers (in case of signed type). +template +inline int FloorLog2(T v) { + static_assert(std::is_integral::value, "non-integral type"); + assert(v > 0); +#ifdef _MSC_VER + static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); + unsigned long idx = 0; + if (sizeof(T) <= sizeof(uint32_t)) { + _BitScanReverse(&idx, static_cast(v)); + } else { +#if defined(_M_X64) || defined(_M_ARM64) + _BitScanReverse64(&idx, static_cast(v)); +#else + const auto vh = static_cast(static_cast(v) >> 32); + if (vh != 0) { + _BitScanReverse(&idx, static_cast(vh)); + idx += 32; + } else { + _BitScanReverse(&idx, static_cast(v)); + } +#endif + } + return idx; +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) <= sizeof(unsigned int)) { + int lz = __builtin_clz(static_cast(v)); + return int{sizeof(unsigned int)} * 8 - 1 - lz; + } else if (sizeof(T) <= sizeof(unsigned long)) { + int lz = __builtin_clzl(static_cast(v)); + return int{sizeof(unsigned long)} * 8 - 1 - lz; + } else { + int lz = __builtin_clzll(static_cast(v)); + return int{sizeof(unsigned long long)} * 8 - 1 - lz; + } +#endif +} + +// Constexpr version of FloorLog2 +template +constexpr int ConstexprFloorLog2(T v) { + int rv = 0; + while (v > T{1}) { + ++rv; + v >>= 1; + } + return rv; +} + +// Number of low-order zero bits before the first 1 bit. Undefined for 0. +template +inline int CountTrailingZeroBits(T v) { + static_assert(std::is_integral::value, "non-integral type"); + assert(v != 0); +#ifdef _MSC_VER + static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); + unsigned long tz = 0; + if (sizeof(T) <= sizeof(uint32_t)) { + _BitScanForward(&tz, static_cast(v)); + } else { +#if defined(_M_X64) || defined(_M_ARM64) + _BitScanForward64(&tz, static_cast(v)); +#else + _BitScanForward(&tz, static_cast(v)); + if (tz == 0) { + _BitScanForward(&tz, + static_cast(static_cast(v) >> 32)); + tz += 32; + } +#endif + } + return static_cast(tz); +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) <= sizeof(unsigned int)) { + return __builtin_ctz(static_cast(v)); + } else if (sizeof(T) <= sizeof(unsigned long)) { + return __builtin_ctzl(static_cast(v)); + } else { + return __builtin_ctzll(static_cast(v)); + } +#endif +} + +// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include +// the following code at coarse granularity for simpler macros. It's important +// to exclude at least so our non-MSVC unit test coverage tool doesn't see it. +#ifdef _MSC_VER + +namespace detail { + +template +int BitsSetToOneFallback(T v) { + const int kBits = static_cast(sizeof(T)) * 8; + static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits"); + // we static_cast these bit patterns in order to truncate them to the correct + // size. Warning C4309 dislikes this technique, so disable it here. +#pragma warning(disable : 4309) + v = static_cast(v - ((v >> 1) & static_cast(0x5555555555555555ull))); + v = static_cast((v & static_cast(0x3333333333333333ull)) + + ((v >> 2) & static_cast(0x3333333333333333ull))); + v = static_cast((v + (v >> 4)) & static_cast(0x0F0F0F0F0F0F0F0Full)); +#pragma warning(default : 4309) + for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) { + v += static_cast(v >> shift_bits); + } + // we want the bottom "slot" that's big enough to represent a value up to + // (and including) kBits. + return static_cast(v & static_cast(kBits | (kBits - 1))); +} + +} // namespace detail + +#endif // _MSC_VER + +// Number of bits set to 1. Also known as "population count". +template +inline int BitsSetToOne(T v) { + static_assert(std::is_integral::value, "non-integral type"); +#ifdef _MSC_VER + static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); + if (sizeof(T) < sizeof(uint32_t)) { + // This bit mask is to avoid a compiler warning on unused path + constexpr auto mm = 8 * sizeof(uint32_t) - 1; + // The bit mask is to neutralize sign extension on small signed types + constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1; +#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) + return static_cast(__popcnt(static_cast(v) & m)); +#else + return static_cast(detail::BitsSetToOneFallback(v) & m); +#endif + } else if (sizeof(T) == sizeof(uint32_t)) { +#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) + return static_cast(__popcnt(static_cast(v))); +#else + return detail::BitsSetToOneFallback(static_cast(v)); +#endif + } else { +#if defined(HAVE_SSE42) && defined(_M_X64) + return static_cast(__popcnt64(static_cast(v))); +#elif defined(HAVE_SSE42) && defined(_M_IX86) + return static_cast( + __popcnt(static_cast(static_cast(v) >> 32) + + __popcnt(static_cast(v)))); +#else + return detail::BitsSetToOneFallback(static_cast(v)); +#endif + } +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) < sizeof(unsigned int)) { + // This bit mask is to avoid a compiler warning on unused path + constexpr auto mm = 8 * sizeof(unsigned int) - 1; + // This bit mask is to neutralize sign extension on small signed types + constexpr unsigned int m = (1U << ((8 * sizeof(T)) & mm)) - 1; + return __builtin_popcount(static_cast(v) & m); + } else if (sizeof(T) == sizeof(unsigned int)) { + return __builtin_popcount(static_cast(v)); + } else if (sizeof(T) <= sizeof(unsigned long)) { + return __builtin_popcountl(static_cast(v)); + } else { + return __builtin_popcountll(static_cast(v)); + } +#endif +} + +template +inline int BitParity(T v) { + static_assert(std::is_integral::value, "non-integral type"); +#ifdef _MSC_VER + // bit parity == oddness of popcount + return BitsSetToOne(v) & 1; +#else + static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); + if (sizeof(T) <= sizeof(unsigned int)) { + // On any sane systen, potential sign extension here won't change parity + return __builtin_parity(static_cast(v)); + } else if (sizeof(T) <= sizeof(unsigned long)) { + return __builtin_parityl(static_cast(v)); + } else { + return __builtin_parityll(static_cast(v)); + } +#endif +} + +// Swaps between big and little endian. Can be used in combination with the +// little-endian encoding/decoding functions in coding_lean.h and coding.h to +// encode/decode big endian. +template +inline T EndianSwapValue(T v) { + static_assert(std::is_integral::value, "non-integral type"); + +#ifdef _MSC_VER + if (sizeof(T) == 2) { + return static_cast(_byteswap_ushort(static_cast(v))); + } else if (sizeof(T) == 4) { + return static_cast(_byteswap_ulong(static_cast(v))); + } else if (sizeof(T) == 8) { + return static_cast(_byteswap_uint64(static_cast(v))); + } +#else + if (sizeof(T) == 2) { + return static_cast(__builtin_bswap16(static_cast(v))); + } else if (sizeof(T) == 4) { + return static_cast(__builtin_bswap32(static_cast(v))); + } else if (sizeof(T) == 8) { + return static_cast(__builtin_bswap64(static_cast(v))); + } +#endif + // Recognized by clang as bswap, but not by gcc :( + T ret_val = 0; + for (std::size_t i = 0; i < sizeof(T); ++i) { + ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i)); + } + return ret_val; +} + +// Reverses the order of bits in an integral value +template +inline T ReverseBits(T v) { + T r = EndianSwapValue(v); + const T kHighestByte = T{1} << ((sizeof(T) - 1) * 8); + const T kEveryByte = kHighestByte | (kHighestByte / 255); + + r = ((r & (kEveryByte * 0x0f)) << 4) | ((r >> 4) & (kEveryByte * 0x0f)); + r = ((r & (kEveryByte * 0x33)) << 2) | ((r >> 2) & (kEveryByte * 0x33)); + r = ((r & (kEveryByte * 0x55)) << 1) | ((r >> 1) & (kEveryByte * 0x55)); + + return r; +} + +// Every output bit depends on many input bits in the same and higher +// positions, but not lower positions. Specifically, this function +// * Output highest bit set to 1 is same as input (same FloorLog2, or +// equivalently, same number of leading zeros) +// * Is its own inverse (an involution) +// * Guarantees that b bottom bits of v and c bottom bits of +// DownwardInvolution(v) uniquely identify b + c bottom bits of v +// (which is all of v if v < 2**(b + c)). +// ** A notable special case is that modifying c adjacent bits at +// some chosen position in the input is bijective with the bottom c +// output bits. +// * Distributes over xor, as in DI(a ^ b) == DI(a) ^ DI(b) +// +// This transformation is equivalent to a matrix*vector multiplication in +// GF(2) where the matrix is recursively defined by the pattern matrix +// P = | 1 1 | +// | 0 1 | +// and replacing 1's with P and 0's with 2x2 zero matices to some depth, +// e.g. depth of 6 for 64-bit T. An essential feature of this matrix +// is that all square sub-matrices that include the top row are invertible. +template +inline T DownwardInvolution(T v) { + static_assert(std::is_integral::value, "non-integral type"); + static_assert(sizeof(T) <= 8, "only supported up to 64 bits"); + + uint64_t r = static_cast(v); + if constexpr (sizeof(T) > 4) { + r ^= r >> 32; + } + if constexpr (sizeof(T) > 2) { + r ^= (r & 0xffff0000ffff0000U) >> 16; + } + if constexpr (sizeof(T) > 1) { + r ^= (r & 0xff00ff00ff00ff00U) >> 8; + } + r ^= (r & 0xf0f0f0f0f0f0f0f0U) >> 4; + r ^= (r & 0xccccccccccccccccU) >> 2; + r ^= (r & 0xaaaaaaaaaaaaaaaaU) >> 1; + return static_cast(r); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/math128.h b/src/rocksdb/util/math128.h new file mode 100644 index 000000000..ae490051a --- /dev/null +++ b/src/rocksdb/util/math128.h @@ -0,0 +1,316 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "util/coding_lean.h" +#include "util/math.h" + +#ifdef TEST_UINT128_COMPAT +#undef HAVE_UINT128_EXTENSION +#endif + +namespace ROCKSDB_NAMESPACE { + +// Unsigned128 is a 128 bit value supporting (at least) bitwise operators, +// shifts, and comparisons. __uint128_t is not always available. + +#ifdef HAVE_UINT128_EXTENSION +using Unsigned128 = __uint128_t; +#else +struct Unsigned128 { + uint64_t lo; + uint64_t hi; + + inline Unsigned128() { + static_assert(sizeof(Unsigned128) == 2 * sizeof(uint64_t), + "unexpected overhead in representation"); + lo = 0; + hi = 0; + } + + inline Unsigned128(uint64_t lower) { + lo = lower; + hi = 0; + } + + inline Unsigned128(uint64_t lower, uint64_t upper) { + lo = lower; + hi = upper; + } + + explicit operator uint64_t() { return lo; } + + explicit operator uint32_t() { return static_cast(lo); } + + explicit operator uint16_t() { return static_cast(lo); } + + explicit operator uint8_t() { return static_cast(lo); } +}; + +inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) { + shift &= 127; + Unsigned128 rv; + if (shift >= 64) { + rv.lo = 0; + rv.hi = lhs.lo << (shift & 63); + } else { + uint64_t tmp = lhs.lo; + rv.lo = tmp << shift; + // Ensure shift==0 shifts away everything. (This avoids another + // conditional branch on shift == 0.) + tmp = tmp >> 1 >> (63 - shift); + rv.hi = tmp | (lhs.hi << shift); + } + return rv; +} + +inline Unsigned128& operator<<=(Unsigned128& lhs, unsigned shift) { + lhs = lhs << shift; + return lhs; +} + +inline Unsigned128 operator>>(const Unsigned128& lhs, unsigned shift) { + shift &= 127; + Unsigned128 rv; + if (shift >= 64) { + rv.hi = 0; + rv.lo = lhs.hi >> (shift & 63); + } else { + uint64_t tmp = lhs.hi; + rv.hi = tmp >> shift; + // Ensure shift==0 shifts away everything + tmp = tmp << 1 << (63 - shift); + rv.lo = tmp | (lhs.lo >> shift); + } + return rv; +} + +inline Unsigned128& operator>>=(Unsigned128& lhs, unsigned shift) { + lhs = lhs >> shift; + return lhs; +} + +inline Unsigned128 operator&(const Unsigned128& lhs, const Unsigned128& rhs) { + return Unsigned128(lhs.lo & rhs.lo, lhs.hi & rhs.hi); +} + +inline Unsigned128& operator&=(Unsigned128& lhs, const Unsigned128& rhs) { + lhs = lhs & rhs; + return lhs; +} + +inline Unsigned128 operator|(const Unsigned128& lhs, const Unsigned128& rhs) { + return Unsigned128(lhs.lo | rhs.lo, lhs.hi | rhs.hi); +} + +inline Unsigned128& operator|=(Unsigned128& lhs, const Unsigned128& rhs) { + lhs = lhs | rhs; + return lhs; +} + +inline Unsigned128 operator^(const Unsigned128& lhs, const Unsigned128& rhs) { + return Unsigned128(lhs.lo ^ rhs.lo, lhs.hi ^ rhs.hi); +} + +inline Unsigned128& operator^=(Unsigned128& lhs, const Unsigned128& rhs) { + lhs = lhs ^ rhs; + return lhs; +} + +inline Unsigned128 operator~(const Unsigned128& v) { + return Unsigned128(~v.lo, ~v.hi); +} + +inline bool operator==(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.lo == rhs.lo && lhs.hi == rhs.hi; +} + +inline bool operator!=(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.lo != rhs.lo || lhs.hi != rhs.hi; +} + +inline bool operator>(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo > rhs.lo); +} + +inline bool operator<(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo < rhs.lo); +} + +inline bool operator>=(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi > rhs.hi || (lhs.hi == rhs.hi && lhs.lo >= rhs.lo); +} + +inline bool operator<=(const Unsigned128& lhs, const Unsigned128& rhs) { + return lhs.hi < rhs.hi || (lhs.hi == rhs.hi && lhs.lo <= rhs.lo); +} +#endif + +inline uint64_t Lower64of128(Unsigned128 v) { +#ifdef HAVE_UINT128_EXTENSION + return static_cast(v); +#else + return v.lo; +#endif +} + +inline uint64_t Upper64of128(Unsigned128 v) { +#ifdef HAVE_UINT128_EXTENSION + return static_cast(v >> 64); +#else + return v.hi; +#endif +} + +// This generally compiles down to a single fast instruction on 64-bit. +// This doesn't really make sense as operator* because it's not a +// general 128x128 multiply and provides more output than 64x64 multiply. +inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) { +#ifdef HAVE_UINT128_EXTENSION + return Unsigned128{a} * Unsigned128{b}; +#else + // Full decomposition + // NOTE: GCC seems to fully understand this code as 64-bit x 64-bit + // -> 128-bit multiplication and optimize it appropriately. + uint64_t tmp = uint64_t{b & 0xffffFFFF} * uint64_t{a & 0xffffFFFF}; + uint64_t lower = tmp & 0xffffFFFF; + tmp >>= 32; + tmp += uint64_t{b & 0xffffFFFF} * uint64_t{a >> 32}; + // Avoid overflow: first add lower 32 of tmp2, and later upper 32 + uint64_t tmp2 = uint64_t{b >> 32} * uint64_t{a & 0xffffFFFF}; + tmp += tmp2 & 0xffffFFFF; + lower |= tmp << 32; + tmp >>= 32; + tmp += tmp2 >> 32; + tmp += uint64_t{b >> 32} * uint64_t{a >> 32}; + return Unsigned128(lower, tmp); +#endif +} + +template <> +inline int FloorLog2(Unsigned128 v) { + if (Upper64of128(v) == 0) { + return FloorLog2(Lower64of128(v)); + } else { + return FloorLog2(Upper64of128(v)) + 64; + } +} + +template <> +inline int CountTrailingZeroBits(Unsigned128 v) { + if (Lower64of128(v) != 0) { + return CountTrailingZeroBits(Lower64of128(v)); + } else { + return CountTrailingZeroBits(Upper64of128(v)) + 64; + } +} + +template <> +inline int BitsSetToOne(Unsigned128 v) { + return BitsSetToOne(Lower64of128(v)) + BitsSetToOne(Upper64of128(v)); +} + +template <> +inline int BitParity(Unsigned128 v) { + return BitParity(Lower64of128(v) ^ Upper64of128(v)); +} + +template <> +inline Unsigned128 EndianSwapValue(Unsigned128 v) { + return (Unsigned128{EndianSwapValue(Lower64of128(v))} << 64) | + EndianSwapValue(Upper64of128(v)); +} + +template <> +inline Unsigned128 ReverseBits(Unsigned128 v) { + return (Unsigned128{ReverseBits(Lower64of128(v))} << 64) | + ReverseBits(Upper64of128(v)); +} + +template <> +inline Unsigned128 DownwardInvolution(Unsigned128 v) { + return (Unsigned128{DownwardInvolution(Upper64of128(v))} << 64) | + DownwardInvolution(Upper64of128(v) ^ Lower64of128(v)); +} + +template +struct IsUnsignedUpTo128 + : std::integral_constant::value || + std::is_same::value> {}; + +inline void EncodeFixed128(char* dst, Unsigned128 value) { + EncodeFixed64(dst, Lower64of128(value)); + EncodeFixed64(dst + 8, Upper64of128(value)); +} + +inline Unsigned128 DecodeFixed128(const char* ptr) { + Unsigned128 rv = DecodeFixed64(ptr + 8); + return (rv << 64) | DecodeFixed64(ptr); +} + +// A version of EncodeFixed* for generic algorithms. Likely to be used +// with Unsigned128, so lives here for now. +template +inline void EncodeFixedGeneric(char* /*dst*/, T /*value*/) { + // Unfortunately, GCC does not appear to optimize this simple code down + // to a trivial load on Intel: + // + // T ret_val = 0; + // for (size_t i = 0; i < sizeof(T); ++i) { + // ret_val |= (static_cast(static_cast(ptr[i])) << (8 * + // i)); + // } + // return ret_val; + // + // But does unroll the loop, and does optimize manually unrolled version + // for specific sizes down to a trivial load. I have no idea why it doesn't + // do both on this code. + + // So instead, we rely on specializations + static_assert(sizeof(T) == 0, "No specialization provided for this type"); +} + +template <> +inline void EncodeFixedGeneric(char* dst, uint16_t value) { + return EncodeFixed16(dst, value); +} +template <> +inline void EncodeFixedGeneric(char* dst, uint32_t value) { + return EncodeFixed32(dst, value); +} +template <> +inline void EncodeFixedGeneric(char* dst, uint64_t value) { + return EncodeFixed64(dst, value); +} +template <> +inline void EncodeFixedGeneric(char* dst, Unsigned128 value) { + return EncodeFixed128(dst, value); +} + +// A version of EncodeFixed* for generic algorithms. +template +inline T DecodeFixedGeneric(const char* /*dst*/) { + static_assert(sizeof(T) == 0, "No specialization provided for this type"); +} + +template <> +inline uint16_t DecodeFixedGeneric(const char* dst) { + return DecodeFixed16(dst); +} +template <> +inline uint32_t DecodeFixedGeneric(const char* dst) { + return DecodeFixed32(dst); +} +template <> +inline uint64_t DecodeFixedGeneric(const char* dst) { + return DecodeFixed64(dst); +} +template <> +inline Unsigned128 DecodeFixedGeneric(const char* dst) { + return DecodeFixed128(dst); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/murmurhash.cc b/src/rocksdb/util/murmurhash.cc new file mode 100644 index 000000000..a69f3918a --- /dev/null +++ b/src/rocksdb/util/murmurhash.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash + is under the MIT license. +*/ +#include "murmurhash.h" + +#include "port/lang.h" + +#if defined(__x86_64__) + +// ------------------------------------------------------------------- +// +// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment +// and endian-ness issues if used across multiple platforms. +// +// 64-bit hash for 64-bit platforms + +#ifdef ROCKSDB_UBSAN_RUN +#if defined(__clang__) +__attribute__((__no_sanitize__("alignment"))) +#elif defined(__GNUC__) +__attribute__((__no_sanitize_undefined__)) +#endif +#endif +// clang-format off +uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= ((uint64_t)data2[6]) << 48; FALLTHROUGH_INTENDED; + case 6: h ^= ((uint64_t)data2[5]) << 40; FALLTHROUGH_INTENDED; + case 5: h ^= ((uint64_t)data2[4]) << 32; FALLTHROUGH_INTENDED; + case 4: h ^= ((uint64_t)data2[3]) << 24; FALLTHROUGH_INTENDED; + case 3: h ^= ((uint64_t)data2[2]) << 16; FALLTHROUGH_INTENDED; + case 2: h ^= ((uint64_t)data2[1]) << 8; FALLTHROUGH_INTENDED; + case 1: h ^= ((uint64_t)data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} +// clang-format on + +#elif defined(__i386__) + +// ------------------------------------------------------------------- +// +// Note - This code makes a few assumptions about how your machine behaves - +// +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +// +// And it has a few limitations - +// +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. +// clang-format off +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED; + case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} +// clang-format on + +#else + +// ------------------------------------------------------------------- +// +// Same as MurmurHash2, but endian- and alignment-neutral. +// Half the speed though, alas. +// clang-format off +unsigned int MurmurHashNeutral2 ( const void * key, int len, unsigned int seed ) +{ + const unsigned int m = 0x5bd1e995; + const int r = 24; + + unsigned int h = seed ^ len; + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k; + + k = data[0]; + k |= data[1] << 8; + k |= data[2] << 16; + k |= data[3] << 24; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + switch(len) + { + case 3: h ^= data[2] << 16; FALLTHROUGH_INTENDED; + case 2: h ^= data[1] << 8; FALLTHROUGH_INTENDED; + case 1: h ^= data[0]; + h *= m; + }; + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} +// clang-format on + +#endif diff --git a/src/rocksdb/util/murmurhash.h b/src/rocksdb/util/murmurhash.h new file mode 100644 index 000000000..7ef4cbbec --- /dev/null +++ b/src/rocksdb/util/murmurhash.h @@ -0,0 +1,43 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +/* + Murmurhash from http://sites.google.com/site/murmurhash/ + + All code is released to the public domain. For business purposes, Murmurhash + is under the MIT license. +*/ +#pragma once +#include + +#include "rocksdb/slice.h" + +#if defined(__x86_64__) +#define MURMUR_HASH MurmurHash64A +uint64_t MurmurHash64A(const void* key, int len, unsigned int seed); +#define MurmurHash MurmurHash64A +using murmur_t = uint64_t; + +#elif defined(__i386__) +#define MURMUR_HASH MurmurHash2 +unsigned int MurmurHash2(const void* key, int len, unsigned int seed); +#define MurmurHash MurmurHash2 +using murmur_t = unsigned int; + +#else +#define MURMUR_HASH MurmurHashNeutral2 +unsigned int MurmurHashNeutral2(const void* key, int len, unsigned int seed); +#define MurmurHash MurmurHashNeutral2 +using murmur_t = unsigned int; +#endif + +// Allow slice to be hashable by murmur hash. +namespace ROCKSDB_NAMESPACE { +struct murmur_hash { + size_t operator()(const Slice& slice) const { + return MurmurHash(slice.data(), static_cast(slice.size()), 0); + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/mutexlock.h b/src/rocksdb/util/mutexlock.h new file mode 100644 index 000000000..94066b29e --- /dev/null +++ b/src/rocksdb/util/mutexlock.h @@ -0,0 +1,180 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +// Helper class that locks a mutex on construction and unlocks the mutex when +// the destructor of the MutexLock object is invoked. +// +// Typical usage: +// +// void MyClass::MyMethod() { +// MutexLock l(&mu_); // mu_ is an instance variable +// ... some complex code, possibly with multiple return paths ... +// } + +class MutexLock { + public: + explicit MutexLock(port::Mutex *mu) : mu_(mu) { this->mu_->Lock(); } + // No copying allowed + MutexLock(const MutexLock &) = delete; + void operator=(const MutexLock &) = delete; + + ~MutexLock() { this->mu_->Unlock(); } + + private: + port::Mutex *const mu_; +}; + +// +// Acquire a ReadLock on the specified RWMutex. +// The Lock will be automatically released when the +// object goes out of scope. +// +class ReadLock { + public: + explicit ReadLock(port::RWMutex *mu) : mu_(mu) { this->mu_->ReadLock(); } + // No copying allowed + ReadLock(const ReadLock &) = delete; + void operator=(const ReadLock &) = delete; + + ~ReadLock() { this->mu_->ReadUnlock(); } + + private: + port::RWMutex *const mu_; +}; + +// +// Automatically unlock a locked mutex when the object is destroyed +// +class ReadUnlock { + public: + explicit ReadUnlock(port::RWMutex *mu) : mu_(mu) { mu->AssertHeld(); } + // No copying allowed + ReadUnlock(const ReadUnlock &) = delete; + ReadUnlock &operator=(const ReadUnlock &) = delete; + + ~ReadUnlock() { mu_->ReadUnlock(); } + + private: + port::RWMutex *const mu_; +}; + +// +// Acquire a WriteLock on the specified RWMutex. +// The Lock will be automatically released then the +// object goes out of scope. +// +class WriteLock { + public: + explicit WriteLock(port::RWMutex *mu) : mu_(mu) { this->mu_->WriteLock(); } + // No copying allowed + WriteLock(const WriteLock &) = delete; + void operator=(const WriteLock &) = delete; + + ~WriteLock() { this->mu_->WriteUnlock(); } + + private: + port::RWMutex *const mu_; +}; + +// +// SpinMutex has very low overhead for low-contention cases. Method names +// are chosen so you can use std::unique_lock or std::lock_guard with it. +// +class SpinMutex { + public: + SpinMutex() : locked_(false) {} + + bool try_lock() { + auto currently_locked = locked_.load(std::memory_order_relaxed); + return !currently_locked && + locked_.compare_exchange_weak(currently_locked, true, + std::memory_order_acquire, + std::memory_order_relaxed); + } + + void lock() { + for (size_t tries = 0;; ++tries) { + if (try_lock()) { + // success + break; + } + port::AsmVolatilePause(); + if (tries > 100) { + std::this_thread::yield(); + } + } + } + + void unlock() { locked_.store(false, std::memory_order_release); } + + private: + std::atomic locked_; +}; + +// We want to prevent false sharing +template +struct ALIGN_AS(CACHE_LINE_SIZE) LockData { + T lock_; +}; + +// +// Inspired by Guava: https://github.com/google/guava/wiki/StripedExplained +// A striped Lock. This offers the underlying lock striping similar +// to that of ConcurrentHashMap in a reusable form, and extends it for +// semaphores and read-write locks. Conceptually, lock striping is the technique +// of dividing a lock into many stripes, increasing the granularity of a +// single lock and allowing independent operations to lock different stripes and +// proceed concurrently, instead of creating contention for a single lock. +// +template +class Striped { + public: + Striped(size_t stripes, std::function hash) + : stripes_(stripes), hash_(hash) { + locks_ = reinterpret_cast *>( + port::cacheline_aligned_alloc(sizeof(LockData) * stripes)); + for (size_t i = 0; i < stripes; i++) { + new (&locks_[i]) LockData(); + } + } + + virtual ~Striped() { + if (locks_ != nullptr) { + assert(stripes_ > 0); + for (size_t i = 0; i < stripes_; i++) { + locks_[i].~LockData(); + } + port::cacheline_aligned_free(locks_); + } + } + + T *get(const P &key) { + uint64_t h = hash_(key); + size_t index = h % stripes_; + return &reinterpret_cast *>(&locks_[index])->lock_; + } + + private: + size_t stripes_; + LockData *locks_; + std::function hash_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/ppc-opcode.h b/src/rocksdb/util/ppc-opcode.h new file mode 100644 index 000000000..5cc5af0e3 --- /dev/null +++ b/src/rocksdb/util/ppc-opcode.h @@ -0,0 +1,27 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// Copyright (c) 2017 International Business Machines Corp. +// All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#define __PPC_RA(a) (((a)&0x1f) << 16) +#define __PPC_RB(b) (((b)&0x1f) << 11) +#define __PPC_XA(a) ((((a)&0x1f) << 16) | (((a)&0x20) >> 3)) +#define __PPC_XB(b) ((((b)&0x1f) << 11) | (((b)&0x20) >> 4)) +#define __PPC_XS(s) ((((s)&0x1f) << 21) | (((s)&0x20) >> 5)) +#define __PPC_XT(s) __PPC_XS(s) +#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b)) +#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b)) + +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 + +#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b) +#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b) +#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t) + 32, a, 0) +#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t) + 32, a, 0) diff --git a/src/rocksdb/util/random.cc b/src/rocksdb/util/random.cc new file mode 100644 index 000000000..c94c28dfb --- /dev/null +++ b/src/rocksdb/util/random.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#include "util/random.h" + +#include +#include + +#include +#include + +#include "port/likely.h" +#include "util/thread_local.h" + +#define STORAGE_DECL static thread_local + +namespace ROCKSDB_NAMESPACE { + +Random* Random::GetTLSInstance() { + STORAGE_DECL Random* tls_instance; + STORAGE_DECL std::aligned_storage::type tls_instance_bytes; + + auto rv = tls_instance; + if (UNLIKELY(rv == nullptr)) { + size_t seed = std::hash()(std::this_thread::get_id()); + rv = new (&tls_instance_bytes) Random((uint32_t)seed); + tls_instance = rv; + } + return rv; +} + +std::string Random::HumanReadableString(int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; ++i) { + ret[i] = static_cast('a' + Uniform(26)); + } + return ret; +} + +std::string Random::RandomString(int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; i++) { + ret[i] = static_cast(' ' + Uniform(95)); // ' ' .. '~' + } + return ret; +} + +std::string Random::RandomBinaryString(int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; i++) { + ret[i] = static_cast(Uniform(CHAR_MAX)); + } + return ret; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/random.h b/src/rocksdb/util/random.h new file mode 100644 index 000000000..8923bdc4f --- /dev/null +++ b/src/rocksdb/util/random.h @@ -0,0 +1,190 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. +class Random { + private: + enum : uint32_t { + M = 2147483647L // 2^31-1 + }; + enum : uint64_t { + A = 16807 // bits 14, 8, 7, 5, 2, 1, 0 + }; + + uint32_t seed_; + + static uint32_t GoodSeed(uint32_t s) { return (s & M) != 0 ? (s & M) : 1; } + + public: + // This is the largest value that can be returned from Next() + enum : uint32_t { kMaxNext = M }; + + explicit Random(uint32_t s) : seed_(GoodSeed(s)) {} + + void Reset(uint32_t s) { seed_ = GoodSeed(s); } + + uint32_t Next() { + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > M) { + seed_ -= M; + } + return seed_; + } + + uint64_t Next64() { return (uint64_t{Next()} << 32) | Next(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(int n) { return Next() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return Uniform(n) == 0; } + + // "Optional" one-in-n, where 0 or negative always returns false + // (may or may not consume a random value) + bool OneInOpt(int n) { return n > 0 && OneIn(n); } + + // Returns random bool that is true for the given percentage of + // calls on average. Zero or less is always false and 100 or more + // is always true (may or may not consume a random value) + bool PercentTrue(int percentage) { + return static_cast(Uniform(100)) < percentage; + } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } + + // Returns a random string of length "len" + std::string RandomString(int len); + + // Generates a random string of len bytes using human-readable characters + std::string HumanReadableString(int len); + + // Generates a random binary data + std::string RandomBinaryString(int len); + + // Returns a Random instance for use by the current thread without + // additional locking + static Random* GetTLSInstance(); +}; + +// A good 32-bit random number generator based on std::mt19937. +// This exists in part to avoid compiler variance in warning about coercing +// uint_fast32_t from mt19937 to uint32_t. +class Random32 { + private: + std::mt19937 generator_; + + public: + explicit Random32(uint32_t s) : generator_(s) {} + + // Generates the next random number + uint32_t Next() { return static_cast(generator_()); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(uint32_t n) { + return static_cast( + std::uniform_int_distribution( + 0, n - 1)(generator_)); + } + + // Returns an *almost* uniformly distributed value in the range [0..n-1]. + // Much faster than Uniform(). + // REQUIRES: n > 0 + uint32_t Uniformish(uint32_t n) { + // fastrange (without the header) + return static_cast((uint64_t(generator_()) * uint64_t(n)) >> 32); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint32_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(uint32_t{1} << Uniform(max_log + 1)); + } + + // Reset the seed of the generator to the given value + void Seed(uint32_t new_seed) { generator_.seed(new_seed); } +}; + +// A good 64-bit random number generator based on std::mt19937_64 +class Random64 { + private: + std::mt19937_64 generator_; + + public: + explicit Random64(uint64_t s) : generator_(s) {} + + // Generates the next random number + uint64_t Next() { return generator_(); } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform(uint64_t n) { + return std::uniform_int_distribution(0, n - 1)(generator_); + } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(uint64_t n) { return Uniform(n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint64_t Skewed(int max_log) { + return Uniform(uint64_t(1) << Uniform(max_log + 1)); + } +}; + +// A seeded replacement for removed std::random_shuffle +template +void RandomShuffle(RandomIt first, RandomIt last, uint32_t seed) { + std::mt19937 rng(seed); + std::shuffle(first, last, rng); +} + +// A replacement for removed std::random_shuffle +template +void RandomShuffle(RandomIt first, RandomIt last) { + RandomShuffle(first, last, std::random_device{}()); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/random_test.cc b/src/rocksdb/util/random_test.cc new file mode 100644 index 000000000..1aa62c5da --- /dev/null +++ b/src/rocksdb/util/random_test.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/random.h" + +#include +#include + +#include "test_util/testharness.h" + +using ROCKSDB_NAMESPACE::Random; + +TEST(RandomTest, Uniform) { + const int average = 20; + for (uint32_t seed : {0, 1, 2, 37, 4096}) { + Random r(seed); + for (int range : {1, 2, 8, 12, 100}) { + std::vector counts(range, 0); + + for (int i = 0; i < range * average; ++i) { + ++counts.at(r.Uniform(range)); + } + int max_variance = static_cast(std::sqrt(range) * 2 + 4); + for (int i = 0; i < range; ++i) { + EXPECT_GE(counts[i], std::max(1, average - max_variance)); + EXPECT_LE(counts[i], average + max_variance + 1); + } + } + } +} + +TEST(RandomTest, OneIn) { + Random r(42); + for (int range : {1, 2, 8, 12, 100, 1234}) { + const int average = 100; + int count = 0; + for (int i = 0; i < average * range; ++i) { + if (r.OneIn(range)) { + ++count; + } + } + if (range == 1) { + EXPECT_EQ(count, average); + } else { + int max_variance = static_cast(std::sqrt(average) * 1.5); + EXPECT_GE(count, average - max_variance); + EXPECT_LE(count, average + max_variance); + } + } +} + +TEST(RandomTest, OneInOpt) { + Random r(42); + for (int range : {-12, 0, 1, 2, 8, 12, 100, 1234}) { + const int average = 100; + int count = 0; + for (int i = 0; i < average * range; ++i) { + if (r.OneInOpt(range)) { + ++count; + } + } + if (range < 1) { + EXPECT_EQ(count, 0); + } else if (range == 1) { + EXPECT_EQ(count, average); + } else { + int max_variance = static_cast(std::sqrt(average) * 1.5); + EXPECT_GE(count, average - max_variance); + EXPECT_LE(count, average + max_variance); + } + } +} + +TEST(RandomTest, PercentTrue) { + Random r(42); + for (int pct : {-12, 0, 1, 2, 10, 50, 90, 98, 99, 100, 1234}) { + const int samples = 10000; + + int count = 0; + for (int i = 0; i < samples; ++i) { + if (r.PercentTrue(pct)) { + ++count; + } + } + if (pct <= 0) { + EXPECT_EQ(count, 0); + } else if (pct >= 100) { + EXPECT_EQ(count, samples); + } else { + int est = (count * 100 + (samples / 2)) / samples; + EXPECT_EQ(est, pct); + } + } +} + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/rate_limiter.cc b/src/rocksdb/util/rate_limiter.cc new file mode 100644 index 000000000..6bbcabfae --- /dev/null +++ b/src/rocksdb/util/rate_limiter.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/rate_limiter.h" + +#include + +#include "monitoring/statistics.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/aligned_buffer.h" + +namespace ROCKSDB_NAMESPACE { +size_t RateLimiter::RequestToken(size_t bytes, size_t alignment, + Env::IOPriority io_priority, Statistics* stats, + RateLimiter::OpType op_type) { + if (io_priority < Env::IO_TOTAL && IsRateLimited(op_type)) { + bytes = std::min(bytes, static_cast(GetSingleBurstBytes())); + + if (alignment > 0) { + // Here we may actually require more than burst and block + // as we can not write/read less than one page at a time on direct I/O + // thus we do not want to be strictly constrained by burst + bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes)); + } + Request(bytes, io_priority, stats, op_type); + } + return bytes; +} + +// Pending request +struct GenericRateLimiter::Req { + explicit Req(int64_t _bytes, port::Mutex* _mu) + : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {} + int64_t request_bytes; + int64_t bytes; + port::CondVar cv; + bool granted; +}; + +GenericRateLimiter::GenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness, + RateLimiter::Mode mode, const std::shared_ptr& clock, + bool auto_tuned) + : RateLimiter(mode), + refill_period_us_(refill_period_us), + rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2 + : rate_bytes_per_sec), + refill_bytes_per_period_( + CalculateRefillBytesPerPeriodLocked(rate_bytes_per_sec_)), + clock_(clock), + stop_(false), + exit_cv_(&request_mutex_), + requests_to_wait_(0), + available_bytes_(0), + next_refill_us_(NowMicrosMonotonicLocked()), + fairness_(fairness > 100 ? 100 : fairness), + rnd_((uint32_t)time(nullptr)), + wait_until_refill_pending_(false), + auto_tuned_(auto_tuned), + num_drains_(0), + max_bytes_per_sec_(rate_bytes_per_sec), + tuned_time_(NowMicrosMonotonicLocked()) { + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_requests_[i] = 0; + total_bytes_through_[i] = 0; + } +} + +GenericRateLimiter::~GenericRateLimiter() { + MutexLock g(&request_mutex_); + stop_ = true; + std::deque::size_type queues_size_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + queues_size_sum += queue_[i].size(); + } + requests_to_wait_ = static_cast(queues_size_sum); + + for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) { + std::deque queue = queue_[i]; + for (auto& r : queue) { + r->cv.Signal(); + } + } + + while (requests_to_wait_ > 0) { + exit_cv_.Wait(); + } +} + +// This API allows user to dynamically change rate limiter's bytes per second. +void GenericRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) { + MutexLock g(&request_mutex_); + SetBytesPerSecondLocked(bytes_per_second); +} + +void GenericRateLimiter::SetBytesPerSecondLocked(int64_t bytes_per_second) { + assert(bytes_per_second > 0); + rate_bytes_per_sec_.store(bytes_per_second, std::memory_order_relaxed); + refill_bytes_per_period_.store( + CalculateRefillBytesPerPeriodLocked(bytes_per_second), + std::memory_order_relaxed); +} + +void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, + Statistics* stats) { + assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed)); + bytes = std::max(static_cast(0), bytes); + TEST_SYNC_POINT("GenericRateLimiter::Request"); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:1", + &rate_bytes_per_sec_); + MutexLock g(&request_mutex_); + + if (auto_tuned_) { + static const int kRefillsPerTune = 100; + std::chrono::microseconds now(NowMicrosMonotonicLocked()); + if (now - tuned_time_ >= + kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) { + Status s = TuneLocked(); + s.PermitUncheckedError(); //**TODO: What to do on error? + } + } + + if (stop_) { + // It is now in the clean-up of ~GenericRateLimiter(). + // Therefore any new incoming request will exit from here + // and not get satiesfied. + return; + } + + ++total_requests_[pri]; + + if (available_bytes_ >= bytes) { + // Refill thread assigns quota and notifies requests waiting on + // the queue under mutex. So if we get here, that means nobody + // is waiting? + available_bytes_ -= bytes; + total_bytes_through_[pri] += bytes; + return; + } + + // Request cannot be satisfied at this moment, enqueue + Req r(bytes, &request_mutex_); + queue_[pri].push_back(&r); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostEnqueueRequest", + &request_mutex_); + // A thread representing a queued request coordinates with other such threads. + // There are two main duties. + // + // (1) Waiting for the next refill time. + // (2) Refilling the bytes and granting requests. + do { + int64_t time_until_refill_us = next_refill_us_ - NowMicrosMonotonicLocked(); + if (time_until_refill_us > 0) { + if (wait_until_refill_pending_) { + // Somebody is performing (1). Trust we'll be woken up when our request + // is granted or we are needed for future duties. + r.cv.Wait(); + } else { + // Whichever thread reaches here first performs duty (1) as described + // above. + int64_t wait_until = clock_->NowMicros() + time_until_refill_us; + RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS); + ++num_drains_; + wait_until_refill_pending_ = true; + r.cv.TimedWait(wait_until); + TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostTimedWait", + &time_until_refill_us); + wait_until_refill_pending_ = false; + } + } else { + // Whichever thread reaches here first performs duty (2) as described + // above. + RefillBytesAndGrantRequestsLocked(); + if (r.granted) { + // If there is any remaining requests, make sure there exists at least + // one candidate is awake for future duties by signaling a front request + // of a queue. + for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) { + std::deque queue = queue_[i]; + if (!queue.empty()) { + queue.front()->cv.Signal(); + break; + } + } + } + } + // Invariant: non-granted request is always in one queue, and granted + // request is always in zero queues. +#ifndef NDEBUG + int num_found = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + if (std::find(queue_[i].begin(), queue_[i].end(), &r) != + queue_[i].end()) { + ++num_found; + } + } + if (r.granted) { + assert(num_found == 0); + } else { + assert(num_found == 1); + } +#endif // NDEBUG + } while (!stop_ && !r.granted); + + if (stop_) { + // It is now in the clean-up of ~GenericRateLimiter(). + // Therefore any woken-up request will have come out of the loop and then + // exit here. It might or might not have been satisfied. + --requests_to_wait_; + exit_cv_.Signal(); + } +} + +std::vector +GenericRateLimiter::GeneratePriorityIterationOrderLocked() { + std::vector pri_iteration_order(Env::IO_TOTAL /* 4 */); + // We make Env::IO_USER a superior priority by always iterating its queue + // first + pri_iteration_order[0] = Env::IO_USER; + + bool high_pri_iterated_after_mid_low_pri = rnd_.OneIn(fairness_); + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForHighPri", + &high_pri_iterated_after_mid_low_pri); + bool mid_pri_itereated_after_low_pri = rnd_.OneIn(fairness_); + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForMidPri", + &mid_pri_itereated_after_low_pri); + + if (high_pri_iterated_after_mid_low_pri) { + pri_iteration_order[3] = Env::IO_HIGH; + pri_iteration_order[2] = + mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW; + pri_iteration_order[1] = + (pri_iteration_order[2] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID; + } else { + pri_iteration_order[1] = Env::IO_HIGH; + pri_iteration_order[3] = + mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW; + pri_iteration_order[2] = + (pri_iteration_order[3] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID; + } + + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PreReturnPriIterationOrder", + &pri_iteration_order); + return pri_iteration_order; +} + +void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { + TEST_SYNC_POINT_CALLBACK( + "GenericRateLimiter::RefillBytesAndGrantRequestsLocked", &request_mutex_); + next_refill_us_ = NowMicrosMonotonicLocked() + refill_period_us_; + // Carry over the left over quota from the last period + auto refill_bytes_per_period = + refill_bytes_per_period_.load(std::memory_order_relaxed); + if (available_bytes_ < refill_bytes_per_period) { + available_bytes_ += refill_bytes_per_period; + } + + std::vector pri_iteration_order = + GeneratePriorityIterationOrderLocked(); + + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + assert(!pri_iteration_order.empty()); + Env::IOPriority current_pri = pri_iteration_order[i]; + auto* queue = &queue_[current_pri]; + while (!queue->empty()) { + auto* next_req = queue->front(); + if (available_bytes_ < next_req->request_bytes) { + // Grant partial request_bytes to avoid starvation of requests + // that become asking for more bytes than available_bytes_ + // due to dynamically reduced rate limiter's bytes_per_second that + // leads to reduced refill_bytes_per_period hence available_bytes_ + next_req->request_bytes -= available_bytes_; + available_bytes_ = 0; + break; + } + available_bytes_ -= next_req->request_bytes; + next_req->request_bytes = 0; + total_bytes_through_[current_pri] += next_req->bytes; + queue->pop_front(); + + next_req->granted = true; + // Quota granted, signal the thread to exit + next_req->cv.Signal(); + } + } +} + +int64_t GenericRateLimiter::CalculateRefillBytesPerPeriodLocked( + int64_t rate_bytes_per_sec) { + if (std::numeric_limits::max() / rate_bytes_per_sec < + refill_period_us_) { + // Avoid unexpected result in the overflow case. The result now is still + // inaccurate but is a number that is large enough. + return std::numeric_limits::max() / 1000000; + } else { + return rate_bytes_per_sec * refill_period_us_ / 1000000; + } +} + +Status GenericRateLimiter::TuneLocked() { + const int kLowWatermarkPct = 50; + const int kHighWatermarkPct = 90; + const int kAdjustFactorPct = 5; + // computed rate limit will be in + // `[max_bytes_per_sec_ / kAllowedRangeFactor, max_bytes_per_sec_]`. + const int kAllowedRangeFactor = 20; + + std::chrono::microseconds prev_tuned_time = tuned_time_; + tuned_time_ = std::chrono::microseconds(NowMicrosMonotonicLocked()); + + int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time + + std::chrono::microseconds(refill_period_us_) - + std::chrono::microseconds(1)) / + std::chrono::microseconds(refill_period_us_); + // We tune every kRefillsPerTune intervals, so the overflow and division-by- + // zero conditions should never happen. + assert(num_drains_ <= std::numeric_limits::max() / 100); + assert(elapsed_intervals > 0); + int64_t drained_pct = num_drains_ * 100 / elapsed_intervals; + + int64_t prev_bytes_per_sec = GetBytesPerSecond(); + int64_t new_bytes_per_sec; + if (drained_pct == 0) { + new_bytes_per_sec = max_bytes_per_sec_ / kAllowedRangeFactor; + } else if (drained_pct < kLowWatermarkPct) { + // sanitize to prevent overflow + int64_t sanitized_prev_bytes_per_sec = + std::min(prev_bytes_per_sec, std::numeric_limits::max() / 100); + new_bytes_per_sec = + std::max(max_bytes_per_sec_ / kAllowedRangeFactor, + sanitized_prev_bytes_per_sec * 100 / (100 + kAdjustFactorPct)); + } else if (drained_pct > kHighWatermarkPct) { + // sanitize to prevent overflow + int64_t sanitized_prev_bytes_per_sec = + std::min(prev_bytes_per_sec, std::numeric_limits::max() / + (100 + kAdjustFactorPct)); + new_bytes_per_sec = + std::min(max_bytes_per_sec_, + sanitized_prev_bytes_per_sec * (100 + kAdjustFactorPct) / 100); + } else { + new_bytes_per_sec = prev_bytes_per_sec; + } + if (new_bytes_per_sec != prev_bytes_per_sec) { + SetBytesPerSecondLocked(new_bytes_per_sec); + } + num_drains_ = 0; + return Status::OK(); +} + +RateLimiter* NewGenericRateLimiter( + int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */, + int32_t fairness /* = 10 */, + RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */, + bool auto_tuned /* = false */) { + assert(rate_bytes_per_sec > 0); + assert(refill_period_us > 0); + assert(fairness > 0); + std::unique_ptr limiter( + new GenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness, + mode, SystemClock::Default(), auto_tuned)); + return limiter.release(); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/rate_limiter.h b/src/rocksdb/util/rate_limiter.h new file mode 100644 index 000000000..4c078f5a0 --- /dev/null +++ b/src/rocksdb/util/rate_limiter.h @@ -0,0 +1,146 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/status.h" +#include "rocksdb/system_clock.h" +#include "util/mutexlock.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class GenericRateLimiter : public RateLimiter { + public: + GenericRateLimiter(int64_t refill_bytes, int64_t refill_period_us, + int32_t fairness, RateLimiter::Mode mode, + const std::shared_ptr& clock, + bool auto_tuned); + + virtual ~GenericRateLimiter(); + + // This API allows user to dynamically change rate limiter's bytes per second. + virtual void SetBytesPerSecond(int64_t bytes_per_second) override; + + // Request for token to write bytes. If this request can not be satisfied, + // the call is blocked. Caller is responsible to make sure + // bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes + // passed in will be rounded up to 0. + using RateLimiter::Request; + virtual void Request(const int64_t bytes, const Env::IOPriority pri, + Statistics* stats) override; + + virtual int64_t GetSingleBurstBytes() const override { + return refill_bytes_per_period_.load(std::memory_order_relaxed); + } + + virtual int64_t GetTotalBytesThrough( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + int64_t total_bytes_through_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_bytes_through_sum += total_bytes_through_[i]; + } + return total_bytes_through_sum; + } + return total_bytes_through_[pri]; + } + + virtual int64_t GetTotalRequests( + const Env::IOPriority pri = Env::IO_TOTAL) const override { + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + int64_t total_requests_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_requests_sum += total_requests_[i]; + } + return total_requests_sum; + } + return total_requests_[pri]; + } + + virtual Status GetTotalPendingRequests( + int64_t* total_pending_requests, + const Env::IOPriority pri = Env::IO_TOTAL) const override { + assert(total_pending_requests != nullptr); + MutexLock g(&request_mutex_); + if (pri == Env::IO_TOTAL) { + int64_t total_pending_requests_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + total_pending_requests_sum += static_cast(queue_[i].size()); + } + *total_pending_requests = total_pending_requests_sum; + } else { + *total_pending_requests = static_cast(queue_[pri].size()); + } + return Status::OK(); + } + + virtual int64_t GetBytesPerSecond() const override { + return rate_bytes_per_sec_.load(std::memory_order_relaxed); + } + + virtual void TEST_SetClock(std::shared_ptr clock) { + MutexLock g(&request_mutex_); + clock_ = std::move(clock); + next_refill_us_ = NowMicrosMonotonicLocked(); + } + + private: + void RefillBytesAndGrantRequestsLocked(); + std::vector GeneratePriorityIterationOrderLocked(); + int64_t CalculateRefillBytesPerPeriodLocked(int64_t rate_bytes_per_sec); + Status TuneLocked(); + void SetBytesPerSecondLocked(int64_t bytes_per_second); + + uint64_t NowMicrosMonotonicLocked() { + return clock_->NowNanos() / std::milli::den; + } + + // This mutex guard all internal states + mutable port::Mutex request_mutex_; + + const int64_t refill_period_us_; + + std::atomic rate_bytes_per_sec_; + std::atomic refill_bytes_per_period_; + std::shared_ptr clock_; + + bool stop_; + port::CondVar exit_cv_; + int32_t requests_to_wait_; + + int64_t total_requests_[Env::IO_TOTAL]; + int64_t total_bytes_through_[Env::IO_TOTAL]; + int64_t available_bytes_; + int64_t next_refill_us_; + + int32_t fairness_; + Random rnd_; + + struct Req; + std::deque queue_[Env::IO_TOTAL]; + bool wait_until_refill_pending_; + + bool auto_tuned_; + int64_t num_drains_; + const int64_t max_bytes_per_sec_; + std::chrono::microseconds tuned_time_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/rate_limiter_test.cc b/src/rocksdb/util/rate_limiter_test.cc new file mode 100644 index 000000000..cda134867 --- /dev/null +++ b/src/rocksdb/util/rate_limiter_test.cc @@ -0,0 +1,476 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/rate_limiter.h" + +#include +#include +#include +#include + +#include "db/db_test_util.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +// TODO(yhchiang): the rate will not be accurate when we run test in parallel. +class RateLimiterTest : public testing::Test { + protected: + ~RateLimiterTest() override { + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + } +}; + +TEST_F(RateLimiterTest, OverflowRate) { + GenericRateLimiter limiter(std::numeric_limits::max(), 1000, 10, + RateLimiter::Mode::kWritesOnly, + SystemClock::Default(), false /* auto_tuned */); + ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll); +} + +TEST_F(RateLimiterTest, StartStop) { + std::unique_ptr limiter(NewGenericRateLimiter(100, 100, 10)); +} + +TEST_F(RateLimiterTest, GetTotalBytesThrough) { + std::unique_ptr limiter(NewGenericRateLimiter( + 200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */)); + for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) { + ASSERT_EQ(limiter->GetTotalBytesThrough(static_cast(i)), + 0); + } + + std::int64_t request_byte = 200; + std::int64_t request_byte_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + limiter->Request(request_byte, static_cast(i), + nullptr /* stats */, RateLimiter::OpType::kWrite); + request_byte_sum += request_byte; + } + + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + EXPECT_EQ(limiter->GetTotalBytesThrough(static_cast(i)), + request_byte) + << "Failed to track total_bytes_through_ correctly when IOPriority = " + << static_cast(i); + } + EXPECT_EQ(limiter->GetTotalBytesThrough(Env::IO_TOTAL), request_byte_sum) + << "Failed to track total_bytes_through_ correctly when IOPriority = " + "Env::IO_TOTAL"; +} + +TEST_F(RateLimiterTest, GetTotalRequests) { + std::unique_ptr limiter(NewGenericRateLimiter( + 200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */)); + for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) { + ASSERT_EQ(limiter->GetTotalRequests(static_cast(i)), 0); + } + + std::int64_t total_requests_sum = 0; + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + limiter->Request(200, static_cast(i), nullptr /* stats */, + RateLimiter::OpType::kWrite); + total_requests_sum += 1; + } + + for (int i = Env::IO_LOW; i < Env::IO_TOTAL; ++i) { + EXPECT_EQ(limiter->GetTotalRequests(static_cast(i)), 1) + << "Failed to track total_requests_ correctly when IOPriority = " + << static_cast(i); + } + EXPECT_EQ(limiter->GetTotalRequests(Env::IO_TOTAL), total_requests_sum) + << "Failed to track total_requests_ correctly when IOPriority = " + "Env::IO_TOTAL"; +} + +TEST_F(RateLimiterTest, GetTotalPendingRequests) { + std::unique_ptr limiter(NewGenericRateLimiter( + 200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */)); + int64_t total_pending_requests = 0; + for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) { + ASSERT_OK(limiter->GetTotalPendingRequests( + &total_pending_requests, static_cast(i))); + ASSERT_EQ(total_pending_requests, 0); + } + // This is a variable for making sure the following callback is called + // and the assertions in it are indeed excuted + bool nonzero_pending_requests_verified = false; + SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:PostEnqueueRequest", [&](void* arg) { + port::Mutex* request_mutex = (port::Mutex*)arg; + // We temporarily unlock the mutex so that the following + // GetTotalPendingRequests() can acquire it + request_mutex->Unlock(); + for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) { + EXPECT_OK(limiter->GetTotalPendingRequests( + &total_pending_requests, static_cast(i))) + << "Failed to return total pending requests for priority level = " + << static_cast(i); + if (i == Env::IO_USER || i == Env::IO_TOTAL) { + EXPECT_EQ(total_pending_requests, 1) + << "Failed to correctly return total pending requests for " + "priority level = " + << static_cast(i); + } else { + EXPECT_EQ(total_pending_requests, 0) + << "Failed to correctly return total pending requests for " + "priority level = " + << static_cast(i); + } + } + // We lock the mutex again so that the request thread can resume running + // with the mutex locked + request_mutex->Lock(); + nonzero_pending_requests_verified = true; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + limiter->Request(200, Env::IO_USER, nullptr /* stats */, + RateLimiter::OpType::kWrite); + ASSERT_EQ(nonzero_pending_requests_verified, true); + for (int i = Env::IO_LOW; i <= Env::IO_TOTAL; ++i) { + EXPECT_OK(limiter->GetTotalPendingRequests(&total_pending_requests, + static_cast(i))) + << "Failed to return total pending requests for priority level = " + << static_cast(i); + EXPECT_EQ(total_pending_requests, 0) + << "Failed to correctly return total pending requests for priority " + "level = " + << static_cast(i); + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::Request:PostEnqueueRequest"); +} + +TEST_F(RateLimiterTest, Modes) { + for (auto mode : {RateLimiter::Mode::kWritesOnly, + RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) { + GenericRateLimiter limiter(2000 /* rate_bytes_per_sec */, + 1000 * 1000 /* refill_period_us */, + 10 /* fairness */, mode, SystemClock::Default(), + false /* auto_tuned */); + limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kRead); + if (mode == RateLimiter::Mode::kWritesOnly) { + ASSERT_EQ(0, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } else { + ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } + + limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + if (mode == RateLimiter::Mode::kAllIo) { + ASSERT_EQ(2000, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } else { + ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH)); + } + } +} + +TEST_F(RateLimiterTest, GeneratePriorityIterationOrder) { + std::unique_ptr limiter(NewGenericRateLimiter( + 200 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */, + 10 /* fairness */)); + + bool possible_random_one_in_fairness_results_for_high_mid_pri[4][2] = { + {false, false}, {false, true}, {true, false}, {true, true}}; + std::vector possible_priority_iteration_orders[4] = { + {Env::IO_USER, Env::IO_HIGH, Env::IO_MID, Env::IO_LOW}, + {Env::IO_USER, Env::IO_HIGH, Env::IO_LOW, Env::IO_MID}, + {Env::IO_USER, Env::IO_MID, Env::IO_LOW, Env::IO_HIGH}, + {Env::IO_USER, Env::IO_LOW, Env::IO_MID, Env::IO_HIGH}}; + + for (int i = 0; i < 4; ++i) { + // These are variables for making sure the following callbacks are called + // and the assertion in the last callback is indeed excuted + bool high_pri_iterated_after_mid_low_pri_set = false; + bool mid_pri_itereated_after_low_pri_set = false; + bool pri_iteration_order_verified = false; + SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForHighPri", + [&](void* arg) { + bool* high_pri_iterated_after_mid_low_pri = (bool*)arg; + *high_pri_iterated_after_mid_low_pri = + possible_random_one_in_fairness_results_for_high_mid_pri[i][0]; + high_pri_iterated_after_mid_low_pri_set = true; + }); + + SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForMidPri", + [&](void* arg) { + bool* mid_pri_itereated_after_low_pri = (bool*)arg; + *mid_pri_itereated_after_low_pri = + possible_random_one_in_fairness_results_for_high_mid_pri[i][1]; + mid_pri_itereated_after_low_pri_set = true; + }); + + SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PreReturnPriIterationOrder", + [&](void* arg) { + std::vector* pri_iteration_order = + (std::vector*)arg; + EXPECT_EQ(*pri_iteration_order, possible_priority_iteration_orders[i]) + << "Failed to generate priority iteration order correctly when " + "high_pri_iterated_after_mid_low_pri = " + << possible_random_one_in_fairness_results_for_high_mid_pri[i][0] + << ", mid_pri_itereated_after_low_pri = " + << possible_random_one_in_fairness_results_for_high_mid_pri[i][1] + << std::endl; + pri_iteration_order_verified = true; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + limiter->Request(200 /* request max bytes to drain so that refill and order + generation will be triggered every time + GenericRateLimiter::Request() is called */ + , + Env::IO_USER, nullptr /* stats */, + RateLimiter::OpType::kWrite); + ASSERT_EQ(high_pri_iterated_after_mid_low_pri_set, true); + ASSERT_EQ(mid_pri_itereated_after_low_pri_set, true); + ASSERT_EQ(pri_iteration_order_verified, true); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PreReturnPriIterationOrder"); + SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForMidPri"); + SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::GeneratePriorityIterationOrderLocked::" + "PostRandomOneInFairnessForHighPri"); + } +} + +TEST_F(RateLimiterTest, Rate) { + auto* env = Env::Default(); + struct Arg { + Arg(int32_t _target_rate, int _burst) + : limiter(NewGenericRateLimiter(_target_rate /* rate_bytes_per_sec */, + 100 * 1000 /* refill_period_us */, + 10 /* fairness */)), + request_size(_target_rate / + 10 /* refill period here is 1/10 second */), + burst(_burst) {} + std::unique_ptr limiter; + int32_t request_size; + int burst; + }; + + auto writer = [](void* p) { + const auto& thread_clock = SystemClock::Default(); + auto* arg = static_cast(p); + // Test for 2 seconds + auto until = thread_clock->NowMicros() + 2 * 1000000; + Random r((uint32_t)(thread_clock->NowNanos() % + std::numeric_limits::max())); + while (thread_clock->NowMicros() < until) { + for (int i = 0; i < static_cast(r.Skewed(arg->burst * 2) + 1); ++i) { + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, + Env::IO_USER, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + + for (int i = 0; i < static_cast(r.Skewed(arg->burst) + 1); ++i) { + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, + Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + } + + for (int i = 0; i < static_cast(r.Skewed(arg->burst / 2 + 1) + 1); + ++i) { + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_MID, + nullptr /* stats */, RateLimiter::OpType::kWrite); + } + + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW, + nullptr /* stats */, RateLimiter::OpType::kWrite); + } + }; + + int samples = 0; + int samples_at_minimum = 0; + + for (int i = 1; i <= 16; i *= 2) { + int32_t target = i * 1024 * 10; + Arg arg(target, i / 4 + 1); + int64_t old_total_bytes_through = 0; + for (int iter = 1; iter <= 2; ++iter) { + // second iteration changes the target dynamically + if (iter == 2) { + target *= 2; + arg.limiter->SetBytesPerSecond(target); + } + auto start = env->NowMicros(); + for (int t = 0; t < i; ++t) { + env->StartThread(writer, &arg); + } + env->WaitForJoin(); + + auto elapsed = env->NowMicros() - start; + double rate = + (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) * + 1000000.0 / elapsed; + old_total_bytes_through = arg.limiter->GetTotalBytesThrough(); + fprintf(stderr, + "request size [1 - %" PRIi32 "], limit %" PRIi32 + " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n", + arg.request_size - 1, target / 1024, rate / 1024, + elapsed / 1000000.0); + + ++samples; + if (rate / target >= 0.80) { + ++samples_at_minimum; + } + ASSERT_LE(rate / target, 1.25); + } + } + + // This can fail due to slow execution speed, like when using valgrind or in + // heavily loaded CI environments + bool skip_minimum_rate_check = +#if (defined(CIRCLECI) && defined(OS_MACOSX)) || defined(ROCKSDB_VALGRIND_RUN) + true; +#else + getenv("SANDCASTLE"); +#endif + if (skip_minimum_rate_check) { + fprintf(stderr, "Skipped minimum rate check (%d / %d passed)\n", + samples_at_minimum, samples); + } else { + ASSERT_EQ(samples_at_minimum, samples); + } +} + +TEST_F(RateLimiterTest, LimitChangeTest) { + // starvation test when limit changes to a smaller value + int64_t refill_period = 1000 * 1000; + auto* env = Env::Default(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + struct Arg { + Arg(int32_t _request_size, Env::IOPriority _pri, + std::shared_ptr _limiter) + : request_size(_request_size), pri(_pri), limiter(_limiter) {} + int32_t request_size; + Env::IOPriority pri; + std::shared_ptr limiter; + }; + + auto writer = [](void* p) { + auto* arg = static_cast(p); + arg->limiter->Request(arg->request_size, arg->pri, nullptr /* stats */, + RateLimiter::OpType::kWrite); + }; + + for (uint32_t i = 1; i <= 16; i <<= 1) { + int32_t target = i * 1024 * 10; + // refill per second + for (int iter = 0; iter < 2; iter++) { + std::shared_ptr limiter = + std::make_shared( + target, refill_period, 10, RateLimiter::Mode::kWritesOnly, + SystemClock::Default(), false /* auto_tuned */); + // After "GenericRateLimiter::Request:1" the mutex is held until the bytes + // are refilled. This test could be improved to change the limit when lock + // is released in `TimedWait()`. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"GenericRateLimiter::Request", + "RateLimiterTest::LimitChangeTest:changeLimitStart"}, + {"RateLimiterTest::LimitChangeTest:changeLimitEnd", + "GenericRateLimiter::Request:1"}}); + Arg arg(target, Env::IO_HIGH, limiter); + // The idea behind is to start a request first, then before it refills, + // update limit to a different value (2X/0.5X). No starvation should + // be guaranteed under any situation + // TODO(lightmark): more test cases are welcome. + env->StartThread(writer, &arg); + int32_t new_limit = (target << 1) >> (iter << 1); + TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitStart"); + arg.limiter->SetBytesPerSecond(new_limit); + TEST_SYNC_POINT("RateLimiterTest::LimitChangeTest:changeLimitEnd"); + env->WaitForJoin(); + fprintf(stderr, + "[COMPLETE] request size %" PRIi32 " KB, new limit %" PRIi32 + "KB/sec, refill period %" PRIi64 " ms\n", + target / 1024, new_limit / 1024, refill_period / 1000); + } + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { + const std::chrono::seconds kTimePerRefill(1); + const int kRefillsPerTune = 100; // needs to match util/rate_limiter.cc + + SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true); + + auto stats = CreateDBStatistics(); + std::unique_ptr rate_limiter(new GenericRateLimiter( + 1000 /* rate_bytes_per_sec */, + std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */, + RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(), + true /* auto_tuned */)); + + // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the + // `Env` to advance its time according to the fake wait duration. The + // workaround is to install a callback that advance the `Env`'s mock time. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) { + int64_t time_waited_us = *static_cast(arg); + special_env.SleepForMicroseconds(static_cast(time_waited_us)); + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // verify rate limit increases after a sequence of periods where rate limiter + // is always drained + int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); + rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(), + RateLimiter::OpType::kWrite); + while (std::chrono::microseconds(special_env.NowMicros()) <= + kRefillsPerTune * kTimePerRefill) { + rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(), + RateLimiter::OpType::kWrite); + } + int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); + ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::Request:PostTimedWait"); + + // decreases after a sequence of periods where rate limiter is not drained + orig_bytes_per_sec = new_bytes_per_sec; + special_env.SleepForMicroseconds(static_cast( + kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count())); + // make a request so tuner can be triggered + rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(), + RateLimiter::OpType::kWrite); + new_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); + ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/repeatable_thread.h b/src/rocksdb/util/repeatable_thread.h new file mode 100644 index 000000000..c75ad7c49 --- /dev/null +++ b/src/rocksdb/util/repeatable_thread.h @@ -0,0 +1,149 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "port/port.h" +#include "rocksdb/system_clock.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// Simple wrapper around port::Thread that supports calling a callback every +// X seconds. If you pass in 0, then it will call your callback repeatedly +// without delay. +class RepeatableThread { + public: + RepeatableThread(std::function function, + const std::string& thread_name, SystemClock* clock, + uint64_t delay_us, uint64_t initial_delay_us = 0) + : function_(function), + thread_name_("rocksdb:" + thread_name), + clock_(clock), + delay_us_(delay_us), + initial_delay_us_(initial_delay_us), + mutex_(clock), + cond_var_(&mutex_), + running_(true), +#ifndef NDEBUG + waiting_(false), + run_count_(0), +#endif + thread_([this] { thread(); }) { + } + + void cancel() { + { + InstrumentedMutexLock l(&mutex_); + if (!running_) { + return; + } + running_ = false; + cond_var_.SignalAll(); + } + thread_.join(); + } + + bool IsRunning() { return running_; } + + ~RepeatableThread() { cancel(); } + +#ifndef NDEBUG + // Wait until RepeatableThread starting waiting, call the optional callback, + // then wait for one run of RepeatableThread. Tests can use provide a + // custom clock object to mock time, and use the callback here to bump current + // time and trigger RepeatableThread. See repeatable_thread_test for example. + // + // Note: only support one caller of this method. + void TEST_WaitForRun(std::function callback = nullptr) { + InstrumentedMutexLock l(&mutex_); + while (!waiting_) { + cond_var_.Wait(); + } + uint64_t prev_count = run_count_; + if (callback != nullptr) { + callback(); + } + cond_var_.SignalAll(); + while (!(run_count_ > prev_count)) { + cond_var_.Wait(); + } + } +#endif + + private: + bool wait(uint64_t delay) { + InstrumentedMutexLock l(&mutex_); + if (running_ && delay > 0) { + uint64_t wait_until = clock_->NowMicros() + delay; +#ifndef NDEBUG + waiting_ = true; + cond_var_.SignalAll(); +#endif + while (running_) { + cond_var_.TimedWait(wait_until); + if (clock_->NowMicros() >= wait_until) { + break; + } + } +#ifndef NDEBUG + waiting_ = false; +#endif + } + return running_; + } + + void thread() { +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + // Set thread name. + auto thread_handle = thread_.native_handle(); + int ret __attribute__((__unused__)) = + pthread_setname_np(thread_handle, thread_name_.c_str()); + assert(ret == 0); +#endif +#endif + + assert(delay_us_ > 0); + if (!wait(initial_delay_us_)) { + return; + } + do { + function_(); +#ifndef NDEBUG + { + InstrumentedMutexLock l(&mutex_); + run_count_++; + cond_var_.SignalAll(); + } +#endif + } while (wait(delay_us_)); + } + + const std::function function_; + const std::string thread_name_; + SystemClock* clock_; + const uint64_t delay_us_; + const uint64_t initial_delay_us_; + + // Mutex lock should be held when accessing running_, waiting_ + // and run_count_. + InstrumentedMutex mutex_; + InstrumentedCondVar cond_var_; + bool running_; +#ifndef NDEBUG + // RepeatableThread waiting for timeout. + bool waiting_; + // Times function_ had run. + uint64_t run_count_; +#endif + port::Thread thread_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/repeatable_thread_test.cc b/src/rocksdb/util/repeatable_thread_test.cc new file mode 100644 index 000000000..0b3e95464 --- /dev/null +++ b/src/rocksdb/util/repeatable_thread_test.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/repeatable_thread.h" + +#include +#include + +#include "db/db_test_util.h" +#include "test_util/mock_time_env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" + +class RepeatableThreadTest : public testing::Test { + public: + RepeatableThreadTest() + : mock_clock_(std::make_shared( + ROCKSDB_NAMESPACE::SystemClock::Default())) {} + + protected: + std::shared_ptr mock_clock_; +}; + +TEST_F(RepeatableThreadTest, TimedTest) { + constexpr uint64_t kSecond = 1000000; // 1s = 1000000us + constexpr int kIteration = 3; + const auto& clock = ROCKSDB_NAMESPACE::SystemClock::Default(); + ROCKSDB_NAMESPACE::port::Mutex mutex; + ROCKSDB_NAMESPACE::port::CondVar test_cv(&mutex); + int count = 0; + uint64_t prev_time = clock->NowMicros(); + ROCKSDB_NAMESPACE::RepeatableThread thread( + [&] { + ROCKSDB_NAMESPACE::MutexLock l(&mutex); + count++; + uint64_t now = clock->NowMicros(); + assert(count == 1 || prev_time + 1 * kSecond <= now); + prev_time = now; + if (count >= kIteration) { + test_cv.SignalAll(); + } + }, + "rt_test", clock.get(), 1 * kSecond); + // Wait for execution finish. + { + ROCKSDB_NAMESPACE::MutexLock l(&mutex); + while (count < kIteration) { + test_cv.Wait(); + } + } + + // Test cancel + thread.cancel(); +} + +TEST_F(RepeatableThreadTest, MockEnvTest) { + constexpr uint64_t kSecond = 1000000; // 1s = 1000000us + constexpr int kIteration = 3; + mock_clock_->SetCurrentTime(0); // in seconds + std::atomic count{0}; + +#if defined(OS_MACOSX) && !defined(NDEBUG) + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) { + // Obtain the current (real) time in seconds and add 1000 extra seconds + // to ensure that RepeatableThread::wait invokes TimedWait with a time + // greater than (real) current time. This is to prevent the TimedWait + // function from returning immediately without sleeping and releasing + // the mutex on certain platforms, e.g. OS X. If TimedWait returns + // immediately, the mutex will not be released, and + // RepeatableThread::TEST_WaitForRun never has a chance to execute the + // callback which, in this case, updates the result returned by + // mock_clock->NowMicros. Consequently, RepeatableThread::wait cannot + // break out of the loop, causing test to hang. The extra 1000 seconds + // is a best-effort approach because there seems no reliable and + // deterministic way to provide the aforementioned guarantee. By the + // time RepeatableThread::wait is called, it is no guarantee that the + // delay + mock_clock->NowMicros will be greater than the current real + // time. However, 1000 seconds should be sufficient in most cases. + uint64_t time_us = *reinterpret_cast(arg); + if (time_us < mock_clock_->RealNowMicros()) { + *reinterpret_cast(arg) = + mock_clock_->RealNowMicros() + 1000; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); +#endif // OS_MACOSX && !NDEBUG + + ROCKSDB_NAMESPACE::RepeatableThread thread( + [&] { count++; }, "rt_test", mock_clock_.get(), 1 * kSecond, 1 * kSecond); + for (int i = 1; i <= kIteration; i++) { + // Bump current time + thread.TEST_WaitForRun([&] { mock_clock_->SetCurrentTime(i); }); + } + // Test function should be exectued exactly kIteraion times. + ASSERT_EQ(kIteration, count.load()); + + // Test cancel + thread.cancel(); +} + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/ribbon_alg.h b/src/rocksdb/util/ribbon_alg.h new file mode 100644 index 000000000..f9afefc23 --- /dev/null +++ b/src/rocksdb/util/ribbon_alg.h @@ -0,0 +1,1225 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "util/math128.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_alg.h: generic versions of core algorithms. +// +// Ribbon is a Perfect Hash Static Function construction useful as a compact +// static Bloom filter alternative. It combines (a) a boolean (GF(2)) linear +// system construction that approximates a Band Matrix with hashing, +// (b) an incremental, on-the-fly Gaussian Elimination algorithm that is +// remarkably efficient and adaptable at constructing an upper-triangular +// band matrix from a set of band-approximating inputs from (a), and +// (c) a storage layout that is fast and adaptable as a filter. +// +// Footnotes: (a) "Efficient Gauss Elimination for Near-Quadratic Matrices +// with One Short Random Block per Row, with Applications" by Stefan +// Walzer and Martin Dietzfelbinger ("DW paper") +// (b) developed by Peter C. Dillinger, though not the first on-the-fly +// GE algorithm. See "On the fly Gaussian Elimination for LT codes" by +// Bioglio, Grangetto, Gaeta, and Sereno. +// (c) see "interleaved" solution storage below. +// +// See ribbon_impl.h for high-level behavioral summary. This file focuses +// on the core design details. +// +// ###################################################################### +// ################# PHSF -> static filter reduction #################### +// +// A Perfect Hash Static Function is a data structure representing a +// map from anything hashable (a "key") to values of some fixed size. +// Crucially, it is allowed to return garbage values for anything not in +// the original set of map keys, and it is a "static" structure: entries +// cannot be added or deleted after construction. PHSFs representing n +// mappings to b-bit values (assume uniformly distributed) require at least +// n * b bits to represent, or at least b bits per entry. We typically +// describe the compactness of a PHSF by typical bits per entry as some +// function of b. For example, the MWHC construction (k=3 "peeling") +// requires about 1.0222*b and a variant called Xor+ requires about +// 1.08*b + 0.5 bits per entry. +// +// With more hashing, a PHSF can over-approximate a set as a Bloom filter +// does, with no FN queries and predictable false positive (FP) query +// rate. Instead of the user providing a value to map each input key to, +// a hash function provides the value. Keys in the original set will +// return a positive membership query because the underlying PHSF returns +// the same value as hashing the key. When a key is not in the original set, +// the PHSF returns a "garbage" value, which is only equal to the key's +// hash with (false positive) probability 1 in 2^b. +// +// For a matching false positive rate, standard Bloom filters require +// 1.44*b bits per entry. Cache-local Bloom filters (like bloom_impl.h) +// require a bit more, around 1.5*b bits per entry. Thus, a Bloom +// alternative could save up to or nearly 1/3rd of memory and storage +// that RocksDB uses for SST (static) Bloom filters. (Memtable Bloom filter +// is dynamic.) +// +// Recommended reading: +// "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" +// by Graf and Lemire +// First three sections of "Fast Scalable Construction of (Minimal +// Perfect Hash) Functions" by Genuzio, Ottaviano, and Vigna +// +// ###################################################################### +// ################## PHSF vs. hash table vs. Bloom ##################### +// +// You can think of traditional hash tables and related filter variants +// such as Cuckoo filters as utilizing an "OR" construction: a hash +// function associates a key with some slots and the data is returned if +// the data is found in any one of those slots. The collision resolution +// is visible in the final data structure and requires extra information. +// For example, Cuckoo filter uses roughly 1.05b + 2 bits per entry, and +// Golomb-Rice code (aka "GCS") as little as b + 1.5. When the data +// structure associates each input key with data in one slot, the +// structure implicitly constructs a (near-)minimal (near-)perfect hash +// (MPH) of the keys, which requires at least 1.44 bits per key to +// represent. This is why approaches with visible collision resolution +// have a fixed + 1.5 or more in storage overhead per entry, often in +// addition to an overhead multiplier on b. +// +// By contrast Bloom filters utilize an "AND" construction: a query only +// returns true if all bit positions associated with a key are set to 1. +// There is no collision resolution, so Bloom filters do not suffer a +// fixed bits per entry overhead like the above structures. +// +// PHSFs typically use a bitwise XOR construction: the data you want is +// not in a single slot, but in a linear combination of several slots. +// For static data, this gives the best of "AND" and "OR" constructions: +// avoids the +1.44 or more fixed overhead by not approximating a MPH and +// can do much better than Bloom's 1.44 factor on b with collision +// resolution, which here is done ahead of time and invisible at query +// time. +// +// ###################################################################### +// ######################## PHSF construction ########################### +// +// For a typical PHSF, construction is solving a linear system of +// equations, typically in GF(2), which is to say that values are boolean +// and XOR serves both as addition and subtraction. We can use matrices to +// represent the problem: +// +// C * S = R +// (n x m) (m x b) (n x b) +// where C = coefficients, S = solution, R = results +// and solving for S given C and R. +// +// Note that C and R each have n rows, one for each input entry for the +// PHSF. A row in C is given by a hash function on the PHSF input key, +// and the corresponding row in R is the b-bit value to associate with +// that input key. (In a filter, rows of R are given by another hash +// function on the input key.) +// +// On solving, the matrix S (solution) is the final PHSF data, as it +// maps any row from the original C to its corresponding desired result +// in R. We just have to hash our query inputs and compute a linear +// combination of rows in S. +// +// In theory, we could chose m = n and let a hash function associate +// each input key with random rows in C. A solution exists with high +// probability, and uses essentially minimum space, b bits per entry +// (because we set m = n) but this has terrible scaling, something +// like O(n^2) space and O(n^3) time during construction (Gaussian +// elimination) and O(n) query time. But computational efficiency is +// key, and the core of this is avoiding scanning all of S to answer +// each query. +// +// The traditional approach (MWHC, aka Xor filter) starts with setting +// only some small fixed number of columns (typically k=3) to 1 for each +// row of C, with remaining entries implicitly 0. This is implemented as +// three hash functions over [0,m), and S can be implemented as a vector +// of b-bit values. Now, a query only involves looking up k rows +// (values) in S and computing their bitwise XOR. Additionally, this +// construction can use a linear time algorithm called "peeling" for +// finding a solution in many cases of one existing, but peeling +// generally requires a larger space overhead factor in the solution +// (m/n) than is required with Gaussian elimination. +// +// Recommended reading: +// "Peeling Close to the Orientability Threshold - Spatial Coupling in +// Hashing-Based Data Structures" by Stefan Walzer +// +// ###################################################################### +// ##################### Ribbon PHSF construction ####################### +// +// Ribbon constructs coefficient rows essentially the same as in the +// Walzer/Dietzfelbinger paper cited above: for some chosen fixed width +// r (kCoeffBits in code), each key is hashed to a starting column in +// [0, m - r] (GetStart() in code) and an r-bit sequence of boolean +// coefficients (GetCoeffRow() in code). If you sort the rows by start, +// the C matrix would look something like this: +// +// [####00000000000000000000] +// [####00000000000000000000] +// [000####00000000000000000] +// [0000####0000000000000000] +// [0000000####0000000000000] +// [000000000####00000000000] +// [000000000####00000000000] +// [0000000000000####0000000] +// [0000000000000000####0000] +// [00000000000000000####000] +// [00000000000000000000####] +// +// where each # could be a 0 or 1, chosen uniformly by a hash function. +// (Except we typically set the start column value to 1.) This scheme +// uses hashing to approximate a band matrix, and it has a solution iff +// it reduces to an upper-triangular boolean r-band matrix, like this: +// +// [1###00000000000000000000] +// [01##00000000000000000000] +// [000000000000000000000000] +// [0001###00000000000000000] +// [000000000000000000000000] +// [000001##0000000000000000] +// [000000000000000000000000] +// [00000001###0000000000000] +// [000000001###000000000000] +// [0000000001##000000000000] +// ... +// [00000000000000000000001#] +// [000000000000000000000001] +// +// where we have expanded to an m x m matrix by filling with rows of +// all zeros as needed. As in Gaussian elimination, this form is ready for +// generating a solution through back-substitution. +// +// The awesome thing about the Ribbon construction (from the DW paper) is +// how row reductions keep each row representable as a start column and +// r coefficients, because row reductions are only needed when two rows +// have the same number of leading zero columns. Thus, the combination +// of those rows, the bitwise XOR of the r-bit coefficient rows, cancels +// out the leading 1s, so starts (at least) one column later and only +// needs (at most) r - 1 coefficients. +// +// ###################################################################### +// ###################### Ribbon PHSF scalability ####################### +// +// Although more practical detail is in ribbon_impl.h, it's worth +// understanding some of the overall benefits and limitations of the +// Ribbon PHSFs. +// +// High-end scalability is a primary issue for Ribbon PHSFs, because in +// a single Ribbon linear system with fixed r and fixed m/n ratio, the +// solution probability approaches zero as n approaches infinity. +// For a given n, solution probability improves with larger r and larger +// m/n. +// +// By contrast, peeling-based PHSFs have somewhat worse storage ratio +// or solution probability for small n (less than ~1000). This is +// especially true with spatial-coupling, where benefits are only +// notable for n on the order of 100k or 1m or more. +// +// To make best use of current hardware, r=128 seems to be closest to +// a "generally good" choice for Ribbon, at least in RocksDB where SST +// Bloom filters typically hold around 10-100k keys, and almost always +// less than 10m keys. r=128 ribbon has a high chance of encoding success +// (with first hash seed) when storage overhead is around 5% (m/n ~ 1.05) +// for roughly 10k - 10m keys in a single linear system. r=64 only scales +// up to about 10k keys with the same storage overhead. Construction and +// access times for r=128 are similar to r=64. r=128 tracks nearly +// twice as much data during construction, but in most cases we expect +// the scalability benefits of r=128 vs. r=64 to make it preferred. +// +// A natural approach to scaling Ribbon beyond ~10m keys is splitting +// (or "sharding") the inputs into multiple linear systems with their +// own hash seeds. This can also help to control peak memory consumption. +// TODO: much more to come +// +// ###################################################################### +// #################### Ribbon on-the-fly banding ####################### +// +// "Banding" is what we call the process of reducing the inputs to an +// upper-triangular r-band matrix ready for finishing a solution with +// back-substitution. Although the DW paper presents an algorithm for +// this ("SGauss"), the awesome properties of their construction enable +// an even simpler, faster, and more backtrackable algorithm. In simplest +// terms, the SGauss algorithm requires sorting the inputs by start +// columns, but it's possible to make Gaussian elimination resemble hash +// table insertion! +// +// The enhanced algorithm is based on these observations: +// - When processing a coefficient row with first 1 in column j, +// - If it's the first at column j to be processed, it can be part of +// the banding at row j. (And that decision never overwritten, with +// no loss of generality!) +// - Else, it can be combined with existing row j and re-processed, +// which will look for a later "empty" row or reach "no solution". +// +// We call our banding algorithm "incremental" and "on-the-fly" because +// (like hash table insertion) we are "finished" after each input +// processed, with respect to all inputs processed so far. Although the +// band matrix is an intermediate step to the solution structure, we have +// eliminated intermediate steps and unnecessary data tracking for +// banding. +// +// Building on "incremental" and "on-the-fly", the banding algorithm is +// easily backtrackable because no (non-empty) rows are overwritten in +// the banding. Thus, if we want to "try" adding an additional set of +// inputs to the banding, we only have to record which rows were written +// in order to efficiently backtrack to our state before considering +// the additional set. (TODO: how this can mitigate scalability and +// reach sub-1% overheads) +// +// Like in a linear-probed hash table, as the occupancy approaches and +// surpasses 90-95%, collision resolution dominates the construction +// time. (Ribbon doesn't usually pay at query time; see solution +// storage below.) This means that we can speed up construction time +// by using a higher m/n ratio, up to negative returns around 1.2. +// At m/n ~= 1.2, which still saves memory substantially vs. Bloom +// filter's 1.5, construction speed (including back-substitution) is not +// far from sorting speed, but still a few times slower than cache-local +// Bloom construction speed. +// +// Back-substitution from an upper-triangular boolean band matrix is +// especially fast and easy. All the memory accesses are sequential or at +// least local, no random. If the number of result bits (b) is a +// compile-time constant, the back-substitution state can even be tracked +// in CPU registers. Regardless of the solution representation, we prefer +// column-major representation for tracking back-substitution state, as +// r (the band width) will typically be much larger than b (result bits +// or columns), so better to handle r-bit values b times (per solution +// row) than b-bit values r times. +// +// ###################################################################### +// ##################### Ribbon solution storage ######################## +// +// Row-major layout is typical for boolean (bit) matrices, including for +// MWHC (Xor) filters where a query combines k b-bit values, and k is +// typically smaller than b. Even for k=4 and b=2, at least k=4 random +// look-ups are required regardless of layout. +// +// Ribbon PHSFs are quite different, however, because +// (a) all of the solution rows relevant to a query are within a single +// range of r rows, and +// (b) the number of solution rows involved (r/2 on average, or r if +// avoiding conditional accesses) is typically much greater than +// b, the number of solution columns. +// +// Row-major for Ribbon PHSFs therefore tends to incur undue CPU overhead +// by processing (up to) r entries of b bits each, where b is typically +// less than 10 for filter applications. +// +// Column-major layout has poor locality because of accessing up to b +// memory locations in different pages (and obviously cache lines). Note +// that negative filter queries do not typically need to access all +// solution columns, as they can return when a mismatch is found in any +// result/solution column. This optimization doesn't always pay off on +// recent hardware, where the penalty for unpredictable conditional +// branching can exceed the penalty for unnecessary work, but the +// optimization is essentially unavailable with row-major layout. +// +// The best compromise seems to be interleaving column-major on the small +// scale with row-major on the large scale. For example, let a solution +// "block" be r rows column-major encoded as b r-bit values in sequence. +// Each query accesses (up to) 2 adjacent blocks, which will typically +// span 1-3 cache lines in adjacent memory. We get very close to the same +// locality as row-major, but with much faster reconstruction of each +// result column, at least for filter applications where b is relatively +// small and negative queries can return early. +// +// ###################################################################### +// ###################### Fractional result bits ######################## +// +// Bloom filters have great flexibility that alternatives mostly do not +// have. One of those flexibilities is in utilizing any ratio of data +// structure bits per key. With a typical memory allocator like jemalloc, +// this flexibility can save roughly 10% of the filters' footprint in +// DRAM by rounding up and down filter sizes to minimize memory internal +// fragmentation (see optimize_filters_for_memory RocksDB option). +// +// At first glance, PHSFs only offer a whole number of bits per "slot" +// (m rather than number of keys n), but coefficient locality in the +// Ribbon construction makes fractional bits/key quite possible and +// attractive for filter applications. This works by a prefix of the +// structure using b-1 solution columns and the rest using b solution +// columns. See InterleavedSolutionStorage below for more detail. +// +// Because false positive rates are non-linear in bits/key, this approach +// is not quite optimal in terms of information theory. In common cases, +// we see additional space overhead up to about 1.5% vs. theoretical +// optimal to achieve the same FP rate. We consider this a quite acceptable +// overhead for very efficiently utilizing space that might otherwise be +// wasted. +// +// This property of Ribbon even makes it "elastic." A Ribbon filter and +// its small metadata for answering queries can be adapted into another +// Ribbon filter filling any smaller multiple of r bits (plus small +// metadata), with a correspondingly higher FP rate. None of the data +// thrown away during construction needs to be recalled for this reduction. +// Similarly a single Ribbon construction can be separated (by solution +// column) into two or more structures (or "layers" or "levels") with +// independent filtering ability (no FP correlation, just as solution or +// result columns in a single structure) despite being constructed as part +// of a single linear system. (TODO: implement) +// See also "ElasticBF: Fine-grained and Elastic Bloom Filter Towards +// Efficient Read for LSM-tree-based KV Stores." +// + +// ###################################################################### +// ################### CODE: Ribbon core algorithms ##################### +// ###################################################################### +// +// These algorithms are templatized for genericity but near-maximum +// performance in a given application. The template parameters +// adhere to informal class/struct type concepts outlined below. (This +// code is written for C++11 so does not use formal C++ concepts.) + +// Rough architecture for these algorithms: +// +// +-----------+ +---+ +-----------------+ +// | AddInputs | --> | H | --> | BandingStorage | +// +-----------+ | a | +-----------------+ +// | s | | +// | h | Back substitution +// | e | V +// +-----------+ | r | +-----------------+ +// | Query Key | --> | | >+< | SolutionStorage | +// +-----------+ +---+ | +-----------------+ +// V +// Query result + +// Common to other concepts +// concept RibbonTypes { +// // An unsigned integer type for an r-bit subsequence of coefficients. +// // r (or kCoeffBits) is taken to be sizeof(CoeffRow) * 8, as it would +// // generally only hurt scalability to leave bits of CoeffRow unused. +// typename CoeffRow; +// // An unsigned integer type big enough to hold a result row (b bits, +// // or number of solution/result columns). +// // In many applications, especially filters, the number of result +// // columns is decided at run time, so ResultRow simply needs to be +// // big enough for the largest number of columns allowed. +// typename ResultRow; +// // An unsigned integer type sufficient for representing the number of +// // rows in the solution structure, and at least the arithmetic +// // promotion size (usually 32 bits). uint32_t recommended because a +// // single Ribbon construction doesn't really scale to billions of +// // entries. +// typename Index; +// }; + +// ###################################################################### +// ######################## Hashers and Banding ######################### + +// Hasher concepts abstract out hashing details. + +// concept PhsfQueryHasher extends RibbonTypes { +// // Type for a lookup key, which is hashable. +// typename Key; +// +// // Type for hashed summary of a Key. uint64_t is recommended. +// typename Hash; +// +// // Compute a hash value summarizing a Key +// Hash GetHash(const Key &) const; +// +// // Given a hash value and a number of columns that can start an +// // r-sequence of coefficients (== m - r + 1), return the start +// // column to associate with that hash value. (Starts can be chosen +// // uniformly or "smash" extra entries into the beginning and end for +// // better utilization at those extremes of the structure. Details in +// // ribbon.impl.h) +// Index GetStart(Hash, Index num_starts) const; +// +// // Given a hash value, return the r-bit sequence of coefficients to +// // associate with it. It's generally OK if +// // sizeof(CoeffRow) > sizeof(Hash) +// // as long as the hash itself is not too prone to collisions for the +// // applications and the CoeffRow is generated uniformly from +// // available hash data, but relatively independent of the start. +// // +// // Must be non-zero, because that's required for a solution to exist +// // when mapping to non-zero result row. (Note: BandingAdd could be +// // modified to allow 0 coeff row if that only occurs with 0 result +// // row, which really only makes sense for filter implementation, +// // where both values are hash-derived. Or BandingAdd could reject 0 +// // coeff row, forcing next seed, but that has potential problems with +// // generality/scalability.) +// CoeffRow GetCoeffRow(Hash) const; +// }; + +// concept FilterQueryHasher extends PhsfQueryHasher { +// // For building or querying a filter, this returns the expected +// // result row associated with a hashed input. For general PHSF, +// // this must return 0. +// // +// // Although not strictly required, there's a slightly better chance of +// // solver success if result row is masked down here to only the bits +// // actually needed. +// ResultRow GetResultRowFromHash(Hash) const; +// } + +// concept BandingHasher extends FilterQueryHasher { +// // For a filter, this will generally be the same as Key. +// // For a general PHSF, it must either +// // (a) include a key and a result it maps to (e.g. in a std::pair), or +// // (b) GetResultRowFromInput looks up the result somewhere rather than +// // extracting it. +// typename AddInput; +// +// // Instead of requiring a way to extract a Key from an +// // AddInput, we require getting the hash of the Key part +// // of an AddInput, which is trivial if AddInput == Key. +// Hash GetHash(const AddInput &) const; +// +// // For building a non-filter PHSF, this extracts or looks up the result +// // row to associate with an input. For filter PHSF, this must return 0. +// ResultRow GetResultRowFromInput(const AddInput &) const; +// +// // Whether the solver can assume the lowest bit of GetCoeffRow is +// // always 1. When true, it should improve solver efficiency slightly. +// static bool kFirstCoeffAlwaysOne; +// } + +// Abstract storage for the the result of "banding" the inputs (Gaussian +// elimination to an upper-triangular boolean band matrix). Because the +// banding is an incremental / on-the-fly algorithm, this also represents +// all the intermediate state between input entries. +// +// concept BandingStorage extends RibbonTypes { +// // Tells the banding algorithm to prefetch memory associated with +// // the next input before processing the current input. Generally +// // recommended iff the BandingStorage doesn't easily fit in CPU +// // cache. +// bool UsePrefetch() const; +// +// // Prefetches (e.g. __builtin_prefetch) memory associated with a +// // slot index i. +// void Prefetch(Index i) const; +// +// // Load or store CoeffRow and ResultRow for slot index i. +// // (Gaussian row operations involve both sides of the equation.) +// // Bool `for_back_subst` indicates that customizing values for +// // unconstrained solution rows (cr == 0) is allowed. +// void LoadRow(Index i, CoeffRow *cr, ResultRow *rr, bool for_back_subst) +// const; +// void StoreRow(Index i, CoeffRow cr, ResultRow rr); +// +// // Returns the number of columns that can start an r-sequence of +// // coefficients, which is the number of slots minus r (kCoeffBits) +// // plus one. (m - r + 1) +// Index GetNumStarts() const; +// }; + +// Optional storage for backtracking data in banding a set of input +// entries. It exposes an array structure which will generally be +// used as a stack. It must be able to accommodate as many entries +// as are passed in as inputs to `BandingAddRange`. +// +// concept BacktrackStorage extends RibbonTypes { +// // If false, backtracking support will be disabled in the algorithm. +// // This should preferably be an inline compile-time constant function. +// bool UseBacktrack() const; +// +// // Records `to_save` as the `i`th backtrack entry +// void BacktrackPut(Index i, Index to_save); +// +// // Recalls the `i`th backtrack entry +// Index BacktrackGet(Index i) const; +// } + +// Adds a single entry to BandingStorage (and optionally, BacktrackStorage), +// returning true if successful or false if solution is impossible with +// current hasher (and presumably its seed) and number of "slots" (solution +// or banding rows). (A solution is impossible when there is a linear +// dependence among the inputs that doesn't "cancel out".) +// +// Pre- and post-condition: the BandingStorage represents a band matrix +// ready for back substitution (row echelon form except for zero rows), +// augmented with result values such that back substitution would give a +// solution satisfying all the cr@start -> rr entries added. +template +bool BandingAdd(BandingStorage *bs, typename BandingStorage::Index start, + typename BandingStorage::ResultRow rr, + typename BandingStorage::CoeffRow cr, BacktrackStorage *bts, + typename BandingStorage::Index *backtrack_pos) { + using CoeffRow = typename BandingStorage::CoeffRow; + using ResultRow = typename BandingStorage::ResultRow; + using Index = typename BandingStorage::Index; + + Index i = start; + + if (!kFirstCoeffAlwaysOne) { + // Requires/asserts that cr != 0 + int tz = CountTrailingZeroBits(cr); + i += static_cast(tz); + cr >>= tz; + } + + for (;;) { + assert((cr & 1) == 1); + CoeffRow cr_at_i; + ResultRow rr_at_i; + bs->LoadRow(i, &cr_at_i, &rr_at_i, /* for_back_subst */ false); + if (cr_at_i == 0) { + bs->StoreRow(i, cr, rr); + bts->BacktrackPut(*backtrack_pos, i); + ++*backtrack_pos; + return true; + } + assert((cr_at_i & 1) == 1); + // Gaussian row reduction + cr ^= cr_at_i; + rr ^= rr_at_i; + if (cr == 0) { + // Inconsistency or (less likely) redundancy + break; + } + // Find relative offset of next non-zero coefficient. + int tz = CountTrailingZeroBits(cr); + i += static_cast(tz); + cr >>= tz; + } + + // Failed, unless result row == 0 because e.g. a duplicate input or a + // stock hash collision, with same result row. (For filter, stock hash + // collision implies same result row.) Or we could have a full equation + // equal to sum of other equations, which is very possible with + // small range of values for result row. + return rr == 0; +} + +// Adds a range of entries to BandingStorage returning true if successful +// or false if solution is impossible with current hasher (and presumably +// its seed) and number of "slots" (solution or banding rows). (A solution +// is impossible when there is a linear dependence among the inputs that +// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. +// +// If UseBacktrack in the BacktrackStorage, this function call rolls back +// to prior state on failure. If !UseBacktrack, some subset of the entries +// will have been added to the BandingStorage, so best considered to be in +// an indeterminate state. +// +template +bool BandingAddRange(BandingStorage *bs, BacktrackStorage *bts, + const BandingHasher &bh, InputIterator begin, + InputIterator end) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + using Hash = typename BandingHasher::Hash; + + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + static_assert(IsUnsignedUpTo128::value, "must be unsigned"); + + constexpr bool kFCA1 = BandingHasher::kFirstCoeffAlwaysOne; + + if (begin == end) { + // trivial + return true; + } + + const Index num_starts = bs->GetNumStarts(); + + InputIterator cur = begin; + Index backtrack_pos = 0; + if (!bs->UsePrefetch()) { + // Simple version, no prefetch + for (;;) { + Hash h = bh.GetHash(*cur); + Index start = bh.GetStart(h, num_starts); + ResultRow rr = + bh.GetResultRowFromInput(*cur) | bh.GetResultRowFromHash(h); + CoeffRow cr = bh.GetCoeffRow(h); + + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + if ((++cur) == end) { + return true; + } + } + } else { + // Pipelined w/prefetch + // Prime the pipeline + Hash h = bh.GetHash(*cur); + Index start = bh.GetStart(h, num_starts); + ResultRow rr = bh.GetResultRowFromInput(*cur); + bs->Prefetch(start); + + // Pipeline + for (;;) { + rr |= bh.GetResultRowFromHash(h); + CoeffRow cr = bh.GetCoeffRow(h); + if ((++cur) == end) { + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + return true; + } + Hash next_h = bh.GetHash(*cur); + Index next_start = bh.GetStart(next_h, num_starts); + ResultRow next_rr = bh.GetResultRowFromInput(*cur); + bs->Prefetch(next_start); + if (!BandingAdd(bs, start, rr, cr, bts, &backtrack_pos)) { + break; + } + h = next_h; + start = next_start; + rr = next_rr; + } + } + // failed; backtrack (if implemented) + if (bts->UseBacktrack()) { + while (backtrack_pos > 0) { + --backtrack_pos; + Index i = bts->BacktrackGet(backtrack_pos); + // Clearing the ResultRow is not strictly required, but is required + // for good FP rate on inputs that might have been backtracked out. + // (We don't want anything we've backtracked on to leak into final + // result, as that might not be "harmless".) + bs->StoreRow(i, 0, 0); + } + } + return false; +} + +// Adds a range of entries to BandingStorage returning true if successful +// or false if solution is impossible with current hasher (and presumably +// its seed) and number of "slots" (solution or banding rows). (A solution +// is impossible when there is a linear dependence among the inputs that +// doesn't "cancel out".) Here "InputIterator" is an iterator over AddInputs. +// +// On failure, some subset of the entries will have been added to the +// BandingStorage, so best considered to be in an indeterminate state. +// +template +bool BandingAddRange(BandingStorage *bs, const BandingHasher &bh, + InputIterator begin, InputIterator end) { + using Index = typename BandingStorage::Index; + struct NoopBacktrackStorage { + bool UseBacktrack() { return false; } + void BacktrackPut(Index, Index) {} + Index BacktrackGet(Index) { + assert(false); + return 0; + } + } nbts; + return BandingAddRange(bs, &nbts, bh, begin, end); +} + +// ###################################################################### +// ######################### Solution Storage ########################### + +// Back-substitution and query algorithms unfortunately depend on some +// details of data layout in the final data structure ("solution"). Thus, +// there is no common SolutionStorage covering all the reasonable +// possibilities. + +// ###################### SimpleSolutionStorage ######################### + +// SimpleSolutionStorage is for a row-major storage, typically with no +// unused bits in each ResultRow. This is mostly for demonstration +// purposes as the simplest solution storage scheme. It is relatively slow +// for filter queries. + +// concept SimpleSolutionStorage extends RibbonTypes { +// // This is called at the beginning of back-substitution for the +// // solution storage to do any remaining configuration before data +// // is stored to it. If configuration is previously finalized, this +// // could be a simple assertion or even no-op. Ribbon algorithms +// // only call this from back-substitution, and only once per call, +// // before other functions here. +// void PrepareForNumStarts(Index num_starts) const; +// // Must return num_starts passed to PrepareForNumStarts, or the most +// // recent call to PrepareForNumStarts if this storage object can be +// // reused. Note that num_starts == num_slots - kCoeffBits + 1 because +// // there must be a run of kCoeffBits slots starting from each start. +// Index GetNumStarts() const; +// // Load the solution row (type ResultRow) for a slot +// ResultRow Load(Index slot_num) const; +// // Store the solution row (type ResultRow) for a slot +// void Store(Index slot_num, ResultRow data); +// }; + +// Back-substitution for generating a solution from BandingStorage to +// SimpleSolutionStorage. +template +void SimpleBackSubst(SimpleSolutionStorage *sss, const BandingStorage &bs) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + + static_assert(sizeof(Index) == sizeof(typename SimpleSolutionStorage::Index), + "must be same"); + static_assert( + sizeof(CoeffRow) == sizeof(typename SimpleSolutionStorage::CoeffRow), + "must be same"); + static_assert( + sizeof(ResultRow) == sizeof(typename SimpleSolutionStorage::ResultRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + constexpr auto kResultBits = static_cast(sizeof(ResultRow) * 8U); + + // A column-major buffer of the solution matrix, containing enough + // recently-computed solution data to compute the next solution row + // (based also on banding data). + std::array state; + state.fill(0); + + const Index num_starts = bs.GetNumStarts(); + sss->PrepareForNumStarts(num_starts); + const Index num_slots = num_starts + kCoeffBits - 1; + + for (Index i = num_slots; i > 0;) { + --i; + CoeffRow cr; + ResultRow rr; + bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true); + // solution row + ResultRow sr = 0; + for (Index j = 0; j < kResultBits; ++j) { + // Compute next solution bit at row i, column j (see derivation below) + CoeffRow tmp = state[j] << 1; + bool bit = (BitParity(tmp & cr) ^ ((rr >> j) & 1)) != 0; + tmp |= bit ? CoeffRow{1} : CoeffRow{0}; + + // Now tmp is solution at column j from row i for next kCoeffBits + // more rows. Thus, for valid solution, the dot product of the + // solution column with the coefficient row has to equal the result + // at that column, + // BitParity(tmp & cr) == ((rr >> j) & 1) + + // Update state. + state[j] = tmp; + // add to solution row + sr |= (bit ? ResultRow{1} : ResultRow{0}) << j; + } + sss->Store(i, sr); + } +} + +// Common functionality for querying a key (already hashed) in +// SimpleSolutionStorage. +template +typename SimpleSolutionStorage::ResultRow SimpleQueryHelper( + typename SimpleSolutionStorage::Index start_slot, + typename SimpleSolutionStorage::CoeffRow cr, + const SimpleSolutionStorage &sss) { + using CoeffRow = typename SimpleSolutionStorage::CoeffRow; + using ResultRow = typename SimpleSolutionStorage::ResultRow; + + constexpr unsigned kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + ResultRow result = 0; + for (unsigned i = 0; i < kCoeffBits; ++i) { + // Bit masking whole value is generally faster here than 'if' + result ^= sss.Load(start_slot + i) & + (ResultRow{0} - (static_cast(cr >> i) & ResultRow{1})); + } + return result; +} + +// General PHSF query a key from SimpleSolutionStorage. +template +typename SimpleSolutionStorage::ResultRow SimplePhsfQuery( + const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, + const SimpleSolutionStorage &sss) { + const typename PhsfQueryHasher::Hash hash = hasher.GetHash(key); + + static_assert(sizeof(typename SimpleSolutionStorage::Index) == + sizeof(typename PhsfQueryHasher::Index), + "must be same"); + static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == + sizeof(typename PhsfQueryHasher::CoeffRow), + "must be same"); + + return SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), + hasher.GetCoeffRow(hash), sss); +} + +// Filter query a key from SimpleSolutionStorage. +template +bool SimpleFilterQuery(const typename FilterQueryHasher::Key &key, + const FilterQueryHasher &hasher, + const SimpleSolutionStorage &sss) { + const typename FilterQueryHasher::Hash hash = hasher.GetHash(key); + const typename SimpleSolutionStorage::ResultRow expected = + hasher.GetResultRowFromHash(hash); + + static_assert(sizeof(typename SimpleSolutionStorage::Index) == + sizeof(typename FilterQueryHasher::Index), + "must be same"); + static_assert(sizeof(typename SimpleSolutionStorage::CoeffRow) == + sizeof(typename FilterQueryHasher::CoeffRow), + "must be same"); + static_assert(sizeof(typename SimpleSolutionStorage::ResultRow) == + sizeof(typename FilterQueryHasher::ResultRow), + "must be same"); + + return expected == + SimpleQueryHelper(hasher.GetStart(hash, sss.GetNumStarts()), + hasher.GetCoeffRow(hash), sss); +} + +// #################### InterleavedSolutionStorage ###################### + +// InterleavedSolutionStorage is row-major at a high level, for good +// locality, and column-major at a low level, for CPU efficiency +// especially in filter queries or relatively small number of result bits +// (== solution columns). The storage is a sequence of "blocks" where a +// block has one CoeffRow-sized segment for each solution column. Each +// query spans at most two blocks; the starting solution row is typically +// in the row-logical middle of a block and spans to the middle of the +// next block. (See diagram below.) +// +// InterleavedSolutionStorage supports choosing b (number of result or +// solution columns) at run time, and even supports mixing b and b-1 solution +// columns in a single linear system solution, for filters that can +// effectively utilize any size space (multiple of CoeffRow) for minimizing +// FP rate for any number of added keys. To simplify query implementation +// (with lower-index columns first), the b-bit portion comes after the b-1 +// portion of the structure. +// +// Diagram (=== marks logical block boundary; b=4; ### is data used by a +// query crossing the b-1 to b boundary, each Segment has type CoeffRow): +// ... +// +======================+ +// | S e g m e n t col=0 | +// +----------------------+ +// | S e g m e n t col=1 | +// +----------------------+ +// | S e g m e n t col=2 | +// +======================+ +// | S e g m e n #########| +// +----------------------+ +// | S e g m e n #########| +// +----------------------+ +// | S e g m e n #########| +// +======================+ Result/solution columns: above = 3, below = 4 +// |#############t col=0 | +// +----------------------+ +// |#############t col=1 | +// +----------------------+ +// |#############t col=2 | +// +----------------------+ +// | S e g m e n t col=3 | +// +======================+ +// | S e g m e n t col=0 | +// +----------------------+ +// | S e g m e n t col=1 | +// +----------------------+ +// | S e g m e n t col=2 | +// +----------------------+ +// | S e g m e n t col=3 | +// +======================+ +// ... +// +// InterleavedSolutionStorage will be adapted by the algorithms from +// simple array-like segment storage. That array-like storage is templatized +// in part so that an implementation may choose to handle byte ordering +// at access time. +// +// concept InterleavedSolutionStorage extends RibbonTypes { +// // This is called at the beginning of back-substitution for the +// // solution storage to do any remaining configuration before data +// // is stored to it. If configuration is previously finalized, this +// // could be a simple assertion or even no-op. Ribbon algorithms +// // only call this from back-substitution, and only once per call, +// // before other functions here. +// void PrepareForNumStarts(Index num_starts) const; +// // Must return num_starts passed to PrepareForNumStarts, or the most +// // recent call to PrepareForNumStarts if this storage object can be +// // reused. Note that num_starts == num_slots - kCoeffBits + 1 because +// // there must be a run of kCoeffBits slots starting from each start. +// Index GetNumStarts() const; +// // The larger number of solution columns used (called "b" above). +// Index GetUpperNumColumns() const; +// // If returns > 0, then block numbers below that use +// // GetUpperNumColumns() - 1 columns per solution row, and the rest +// // use GetUpperNumColumns(). A block represents kCoeffBits "slots", +// // where all but the last kCoeffBits - 1 slots are also starts. And +// // a block contains a segment for each solution column. +// // An implementation may only support uniform columns per solution +// // row and return constant 0 here. +// Index GetUpperStartBlock() const; +// +// // ### "Array of segments" portion of API ### +// // The number of values of type CoeffRow used in this solution +// // representation. (This value can be inferred from the previous +// // three functions, but is expected at least for sanity / assertion +// // checking.) +// Index GetNumSegments() const; +// // Load an entry from the logical array of segments +// CoeffRow LoadSegment(Index segment_num) const; +// // Store an entry to the logical array of segments +// void StoreSegment(Index segment_num, CoeffRow data); +// }; + +// A helper for InterleavedBackSubst. +template +inline void BackSubstBlock(typename BandingStorage::CoeffRow *state, + typename BandingStorage::Index num_columns, + const BandingStorage &bs, + typename BandingStorage::Index start_slot) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + using ResultRow = typename BandingStorage::ResultRow; + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + for (Index i = start_slot + kCoeffBits; i > start_slot;) { + --i; + CoeffRow cr; + ResultRow rr; + bs.LoadRow(i, &cr, &rr, /* for_back_subst */ true); + for (Index j = 0; j < num_columns; ++j) { + // Compute next solution bit at row i, column j (see derivation below) + CoeffRow tmp = state[j] << 1; + int bit = BitParity(tmp & cr) ^ ((rr >> j) & 1); + tmp |= static_cast(bit); + + // Now tmp is solution at column j from row i for next kCoeffBits + // more rows. Thus, for valid solution, the dot product of the + // solution column with the coefficient row has to equal the result + // at that column, + // BitParity(tmp & cr) == ((rr >> j) & 1) + + // Update state. + state[j] = tmp; + } + } +} + +// Back-substitution for generating a solution from BandingStorage to +// InterleavedSolutionStorage. +template +void InterleavedBackSubst(InterleavedSolutionStorage *iss, + const BandingStorage &bs) { + using CoeffRow = typename BandingStorage::CoeffRow; + using Index = typename BandingStorage::Index; + + static_assert( + sizeof(Index) == sizeof(typename InterleavedSolutionStorage::Index), + "must be same"); + static_assert( + sizeof(CoeffRow) == sizeof(typename InterleavedSolutionStorage::CoeffRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const Index num_starts = bs.GetNumStarts(); + // Although it might be nice to have a filter that returns "always false" + // when no key is added, we aren't specifically supporting that here + // because it would require another condition branch in the query. + assert(num_starts > 0); + iss->PrepareForNumStarts(num_starts); + + const Index num_slots = num_starts + kCoeffBits - 1; + assert(num_slots % kCoeffBits == 0); + const Index num_blocks = num_slots / kCoeffBits; + const Index num_segments = iss->GetNumSegments(); + + // For now upper, then lower + Index num_columns = iss->GetUpperNumColumns(); + const Index upper_start_block = iss->GetUpperStartBlock(); + + if (num_columns == 0) { + // Nothing to do, presumably because there's not enough space for even + // a single segment. + assert(num_segments == 0); + // When num_columns == 0, a Ribbon filter query will always return true, + // or a PHSF query always 0. + return; + } + + // We should be utilizing all available segments + assert(num_segments == (upper_start_block * (num_columns - 1)) + + ((num_blocks - upper_start_block) * num_columns)); + + // TODO: consider fixed-column specializations with stack-allocated state + + // A column-major buffer of the solution matrix, containing enough + // recently-computed solution data to compute the next solution row + // (based also on banding data). + std::unique_ptr state{new CoeffRow[num_columns]()}; + + Index block = num_blocks; + Index segment_num = num_segments; + while (block > upper_start_block) { + --block; + BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); + segment_num -= num_columns; + for (Index i = 0; i < num_columns; ++i) { + iss->StoreSegment(segment_num + i, state[i]); + } + } + // Now (if applicable), region using lower number of columns + // (This should be optimized away if GetUpperStartBlock() returns + // constant 0.) + --num_columns; + while (block > 0) { + --block; + BackSubstBlock(state.get(), num_columns, bs, block * kCoeffBits); + segment_num -= num_columns; + for (Index i = 0; i < num_columns; ++i) { + iss->StoreSegment(segment_num + i, state[i]); + } + } + // Verify everything processed + assert(block == 0); + assert(segment_num == 0); +} + +// Prefetch memory for a key in InterleavedSolutionStorage. +template +inline void InterleavedPrepareQuery( + const typename PhsfQueryHasher::Key &key, const PhsfQueryHasher &hasher, + const InterleavedSolutionStorage &iss, + typename PhsfQueryHasher::Hash *saved_hash, + typename InterleavedSolutionStorage::Index *saved_segment_num, + typename InterleavedSolutionStorage::Index *saved_num_columns, + typename InterleavedSolutionStorage::Index *saved_start_bit) { + using Hash = typename PhsfQueryHasher::Hash; + using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; + using Index = typename InterleavedSolutionStorage::Index; + + static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), + "must be same"); + + const Hash hash = hasher.GetHash(key); + const Index start_slot = hasher.GetStart(hash, iss.GetNumStarts()); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const Index upper_start_block = iss.GetUpperStartBlock(); + Index num_columns = iss.GetUpperNumColumns(); + Index start_block_num = start_slot / kCoeffBits; + Index segment_num = start_block_num * num_columns - + std::min(start_block_num, upper_start_block); + // Change to lower num columns if applicable. + // (This should not compile to a conditional branch.) + num_columns -= (start_block_num < upper_start_block) ? 1 : 0; + + Index start_bit = start_slot % kCoeffBits; + + Index segment_count = num_columns + (start_bit == 0 ? 0 : num_columns); + + iss.PrefetchSegmentRange(segment_num, segment_num + segment_count); + + *saved_hash = hash; + *saved_segment_num = segment_num; + *saved_num_columns = num_columns; + *saved_start_bit = start_bit; +} + +// General PHSF query from InterleavedSolutionStorage, using data for +// the query key from InterleavedPrepareQuery +template +inline typename InterleavedSolutionStorage::ResultRow InterleavedPhsfQuery( + typename PhsfQueryHasher::Hash hash, + typename InterleavedSolutionStorage::Index segment_num, + typename InterleavedSolutionStorage::Index num_columns, + typename InterleavedSolutionStorage::Index start_bit, + const PhsfQueryHasher &hasher, const InterleavedSolutionStorage &iss) { + using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; + using Index = typename InterleavedSolutionStorage::Index; + using ResultRow = typename InterleavedSolutionStorage::ResultRow; + + static_assert(sizeof(Index) == sizeof(typename PhsfQueryHasher::Index), + "must be same"); + static_assert(sizeof(CoeffRow) == sizeof(typename PhsfQueryHasher::CoeffRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const CoeffRow cr = hasher.GetCoeffRow(hash); + + ResultRow sr = 0; + const CoeffRow cr_left = cr << static_cast(start_bit); + for (Index i = 0; i < num_columns; ++i) { + sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_left) << i; + } + + if (start_bit > 0) { + segment_num += num_columns; + const CoeffRow cr_right = + cr >> static_cast(kCoeffBits - start_bit); + for (Index i = 0; i < num_columns; ++i) { + sr ^= BitParity(iss.LoadSegment(segment_num + i) & cr_right) << i; + } + } + + return sr; +} + +// Filter query a key from InterleavedFilterQuery. +template +inline bool InterleavedFilterQuery( + typename FilterQueryHasher::Hash hash, + typename InterleavedSolutionStorage::Index segment_num, + typename InterleavedSolutionStorage::Index num_columns, + typename InterleavedSolutionStorage::Index start_bit, + const FilterQueryHasher &hasher, const InterleavedSolutionStorage &iss) { + using CoeffRow = typename InterleavedSolutionStorage::CoeffRow; + using Index = typename InterleavedSolutionStorage::Index; + using ResultRow = typename InterleavedSolutionStorage::ResultRow; + + static_assert(sizeof(Index) == sizeof(typename FilterQueryHasher::Index), + "must be same"); + static_assert( + sizeof(CoeffRow) == sizeof(typename FilterQueryHasher::CoeffRow), + "must be same"); + static_assert( + sizeof(ResultRow) == sizeof(typename FilterQueryHasher::ResultRow), + "must be same"); + + constexpr auto kCoeffBits = static_cast(sizeof(CoeffRow) * 8U); + + const CoeffRow cr = hasher.GetCoeffRow(hash); + const ResultRow expected = hasher.GetResultRowFromHash(hash); + + // TODO: consider optimizations such as + // * get rid of start_bit == 0 condition with careful fetching & shifting + if (start_bit == 0) { + for (Index i = 0; i < num_columns; ++i) { + if (BitParity(iss.LoadSegment(segment_num + i) & cr) != + (static_cast(expected >> i) & 1)) { + return false; + } + } + } else { + const CoeffRow cr_left = cr << static_cast(start_bit); + const CoeffRow cr_right = + cr >> static_cast(kCoeffBits - start_bit); + + for (Index i = 0; i < num_columns; ++i) { + CoeffRow soln_data = + (iss.LoadSegment(segment_num + i) & cr_left) ^ + (iss.LoadSegment(segment_num + num_columns + i) & cr_right); + if (BitParity(soln_data) != (static_cast(expected >> i) & 1)) { + return false; + } + } + } + // otherwise, all match + return true; +} + +// TODO: refactor Interleaved*Query so that queries can be "prepared" by +// prefetching memory, to hide memory latency for multiple queries in a +// single thread. + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/ribbon_config.cc b/src/rocksdb/util/ribbon_config.cc new file mode 100644 index 000000000..c1046f4aa --- /dev/null +++ b/src/rocksdb/util/ribbon_config.cc @@ -0,0 +1,506 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/ribbon_config.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +namespace detail { + +// Each instantiation of this struct is sufficiently unique for configuration +// purposes, and is only instantiated for settings where we support the +// configuration API. An application might only reference one instantiation, +// meaning the rest could be pruned at link time. +template +struct BandingConfigHelperData { + static constexpr size_t kKnownSize = 18U; + + // Because of complexity in the data, for smaller numbers of slots + // (powers of two up to 2^17), we record known numbers that can be added + // with kCfc chance of construction failure and settings in template + // parameters. Zero means "unsupported (too small) number of slots". + // (GetNumToAdd below will use interpolation for numbers of slots + // between powers of two; double rather than integer values here make + // that more accurate.) + static const std::array kKnownToAddByPow2; + + // For sufficiently large number of slots, doubling the number of + // slots will increase the expected overhead (slots over number added) + // by approximately this constant. + // (This is roughly constant regardless of ConstructionFailureChance and + // smash setting.) + // (Would be a constant if we had partial template specialization for + // static const members.) + static inline double GetFactorPerPow2() { + if (kCoeffBits == 128U) { + return 0.0038; + } else { + assert(kCoeffBits == 64U); + return 0.0083; + } + } + + // Overhead factor for 2^(kKnownSize-1) slots + // (Would be a constant if we had partial template specialization for + // static const members.) + static inline double GetFinalKnownFactor() { + return 1.0 * (uint32_t{1} << (kKnownSize - 1)) / + kKnownToAddByPow2[kKnownSize - 1]; + } + + // GetFinalKnownFactor() - (kKnownSize-1) * GetFactorPerPow2() + // (Would be a constant if we had partial template specialization for + // static const members.) + static inline double GetBaseFactor() { + return GetFinalKnownFactor() - (kKnownSize - 1) * GetFactorPerPow2(); + } + + // Get overhead factor (slots over number to add) for sufficiently large + // number of slots (by log base 2) + static inline double GetFactorForLarge(double log2_num_slots) { + return GetBaseFactor() + log2_num_slots * GetFactorPerPow2(); + } + + // For a given power of two number of slots (specified by whole number + // log base 2), implements GetNumToAdd for such limited case, returning + // double for better interpolation in GetNumToAdd and GetNumSlots. + static inline double GetNumToAddForPow2(uint32_t log2_num_slots) { + assert(log2_num_slots <= 32); // help clang-analyze + if (log2_num_slots < kKnownSize) { + return kKnownToAddByPow2[log2_num_slots]; + } else { + return 1.0 * (uint64_t{1} << log2_num_slots) / + GetFactorForLarge(1.0 * log2_num_slots); + } + } +}; + +// Based on data from FindOccupancy in ribbon_test +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 252.984, + 506.109, + 1013.71, + 2029.47, + 4060.43, + 8115.63, + 16202.2, + 32305.1, + 64383.5, + 128274, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 126.274, + 254.279, + 510.27, + 1022.24, + 2046.02, + 4091.99, + 8154.98, + 16244.3, + 32349.7, + 64426.6, + 128307, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 124.94, + 249.968, + 501.234, + 1004.06, + 2006.15, + 3997.89, + 7946.99, + 15778.4, + 31306.9, + 62115.3, + 123284, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 62.2683, + 126.259, + 254.268, + 509.975, + 1019.98, + 2026.16, + 4019.75, + 7969.8, + 15798.2, + 31330.3, + 62134.2, + 123255, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 248.851, + 499.532, + 1001.26, + 2003.97, + 4005.59, + 8000.39, + 15966.6, + 31828.1, + 63447.3, + 126506, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 122.637, + 250.651, + 506.625, + 1018.54, + 2036.43, + 4041.6, + 8039.25, + 16005, + 31869.6, + 63492.8, + 126537, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 120.659, + 243.346, + 488.168, + 976.373, + 1948.86, + 3875.85, + 7704.97, + 15312.4, + 30395.1, + 60321.8, + 119813, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 58.6016, + 122.619, + 250.641, + 503.595, + 994.165, + 1967.36, + 3898.17, + 7727.21, + 15331.5, + 30405.8, + 60376.2, + 119836, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 242.61, + 491.887, + 983.603, + 1968.21, + 3926.98, + 7833.99, + 15629, + 31199.9, + 62307.8, + 123870, + }}; + +template <> +const std::array BandingConfigHelperData< + kOneIn1000, 128U, /*smash*/ true>::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 117.19, + 245.105, + 500.748, + 1010.67, + 1993.4, + 3950.01, + 7863.31, + 15652, + 31262.1, + 62462.8, + 124095, +}}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{{ + 0, + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 114, + 234.8, + 471.498, + 940.165, + 1874, + 3721.5, + 7387.5, + 14592, + 29160, + 57745, + 115082, + }}; + +template <> +const std::array + BandingConfigHelperData::kKnownToAddByPow2{ + { + 0, + 0, + 0, + 0, + 0, + 0, // unsupported + 53.0434, + 117, + 245.312, + 483.571, + 950.251, + 1878, + 3736.34, + 7387.97, + 14618, + 29142.9, + 57838.8, + 114932, + }}; + +// We hide these implementation details from the .h file with explicit +// instantiations below these partial specializations. + +template +uint32_t BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, + true /* kIsSupported */>::GetNumToAdd(uint32_t num_slots) { + using Data = detail::BandingConfigHelperData; + if (num_slots == 0) { + return 0; + } + uint32_t num_to_add; + double log2_num_slots = std::log(num_slots) * 1.4426950409; + uint32_t floor_log2 = static_cast(log2_num_slots); + if (floor_log2 + 1 < Data::kKnownSize) { + double ceil_portion = 1.0 * num_slots / (uint32_t{1} << floor_log2) - 1.0; + // Must be a supported number of slots + assert(Data::kKnownToAddByPow2[floor_log2] > 0.0); + // Weighted average of two nearest known data points + num_to_add = static_cast( + ceil_portion * Data::kKnownToAddByPow2[floor_log2 + 1] + + (1.0 - ceil_portion) * Data::kKnownToAddByPow2[floor_log2]); + } else { + // Use formula for large values + double factor = Data::GetFactorForLarge(log2_num_slots); + assert(factor >= 1.0); + num_to_add = static_cast(num_slots / factor); + } + if (kHomogeneous) { + // Even when standard filter construction would succeed, we might + // have loaded things up too much for Homogeneous filter. (Complete + // explanation not known but observed empirically.) This seems to + // correct for that, mostly affecting small filter configurations. + if (num_to_add >= 8) { + num_to_add -= 8; + } else { + assert(false); + } + } + return num_to_add; +} + +template +uint32_t BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, + true /* kIsSupported */>::GetNumSlots(uint32_t num_to_add) { + using Data = detail::BandingConfigHelperData; + + if (num_to_add == 0) { + return 0; + } + if (kHomogeneous) { + // Reverse of above in GetNumToAdd + num_to_add += 8; + } + double log2_num_to_add = std::log(num_to_add) * 1.4426950409; + uint32_t approx_log2_slots = static_cast(log2_num_to_add + 0.5); + assert(approx_log2_slots <= 32); // help clang-analyze + + double lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots); + double upper_num_to_add; + if (approx_log2_slots == 0 || lower_num_to_add == /* unsupported */ 0) { + // Return minimum non-zero slots in standard implementation + return kUseSmash ? kCoeffBits : 2 * kCoeffBits; + } else if (num_to_add < lower_num_to_add) { + upper_num_to_add = lower_num_to_add; + --approx_log2_slots; + lower_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots); + } else { + upper_num_to_add = Data::GetNumToAddForPow2(approx_log2_slots + 1); + } + + assert(num_to_add >= lower_num_to_add); + assert(num_to_add < upper_num_to_add); + + double upper_portion = + (num_to_add - lower_num_to_add) / (upper_num_to_add - lower_num_to_add); + + double lower_num_slots = 1.0 * (uint64_t{1} << approx_log2_slots); + + // Interpolation, round up + return static_cast(upper_portion * lower_num_slots + + lower_num_slots + 0.999999999); +} + +// These explicit instantiations enable us to hide most of the +// implementation details from the .h file. (The .h file currently +// needs to determine whether settings are "supported" or not.) + +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; + +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported; + +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ false, /*hm*/ false, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ true, /*hm*/ false, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ false, /*hm*/ true, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 128U, /*sm*/ true, /*hm*/ true, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 64U, /*sm*/ false, /*hm*/ false, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported; +template struct BandingConfigHelper1MaybeSupported< + kOneIn1000, 64U, /*sm*/ false, /*hm*/ true, /*sup*/ true>; +template struct BandingConfigHelper1MaybeSupported; + +} // namespace detail + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/ribbon_config.h b/src/rocksdb/util/ribbon_config.h new file mode 100644 index 000000000..0e3edf073 --- /dev/null +++ b/src/rocksdb/util/ribbon_config.h @@ -0,0 +1,182 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include +#include + +#include "port/lang.h" // for FALLTHROUGH_INTENDED +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_config.h: APIs for relating numbers of slots with numbers of +// additions for tolerable construction failure probabilities. This is +// separate from ribbon_impl.h because it might not be needed for +// some applications. +// +// This API assumes uint32_t for number of slots, as a single Ribbon +// linear system should not normally overflow that without big penalties. +// +// Template parameter kCoeffBits uses uint64_t for convenience in case it +// comes from size_t. +// +// Most of the complexity here is trying to optimize speed and +// compiled code size, using templates to minimize table look-ups and +// the compiled size of all linked look-up tables. Look-up tables are +// required because we don't have good formulas, and the data comes +// from running FindOccupancy in ribbon_test. + +// Represents a chosen chance of successful Ribbon construction for a single +// seed. Allowing higher chance of failed construction can reduce space +// overhead but takes extra time in construction. +enum ConstructionFailureChance { + kOneIn2, + kOneIn20, + // When using kHomogeneous==true, construction failure chance should + // not generally exceed target FP rate, so it unlikely useful to + // allow a higher "failure" chance. In some cases, even more overhead + // is appropriate. (TODO) + kOneIn1000, +}; + +namespace detail { + +// It is useful to compile ribbon_test linking to BandingConfigHelper with +// settings for which we do not have configuration data, as long as we don't +// run the code. This template hack supports that. +template +struct BandingConfigHelper1MaybeSupported { + public: + static uint32_t GetNumToAdd(uint32_t num_slots) { + // Unsupported + assert(num_slots == 0); + (void)num_slots; + return 0; + } + + static uint32_t GetNumSlots(uint32_t num_to_add) { + // Unsupported + assert(num_to_add == 0); + (void)num_to_add; + return 0; + } +}; + +// Base class for BandingConfigHelper1 and helper for BandingConfigHelper +// with core implementations built on above data +template +struct BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, true /* kIsSupported */> { + public: + // See BandingConfigHelper1. Implementation in ribbon_config.cc + static uint32_t GetNumToAdd(uint32_t num_slots); + + // See BandingConfigHelper1. Implementation in ribbon_config.cc + static uint32_t GetNumSlots(uint32_t num_to_add); +}; + +} // namespace detail + +template +struct BandingConfigHelper1 + : public detail::BandingConfigHelper1MaybeSupported< + kCfc, kCoeffBits, kUseSmash, kHomogeneous, + /* kIsSupported */ kCoeffBits == 64 || kCoeffBits == 128> { + public: + // Returns a number of entries that can be added to a given number of + // slots, with roughly kCfc chance of construction failure per seed, + // or better. Does NOT do rounding for InterleavedSoln; call + // RoundUpNumSlots for that. + // + // inherited: + // static uint32_t GetNumToAdd(uint32_t num_slots); + + // Returns a number of slots for a given number of entries to add + // that should have roughly kCfc chance of construction failure per + // seed, or better. Does NOT do rounding for InterleavedSoln; call + // RoundUpNumSlots for that. + // + // num_to_add should not exceed roughly 2/3rds of the maximum value + // of the uint32_t type to avoid overflow. + // + // inherited: + // static uint32_t GetNumSlots(uint32_t num_to_add); +}; + +// Configured using TypesAndSettings as in ribbon_impl.h +template +struct BandingConfigHelper1TS + : public BandingConfigHelper1< + kCfc, + /* kCoeffBits */ sizeof(typename TypesAndSettings::CoeffRow) * 8U, + TypesAndSettings::kUseSmash, TypesAndSettings::kHomogeneous> {}; + +// Like BandingConfigHelper1TS except failure chance can be a runtime rather +// than compile time value. +template +struct BandingConfigHelper { + public: + static constexpr ConstructionFailureChance kDefaultFailureChance = + TypesAndSettings::kHomogeneous ? kOneIn1000 : kOneIn20; + + static uint32_t GetNumToAdd( + uint32_t num_slots, + ConstructionFailureChance max_failure = kDefaultFailureChance) { + switch (max_failure) { + default: + assert(false); + FALLTHROUGH_INTENDED; + case kOneIn20: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumToAdd(num_slots); + } + case kOneIn2: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumToAdd(num_slots); + } + case kOneIn1000: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumToAdd(num_slots); + } + } + } + + static uint32_t GetNumSlots( + uint32_t num_to_add, + ConstructionFailureChance max_failure = kDefaultFailureChance) { + switch (max_failure) { + default: + assert(false); + FALLTHROUGH_INTENDED; + case kOneIn20: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumSlots(num_to_add); + } + case kOneIn2: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumSlots(num_to_add); + } + case kOneIn1000: { + using H1 = BandingConfigHelper1TS; + return H1::GetNumSlots(num_to_add); + } + } + } +}; + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/ribbon_impl.h b/src/rocksdb/util/ribbon_impl.h new file mode 100644 index 000000000..0afecc67d --- /dev/null +++ b/src/rocksdb/util/ribbon_impl.h @@ -0,0 +1,1137 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "port/port.h" // for PREFETCH +#include "util/fastrange.h" +#include "util/ribbon_alg.h" + +namespace ROCKSDB_NAMESPACE { + +namespace ribbon { + +// RIBBON PHSF & RIBBON Filter (Rapid Incremental Boolean Banding ON-the-fly) +// +// ribbon_impl.h: templated (parameterized) standard implementations +// +// Ribbon is a Perfect Hash Static Function construction useful as a compact +// static Bloom filter alternative. See ribbon_alg.h for core algorithms +// and core design details. +// +// TODO: more details on trade-offs and practical issues. +// +// APIs for configuring Ribbon are in ribbon_config.h + +// Ribbon implementations in this file take these parameters, which must be +// provided in a class/struct type with members expressed in this concept: + +// concept TypesAndSettings { +// // See RibbonTypes and *Hasher in ribbon_alg.h, except here we have +// // the added constraint that Hash be equivalent to either uint32_t or +// // uint64_t. +// typename Hash; +// typename CoeffRow; +// typename ResultRow; +// typename Index; +// typename Key; +// static constexpr bool kFirstCoeffAlwaysOne; +// +// // An unsigned integer type for identifying a hash seed, typically +// // uint32_t or uint64_t. Importantly, this is the amount of data +// // stored in memory for identifying a raw seed. See StandardHasher. +// typename Seed; +// +// // When true, the PHSF implements a static filter, expecting just +// // keys as inputs for construction. When false, implements a general +// // PHSF and expects std::pair as inputs for +// // construction. +// static constexpr bool kIsFilter; +// +// // When true, enables a special "homogeneous" filter implementation that +// // is slightly faster to construct, and never fails to construct though +// // FP rate can quickly explode in cases where corresponding +// // non-homogeneous filter would fail (or nearly fail?) to construct. +// // For smaller filters, you can configure with ConstructionFailureChance +// // smaller than desired FP rate to largely counteract this effect. +// // TODO: configuring Homogeneous Ribbon for arbitrarily large filters +// // based on data from OptimizeHomogAtScale +// static constexpr bool kHomogeneous; +// +// // When true, adds a tiny bit more hashing logic on queries and +// // construction to improve utilization at the beginning and end of +// // the structure. Recommended when CoeffRow is only 64 bits (or +// // less), so typical num_starts < 10k. Although this is compatible +// // with kHomogeneous, the competing space vs. time priorities might +// // not be useful. +// static constexpr bool kUseSmash; +// +// // When true, allows number of "starts" to be zero, for best support +// // of the "no keys to add" case by always returning false for filter +// // queries. (This is distinct from the "keys added but no space for +// // any data" case, in which a filter always returns true.) The cost +// // supporting this is a conditional branch (probably predictable) in +// // queries. +// static constexpr bool kAllowZeroStarts; +// +// // A seedable stock hash function on Keys. All bits of Hash must +// // be reasonably high quality. XXH functions recommended, but +// // Murmur, City, Farm, etc. also work. +// static Hash HashFn(const Key &, Seed raw_seed); +// }; + +// A bit of a hack to automatically construct the type for +// AddInput based on a constexpr bool. +template +struct AddInputSelector { + // For general PHSF, not filter + using T = std::pair; +}; + +template +struct AddInputSelector { + // For Filter + using T = Key; +}; + +// To avoid writing 'typename' everywhere that we use types like 'Index' +#define IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings) \ + using CoeffRow = typename TypesAndSettings::CoeffRow; \ + using ResultRow = typename TypesAndSettings::ResultRow; \ + using Index = typename TypesAndSettings::Index; \ + using Hash = typename TypesAndSettings::Hash; \ + using Key = typename TypesAndSettings::Key; \ + using Seed = typename TypesAndSettings::Seed; \ + \ + /* Some more additions */ \ + using QueryInput = Key; \ + using AddInput = typename ROCKSDB_NAMESPACE::ribbon::AddInputSelector< \ + Key, ResultRow, TypesAndSettings::kIsFilter>::T; \ + static constexpr auto kCoeffBits = \ + static_cast(sizeof(CoeffRow) * 8U); \ + \ + /* Export to algorithm */ \ + static constexpr bool kFirstCoeffAlwaysOne = \ + TypesAndSettings::kFirstCoeffAlwaysOne; \ + \ + static_assert(sizeof(CoeffRow) + sizeof(ResultRow) + sizeof(Index) + \ + sizeof(Hash) + sizeof(Key) + sizeof(Seed) + \ + sizeof(QueryInput) + sizeof(AddInput) + kCoeffBits + \ + kFirstCoeffAlwaysOne > \ + 0, \ + "avoid unused warnings, semicolon expected after macro call") + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4309) // cast truncating constant +#pragma warning(disable : 4307) // arithmetic constant overflow +#endif + +// StandardHasher: A standard implementation of concepts RibbonTypes, +// PhsfQueryHasher, FilterQueryHasher, and BandingHasher from ribbon_alg.h. +// +// This implementation should be suitable for most all practical purposes +// as it "behaves" across a wide range of settings, with little room left +// for improvement. The key functionality in this hasher is generating +// CoeffRows, starts, and (for filters) ResultRows, which could be ~150 +// bits of data or more, from a modest hash of 64 or even just 32 bits, with +// enough uniformity and bitwise independence to be close to "the best you +// can do" with available hash information in terms of FP rate and +// compactness. (64 bits recommended and sufficient for PHSF practical +// purposes.) +// +// Another feature of this hasher is a minimal "premixing" of seeds before +// they are provided to TypesAndSettings::HashFn in case that function does +// not provide sufficiently independent hashes when iterating merely +// sequentially on seeds. (This for example works around a problem with the +// preview version 0.7.2 of XXH3 used in RocksDB, a.k.a. XXPH3 or Hash64, and +// MurmurHash1 used in RocksDB, a.k.a. Hash.) We say this pre-mixing step +// translates "ordinal seeds," which we iterate sequentially to find a +// solution, into "raw seeds," with many more bits changing for each +// iteration. The translation is an easily reversible lightweight mixing, +// not suitable for hashing on its own. An advantage of this approach is that +// StandardHasher can store just the raw seed (e.g. 64 bits) for fast query +// times, while from the application perspective, we can limit to a small +// number of ordinal keys (e.g. 64 in 6 bits) for saving in metadata. +// +// The default constructor initializes the seed to ordinal seed zero, which +// is equal to raw seed zero. +// +template +class StandardHasher { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + inline Hash GetHash(const Key& key) const { + return TypesAndSettings::HashFn(key, raw_seed_); + }; + // For when AddInput == pair (kIsFilter == false) + inline Hash GetHash(const std::pair& bi) const { + return GetHash(bi.first); + }; + inline Index GetStart(Hash h, Index num_starts) const { + // This is "critical path" code because it's required before memory + // lookup. + // + // FastRange gives us a fast and effective mapping from h to the + // appropriate range. This depends most, sometimes exclusively, on + // upper bits of h. + // + if (TypesAndSettings::kUseSmash) { + // Extra logic to "smash" entries at beginning and end, for + // better utilization. For example, without smash and with + // kFirstCoeffAlwaysOne, there's about a 30% chance that the + // first slot in the banding will be unused, and worse without + // kFirstCoeffAlwaysOne. The ending slots are even less utilized + // without smash. + // + // But since this only affects roughly kCoeffBits of the slots, + // it's usually small enough to be ignorable (less computation in + // this function) when number of slots is roughly 10k or larger. + // + // The best values for these smash weights might depend on how + // densely you're packing entries, and also kCoeffBits, but this + // seems to work well for roughly 95% success probability. + // + constexpr Index kFrontSmash = kCoeffBits / 4; + constexpr Index kBackSmash = kCoeffBits / 4; + Index start = FastRangeGeneric(h, num_starts + kFrontSmash + kBackSmash); + start = std::max(start, kFrontSmash); + start -= kFrontSmash; + start = std::min(start, num_starts - 1); + return start; + } else { + // For query speed, we allow small number of initial and final + // entries to be under-utilized. + // NOTE: This call statically enforces that Hash is equivalent to + // either uint32_t or uint64_t. + return FastRangeGeneric(h, num_starts); + } + } + inline CoeffRow GetCoeffRow(Hash h) const { + // This is not so much "critical path" code because it can be done in + // parallel (instruction level) with memory lookup. + // + // When we might have many entries squeezed into a single start, + // we need reasonably good remixing for CoeffRow. + if (TypesAndSettings::kUseSmash) { + // Reasonably good, reasonably fast, reasonably general. + // Probably not 1:1 but probably close enough. + Unsigned128 a = Multiply64to128(h, kAltCoeffFactor1); + Unsigned128 b = Multiply64to128(h, kAltCoeffFactor2); + auto cr = static_cast(b ^ (a << 64) ^ (a >> 64)); + + // Now ensure the value is non-zero + if (kFirstCoeffAlwaysOne) { + cr |= 1; + } else { + // Still have to ensure some bit is non-zero + cr |= (cr == 0) ? 1 : 0; + } + return cr; + } + // If not kUseSmash, we ensure we're not squeezing many entries into a + // single start, in part by ensuring num_starts > num_slots / 2. Thus, + // here we do not need good remixing for CoeffRow, but just enough that + // (a) every bit is reasonably independent from Start. + // (b) every Hash-length bit subsequence of the CoeffRow has full or + // nearly full entropy from h. + // (c) if nontrivial bit subsequences within are correlated, it needs to + // be more complicated than exact copy or bitwise not (at least without + // kFirstCoeffAlwaysOne), or else there seems to be a kind of + // correlated clustering effect. + // (d) the CoeffRow is not zero, so that no one input on its own can + // doom construction success. (Preferably a mix of 1's and 0's if + // satisfying above.) + + // First, establish sufficient bitwise independence from Start, with + // multiplication by a large random prime. + // Note that we cast to Hash because if we use product bits beyond + // original input size, that's going to correlate with Start (FastRange) + // even with a (likely) different multiplier here. + Hash a = h * kCoeffAndResultFactor; + + static_assert( + sizeof(Hash) == sizeof(uint64_t) || sizeof(Hash) == sizeof(uint32_t), + "Supported sizes"); + // If that's big enough, we're done. If not, we have to expand it, + // maybe up to 4x size. + uint64_t b; + if (sizeof(Hash) < sizeof(uint64_t)) { + // Almost-trivial hash expansion (OK - see above), favoring roughly + // equal number of 1's and 0's in result + b = (uint64_t{a} << 32) ^ (a ^ kCoeffXor32); + } else { + b = a; + } + static_assert(sizeof(CoeffRow) <= sizeof(Unsigned128), "Supported sizes"); + Unsigned128 c; + if (sizeof(uint64_t) < sizeof(CoeffRow)) { + // Almost-trivial hash expansion (OK - see above), favoring roughly + // equal number of 1's and 0's in result + c = (Unsigned128{b} << 64) ^ (b ^ kCoeffXor64); + } else { + c = b; + } + auto cr = static_cast(c); + + // Now ensure the value is non-zero + if (kFirstCoeffAlwaysOne) { + cr |= 1; + } else if (sizeof(CoeffRow) == sizeof(Hash)) { + // Still have to ensure some bit is non-zero + cr |= (cr == 0) ? 1 : 0; + } else { + // (We did trivial expansion with constant xor, which ensures some + // bits are non-zero.) + } + return cr; + } + inline ResultRow GetResultRowMask() const { + // TODO: will be used with InterleavedSolutionStorage? + // For now, all bits set (note: might be a small type so might need to + // narrow after promotion) + return static_cast(~ResultRow{0}); + } + inline ResultRow GetResultRowFromHash(Hash h) const { + if (TypesAndSettings::kIsFilter && !TypesAndSettings::kHomogeneous) { + // This is not so much "critical path" code because it can be done in + // parallel (instruction level) with memory lookup. + // + // ResultRow bits only needs to be independent from CoeffRow bits if + // many entries might have the same start location, where "many" is + // comparable to number of hash bits or kCoeffBits. If !kUseSmash + // and num_starts > kCoeffBits, it is safe and efficient to draw from + // the same bits computed for CoeffRow, which are reasonably + // independent from Start. (Inlining and common subexpression + // elimination with GetCoeffRow should make this + // a single shared multiplication in generated code when !kUseSmash.) + Hash a = h * kCoeffAndResultFactor; + + // The bits here that are *most* independent of Start are the highest + // order bits (as in Knuth multiplicative hash). To make those the + // most preferred for use in the result row, we do a bswap here. + auto rr = static_cast(EndianSwapValue(a)); + return rr & GetResultRowMask(); + } else { + // Must be zero + return 0; + } + } + // For when AddInput == Key (kIsFilter == true) + inline ResultRow GetResultRowFromInput(const Key&) const { + // Must be zero + return 0; + } + // For when AddInput == pair (kIsFilter == false) + inline ResultRow GetResultRowFromInput( + const std::pair& bi) const { + // Simple extraction + return bi.second; + } + + // Seed tracking APIs - see class comment + void SetRawSeed(Seed seed) { raw_seed_ = seed; } + Seed GetRawSeed() { return raw_seed_; } + void SetOrdinalSeed(Seed count) { + // A simple, reversible mixing of any size (whole bytes) up to 64 bits. + // This allows casting the raw seed to any smaller size we use for + // ordinal seeds without risk of duplicate raw seeds for unique ordinal + // seeds. + + // Seed type might be smaller than numerical promotion size, but Hash + // should be at least that size, so we use Hash as intermediate type. + static_assert(sizeof(Seed) <= sizeof(Hash), + "Hash must be at least size of Seed"); + + // Multiply by a large random prime (one-to-one for any prefix of bits) + Hash tmp = count * kToRawSeedFactor; + // Within-byte one-to-one mixing + static_assert((kSeedMixMask & (kSeedMixMask >> kSeedMixShift)) == 0, + "Illegal mask+shift"); + tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift; + raw_seed_ = static_cast(tmp); + // dynamic verification + assert(GetOrdinalSeed() == count); + } + Seed GetOrdinalSeed() { + Hash tmp = raw_seed_; + // Within-byte one-to-one mixing (its own inverse) + tmp ^= (tmp & kSeedMixMask) >> kSeedMixShift; + // Multiply by 64-bit multiplicative inverse + static_assert(kToRawSeedFactor * kFromRawSeedFactor == Hash{1}, + "Must be inverses"); + return static_cast(tmp * kFromRawSeedFactor); + } + + protected: + // For expanding hash: + // large random prime + static constexpr Hash kCoeffAndResultFactor = + static_cast(0xc28f82822b650bedULL); + static constexpr uint64_t kAltCoeffFactor1 = 0x876f170be4f1fcb9U; + static constexpr uint64_t kAltCoeffFactor2 = 0xf0433a4aecda4c5fU; + // random-ish data + static constexpr uint32_t kCoeffXor32 = 0xa6293635U; + static constexpr uint64_t kCoeffXor64 = 0xc367844a6e52731dU; + + // For pre-mixing seeds + static constexpr Hash kSeedMixMask = static_cast(0xf0f0f0f0f0f0f0f0ULL); + static constexpr unsigned kSeedMixShift = 4U; + static constexpr Hash kToRawSeedFactor = + static_cast(0xc78219a23eeadd03ULL); + static constexpr Hash kFromRawSeedFactor = + static_cast(0xfe1a137d14b475abULL); + + // See class description + Seed raw_seed_ = 0; +}; + +// StandardRehasher (and StandardRehasherAdapter): A variant of +// StandardHasher that uses the same type for keys as for hashes. +// This is primarily intended for building a Ribbon filter +// from existing hashes without going back to original inputs in +// order to apply a different seed. This hasher seeds a 1-to-1 mixing +// transformation to apply a seed to an existing hash. (Untested for +// hash-sized keys that are not already uniformly distributed.) This +// transformation builds on the seed pre-mixing done in StandardHasher. +// +// Testing suggests essentially no degradation of solution success rate +// vs. going back to original inputs when changing hash seeds. For example: +// Average re-seeds for solution with r=128, 1.02x overhead, and ~100k keys +// is about 1.10 for both StandardHasher and StandardRehasher. +// +// StandardRehasher is not really recommended for general PHSFs (not +// filters) because a collision in the original hash could prevent +// construction despite re-seeding the Rehasher. (Such collisions +// do not interfere with filter construction.) +// +// concept RehasherTypesAndSettings: like TypesAndSettings but +// does not require Key or HashFn. +template +class StandardRehasherAdapter : public RehasherTypesAndSettings { + public: + using Hash = typename RehasherTypesAndSettings::Hash; + using Key = Hash; + using Seed = typename RehasherTypesAndSettings::Seed; + + static Hash HashFn(const Hash& input, Seed raw_seed) { + // Note: raw_seed is already lightly pre-mixed, and this multiplication + // by a large prime is sufficient mixing (low-to-high bits) on top of + // that for good FastRange results, which depends primarily on highest + // bits. (The hashed CoeffRow and ResultRow are less sensitive to + // mixing than Start.) + // Also note: did consider adding ^ (input >> some) before the + // multiplication, but doesn't appear to be necessary. + return (input ^ raw_seed) * kRehashFactor; + } + + private: + static constexpr Hash kRehashFactor = + static_cast(0x6193d459236a3a0dULL); +}; + +// See comment on StandardRehasherAdapter +template +using StandardRehasher = + StandardHasher>; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +// Especially with smaller hashes (e.g. 32 bit), there can be noticeable +// false positives due to collisions in the Hash returned by GetHash. +// This function returns the expected FP rate due to those collisions, +// which can be added to the expected FP rate from the underlying data +// structure. (Note: technically, a + b is only a good approximation of +// 1-(1-a)(1-b) == a + b - a*b, if a and b are much closer to 0 than to 1.) +// The number of entries added can be a double here in case it's an +// average. +template +double ExpectedCollisionFpRate(const Hasher& hasher, Numerical added) { + // Standardize on the 'double' specialization + return ExpectedCollisionFpRate(hasher, 1.0 * added); +} +template +double ExpectedCollisionFpRate(const Hasher& /*hasher*/, double added) { + // Technically, there could be overlap among the added, but ignoring that + // is typically close enough. + return added / std::pow(256.0, sizeof(typename Hasher::Hash)); +} + +// StandardBanding: a canonical implementation of BandingStorage and +// BacktrackStorage, with convenience API for banding (solving with on-the-fly +// Gaussian elimination) with and without backtracking. +template +class StandardBanding : public StandardHasher { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + StandardBanding(Index num_slots = 0, Index backtrack_size = 0) { + Reset(num_slots, backtrack_size); + } + + void Reset(Index num_slots, Index backtrack_size = 0) { + if (num_slots == 0) { + // Unusual (TypesAndSettings::kAllowZeroStarts) or "uninitialized" + num_starts_ = 0; + } else { + // Normal + assert(num_slots >= kCoeffBits); + if (num_slots > num_slots_allocated_) { + coeff_rows_.reset(new CoeffRow[num_slots]()); + if (!TypesAndSettings::kHomogeneous) { + // Note: don't strictly have to zero-init result_rows, + // except possible information leakage, etc ;) + result_rows_.reset(new ResultRow[num_slots]()); + } + num_slots_allocated_ = num_slots; + } else { + for (Index i = 0; i < num_slots; ++i) { + coeff_rows_[i] = 0; + if (!TypesAndSettings::kHomogeneous) { + // Note: don't strictly have to zero-init result_rows, + // except possible information leakage, etc ;) + result_rows_[i] = 0; + } + } + } + num_starts_ = num_slots - kCoeffBits + 1; + } + EnsureBacktrackSize(backtrack_size); + } + + void EnsureBacktrackSize(Index backtrack_size) { + if (backtrack_size > backtrack_size_) { + backtrack_.reset(new Index[backtrack_size]); + backtrack_size_ = backtrack_size; + } + } + + // ******************************************************************** + // From concept BandingStorage + + inline bool UsePrefetch() const { + // A rough guesstimate of when prefetching during construction pays off. + // TODO: verify/validate + return num_starts_ > 1500; + } + inline void Prefetch(Index i) const { + PREFETCH(&coeff_rows_[i], 1 /* rw */, 1 /* locality */); + if (!TypesAndSettings::kHomogeneous) { + PREFETCH(&result_rows_[i], 1 /* rw */, 1 /* locality */); + } + } + inline void LoadRow(Index i, CoeffRow* cr, ResultRow* rr, + bool for_back_subst) const { + *cr = coeff_rows_[i]; + if (TypesAndSettings::kHomogeneous) { + if (for_back_subst && *cr == 0) { + // Cheap pseudorandom data to fill unconstrained solution rows + *rr = static_cast(i * 0x9E3779B185EBCA87ULL); + } else { + *rr = 0; + } + } else { + *rr = result_rows_[i]; + } + } + inline void StoreRow(Index i, CoeffRow cr, ResultRow rr) { + coeff_rows_[i] = cr; + if (TypesAndSettings::kHomogeneous) { + assert(rr == 0); + } else { + result_rows_[i] = rr; + } + } + inline Index GetNumStarts() const { return num_starts_; } + + // from concept BacktrackStorage, for when backtracking is used + inline bool UseBacktrack() const { return true; } + inline void BacktrackPut(Index i, Index to_save) { backtrack_[i] = to_save; } + inline Index BacktrackGet(Index i) const { return backtrack_[i]; } + + // ******************************************************************** + // Some useful API, still somewhat low level. Here an input is + // a Key for filters, or std::pair for general PHSF. + + // Adds a range of inputs to the banding, returning true if successful. + // False means none or some may have been successfully added, so it's + // best to Reset this banding before any further use. + // + // Adding can fail even before all the "slots" are completely "full". + // + template + bool AddRange(InputIterator begin, InputIterator end) { + assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Can't add any in this case. + return begin == end; + } + // Normal + return BandingAddRange(this, *this, begin, end); + } + + // Adds a range of inputs to the banding, returning true if successful, + // or if unsuccessful, rolls back to state before this call and returns + // false. Caller guarantees that the number of inputs in this batch + // does not exceed `backtrack_size` provided to Reset. + // + // Adding can fail even before all the "slots" are completely "full". + // + template + bool AddRangeOrRollBack(InputIterator begin, InputIterator end) { + assert(num_starts_ > 0 || TypesAndSettings::kAllowZeroStarts); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Can't add any in this case. + return begin == end; + } + // else Normal + return BandingAddRange(this, this, *this, begin, end); + } + + // Adds a single input to the banding, returning true if successful. + // If unsuccessful, returns false and banding state is unchanged. + // + // Adding can fail even before all the "slots" are completely "full". + // + bool Add(const AddInput& input) { + // Pointer can act as iterator + return AddRange(&input, &input + 1); + } + + // Return the number of "occupied" rows (with non-zero coefficients stored). + Index GetOccupiedCount() const { + Index count = 0; + if (num_starts_ > 0) { + const Index num_slots = num_starts_ + kCoeffBits - 1; + for (Index i = 0; i < num_slots; ++i) { + if (coeff_rows_[i] != 0) { + ++count; + } + } + } + return count; + } + + // Returns whether a row is "occupied" in the banding (non-zero + // coefficients stored). (Only recommended for debug/test) + bool IsOccupied(Index i) { return coeff_rows_[i] != 0; } + + // ******************************************************************** + // High-level API + + // Iteratively (a) resets the structure for `num_slots`, (b) attempts + // to add the range of inputs, and (c) if unsuccessful, chooses next + // hash seed, until either successful or unsuccessful with all the + // allowed seeds. Returns true if successful. In that case, use + // GetOrdinalSeed() or GetRawSeed() to get the successful seed. + // + // The allowed sequence of hash seeds is determined by + // `starting_ordinal_seed,` the first ordinal seed to be attempted + // (see StandardHasher), and `ordinal_seed_mask,` a bit mask (power of + // two minus one) for the range of ordinal seeds to consider. The + // max number of seeds considered will be ordinal_seed_mask + 1. + // For filters we suggest `starting_ordinal_seed` be chosen randomly + // or round-robin, to minimize false positive correlations between keys. + // + // If unsuccessful, how best to continue is going to be application + // specific. It should be possible to choose parameters such that + // failure is extremely unlikely, using max_seed around 32 to 64. + // (TODO: APIs to help choose parameters) One option for fallback in + // constructing a filter is to construct a Bloom filter instead. + // Increasing num_slots is an option, but should not be used often + // unless construction maximum latency is a concern (rather than + // average running time of construction). Instead, choose parameters + // appropriately and trust that seeds are independent. (Also, + // increasing num_slots without changing hash seed would have a + // significant correlation in success, rather than independence.) + template + bool ResetAndFindSeedToSolve(Index num_slots, InputIterator begin, + InputIterator end, + Seed starting_ordinal_seed = 0U, + Seed ordinal_seed_mask = 63U) { + // power of 2 minus 1 + assert((ordinal_seed_mask & (ordinal_seed_mask + 1)) == 0); + // starting seed is within mask + assert((starting_ordinal_seed & ordinal_seed_mask) == + starting_ordinal_seed); + starting_ordinal_seed &= ordinal_seed_mask; // if not debug + + Seed cur_ordinal_seed = starting_ordinal_seed; + do { + StandardHasher::SetOrdinalSeed(cur_ordinal_seed); + Reset(num_slots); + bool success = AddRange(begin, end); + if (success) { + return true; + } + cur_ordinal_seed = (cur_ordinal_seed + 1) & ordinal_seed_mask; + } while (cur_ordinal_seed != starting_ordinal_seed); + // Reached limit by circling around + return false; + } + + static std::size_t EstimateMemoryUsage(uint32_t num_slots) { + std::size_t bytes_coeff_rows = num_slots * sizeof(CoeffRow); + std::size_t bytes_result_rows = num_slots * sizeof(ResultRow); + std::size_t bytes_backtrack = 0; + std::size_t bytes_banding = + bytes_coeff_rows + bytes_result_rows + bytes_backtrack; + + return bytes_banding; + } + + protected: + // TODO: explore combining in a struct + std::unique_ptr coeff_rows_; + std::unique_ptr result_rows_; + // We generally store "starts" instead of slots for speed of GetStart(), + // as in StandardHasher. + Index num_starts_ = 0; + Index num_slots_allocated_ = 0; + std::unique_ptr backtrack_; + Index backtrack_size_ = 0; +}; + +// Implements concept SimpleSolutionStorage, mostly for demonstration +// purposes. This is "in memory" only because it does not handle byte +// ordering issues for serialization. +template +class InMemSimpleSolution { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + void PrepareForNumStarts(Index num_starts) { + if (TypesAndSettings::kAllowZeroStarts && num_starts == 0) { + // Unusual + num_starts_ = 0; + } else { + // Normal + const Index num_slots = num_starts + kCoeffBits - 1; + assert(num_slots >= kCoeffBits); + if (num_slots > num_slots_allocated_) { + // Do not need to init the memory + solution_rows_.reset(new ResultRow[num_slots]); + num_slots_allocated_ = num_slots; + } + num_starts_ = num_starts; + } + } + + Index GetNumStarts() const { return num_starts_; } + + ResultRow Load(Index slot_num) const { return solution_rows_[slot_num]; } + + void Store(Index slot_num, ResultRow solution_row) { + solution_rows_[slot_num] = solution_row; + } + + // ******************************************************************** + // High-level API + + template + void BackSubstFrom(const BandingStorage& bs) { + if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) { + // Unusual + PrepareForNumStarts(0); + } else { + // Normal + SimpleBackSubst(this, bs); + } + } + + template + ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const { + // assert(!TypesAndSettings::kIsFilter); Can be useful in testing + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual + return 0; + } else { + // Normal + return SimplePhsfQuery(input, hasher, *this); + } + } + + template + bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Zero starts presumes no keys added -> always false + return false; + } else { + // Normal, or upper_num_columns_ == 0 means "no space for data" and + // thus will always return true. + return SimpleFilterQuery(input, hasher, *this); + } + } + + double ExpectedFpRate() const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual, but we don't have FPs if we always return false. + return 0.0; + } + // else Normal + + // Each result (solution) bit (column) cuts FP rate in half + return std::pow(0.5, 8U * sizeof(ResultRow)); + } + + // ******************************************************************** + // Static high-level API + + // Round up to a number of slots supported by this structure. Note that + // this needs to be must be taken into account for the banding if this + // solution layout/storage is to be used. + static Index RoundUpNumSlots(Index num_slots) { + // Must be at least kCoeffBits for at least one start + // Or if not smash, even more because hashing not equipped + // for stacking up so many entries on a single start location + auto min_slots = kCoeffBits * (TypesAndSettings::kUseSmash ? 1 : 2); + return std::max(num_slots, static_cast(min_slots)); + } + + protected: + // We generally store "starts" instead of slots for speed of GetStart(), + // as in StandardHasher. + Index num_starts_ = 0; + Index num_slots_allocated_ = 0; + std::unique_ptr solution_rows_; +}; + +// Implements concept InterleavedSolutionStorage always using little-endian +// byte order, so easy for serialization/deserialization. This implementation +// fully supports fractional bits per key, where any number of segments +// (number of bytes multiple of sizeof(CoeffRow)) can be used with any number +// of slots that is a multiple of kCoeffBits. +// +// The structure is passed an externally allocated/de-allocated byte buffer +// that is optionally pre-populated (from storage) for answering queries, +// or can be populated by BackSubstFrom. +// +template +class SerializableInterleavedSolution { + public: + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings); + + // Does not take ownership of `data` but uses it (up to `data_len` bytes) + // throughout lifetime + SerializableInterleavedSolution(char* data, size_t data_len) + : data_(data), data_len_(data_len) {} + + void PrepareForNumStarts(Index num_starts) { + assert(num_starts == 0 || (num_starts % kCoeffBits == 1)); + num_starts_ = num_starts; + + InternalConfigure(); + } + + Index GetNumStarts() const { return num_starts_; } + + Index GetNumBlocks() const { + const Index num_slots = num_starts_ + kCoeffBits - 1; + return num_slots / kCoeffBits; + } + + Index GetUpperNumColumns() const { return upper_num_columns_; } + + Index GetUpperStartBlock() const { return upper_start_block_; } + + Index GetNumSegments() const { + return static_cast(data_len_ / sizeof(CoeffRow)); + } + + CoeffRow LoadSegment(Index segment_num) const { + assert(data_ != nullptr); // suppress clang analyzer report + return DecodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow)); + } + void StoreSegment(Index segment_num, CoeffRow val) { + assert(data_ != nullptr); // suppress clang analyzer report + EncodeFixedGeneric(data_ + segment_num * sizeof(CoeffRow), val); + } + void PrefetchSegmentRange(Index begin_segment_num, + Index end_segment_num) const { + if (end_segment_num == begin_segment_num) { + // Nothing to do + return; + } + char* cur = data_ + begin_segment_num * sizeof(CoeffRow); + char* last = data_ + (end_segment_num - 1) * sizeof(CoeffRow); + while (cur < last) { + PREFETCH(cur, 0 /* rw */, 1 /* locality */); + cur += CACHE_LINE_SIZE; + } + PREFETCH(last, 0 /* rw */, 1 /* locality */); + } + + // ******************************************************************** + // High-level API + + void ConfigureForNumBlocks(Index num_blocks) { + if (num_blocks == 0) { + PrepareForNumStarts(0); + } else { + PrepareForNumStarts(num_blocks * kCoeffBits - kCoeffBits + 1); + } + } + + void ConfigureForNumSlots(Index num_slots) { + assert(num_slots % kCoeffBits == 0); + ConfigureForNumBlocks(num_slots / kCoeffBits); + } + + template + void BackSubstFrom(const BandingStorage& bs) { + if (TypesAndSettings::kAllowZeroStarts && bs.GetNumStarts() == 0) { + // Unusual + PrepareForNumStarts(0); + } else { + // Normal + InterleavedBackSubst(this, bs); + } + } + + template + ResultRow PhsfQuery(const Key& input, const PhsfQueryHasher& hasher) const { + // assert(!TypesAndSettings::kIsFilter); Can be useful in testing + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual + return 0; + } else { + // Normal + // NOTE: not using a struct to encourage compiler optimization + Hash hash; + Index segment_num; + Index num_columns; + Index start_bit; + InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num, + &num_columns, &start_bit); + return InterleavedPhsfQuery(hash, segment_num, num_columns, start_bit, + hasher, *this); + } + } + + template + bool FilterQuery(const Key& input, const FilterQueryHasher& hasher) const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Zero starts presumes no keys added -> always false + return false; + } else { + // Normal, or upper_num_columns_ == 0 means "no space for data" and + // thus will always return true. + // NOTE: not using a struct to encourage compiler optimization + Hash hash; + Index segment_num; + Index num_columns; + Index start_bit; + InterleavedPrepareQuery(input, hasher, *this, &hash, &segment_num, + &num_columns, &start_bit); + return InterleavedFilterQuery(hash, segment_num, num_columns, start_bit, + hasher, *this); + } + } + + double ExpectedFpRate() const { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts && num_starts_ == 0) { + // Unusual. Zero starts presumes no keys added -> always false + return 0.0; + } + // else Normal + + // Note: Ignoring smash setting; still close enough in that case + double lower_portion = + (upper_start_block_ * 1.0 * kCoeffBits) / num_starts_; + + // Each result (solution) bit (column) cuts FP rate in half. Weight that + // for upper and lower number of bits (columns). + return lower_portion * std::pow(0.5, upper_num_columns_ - 1) + + (1.0 - lower_portion) * std::pow(0.5, upper_num_columns_); + } + + // ******************************************************************** + // Static high-level API + + // Round up to a number of slots supported by this structure. Note that + // this needs to be must be taken into account for the banding if this + // solution layout/storage is to be used. + static Index RoundUpNumSlots(Index num_slots) { + // Must be multiple of kCoeffBits + Index corrected = (num_slots + kCoeffBits - 1) / kCoeffBits * kCoeffBits; + + // Do not use num_starts==1 unless kUseSmash, because the hashing + // might not be equipped for stacking up so many entries on a + // single start location. + if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) { + corrected += kCoeffBits; + } + return corrected; + } + + // Round down to a number of slots supported by this structure. Note that + // this needs to be must be taken into account for the banding if this + // solution layout/storage is to be used. + static Index RoundDownNumSlots(Index num_slots) { + // Must be multiple of kCoeffBits + Index corrected = num_slots / kCoeffBits * kCoeffBits; + + // Do not use num_starts==1 unless kUseSmash, because the hashing + // might not be equipped for stacking up so many entries on a + // single start location. + if (!TypesAndSettings::kUseSmash && corrected == kCoeffBits) { + corrected = 0; + } + return corrected; + } + + // Compute the number of bytes for a given number of slots and desired + // FP rate. Since desired FP rate might not be exactly achievable, + // rounding_bias32==0 means to always round toward lower FP rate + // than desired (more bytes); rounding_bias32==max uint32_t means always + // round toward higher FP rate than desired (fewer bytes); other values + // act as a proportional threshold or bias between the two. + static size_t GetBytesForFpRate(Index num_slots, double desired_fp_rate, + uint32_t rounding_bias32) { + return InternalGetBytesForFpRate(num_slots, desired_fp_rate, + 1.0 / desired_fp_rate, rounding_bias32); + } + + // The same, but specifying desired accuracy as 1.0 / FP rate, or + // one_in_fp_rate. E.g. desired_one_in_fp_rate=100 means 1% FP rate. + static size_t GetBytesForOneInFpRate(Index num_slots, + double desired_one_in_fp_rate, + uint32_t rounding_bias32) { + return InternalGetBytesForFpRate(num_slots, 1.0 / desired_one_in_fp_rate, + desired_one_in_fp_rate, rounding_bias32); + } + + protected: + static size_t InternalGetBytesForFpRate(Index num_slots, + double desired_fp_rate, + double desired_one_in_fp_rate, + uint32_t rounding_bias32) { + assert(TypesAndSettings::kIsFilter); + if (TypesAndSettings::kAllowZeroStarts) { + if (num_slots == 0) { + // Unusual. Zero starts presumes no keys added -> always false (no FPs) + return 0U; + } + } else { + assert(num_slots > 0); + } + // Must be rounded up already. + assert(RoundUpNumSlots(num_slots) == num_slots); + + if (desired_one_in_fp_rate > 1.0 && desired_fp_rate < 1.0) { + // Typical: less than 100% FP rate + if (desired_one_in_fp_rate <= static_cast(-1)) { + // Typical: Less than maximum result row entropy + ResultRow rounded = static_cast(desired_one_in_fp_rate); + int lower_columns = FloorLog2(rounded); + double lower_columns_fp_rate = std::pow(2.0, -lower_columns); + double upper_columns_fp_rate = std::pow(2.0, -(lower_columns + 1)); + // Floating point don't let me down! + assert(lower_columns_fp_rate >= desired_fp_rate); + assert(upper_columns_fp_rate <= desired_fp_rate); + + double lower_portion = (desired_fp_rate - upper_columns_fp_rate) / + (lower_columns_fp_rate - upper_columns_fp_rate); + // Floating point don't let me down! + assert(lower_portion >= 0.0); + assert(lower_portion <= 1.0); + + double rounding_bias = (rounding_bias32 + 0.5) / double{0x100000000}; + assert(rounding_bias > 0.0); + assert(rounding_bias < 1.0); + + // Note: Ignoring smash setting; still close enough in that case + Index num_starts = num_slots - kCoeffBits + 1; + // Lower upper_start_block means lower FP rate (higher accuracy) + Index upper_start_block = static_cast( + (lower_portion * num_starts + rounding_bias) / kCoeffBits); + Index num_blocks = num_slots / kCoeffBits; + assert(upper_start_block < num_blocks); + + // Start by assuming all blocks use lower number of columns + Index num_segments = num_blocks * static_cast(lower_columns); + // Correct by 1 each for blocks using upper number of columns + num_segments += (num_blocks - upper_start_block); + // Total bytes + return num_segments * sizeof(CoeffRow); + } else { + // one_in_fp_rate too big, thus requested FP rate is smaller than + // supported. Use max number of columns for minimum supported FP rate. + return num_slots * sizeof(ResultRow); + } + } else { + // Effectively asking for 100% FP rate, or NaN etc. + if (TypesAndSettings::kAllowZeroStarts) { + // Zero segments + return 0U; + } else { + // One segment (minimum size, maximizing FP rate) + return sizeof(CoeffRow); + } + } + } + + void InternalConfigure() { + const Index num_blocks = GetNumBlocks(); + Index num_segments = GetNumSegments(); + + if (num_blocks == 0) { + // Exceptional + upper_num_columns_ = 0; + upper_start_block_ = 0; + } else { + // Normal + upper_num_columns_ = + (num_segments + /*round up*/ num_blocks - 1) / num_blocks; + upper_start_block_ = upper_num_columns_ * num_blocks - num_segments; + // Unless that's more columns than supported by ResultRow data type + if (upper_num_columns_ > 8U * sizeof(ResultRow)) { + // Use maximum columns (there will be space unused) + upper_num_columns_ = static_cast(8U * sizeof(ResultRow)); + upper_start_block_ = 0; + num_segments = num_blocks * upper_num_columns_; + } + } + // Update data_len_ for correct rounding and/or unused space + // NOTE: unused space stays gone if we PrepareForNumStarts again. + // We are prioritizing minimizing the number of fields over making + // the "unusued space" feature work well. + data_len_ = num_segments * sizeof(CoeffRow); + } + + char* const data_; + size_t data_len_; + Index num_starts_ = 0; + Index upper_num_columns_ = 0; + Index upper_start_block_ = 0; +}; + +} // namespace ribbon + +} // namespace ROCKSDB_NAMESPACE + +// For convenience working with templates +#define IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings) \ + using Hasher = ROCKSDB_NAMESPACE::ribbon::StandardHasher; \ + using Banding = \ + ROCKSDB_NAMESPACE::ribbon::StandardBanding; \ + using SimpleSoln = \ + ROCKSDB_NAMESPACE::ribbon::InMemSimpleSolution; \ + using InterleavedSoln = \ + ROCKSDB_NAMESPACE::ribbon::SerializableInterleavedSolution< \ + TypesAndSettings>; \ + static_assert(sizeof(Hasher) + sizeof(Banding) + sizeof(SimpleSoln) + \ + sizeof(InterleavedSoln) > \ + 0, \ + "avoid unused warnings, semicolon expected after macro call") diff --git a/src/rocksdb/util/ribbon_test.cc b/src/rocksdb/util/ribbon_test.cc new file mode 100644 index 000000000..6519df3d5 --- /dev/null +++ b/src/rocksdb/util/ribbon_test.cc @@ -0,0 +1,1308 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/system_clock.h" +#include "test_util/testharness.h" +#include "util/bloom_impl.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/ribbon_config.h" +#include "util/ribbon_impl.h" +#include "util/stop_watch.h" +#include "util/string_util.h" + +#ifndef GFLAGS +uint32_t FLAGS_thoroughness = 5; +uint32_t FLAGS_max_add = 0; +uint32_t FLAGS_min_check = 4000; +uint32_t FLAGS_max_check = 100000; +bool FLAGS_verbose = false; + +bool FLAGS_find_occ = false; +bool FLAGS_find_slot_occ = false; +double FLAGS_find_next_factor = 1.618; +uint32_t FLAGS_find_iters = 10000; +uint32_t FLAGS_find_min_slots = 128; +uint32_t FLAGS_find_max_slots = 1000000; + +bool FLAGS_optimize_homog = false; +uint32_t FLAGS_optimize_homog_slots = 30000000; +uint32_t FLAGS_optimize_homog_check = 200000; +double FLAGS_optimize_homog_granularity = 0.002; +#else +#include "util/gflags_compat.h" +using GFLAGS_NAMESPACE::ParseCommandLineFlags; +// Using 500 is a good test when you have time to be thorough. +// Default is for general RocksDB regression test runs. +DEFINE_uint32(thoroughness, 5, "iterations per configuration"); +DEFINE_uint32(max_add, 0, + "Add up to this number of entries to a single filter in " + "CompactnessAndBacktrackAndFpRate; 0 == reasonable default"); +DEFINE_uint32(min_check, 4000, + "Minimum number of novel entries for testing FP rate"); +DEFINE_uint32(max_check, 10000, + "Maximum number of novel entries for testing FP rate"); +DEFINE_bool(verbose, false, "Print extra details"); + +// Options for FindOccupancy, which is more of a tool than a test. +DEFINE_bool(find_occ, false, "whether to run the FindOccupancy tool"); +DEFINE_bool(find_slot_occ, false, + "whether to show individual slot occupancies with " + "FindOccupancy tool"); +DEFINE_double(find_next_factor, 1.618, + "factor to next num_slots for FindOccupancy"); +DEFINE_uint32(find_iters, 10000, "number of samples for FindOccupancy"); +DEFINE_uint32(find_min_slots, 128, "number of slots for FindOccupancy"); +DEFINE_uint32(find_max_slots, 1000000, "number of slots for FindOccupancy"); + +// Options for OptimizeHomogAtScale, which is more of a tool than a test. +DEFINE_bool(optimize_homog, false, + "whether to run the OptimizeHomogAtScale tool"); +DEFINE_uint32(optimize_homog_slots, 30000000, + "number of slots for OptimizeHomogAtScale"); +DEFINE_uint32(optimize_homog_check, 200000, + "number of queries for checking FP rate in OptimizeHomogAtScale"); +DEFINE_double( + optimize_homog_granularity, 0.002, + "overhead change between FP rate checking in OptimizeHomogAtScale"); + +#endif // GFLAGS + +template +class RibbonTypeParamTest : public ::testing::Test {}; + +class RibbonTest : public ::testing::Test {}; + +namespace { + +// Different ways of generating keys for testing + +// Generate semi-sequential keys +struct StandardKeyGen { + StandardKeyGen(const std::string& prefix, uint64_t id) + : id_(id), str_(prefix) { + ROCKSDB_NAMESPACE::PutFixed64(&str_, /*placeholder*/ 0); + } + + // Prefix (only one required) + StandardKeyGen& operator++() { + ++id_; + return *this; + } + + StandardKeyGen& operator+=(uint64_t i) { + id_ += i; + return *this; + } + + const std::string& operator*() { + // Use multiplication to mix things up a little in the key + ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], + id_ * uint64_t{0x1500000001}); + return str_; + } + + bool operator==(const StandardKeyGen& other) { + // Same prefix is assumed + return id_ == other.id_; + } + bool operator!=(const StandardKeyGen& other) { + // Same prefix is assumed + return id_ != other.id_; + } + + uint64_t id_; + std::string str_; +}; + +// Generate small sequential keys, that can misbehave with sequential seeds +// as in https://github.com/Cyan4973/xxHash/issues/469. +// These keys are only heuristically unique, but that's OK with 64 bits, +// for testing purposes. +struct SmallKeyGen { + SmallKeyGen(const std::string& prefix, uint64_t id) : id_(id) { + // Hash the prefix for a heuristically unique offset + id_ += ROCKSDB_NAMESPACE::GetSliceHash64(prefix); + ROCKSDB_NAMESPACE::PutFixed64(&str_, id_); + } + + // Prefix (only one required) + SmallKeyGen& operator++() { + ++id_; + return *this; + } + + SmallKeyGen& operator+=(uint64_t i) { + id_ += i; + return *this; + } + + const std::string& operator*() { + ROCKSDB_NAMESPACE::EncodeFixed64(&str_[str_.size() - 8], id_); + return str_; + } + + bool operator==(const SmallKeyGen& other) { return id_ == other.id_; } + bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; } + + uint64_t id_; + std::string str_; +}; + +template +struct Hash32KeyGenWrapper : public KeyGen { + Hash32KeyGenWrapper(const std::string& prefix, uint64_t id) + : KeyGen(prefix, id) {} + uint32_t operator*() { + auto& key = *static_cast(*this); + // unseeded + return ROCKSDB_NAMESPACE::GetSliceHash(key); + } +}; + +template +struct Hash64KeyGenWrapper : public KeyGen { + Hash64KeyGenWrapper(const std::string& prefix, uint64_t id) + : KeyGen(prefix, id) {} + uint64_t operator*() { + auto& key = *static_cast(*this); + // unseeded + return ROCKSDB_NAMESPACE::GetSliceHash64(key); + } +}; + +using ROCKSDB_NAMESPACE::ribbon::ConstructionFailureChance; + +const std::vector kFailureOnly50Pct = { + ROCKSDB_NAMESPACE::ribbon::kOneIn2}; + +const std::vector kFailureOnlyRare = { + ROCKSDB_NAMESPACE::ribbon::kOneIn1000}; + +const std::vector kFailureAll = { + ROCKSDB_NAMESPACE::ribbon::kOneIn2, ROCKSDB_NAMESPACE::ribbon::kOneIn20, + ROCKSDB_NAMESPACE::ribbon::kOneIn1000}; + +} // namespace + +using ROCKSDB_NAMESPACE::ribbon::ExpectedCollisionFpRate; +using ROCKSDB_NAMESPACE::ribbon::StandardHasher; +using ROCKSDB_NAMESPACE::ribbon::StandardRehasherAdapter; + +struct DefaultTypesAndSettings { + using CoeffRow = ROCKSDB_NAMESPACE::Unsigned128; + using ResultRow = uint8_t; + using Index = uint32_t; + using Hash = uint64_t; + using Seed = uint32_t; + using Key = ROCKSDB_NAMESPACE::Slice; + static constexpr bool kIsFilter = true; + static constexpr bool kHomogeneous = false; + static constexpr bool kFirstCoeffAlwaysOne = true; + static constexpr bool kUseSmash = false; + static constexpr bool kAllowZeroStarts = false; + static Hash HashFn(const Key& key, uint64_t raw_seed) { + // This version 0.7.2 preview of XXH3 (a.k.a. XXPH3) function does + // not pass SmallKeyGen tests below without some seed premixing from + // StandardHasher. See https://github.com/Cyan4973/xxHash/issues/469 + return ROCKSDB_NAMESPACE::Hash64(key.data(), key.size(), raw_seed); + } + // For testing + using KeyGen = StandardKeyGen; + static const std::vector& FailureChanceToTest() { + return kFailureAll; + } +}; + +using TypesAndSettings_Coeff128 = DefaultTypesAndSettings; +struct TypesAndSettings_Coeff128Smash : public DefaultTypesAndSettings { + static constexpr bool kUseSmash = true; +}; +struct TypesAndSettings_Coeff64 : public DefaultTypesAndSettings { + using CoeffRow = uint64_t; +}; +struct TypesAndSettings_Coeff64Smash : public TypesAndSettings_Coeff64 { + static constexpr bool kUseSmash = true; +}; +struct TypesAndSettings_Coeff64Smash0 : public TypesAndSettings_Coeff64Smash { + static constexpr bool kFirstCoeffAlwaysOne = false; +}; + +// Homogeneous Ribbon configurations +struct TypesAndSettings_Coeff128_Homog : public DefaultTypesAndSettings { + static constexpr bool kHomogeneous = true; + // Since our best construction success setting still has 1/1000 failure + // rate, the best FP rate we test is 1/256 + using ResultRow = uint8_t; + // Homogeneous only makes sense with sufficient slots for equivalent of + // almost sure construction success + static const std::vector& FailureChanceToTest() { + return kFailureOnlyRare; + } +}; +struct TypesAndSettings_Coeff128Smash_Homog + : public TypesAndSettings_Coeff128_Homog { + // Smash (extra time to save space) + Homog (extra space to save time) + // doesn't make much sense in practice, but we minimally test it + static constexpr bool kUseSmash = true; +}; +struct TypesAndSettings_Coeff64_Homog : public TypesAndSettings_Coeff128_Homog { + using CoeffRow = uint64_t; +}; +struct TypesAndSettings_Coeff64Smash_Homog + : public TypesAndSettings_Coeff64_Homog { + // Smash (extra time to save space) + Homog (extra space to save time) + // doesn't make much sense in practice, but we minimally test it + static constexpr bool kUseSmash = true; +}; + +// Less exhaustive mix of coverage, but still covering the most stressful case +// (only 50% construction success) +struct AbridgedTypesAndSettings : public DefaultTypesAndSettings { + static const std::vector& FailureChanceToTest() { + return kFailureOnly50Pct; + } +}; +struct TypesAndSettings_Result16 : public AbridgedTypesAndSettings { + using ResultRow = uint16_t; +}; +struct TypesAndSettings_Result32 : public AbridgedTypesAndSettings { + using ResultRow = uint32_t; +}; +struct TypesAndSettings_IndexSizeT : public AbridgedTypesAndSettings { + using Index = size_t; +}; +struct TypesAndSettings_Hash32 : public AbridgedTypesAndSettings { + using Hash = uint32_t; + static Hash HashFn(const Key& key, Hash raw_seed) { + // This MurmurHash1 function does not pass tests below without the + // seed premixing from StandardHasher. In fact, it needs more than + // just a multiplication mixer on the ordinal seed. + return ROCKSDB_NAMESPACE::Hash(key.data(), key.size(), raw_seed); + } +}; +struct TypesAndSettings_Hash32_Result16 : public AbridgedTypesAndSettings { + using ResultRow = uint16_t; +}; +struct TypesAndSettings_KeyString : public AbridgedTypesAndSettings { + using Key = std::string; +}; +struct TypesAndSettings_Seed8 : public AbridgedTypesAndSettings { + // This is not a generally recommended configuration. With the configured + // hash function, it would fail with SmallKeyGen due to insufficient + // independence among the seeds. + using Seed = uint8_t; +}; +struct TypesAndSettings_NoAlwaysOne : public AbridgedTypesAndSettings { + static constexpr bool kFirstCoeffAlwaysOne = false; +}; +struct TypesAndSettings_AllowZeroStarts : public AbridgedTypesAndSettings { + static constexpr bool kAllowZeroStarts = true; +}; +struct TypesAndSettings_Seed64 : public AbridgedTypesAndSettings { + using Seed = uint64_t; +}; +struct TypesAndSettings_Rehasher + : public StandardRehasherAdapter { + using KeyGen = Hash64KeyGenWrapper; +}; +struct TypesAndSettings_Rehasher_Result16 : public TypesAndSettings_Rehasher { + using ResultRow = uint16_t; +}; +struct TypesAndSettings_Rehasher_Result32 : public TypesAndSettings_Rehasher { + using ResultRow = uint32_t; +}; +struct TypesAndSettings_Rehasher_Seed64 + : public StandardRehasherAdapter { + using KeyGen = Hash64KeyGenWrapper; + // Note: 64-bit seed with Rehasher gives slightly better average reseeds +}; +struct TypesAndSettings_Rehasher32 + : public StandardRehasherAdapter { + using KeyGen = Hash32KeyGenWrapper; +}; +struct TypesAndSettings_Rehasher32_Coeff64 + : public TypesAndSettings_Rehasher32 { + using CoeffRow = uint64_t; +}; +struct TypesAndSettings_SmallKeyGen : public AbridgedTypesAndSettings { + // SmallKeyGen stresses the independence of different hash seeds + using KeyGen = SmallKeyGen; +}; +struct TypesAndSettings_Hash32_SmallKeyGen : public TypesAndSettings_Hash32 { + // SmallKeyGen stresses the independence of different hash seeds + using KeyGen = SmallKeyGen; +}; +struct TypesAndSettings_Coeff32 : public DefaultTypesAndSettings { + using CoeffRow = uint32_t; +}; +struct TypesAndSettings_Coeff32Smash : public TypesAndSettings_Coeff32 { + static constexpr bool kUseSmash = true; +}; +struct TypesAndSettings_Coeff16 : public DefaultTypesAndSettings { + using CoeffRow = uint16_t; +}; +struct TypesAndSettings_Coeff16Smash : public TypesAndSettings_Coeff16 { + static constexpr bool kUseSmash = true; +}; + +using TestTypesAndSettings = ::testing::Types< + TypesAndSettings_Coeff128, TypesAndSettings_Coeff128Smash, + TypesAndSettings_Coeff64, TypesAndSettings_Coeff64Smash, + TypesAndSettings_Coeff64Smash0, TypesAndSettings_Coeff128_Homog, + TypesAndSettings_Coeff128Smash_Homog, TypesAndSettings_Coeff64_Homog, + TypesAndSettings_Coeff64Smash_Homog, TypesAndSettings_Result16, + TypesAndSettings_Result32, TypesAndSettings_IndexSizeT, + TypesAndSettings_Hash32, TypesAndSettings_Hash32_Result16, + TypesAndSettings_KeyString, TypesAndSettings_Seed8, + TypesAndSettings_NoAlwaysOne, TypesAndSettings_AllowZeroStarts, + TypesAndSettings_Seed64, TypesAndSettings_Rehasher, + TypesAndSettings_Rehasher_Result16, TypesAndSettings_Rehasher_Result32, + TypesAndSettings_Rehasher_Seed64, TypesAndSettings_Rehasher32, + TypesAndSettings_Rehasher32_Coeff64, TypesAndSettings_SmallKeyGen, + TypesAndSettings_Hash32_SmallKeyGen, TypesAndSettings_Coeff32, + TypesAndSettings_Coeff32Smash, TypesAndSettings_Coeff16, + TypesAndSettings_Coeff16Smash>; +TYPED_TEST_CASE(RibbonTypeParamTest, TestTypesAndSettings); + +namespace { + +// For testing Poisson-distributed (or similar) statistics, get value for +// `stddevs_allowed` standard deviations above expected mean +// `expected_count`. +// (Poisson approximates Binomial only if probability of a trial being +// in the count is low.) +uint64_t PoissonUpperBound(double expected_count, double stddevs_allowed) { + return static_cast( + expected_count + stddevs_allowed * std::sqrt(expected_count) + 1.0); +} + +uint64_t PoissonLowerBound(double expected_count, double stddevs_allowed) { + return static_cast(std::max( + 0.0, expected_count - stddevs_allowed * std::sqrt(expected_count))); +} + +uint64_t FrequentPoissonUpperBound(double expected_count) { + // Allow up to 5.0 standard deviations for frequently checked statistics + return PoissonUpperBound(expected_count, 5.0); +} + +uint64_t FrequentPoissonLowerBound(double expected_count) { + return PoissonLowerBound(expected_count, 5.0); +} + +uint64_t InfrequentPoissonUpperBound(double expected_count) { + // Allow up to 3 standard deviations for infrequently checked statistics + return PoissonUpperBound(expected_count, 3.0); +} + +uint64_t InfrequentPoissonLowerBound(double expected_count) { + return PoissonLowerBound(expected_count, 3.0); +} + +} // namespace + +TYPED_TEST(RibbonTypeParamTest, CompactnessAndBacktrackAndFpRate) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); + IMPORT_RIBBON_IMPL_TYPES(TypeParam); + using KeyGen = typename TypeParam::KeyGen; + using ConfigHelper = + ROCKSDB_NAMESPACE::ribbon::BandingConfigHelper; + + if (sizeof(CoeffRow) < 8) { + ROCKSDB_GTEST_BYPASS("Not fully supported"); + return; + } + + const auto log2_thoroughness = + static_cast(ROCKSDB_NAMESPACE::FloorLog2(FLAGS_thoroughness)); + + // We are going to choose num_to_add using an exponential distribution, + // so that we have good representation of small-to-medium filters. + // Here we just pick some reasonable, practical upper bound based on + // kCoeffBits or option. + const double log_max_add = std::log( + FLAGS_max_add > 0 ? FLAGS_max_add + : static_cast(kCoeffBits * kCoeffBits) * + std::max(FLAGS_thoroughness, uint32_t{32})); + + // This needs to be enough below the minimum number of slots to get a + // reasonable number of samples with the minimum number of slots. + const double log_min_add = std::log(0.66 * SimpleSoln::RoundUpNumSlots(1)); + + ASSERT_GT(log_max_add, log_min_add); + + const double diff_log_add = log_max_add - log_min_add; + + for (ConstructionFailureChance cs : TypeParam::FailureChanceToTest()) { + double expected_reseeds; + switch (cs) { + default: + assert(false); + FALLTHROUGH_INTENDED; + case ROCKSDB_NAMESPACE::ribbon::kOneIn2: + fprintf(stderr, "== Failure: 50 percent\n"); + expected_reseeds = 1.0; + break; + case ROCKSDB_NAMESPACE::ribbon::kOneIn20: + fprintf(stderr, "== Failure: 95 percent\n"); + expected_reseeds = 0.053; + break; + case ROCKSDB_NAMESPACE::ribbon::kOneIn1000: + fprintf(stderr, "== Failure: 1/1000\n"); + expected_reseeds = 0.001; + break; + } + + uint64_t total_reseeds = 0; + uint64_t total_singles = 0; + uint64_t total_single_failures = 0; + uint64_t total_batch = 0; + uint64_t total_batch_successes = 0; + uint64_t total_fp_count = 0; + uint64_t total_added = 0; + uint64_t total_expand_trials = 0; + uint64_t total_expand_failures = 0; + double total_expand_overhead = 0.0; + + uint64_t soln_query_nanos = 0; + uint64_t soln_query_count = 0; + uint64_t bloom_query_nanos = 0; + uint64_t isoln_query_nanos = 0; + uint64_t isoln_query_count = 0; + + // Take different samples if you change thoroughness + ROCKSDB_NAMESPACE::Random32 rnd(FLAGS_thoroughness); + + for (uint32_t i = 0; i < FLAGS_thoroughness; ++i) { + // We are going to choose num_to_add using an exponential distribution + // as noted above, but instead of randomly choosing them, we generate + // samples linearly using the golden ratio, which ensures a nice spread + // even for a small number of samples, and starting with the minimum + // number of slots to ensure it is tested. + double log_add = + std::fmod(0.6180339887498948482 * diff_log_add * i, diff_log_add) + + log_min_add; + uint32_t num_to_add = static_cast(std::exp(log_add)); + + // Most of the time, test the Interleaved solution storage, but when + // we do we have to make num_slots a multiple of kCoeffBits. So + // sometimes we want to test without that limitation. + bool test_interleaved = (i % 7) != 6; + + // Compute num_slots, and re-adjust num_to_add to get as close as possible + // to next num_slots, to stress that num_slots in terms of construction + // success. Ensure at least one iteration: + Index num_slots = Index{0} - 1; + --num_to_add; + for (;;) { + Index next_num_slots = SimpleSoln::RoundUpNumSlots( + ConfigHelper::GetNumSlots(num_to_add + 1, cs)); + if (test_interleaved) { + next_num_slots = InterleavedSoln::RoundUpNumSlots(next_num_slots); + // assert idempotent + EXPECT_EQ(next_num_slots, + InterleavedSoln::RoundUpNumSlots(next_num_slots)); + } + // assert idempotent with InterleavedSoln::RoundUpNumSlots + EXPECT_EQ(next_num_slots, SimpleSoln::RoundUpNumSlots(next_num_slots)); + + if (next_num_slots > num_slots) { + break; + } + num_slots = next_num_slots; + ++num_to_add; + } + assert(num_slots < Index{0} - 1); + + total_added += num_to_add; + + std::string prefix; + ROCKSDB_NAMESPACE::PutFixed32(&prefix, rnd.Next()); + + // Batch that must be added + std::string added_str = prefix + "added"; + KeyGen keys_begin(added_str, 0); + KeyGen keys_end(added_str, num_to_add); + + // A couple more that will probably be added + KeyGen one_more(prefix + "more", 1); + KeyGen two_more(prefix + "more", 2); + + // Batch that may or may not be added + uint32_t batch_size = + static_cast(2.0 * std::sqrt(num_slots - num_to_add)); + if (batch_size < 10U) { + batch_size = 0; + } + std::string batch_str = prefix + "batch"; + KeyGen batch_begin(batch_str, 0); + KeyGen batch_end(batch_str, batch_size); + + // Batch never (successfully) added, but used for querying FP rate + std::string not_str = prefix + "not"; + KeyGen other_keys_begin(not_str, 0); + KeyGen other_keys_end(not_str, FLAGS_max_check); + + double overhead_ratio = 1.0 * num_slots / num_to_add; + if (FLAGS_verbose) { + fprintf(stderr, "Adding(%s) %u / %u Overhead: %g Batch size: %u\n", + test_interleaved ? "i" : "s", (unsigned)num_to_add, + (unsigned)num_slots, overhead_ratio, (unsigned)batch_size); + } + + // Vary bytes for InterleavedSoln to use number of solution columns + // from 0 to max allowed by ResultRow type (and used by SimpleSoln). + // Specifically include 0 and max, and otherwise skew toward max. + uint32_t max_ibytes = + static_cast(sizeof(ResultRow) * num_slots); + size_t ibytes; + if (i == 0) { + ibytes = 0; + } else if (i == 1) { + ibytes = max_ibytes; + } else { + // Skewed + ibytes = + std::max(rnd.Uniformish(max_ibytes), rnd.Uniformish(max_ibytes)); + } + std::unique_ptr idata(new char[ibytes]); + InterleavedSoln isoln(idata.get(), ibytes); + + SimpleSoln soln; + Hasher hasher; + bool first_single; + bool second_single; + bool batch_success; + { + Banding banding; + // Traditional solve for a fixed set. + ASSERT_TRUE( + banding.ResetAndFindSeedToSolve(num_slots, keys_begin, keys_end)); + + Index occupied_count = banding.GetOccupiedCount(); + Index more_added = 0; + + if (TypeParam::kHomogeneous || overhead_ratio < 1.01 || + batch_size == 0) { + // Homogeneous not compatible with backtracking because add + // doesn't fail. Small overhead ratio too packed to expect more + first_single = false; + second_single = false; + batch_success = false; + } else { + // Now to test backtracking, starting with guaranteed fail. By using + // the keys that will be used to test FP rate, we are then doing an + // extra check that after backtracking there are no remnants (e.g. in + // result side of banding) of these entries. + KeyGen other_keys_too_big_end = other_keys_begin; + other_keys_too_big_end += num_to_add; + banding.EnsureBacktrackSize(std::max(num_to_add, batch_size)); + EXPECT_FALSE(banding.AddRangeOrRollBack(other_keys_begin, + other_keys_too_big_end)); + EXPECT_EQ(occupied_count, banding.GetOccupiedCount()); + + // Check that we still have a good chance of adding a couple more + // individually + first_single = banding.Add(*one_more); + second_single = banding.Add(*two_more); + more_added += (first_single ? 1 : 0) + (second_single ? 1 : 0); + total_singles += 2U; + total_single_failures += 2U - more_added; + + // Or as a batch + batch_success = banding.AddRangeOrRollBack(batch_begin, batch_end); + ++total_batch; + if (batch_success) { + more_added += batch_size; + ++total_batch_successes; + } + EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added); + } + + // Also verify that redundant adds are OK (no effect) + ASSERT_TRUE( + banding.AddRange(keys_begin, KeyGen(added_str, num_to_add / 8))); + EXPECT_LE(banding.GetOccupiedCount(), occupied_count + more_added); + + // Now back-substitution + soln.BackSubstFrom(banding); + if (test_interleaved) { + isoln.BackSubstFrom(banding); + } + + Seed reseeds = banding.GetOrdinalSeed(); + total_reseeds += reseeds; + + EXPECT_LE(reseeds, 8 + log2_thoroughness); + if (reseeds > log2_thoroughness + 1) { + fprintf( + stderr, "%s high reseeds at %u, %u/%u: %u\n", + reseeds > log2_thoroughness + 8 ? "ERROR Extremely" : "Somewhat", + static_cast(i), static_cast(num_to_add), + static_cast(num_slots), static_cast(reseeds)); + } + + if (reseeds > 0) { + // "Expand" test: given a failed construction, how likely is it to + // pass with same seed and more slots. At each step, we increase + // enough to ensure there is at least one shift within each coeff + // block. + ++total_expand_trials; + Index expand_count = 0; + Index ex_slots = num_slots; + banding.SetOrdinalSeed(0); + for (;; ++expand_count) { + ASSERT_LE(expand_count, log2_thoroughness); + ex_slots += ex_slots / kCoeffBits; + if (test_interleaved) { + ex_slots = InterleavedSoln::RoundUpNumSlots(ex_slots); + } + banding.Reset(ex_slots); + bool success = banding.AddRange(keys_begin, keys_end); + if (success) { + break; + } + } + total_expand_failures += expand_count; + total_expand_overhead += 1.0 * (ex_slots - num_slots) / num_slots; + } + + hasher.SetOrdinalSeed(reseeds); + } + // soln and hasher now independent of Banding object + + // Verify keys added + KeyGen cur = keys_begin; + while (cur != keys_end) { + ASSERT_TRUE(soln.FilterQuery(*cur, hasher)); + ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher)); + ++cur; + } + // We (maybe) snuck these in! + if (first_single) { + ASSERT_TRUE(soln.FilterQuery(*one_more, hasher)); + ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*one_more, hasher)); + } + if (second_single) { + ASSERT_TRUE(soln.FilterQuery(*two_more, hasher)); + ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*two_more, hasher)); + } + if (batch_success) { + cur = batch_begin; + while (cur != batch_end) { + ASSERT_TRUE(soln.FilterQuery(*cur, hasher)); + ASSERT_TRUE(!test_interleaved || isoln.FilterQuery(*cur, hasher)); + ++cur; + } + } + + // Check FP rate (depends only on number of result bits == solution + // columns) + Index fp_count = 0; + cur = other_keys_begin; + { + ROCKSDB_NAMESPACE::StopWatchNano timer( + ROCKSDB_NAMESPACE::SystemClock::Default().get(), true); + while (cur != other_keys_end) { + bool fp = soln.FilterQuery(*cur, hasher); + fp_count += fp ? 1 : 0; + ++cur; + } + soln_query_nanos += timer.ElapsedNanos(); + soln_query_count += FLAGS_max_check; + } + { + double expected_fp_count = soln.ExpectedFpRate() * FLAGS_max_check; + // For expected FP rate, also include false positives due to collisions + // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) + double correction = + FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add); + + // NOTE: rare violations expected with kHomogeneous + EXPECT_LE(fp_count, + FrequentPoissonUpperBound(expected_fp_count + correction)); + EXPECT_GE(fp_count, + FrequentPoissonLowerBound(expected_fp_count + correction)); + } + total_fp_count += fp_count; + + // And also check FP rate for isoln + if (test_interleaved) { + Index ifp_count = 0; + cur = other_keys_begin; + ROCKSDB_NAMESPACE::StopWatchNano timer( + ROCKSDB_NAMESPACE::SystemClock::Default().get(), true); + while (cur != other_keys_end) { + ifp_count += isoln.FilterQuery(*cur, hasher) ? 1 : 0; + ++cur; + } + isoln_query_nanos += timer.ElapsedNanos(); + isoln_query_count += FLAGS_max_check; + { + double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check; + // For expected FP rate, also include false positives due to + // collisions in Hash value. (Negligible for 64-bit, can matter for + // 32-bit.) + double correction = + FLAGS_max_check * ExpectedCollisionFpRate(hasher, num_to_add); + + // NOTE: rare violations expected with kHomogeneous + EXPECT_LE(ifp_count, + FrequentPoissonUpperBound(expected_fp_count + correction)); + + // FIXME: why sometimes can we slightly "beat the odds"? + // (0.95 factor should not be needed) + EXPECT_GE(ifp_count, FrequentPoissonLowerBound( + 0.95 * expected_fp_count + correction)); + } + // Since the bits used in isoln are a subset of the bits used in soln, + // it cannot have fewer FPs + EXPECT_GE(ifp_count, fp_count); + } + + // And compare to Bloom time, for fun + if (ibytes >= /* minimum Bloom impl bytes*/ 64) { + Index bfp_count = 0; + cur = other_keys_begin; + ROCKSDB_NAMESPACE::StopWatchNano timer( + ROCKSDB_NAMESPACE::SystemClock::Default().get(), true); + while (cur != other_keys_end) { + uint64_t h = hasher.GetHash(*cur); + uint32_t h1 = ROCKSDB_NAMESPACE::Lower32of64(h); + uint32_t h2 = sizeof(Hash) >= 8 ? ROCKSDB_NAMESPACE::Upper32of64(h) + : h1 * 0x9e3779b9; + bfp_count += + ROCKSDB_NAMESPACE::FastLocalBloomImpl::HashMayMatch( + h1, h2, static_cast(ibytes), 6, idata.get()) + ? 1 + : 0; + ++cur; + } + bloom_query_nanos += timer.ElapsedNanos(); + // ensure bfp_count is used + ASSERT_LT(bfp_count, FLAGS_max_check); + } + } + + // "outside" == key not in original set so either negative or false positive + fprintf(stderr, + "Simple outside query, hot, incl hashing, ns/key: %g\n", + 1.0 * soln_query_nanos / soln_query_count); + fprintf(stderr, + "Interleaved outside query, hot, incl hashing, ns/key: %g\n", + 1.0 * isoln_query_nanos / isoln_query_count); + fprintf(stderr, + "Bloom outside query, hot, incl hashing, ns/key: %g\n", + 1.0 * bloom_query_nanos / soln_query_count); + + if (TypeParam::kHomogeneous) { + EXPECT_EQ(total_reseeds, 0U); + } else { + double average_reseeds = 1.0 * total_reseeds / FLAGS_thoroughness; + fprintf(stderr, "Average re-seeds: %g\n", average_reseeds); + // Values above were chosen to target around 50% chance of encoding + // success rate (average of 1.0 re-seeds) or slightly better. But 1.15 is + // also close enough. + EXPECT_LE(total_reseeds, + InfrequentPoissonUpperBound(1.15 * expected_reseeds * + FLAGS_thoroughness)); + // Would use 0.85 here instead of 0.75, but + // TypesAndSettings_Hash32_SmallKeyGen can "beat the odds" because of + // sequential keys with a small, cheap hash function. We accept that + // there are surely inputs that are somewhat bad for this setup, but + // these somewhat good inputs are probably more likely. + EXPECT_GE(total_reseeds, + InfrequentPoissonLowerBound(0.75 * expected_reseeds * + FLAGS_thoroughness)); + } + + if (total_expand_trials > 0) { + double average_expand_failures = + 1.0 * total_expand_failures / total_expand_trials; + fprintf(stderr, "Average expand failures, and overhead: %g, %g\n", + average_expand_failures, + total_expand_overhead / total_expand_trials); + // Seems to be a generous allowance + EXPECT_LE(total_expand_failures, + InfrequentPoissonUpperBound(1.0 * total_expand_trials)); + } else { + fprintf(stderr, "Average expand failures: N/A\n"); + } + + if (total_singles > 0) { + double single_failure_rate = 1.0 * total_single_failures / total_singles; + fprintf(stderr, "Add'l single, failure rate: %g\n", single_failure_rate); + // A rough bound (one sided) based on nothing in particular + double expected_single_failures = 1.0 * total_singles / + (sizeof(CoeffRow) == 16 ? 128 + : TypeParam::kUseSmash ? 64 + : 32); + EXPECT_LE(total_single_failures, + InfrequentPoissonUpperBound(expected_single_failures)); + } + + if (total_batch > 0) { + // Counting successes here for Poisson to approximate the Binomial + // distribution. + // A rough bound (one sided) based on nothing in particular. + double expected_batch_successes = 1.0 * total_batch / 2; + uint64_t lower_bound = + InfrequentPoissonLowerBound(expected_batch_successes); + fprintf(stderr, "Add'l batch, success rate: %g (>= %g)\n", + 1.0 * total_batch_successes / total_batch, + 1.0 * lower_bound / total_batch); + EXPECT_GE(total_batch_successes, lower_bound); + } + + { + uint64_t total_checked = uint64_t{FLAGS_max_check} * FLAGS_thoroughness; + double expected_total_fp_count = + total_checked * std::pow(0.5, 8U * sizeof(ResultRow)); + // For expected FP rate, also include false positives due to collisions + // in Hash value. (Negligible for 64-bit, can matter for 32-bit.) + double average_added = 1.0 * total_added / FLAGS_thoroughness; + expected_total_fp_count += + total_checked * ExpectedCollisionFpRate(Hasher(), average_added); + + uint64_t upper_bound = + InfrequentPoissonUpperBound(expected_total_fp_count); + uint64_t lower_bound = + InfrequentPoissonLowerBound(expected_total_fp_count); + fprintf(stderr, "Average FP rate: %g (~= %g, <= %g, >= %g)\n", + 1.0 * total_fp_count / total_checked, + expected_total_fp_count / total_checked, + 1.0 * upper_bound / total_checked, + 1.0 * lower_bound / total_checked); + EXPECT_LE(total_fp_count, upper_bound); + EXPECT_GE(total_fp_count, lower_bound); + } + } +} + +TYPED_TEST(RibbonTypeParamTest, Extremes) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); + IMPORT_RIBBON_IMPL_TYPES(TypeParam); + using KeyGen = typename TypeParam::KeyGen; + + size_t bytes = 128 * 1024; + std::unique_ptr buf(new char[bytes]); + InterleavedSoln isoln(buf.get(), bytes); + SimpleSoln soln; + Hasher hasher; + Banding banding; + + // ######################################## + // Add zero keys to minimal number of slots + KeyGen begin_and_end("foo", 123); + ASSERT_TRUE(banding.ResetAndFindSeedToSolve( + /*slots*/ kCoeffBits, begin_and_end, begin_and_end, /*first seed*/ 0, + /* seed mask*/ 0)); + + soln.BackSubstFrom(banding); + isoln.BackSubstFrom(banding); + + // Because there's plenty of memory, we expect the interleaved solution to + // use maximum supported columns (same as simple solution) + ASSERT_EQ(isoln.GetUpperNumColumns(), 8U * sizeof(ResultRow)); + ASSERT_EQ(isoln.GetUpperStartBlock(), 0U); + + // Somewhat oddly, we expect same FP rate as if we had essentially filled + // up the slots. + KeyGen other_keys_begin("not", 0); + KeyGen other_keys_end("not", FLAGS_max_check); + + Index fp_count = 0; + KeyGen cur = other_keys_begin; + while (cur != other_keys_end) { + bool isoln_query_result = isoln.FilterQuery(*cur, hasher); + bool soln_query_result = soln.FilterQuery(*cur, hasher); + // Solutions are equivalent + ASSERT_EQ(isoln_query_result, soln_query_result); + if (!TypeParam::kHomogeneous) { + // And in fact we only expect an FP when ResultRow is 0 + // (except Homogeneous) + ASSERT_EQ(soln_query_result, hasher.GetResultRowFromHash( + hasher.GetHash(*cur)) == ResultRow{0}); + } + fp_count += soln_query_result ? 1 : 0; + ++cur; + } + { + ASSERT_EQ(isoln.ExpectedFpRate(), soln.ExpectedFpRate()); + double expected_fp_count = isoln.ExpectedFpRate() * FLAGS_max_check; + EXPECT_LE(fp_count, InfrequentPoissonUpperBound(expected_fp_count)); + if (TypeParam::kHomogeneous) { + // Pseudorandom garbage in Homogeneous filter can "beat the odds" if + // nothing added + } else { + EXPECT_GE(fp_count, InfrequentPoissonLowerBound(expected_fp_count)); + } + } + + // ###################################################### + // Use zero bytes for interleaved solution (key(s) added) + + // Add one key + KeyGen key_begin("added", 0); + KeyGen key_end("added", 1); + ASSERT_TRUE(banding.ResetAndFindSeedToSolve( + /*slots*/ kCoeffBits, key_begin, key_end, /*first seed*/ 0, + /* seed mask*/ 0)); + + InterleavedSoln isoln2(nullptr, /*bytes*/ 0); + + isoln2.BackSubstFrom(banding); + + ASSERT_EQ(isoln2.GetUpperNumColumns(), 0U); + ASSERT_EQ(isoln2.GetUpperStartBlock(), 0U); + + // All queries return true + ASSERT_TRUE(isoln2.FilterQuery(*other_keys_begin, hasher)); + ASSERT_EQ(isoln2.ExpectedFpRate(), 1.0); +} + +TEST(RibbonTest, AllowZeroStarts) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypesAndSettings_AllowZeroStarts); + IMPORT_RIBBON_IMPL_TYPES(TypesAndSettings_AllowZeroStarts); + using KeyGen = StandardKeyGen; + + InterleavedSoln isoln(nullptr, /*bytes*/ 0); + SimpleSoln soln; + Hasher hasher; + Banding banding; + + KeyGen begin("foo", 0); + KeyGen end("foo", 1); + // Can't add 1 entry + ASSERT_FALSE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin, end)); + + KeyGen begin_and_end("foo", 123); + // Can add 0 entries + ASSERT_TRUE(banding.ResetAndFindSeedToSolve(/*slots*/ 0, begin_and_end, + begin_and_end)); + + Seed reseeds = banding.GetOrdinalSeed(); + ASSERT_EQ(reseeds, 0U); + hasher.SetOrdinalSeed(reseeds); + + // Can construct 0-slot solutions + isoln.BackSubstFrom(banding); + soln.BackSubstFrom(banding); + + // Should always return false + ASSERT_FALSE(isoln.FilterQuery(*begin, hasher)); + ASSERT_FALSE(soln.FilterQuery(*begin, hasher)); + + // And report that in FP rate + ASSERT_EQ(isoln.ExpectedFpRate(), 0.0); + ASSERT_EQ(soln.ExpectedFpRate(), 0.0); +} + +TEST(RibbonTest, RawAndOrdinalSeeds) { + StandardHasher hasher64; + StandardHasher hasher64_32; + StandardHasher hasher32; + StandardHasher hasher8; + + for (uint32_t limit : {0xffU, 0xffffU}) { + std::vector seen(limit + 1); + for (uint32_t i = 0; i < limit; ++i) { + hasher64.SetOrdinalSeed(i); + auto raw64 = hasher64.GetRawSeed(); + hasher32.SetOrdinalSeed(i); + auto raw32 = hasher32.GetRawSeed(); + hasher8.SetOrdinalSeed(static_cast(i)); + auto raw8 = hasher8.GetRawSeed(); + { + hasher64_32.SetOrdinalSeed(i); + auto raw64_32 = hasher64_32.GetRawSeed(); + ASSERT_EQ(raw64_32, raw32); // Same size seed + } + if (i == 0) { + // Documented that ordinal seed 0 == raw seed 0 + ASSERT_EQ(raw64, 0U); + ASSERT_EQ(raw32, 0U); + ASSERT_EQ(raw8, 0U); + } else { + // Extremely likely that upper bits are set + ASSERT_GT(raw64, raw32); + ASSERT_GT(raw32, raw8); + } + // Hashers agree on lower bits + ASSERT_EQ(static_cast(raw64), raw32); + ASSERT_EQ(static_cast(raw32), raw8); + + // The translation is one-to-one for this size prefix + uint32_t v = static_cast(raw32 & limit); + ASSERT_EQ(raw64 & limit, v); + ASSERT_FALSE(seen[v]); + seen[v] = true; + } + } +} + +namespace { + +struct PhsfInputGen { + PhsfInputGen(const std::string& prefix, uint64_t id) : id_(id) { + val_.first = prefix; + ROCKSDB_NAMESPACE::PutFixed64(&val_.first, /*placeholder*/ 0); + } + + // Prefix (only one required) + PhsfInputGen& operator++() { + ++id_; + return *this; + } + + const std::pair& operator*() { + // Use multiplication to mix things up a little in the key + ROCKSDB_NAMESPACE::EncodeFixed64(&val_.first[val_.first.size() - 8], + id_ * uint64_t{0x1500000001}); + // Occasionally repeat values etc. + val_.second = static_cast(id_ * 7 / 8); + return val_; + } + + const std::pair* operator->() { return &**this; } + + bool operator==(const PhsfInputGen& other) { + // Same prefix is assumed + return id_ == other.id_; + } + bool operator!=(const PhsfInputGen& other) { + // Same prefix is assumed + return id_ != other.id_; + } + + uint64_t id_; + std::pair val_; +}; + +struct PhsfTypesAndSettings : public DefaultTypesAndSettings { + static constexpr bool kIsFilter = false; +}; +} // namespace + +TEST(RibbonTest, PhsfBasic) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(PhsfTypesAndSettings); + IMPORT_RIBBON_IMPL_TYPES(PhsfTypesAndSettings); + + Index num_slots = 12800; + Index num_to_add = static_cast(num_slots / 1.02); + + PhsfInputGen begin("in", 0); + PhsfInputGen end("in", num_to_add); + + std::unique_ptr idata(new char[/*bytes*/ num_slots]); + InterleavedSoln isoln(idata.get(), /*bytes*/ num_slots); + SimpleSoln soln; + Hasher hasher; + + { + Banding banding; + ASSERT_TRUE(banding.ResetAndFindSeedToSolve(num_slots, begin, end)); + + soln.BackSubstFrom(banding); + isoln.BackSubstFrom(banding); + + hasher.SetOrdinalSeed(banding.GetOrdinalSeed()); + } + + for (PhsfInputGen cur = begin; cur != end; ++cur) { + ASSERT_EQ(cur->second, soln.PhsfQuery(cur->first, hasher)); + ASSERT_EQ(cur->second, isoln.PhsfQuery(cur->first, hasher)); + } +} + +// Not a real test, but a tool used to build APIs in ribbon_config.h +TYPED_TEST(RibbonTypeParamTest, FindOccupancy) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); + IMPORT_RIBBON_IMPL_TYPES(TypeParam); + using KeyGen = typename TypeParam::KeyGen; + + if (!FLAGS_find_occ) { + ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs"); + return; + } + + KeyGen cur(std::to_string(testing::UnitTest::GetInstance()->random_seed()), + 0); + + Banding banding; + Index num_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_min_slots); + Index max_slots = InterleavedSoln::RoundUpNumSlots(FLAGS_find_max_slots); + while (num_slots <= max_slots) { + std::map rem_histogram; + std::map slot_histogram; + if (FLAGS_find_slot_occ) { + for (Index i = 0; i < kCoeffBits; ++i) { + slot_histogram[i] = 0; + slot_histogram[num_slots - 1 - i] = 0; + slot_histogram[num_slots / 2 - kCoeffBits / 2 + i] = 0; + } + } + uint64_t total_added = 0; + for (uint32_t i = 0; i < FLAGS_find_iters; ++i) { + banding.Reset(num_slots); + uint32_t j = 0; + KeyGen end = cur; + end += num_slots + num_slots / 10; + for (; cur != end; ++cur) { + if (banding.Add(*cur)) { + ++j; + } else { + break; + } + } + total_added += j; + for (auto& slot : slot_histogram) { + slot.second += banding.IsOccupied(slot.first); + } + + int32_t bucket = + static_cast(num_slots) - static_cast(j); + rem_histogram[bucket]++; + if (FLAGS_verbose) { + fprintf(stderr, "num_slots: %u i: %u / %u avg_overhead: %g\r", + static_cast(num_slots), static_cast(i), + static_cast(FLAGS_find_iters), + 1.0 * (i + 1) * num_slots / total_added); + } + } + if (FLAGS_verbose) { + fprintf(stderr, "\n"); + } + + uint32_t cumulative = 0; + + double p50_rem = 0; + double p95_rem = 0; + double p99_9_rem = 0; + + for (auto& h : rem_histogram) { + double before = 1.0 * cumulative / FLAGS_find_iters; + double not_after = 1.0 * (cumulative + h.second) / FLAGS_find_iters; + if (FLAGS_verbose) { + fprintf(stderr, "overhead: %g before: %g not_after: %g\n", + 1.0 * num_slots / (num_slots - h.first), before, not_after); + } + cumulative += h.second; + if (before < 0.5 && 0.5 <= not_after) { + // fake it with linear interpolation + double portion = (0.5 - before) / (not_after - before); + p50_rem = h.first + portion; + } else if (before < 0.95 && 0.95 <= not_after) { + // fake it with linear interpolation + double portion = (0.95 - before) / (not_after - before); + p95_rem = h.first + portion; + } else if (before < 0.999 && 0.999 <= not_after) { + // fake it with linear interpolation + double portion = (0.999 - before) / (not_after - before); + p99_9_rem = h.first + portion; + } + } + for (auto& slot : slot_histogram) { + fprintf(stderr, "slot[%u] occupied: %g\n", (unsigned)slot.first, + 1.0 * slot.second / FLAGS_find_iters); + } + + double mean_rem = + (1.0 * FLAGS_find_iters * num_slots - total_added) / FLAGS_find_iters; + fprintf( + stderr, + "num_slots: %u iters: %u mean_ovr: %g p50_ovr: %g p95_ovr: %g " + "p99.9_ovr: %g mean_rem: %g p50_rem: %g p95_rem: %g p99.9_rem: %g\n", + static_cast(num_slots), + static_cast(FLAGS_find_iters), + 1.0 * num_slots / (num_slots - mean_rem), + 1.0 * num_slots / (num_slots - p50_rem), + 1.0 * num_slots / (num_slots - p95_rem), + 1.0 * num_slots / (num_slots - p99_9_rem), mean_rem, p50_rem, p95_rem, + p99_9_rem); + + num_slots = std::max( + num_slots + 1, static_cast(num_slots * FLAGS_find_next_factor)); + num_slots = InterleavedSoln::RoundUpNumSlots(num_slots); + } +} + +// Not a real test, but a tool to understand Homogeneous Ribbon +// behavior (TODO: configuration APIs & tests) +TYPED_TEST(RibbonTypeParamTest, OptimizeHomogAtScale) { + IMPORT_RIBBON_TYPES_AND_SETTINGS(TypeParam); + IMPORT_RIBBON_IMPL_TYPES(TypeParam); + using KeyGen = typename TypeParam::KeyGen; + + if (!FLAGS_optimize_homog) { + ROCKSDB_GTEST_BYPASS("Tool disabled during unit test runs"); + return; + } + + if (!TypeParam::kHomogeneous) { + ROCKSDB_GTEST_BYPASS("Only for Homogeneous Ribbon"); + return; + } + + KeyGen cur(std::to_string(testing::UnitTest::GetInstance()->random_seed()), + 0); + + Banding banding; + Index num_slots = SimpleSoln::RoundUpNumSlots(FLAGS_optimize_homog_slots); + banding.Reset(num_slots); + + // This and "band_ovr" is the "allocated overhead", or slots over added. + // It does not take into account FP rates. + double target_overhead = 1.20; + uint32_t num_added = 0; + + do { + do { + (void)banding.Add(*cur); + ++cur; + ++num_added; + } while (1.0 * num_slots / num_added > target_overhead); + + SimpleSoln soln; + soln.BackSubstFrom(banding); + + std::array fp_counts_by_cols; + fp_counts_by_cols.fill(0U); + for (uint32_t i = 0; i < FLAGS_optimize_homog_check; ++i) { + ResultRow r = soln.PhsfQuery(*cur, banding); + ++cur; + for (size_t j = 0; j < fp_counts_by_cols.size(); ++j) { + if ((r & 1) == 1) { + break; + } + fp_counts_by_cols[j]++; + r /= 2; + } + } + fprintf(stderr, "band_ovr: %g ", 1.0 * num_slots / num_added); + for (unsigned j = 0; j < fp_counts_by_cols.size(); ++j) { + double inv_fp_rate = + 1.0 * FLAGS_optimize_homog_check / fp_counts_by_cols[j]; + double equiv_cols = std::log(inv_fp_rate) * 1.4426950409; + // Overhead vs. information-theoretic minimum based on observed + // FP rate (subject to sampling error, especially for low FP rates) + double actual_overhead = + 1.0 * (j + 1) * num_slots / (equiv_cols * num_added); + fprintf(stderr, "ovr_%u: %g ", j + 1, actual_overhead); + } + fprintf(stderr, "\n"); + target_overhead -= FLAGS_optimize_homog_granularity; + } while (target_overhead > 1.0); +} + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); +#ifdef GFLAGS + ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/set_comparator.h b/src/rocksdb/util/set_comparator.h new file mode 100644 index 000000000..e0e64436a --- /dev/null +++ b/src/rocksdb/util/set_comparator.h @@ -0,0 +1,24 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { +// A comparator to be used in std::set +struct SetComparator { + explicit SetComparator() : user_comparator_(BytewiseComparator()) {} + explicit SetComparator(const Comparator* user_comparator) + : user_comparator_(user_comparator ? user_comparator + : BytewiseComparator()) {} + bool operator()(const Slice& lhs, const Slice& rhs) const { + return user_comparator_->Compare(lhs, rhs) < 0; + } + + private: + const Comparator* user_comparator_; +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/single_thread_executor.h b/src/rocksdb/util/single_thread_executor.h new file mode 100644 index 000000000..c69f2a292 --- /dev/null +++ b/src/rocksdb/util/single_thread_executor.h @@ -0,0 +1,56 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once + +#if USE_COROUTINES +#include + +#include "folly/CPortability.h" +#include "folly/CppAttributes.h" +#include "folly/Executor.h" +#include "util/async_file_reader.h" + +namespace ROCKSDB_NAMESPACE { +// Implements a simple executor that runs callback functions in the same +// thread, unlike CPUThreadExecutor which may schedule the callback on +// another thread. Runs in a tight loop calling the queued callbacks, +// and polls for async IO completions when idle. The completions will +// resume suspended coroutines and they get added to the queue, which +// will get picked up by this loop. +// Any possibility of deadlock is precluded because the file system +// guarantees that async IO completion callbacks will not be scheduled +// to run in this thread or this executor. +class SingleThreadExecutor : public folly::Executor { + public: + explicit SingleThreadExecutor(AsyncFileReader& reader) + : reader_(reader), busy_(false) {} + + void add(folly::Func callback) override { + auto& q = q_; + q.push(std::move(callback)); + if (q.size() == 1 && !busy_) { + while (!q.empty()) { + q.front()(); + q.pop(); + + if (q.empty()) { + // Prevent recursion, as the Wait may queue resumed coroutines + busy_ = true; + reader_.Wait(); + busy_ = false; + } + } + } + } + + private: + std::queue q_; + AsyncFileReader& reader_; + bool busy_; +}; +} // namespace ROCKSDB_NAMESPACE +#endif // USE_COROUTINES diff --git a/src/rocksdb/util/slice.cc b/src/rocksdb/util/slice.cc new file mode 100644 index 000000000..1fa21afcb --- /dev/null +++ b/src/rocksdb/util/slice.cc @@ -0,0 +1,405 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice.h" + +#include + +#include + +#include "rocksdb/convenience.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { + +class FixedPrefixTransform : public SliceTransform { + private: + size_t prefix_len_; + std::string id_; + + public: + explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len) { + id_ = std::string(kClassName()) + "." + std::to_string(prefix_len_); + } + + static const char* kClassName() { return "rocksdb.FixedPrefix"; } + static const char* kNickName() { return "fixed"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool IsInstanceOf(const std::string& name) const override { + if (name == id_) { + return true; + } else if (StartsWith(name, kNickName())) { + std::string alt_id = + std::string(kNickName()) + ":" + std::to_string(prefix_len_); + if (name == alt_id) { + return true; + } + } + return SliceTransform::IsInstanceOf(name); + } + + std::string GetId() const override { return id_; } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return Slice(src.data(), prefix_len_); + } + + bool InDomain(const Slice& src) const override { + return (src.size() >= prefix_len_); + } + + bool InRange(const Slice& dst) const override { + return (dst.size() == prefix_len_); + } + + bool FullLengthEnabled(size_t* len) const override { + *len = prefix_len_; + return true; + } + + bool SameResultWhenAppended(const Slice& prefix) const override { + return InDomain(prefix); + } +}; + +class CappedPrefixTransform : public SliceTransform { + private: + size_t cap_len_; + std::string id_; + + public: + explicit CappedPrefixTransform(size_t cap_len) : cap_len_(cap_len) { + id_ = std::string(kClassName()) + "." + std::to_string(cap_len_); + } + + static const char* kClassName() { return "rocksdb.CappedPrefix"; } + static const char* kNickName() { return "capped"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + std::string GetId() const override { return id_; } + + bool IsInstanceOf(const std::string& name) const override { + if (name == id_) { + return true; + } else if (StartsWith(name, kNickName())) { + std::string alt_id = + std::string(kNickName()) + ":" + std::to_string(cap_len_); + if (name == alt_id) { + return true; + } + } + return SliceTransform::IsInstanceOf(name); + } + + Slice Transform(const Slice& src) const override { + assert(InDomain(src)); + return Slice(src.data(), std::min(cap_len_, src.size())); + } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& dst) const override { + return (dst.size() <= cap_len_); + } + + bool FullLengthEnabled(size_t* len) const override { + *len = cap_len_; + return true; + } + + bool SameResultWhenAppended(const Slice& prefix) const override { + return prefix.size() >= cap_len_; + } +}; + +class NoopTransform : public SliceTransform { + public: + explicit NoopTransform() {} + + static const char* kClassName() { return "rocksdb.Noop"; } + const char* Name() const override { return kClassName(); } + + Slice Transform(const Slice& src) const override { return src; } + + bool InDomain(const Slice& /*src*/) const override { return true; } + + bool InRange(const Slice& /*dst*/) const override { return true; } + + bool SameResultWhenAppended(const Slice& /*prefix*/) const override { + return false; + } +}; + +} // end namespace + +const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { + return new FixedPrefixTransform(prefix_len); +} + +const SliceTransform* NewCappedPrefixTransform(size_t cap_len) { + return new CappedPrefixTransform(cap_len); +} + +const SliceTransform* NewNoopTransform() { return new NoopTransform; } + +#ifndef ROCKSDB_LITE +static int RegisterBuiltinSliceTransform(ObjectLibrary& library, + const std::string& /*arg*/) { + // For the builtin transforms, the format is typically + // [Name].[0-9]+ or [NickName]:[0-9]+ + library.AddFactory( + NoopTransform::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(NewNoopTransform()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(FixedPrefixTransform::kNickName(), false) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + auto len = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewFixedPrefixTransform(len)); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(FixedPrefixTransform::kClassName(), false) + .AddNumber("."), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto len = ParseSizeT( + uri.substr(strlen(FixedPrefixTransform::kClassName()) + 1)); + guard->reset(NewFixedPrefixTransform(len)); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(CappedPrefixTransform::kNickName(), false) + .AddNumber(":"), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto colon = uri.find(":"); + auto len = ParseSizeT(uri.substr(colon + 1)); + guard->reset(NewCappedPrefixTransform(len)); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(CappedPrefixTransform::kClassName(), false) + .AddNumber("."), + [](const std::string& uri, std::unique_ptr* guard, + std::string* /*errmsg*/) { + auto len = ParseSizeT( + uri.substr(strlen(CappedPrefixTransform::kClassName()) + 1)); + guard->reset(NewCappedPrefixTransform(len)); + return guard->get(); + }); + size_t num_types; + return static_cast(library.GetFactoryCount(&num_types)); +} +#endif // ROCKSDB_LITE + +Status SliceTransform::CreateFromString( + const ConfigOptions& config_options, const std::string& value, + std::shared_ptr* result) { +#ifndef ROCKSDB_LITE + static std::once_flag once; + std::call_once(once, [&]() { + RegisterBuiltinSliceTransform(*(ObjectLibrary::Default().get()), ""); + }); +#endif // ROCKSDB_LITE + std::string id; + std::unordered_map opt_map; + Status status = Customizable::GetOptionsMap(config_options, result->get(), + value, &id, &opt_map); + if (!status.ok()) { // GetOptionsMap failed + return status; + } else if (id.empty() && opt_map.empty()) { + result->reset(); + } else { +#ifndef ROCKSDB_LITE + status = config_options.registry->NewSharedObject(id, result); +#else + auto Matches = [](const std::string& input, size_t size, + const char* pattern, char sep) { + auto plen = strlen(pattern); + return (size > plen + 2 && input[plen] == sep && + StartsWith(input, pattern)); + }; + + auto size = id.size(); + if (id == NoopTransform::kClassName()) { + result->reset(NewNoopTransform()); + } else if (Matches(id, size, FixedPrefixTransform::kNickName(), ':')) { + auto fixed = strlen(FixedPrefixTransform::kNickName()); + auto len = ParseSizeT(id.substr(fixed + 1)); + result->reset(NewFixedPrefixTransform(len)); + } else if (Matches(id, size, CappedPrefixTransform::kNickName(), ':')) { + auto capped = strlen(CappedPrefixTransform::kNickName()); + auto len = ParseSizeT(id.substr(capped + 1)); + result->reset(NewCappedPrefixTransform(len)); + } else if (Matches(id, size, CappedPrefixTransform::kClassName(), '.')) { + auto capped = strlen(CappedPrefixTransform::kClassName()); + auto len = ParseSizeT(id.substr(capped + 1)); + result->reset(NewCappedPrefixTransform(len)); + } else if (Matches(id, size, FixedPrefixTransform::kClassName(), '.')) { + auto fixed = strlen(FixedPrefixTransform::kClassName()); + auto len = ParseSizeT(id.substr(fixed + 1)); + result->reset(NewFixedPrefixTransform(len)); + } else { + status = Status::NotSupported("Cannot load object in LITE mode ", id); + } +#endif // ROCKSDB_LITE + if (config_options.ignore_unsupported_options && status.IsNotSupported()) { + return Status::OK(); + } else if (status.ok()) { + SliceTransform* transform = const_cast(result->get()); + status = + Customizable::ConfigureNewObject(config_options, transform, opt_map); + } + } + return status; +} + +std::string SliceTransform::AsString() const { +#ifndef ROCKSDB_LITE + if (HasRegisteredOptions()) { + ConfigOptions opts; + opts.delimiter = ";"; + return ToString(opts); + } +#endif // ROCKSDB_LITE + return GetId(); +} + +// 2 small internal utility functions, for efficient hex conversions +// and no need for snprintf, toupper etc... +// Originally from wdt/util/EncryptionUtils.cpp - for +// std::to_string(true)/DecodeHex: +char toHex(unsigned char v) { + if (v <= 9) { + return '0' + v; + } + return 'A' + v - 10; +} +// most of the code is for validation/error check +int fromHex(char c) { + // toupper: + if (c >= 'a' && c <= 'f') { + c -= ('a' - 'A'); // aka 0x20 + } + // validation + if (c < '0' || (c > '9' && (c < 'A' || c > 'F'))) { + return -1; // invalid not 0-9A-F hex char + } + if (c <= '9') { + return c - '0'; + } + return c - 'A' + 10; +} + +Slice::Slice(const SliceParts& parts, std::string* buf) { + size_t length = 0; + for (int i = 0; i < parts.num_parts; ++i) { + length += parts.parts[i].size(); + } + buf->reserve(length); + + for (int i = 0; i < parts.num_parts; ++i) { + buf->append(parts.parts[i].data(), parts.parts[i].size()); + } + data_ = buf->data(); + size_ = buf->size(); +} + +// Return a string that contains the copy of the referenced data. +std::string Slice::ToString(bool hex) const { + std::string result; // RVO/NRVO/move + if (hex) { + result.reserve(2 * size_); + for (size_t i = 0; i < size_; ++i) { + unsigned char c = data_[i]; + result.push_back(toHex(c >> 4)); + result.push_back(toHex(c & 0xf)); + } + return result; + } else { + result.assign(data_, size_); + return result; + } +} + +// Originally from rocksdb/utilities/ldb_cmd.h +bool Slice::DecodeHex(std::string* result) const { + std::string::size_type len = size_; + if (len % 2) { + // Hex string must be even number of hex digits to get complete bytes back + return false; + } + if (!result) { + return false; + } + result->clear(); + result->reserve(len / 2); + + for (size_t i = 0; i < len;) { + int h1 = fromHex(data_[i++]); + if (h1 < 0) { + return false; + } + int h2 = fromHex(data_[i++]); + if (h2 < 0) { + return false; + } + result->push_back(static_cast((h1 << 4) | h2)); + } + return true; +} + +PinnableSlice::PinnableSlice(PinnableSlice&& other) { + *this = std::move(other); +} + +PinnableSlice& PinnableSlice::operator=(PinnableSlice&& other) { + if (this != &other) { + Cleanable::Reset(); + Cleanable::operator=(std::move(other)); + size_ = other.size_; + pinned_ = other.pinned_; + if (pinned_) { + data_ = other.data_; + // When it's pinned, buf should no longer be of use. + } else { + if (other.buf_ == &other.self_space_) { + self_space_ = std::move(other.self_space_); + buf_ = &self_space_; + data_ = buf_->data(); + } else { + buf_ = other.buf_; + data_ = other.data_; + } + } + other.self_space_.clear(); + other.buf_ = &other.self_space_; + other.pinned_ = false; + other.PinSelf(); + } + return *this; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/slice_test.cc b/src/rocksdb/util/slice_test.cc new file mode 100644 index 000000000..e1c35d567 --- /dev/null +++ b/src/rocksdb/util/slice_test.cc @@ -0,0 +1,191 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/slice.h" + +#include + +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/data_structure.h" +#include "rocksdb/types.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(SliceTest, StringView) { + std::string s = "foo"; + std::string_view sv = s; + ASSERT_EQ(Slice(s), Slice(sv)); + ASSERT_EQ(Slice(s), Slice(std::move(sv))); +} + +// Use this to keep track of the cleanups that were actually performed +void Multiplier(void* arg1, void* arg2) { + int* res = reinterpret_cast(arg1); + int* num = reinterpret_cast(arg2); + *res *= *num; +} + +class PinnableSliceTest : public testing::Test { + public: + void AssertSameData(const std::string& expected, const PinnableSlice& slice) { + std::string got; + got.assign(slice.data(), slice.size()); + ASSERT_EQ(expected, got); + } +}; + +// Test that the external buffer is moved instead of being copied. +TEST_F(PinnableSliceTest, MoveExternalBuffer) { + Slice s("123"); + std::string buf; + PinnableSlice v1(&buf); + v1.PinSelf(s); + + PinnableSlice v2(std::move(v1)); + ASSERT_EQ(buf.data(), v2.data()); + ASSERT_EQ(&buf, v2.GetSelf()); + + PinnableSlice v3; + v3 = std::move(v2); + ASSERT_EQ(buf.data(), v3.data()); + ASSERT_EQ(&buf, v3.GetSelf()); +} + +TEST_F(PinnableSliceTest, Move) { + int n2 = 2; + int res = 1; + const std::string const_str1 = "123"; + const std::string const_str2 = "ABC"; + Slice slice1(const_str1); + Slice slice2(const_str2); + + { + // Test move constructor on a pinned slice. + res = 1; + PinnableSlice v1; + v1.PinSlice(slice1, Multiplier, &res, &n2); + PinnableSlice v2(std::move(v1)); + + // Since v1's Cleanable has been moved to v2, + // no cleanup should happen in Reset. + v1.Reset(); + ASSERT_EQ(1, res); + + AssertSameData(const_str1, v2); + } + // v2 is cleaned up. + ASSERT_EQ(2, res); + + { + // Test move constructor on an unpinned slice. + PinnableSlice v1; + v1.PinSelf(slice1); + PinnableSlice v2(std::move(v1)); + + AssertSameData(const_str1, v2); + } + + { + // Test move assignment from a pinned slice to + // another pinned slice. + res = 1; + PinnableSlice v1; + v1.PinSlice(slice1, Multiplier, &res, &n2); + PinnableSlice v2; + v2.PinSlice(slice2, Multiplier, &res, &n2); + v2 = std::move(v1); + + // v2's Cleanable will be Reset before moving + // anything from v1. + ASSERT_EQ(2, res); + // Since v1's Cleanable has been moved to v2, + // no cleanup should happen in Reset. + v1.Reset(); + ASSERT_EQ(2, res); + + AssertSameData(const_str1, v2); + } + // The Cleanable moved from v1 to v2 will be Reset. + ASSERT_EQ(4, res); + + { + // Test move assignment from a pinned slice to + // an unpinned slice. + res = 1; + PinnableSlice v1; + v1.PinSlice(slice1, Multiplier, &res, &n2); + PinnableSlice v2; + v2.PinSelf(slice2); + v2 = std::move(v1); + + // Since v1's Cleanable has been moved to v2, + // no cleanup should happen in Reset. + v1.Reset(); + ASSERT_EQ(1, res); + + AssertSameData(const_str1, v2); + } + // The Cleanable moved from v1 to v2 will be Reset. + ASSERT_EQ(2, res); + + { + // Test move assignment from an upinned slice to + // another unpinned slice. + PinnableSlice v1; + v1.PinSelf(slice1); + PinnableSlice v2; + v2.PinSelf(slice2); + v2 = std::move(v1); + + AssertSameData(const_str1, v2); + } + + { + // Test move assignment from an upinned slice to + // a pinned slice. + res = 1; + PinnableSlice v1; + v1.PinSelf(slice1); + PinnableSlice v2; + v2.PinSlice(slice2, Multiplier, &res, &n2); + v2 = std::move(v1); + + // v2's Cleanable will be Reset before moving + // anything from v1. + ASSERT_EQ(2, res); + + AssertSameData(const_str1, v2); + } + // No Cleanable is moved from v1 to v2, so no more cleanup. + ASSERT_EQ(2, res); +} + +// ***************************************************************** // +// Unit test for SmallEnumSet +class SmallEnumSetTest : public testing::Test { + public: + SmallEnumSetTest() {} + ~SmallEnumSetTest() {} +}; + +TEST_F(SmallEnumSetTest, SmallSetTest) { + FileTypeSet fs; + ASSERT_TRUE(fs.Add(FileType::kIdentityFile)); + ASSERT_FALSE(fs.Add(FileType::kIdentityFile)); + ASSERT_TRUE(fs.Add(FileType::kInfoLogFile)); + ASSERT_TRUE(fs.Contains(FileType::kIdentityFile)); + ASSERT_FALSE(fs.Contains(FileType::kDBLockFile)); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/slice_transform_test.cc b/src/rocksdb/util/slice_transform_test.cc new file mode 100644 index 000000000..64ac8bb1f --- /dev/null +++ b/src/rocksdb/util/slice_transform_test.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "test_util/testharness.h" + +namespace ROCKSDB_NAMESPACE { + +class SliceTransformTest : public testing::Test {}; + +TEST_F(SliceTransformTest, CapPrefixTransform) { + std::string s; + s = "abcdefge"; + + std::unique_ptr transform; + + transform.reset(NewCappedPrefixTransform(6)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdef"); + ASSERT_TRUE(transform->SameResultWhenAppended("123456")); + ASSERT_TRUE(transform->SameResultWhenAppended("1234567")); + ASSERT_TRUE(!transform->SameResultWhenAppended("12345")); + + transform.reset(NewCappedPrefixTransform(8)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(10)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform(s).ToString(), ""); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform("").ToString(), ""); +} + +class SliceTransformDBTest : public testing::Test { + private: + std::string dbname_; + Env* env_; + DB* db_; + + public: + SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) { + dbname_ = test::PerThreadDBPath("slice_transform_db_test"); + EXPECT_OK(DestroyDB(dbname_, last_options_)); + } + + ~SliceTransformDBTest() override { + delete db_; + EXPECT_OK(DestroyDB(dbname_, last_options_)); + } + + DB* db() { return db_; } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } + + Options last_options_; +}; + +namespace { +uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} +} // namespace + +TEST_F(SliceTransformDBTest, CapPrefix) { + last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8)); + last_options_.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ASSERT_OK(TryReopen()); + + ReadOptions ro; + FlushOptions fo; + WriteOptions wo; + + ASSERT_OK(db()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(db()->Put(wo, "foo", "bar")); + ASSERT_OK(db()->Put(wo, "foo3", "bar3")); + ASSERT_OK(db()->Flush(fo)); + + std::unique_ptr iter(db()->NewIterator(ro)); + + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "bar"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U); + + iter->Seek("foo2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barbarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "foo"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barfoofoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U); + + iter->Seek("foobarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/status.cc b/src/rocksdb/util/status.cc new file mode 100644 index 000000000..72fdfdbcc --- /dev/null +++ b/src/rocksdb/util/status.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/status.h" + +#include +#ifdef OS_WIN +#include +#endif +#include + +#include "port/port.h" + +namespace ROCKSDB_NAMESPACE { + +std::unique_ptr Status::CopyState(const char* s) { + const size_t cch = std::strlen(s) + 1; // +1 for the null terminator + char* rv = new char[cch]; + std::strncpy(rv, s, cch); + return std::unique_ptr(rv); +} + +static const char* msgs[static_cast(Status::kMaxSubCode)] = { + "", // kNone + "Timeout Acquiring Mutex", // kMutexTimeout + "Timeout waiting to lock key", // kLockTimeout + "Failed to acquire lock due to max_num_locks limit", // kLockLimit + "No space left on device", // kNoSpace + "Deadlock", // kDeadlock + "Stale file handle", // kStaleFile + "Memory limit reached", // kMemoryLimit + "Space limit reached", // kSpaceLimit + "No such file or directory", // kPathNotFound + // KMergeOperandsInsufficientCapacity + "Insufficient capacity for merge operands", + // kManualCompactionPaused + "Manual compaction paused", + " (overwritten)", // kOverwritten, subcode of OK + "Txn not prepared", // kTxnNotPrepared + "IO fenced off", // kIOFenced +}; + +Status::Status(Code _code, SubCode _subcode, const Slice& msg, + const Slice& msg2, Severity sev) + : code_(_code), + subcode_(_subcode), + sev_(sev), + retryable_(false), + data_loss_(false), + scope_(0) { + assert(subcode_ != kMaxSubCode); + const size_t len1 = msg.size(); + const size_t len2 = msg2.size(); + const size_t size = len1 + (len2 ? (2 + len2) : 0); + char* const result = new char[size + 1]; // +1 for null terminator + memcpy(result, msg.data(), len1); + if (len2) { + result[len1] = ':'; + result[len1 + 1] = ' '; + memcpy(result + len1 + 2, msg2.data(), len2); + } + result[size] = '\0'; // null terminator for C style string + state_.reset(result); +} + +std::string Status::ToString() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + checked_ = true; +#endif // ROCKSDB_ASSERT_STATUS_CHECKED + const char* type = nullptr; + switch (code_) { + case kOk: + return "OK"; + case kNotFound: + type = "NotFound: "; + break; + case kCorruption: + type = "Corruption: "; + break; + case kNotSupported: + type = "Not implemented: "; + break; + case kInvalidArgument: + type = "Invalid argument: "; + break; + case kIOError: + type = "IO error: "; + break; + case kMergeInProgress: + type = "Merge in progress: "; + break; + case kIncomplete: + type = "Result incomplete: "; + break; + case kShutdownInProgress: + type = "Shutdown in progress: "; + break; + case kTimedOut: + type = "Operation timed out: "; + break; + case kAborted: + type = "Operation aborted: "; + break; + case kBusy: + type = "Resource busy: "; + break; + case kExpired: + type = "Operation expired: "; + break; + case kTryAgain: + type = "Operation failed. Try again.: "; + break; + case kCompactionTooLarge: + type = "Compaction too large: "; + break; + case kColumnFamilyDropped: + type = "Column family dropped: "; + break; + case kMaxCode: + assert(false); + break; + } + char tmp[30]; + if (type == nullptr) { + // This should not happen since `code_` should be a valid non-`kMaxCode` + // member of the `Code` enum. The above switch-statement should have had a + // case assigning `type` to a corresponding string. + assert(false); + snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast(code())); + type = tmp; + } + std::string result(type); + if (subcode_ != kNone) { + uint32_t index = static_cast(subcode_); + assert(sizeof(msgs) / sizeof(msgs[0]) > index); + result.append(msgs[index]); + } + + if (state_ != nullptr) { + if (subcode_ != kNone) { + result.append(": "); + } + result.append(state_.get()); + } + return result; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/stderr_logger.cc b/src/rocksdb/util/stderr_logger.cc new file mode 100644 index 000000000..6044b8b93 --- /dev/null +++ b/src/rocksdb/util/stderr_logger.cc @@ -0,0 +1,30 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/stderr_logger.h" + +#include "port/sys_time.h" + +namespace ROCKSDB_NAMESPACE { +StderrLogger::~StderrLogger() {} + +void StderrLogger::Logv(const char* format, va_list ap) { + const uint64_t thread_id = Env::Default()->GetThreadID(); + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + port::LocalTimeR(&seconds, &t); + fprintf(stderr, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llx ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, + static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + vfprintf(stderr, format, ap); + fprintf(stderr, "\n"); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/stderr_logger.h b/src/rocksdb/util/stderr_logger.h new file mode 100644 index 000000000..c3b01210c --- /dev/null +++ b/src/rocksdb/util/stderr_logger.h @@ -0,0 +1,31 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include + +#include "rocksdb/env.h" + +namespace ROCKSDB_NAMESPACE { + +// Prints logs to stderr for faster debugging +class StderrLogger : public Logger { + public: + explicit StderrLogger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) + : Logger(log_level) {} + + ~StderrLogger() override; + + // Brings overloaded Logv()s into scope so they're not hidden when we override + // a subset of them. + using Logger::Logv; + + virtual void Logv(const char* format, va_list ap) override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/stop_watch.h b/src/rocksdb/util/stop_watch.h new file mode 100644 index 000000000..e26380d97 --- /dev/null +++ b/src/rocksdb/util/stop_watch.h @@ -0,0 +1,118 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#pragma once +#include "monitoring/statistics.h" +#include "rocksdb/system_clock.h" + +namespace ROCKSDB_NAMESPACE { +// Auto-scoped. +// Records the measure time into the corresponding histogram if statistics +// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr +// and overwrite is true, it will be added to *elapsed if overwrite is false. +class StopWatch { + public: + StopWatch(SystemClock* clock, Statistics* statistics, + const uint32_t hist_type, uint64_t* elapsed = nullptr, + bool overwrite = true, bool delay_enabled = false) + : clock_(clock), + statistics_(statistics), + hist_type_(hist_type), + elapsed_(elapsed), + overwrite_(overwrite), + stats_enabled_(statistics && + statistics->get_stats_level() >= + StatsLevel::kExceptTimers && + statistics->HistEnabledForType(hist_type)), + delay_enabled_(delay_enabled), + total_delay_(0), + delay_start_time_(0), + start_time_((stats_enabled_ || elapsed != nullptr) ? clock->NowMicros() + : 0) {} + + ~StopWatch() { + if (elapsed_) { + if (overwrite_) { + *elapsed_ = clock_->NowMicros() - start_time_; + } else { + *elapsed_ += clock_->NowMicros() - start_time_; + } + } + if (elapsed_ && delay_enabled_) { + *elapsed_ -= total_delay_; + } + if (stats_enabled_) { + statistics_->reportTimeToHistogram( + hist_type_, (elapsed_ != nullptr) + ? *elapsed_ + : (clock_->NowMicros() - start_time_)); + } + } + + void DelayStart() { + // if delay_start_time_ is not 0, it means we are already tracking delay, + // so delay_start_time_ should not be overwritten + if (elapsed_ && delay_enabled_ && delay_start_time_ == 0) { + delay_start_time_ = clock_->NowMicros(); + } + } + + void DelayStop() { + if (elapsed_ && delay_enabled_ && delay_start_time_ != 0) { + total_delay_ += clock_->NowMicros() - delay_start_time_; + } + // reset to 0 means currently no delay is being tracked, so two consecutive + // calls to DelayStop will not increase total_delay_ + delay_start_time_ = 0; + } + + uint64_t GetDelay() const { return delay_enabled_ ? total_delay_ : 0; } + + uint64_t start_time() const { return start_time_; } + + private: + SystemClock* clock_; + Statistics* statistics_; + const uint32_t hist_type_; + uint64_t* elapsed_; + bool overwrite_; + bool stats_enabled_; + bool delay_enabled_; + uint64_t total_delay_; + uint64_t delay_start_time_; + const uint64_t start_time_; +}; + +// a nano second precision stopwatch +class StopWatchNano { + public: + explicit StopWatchNano(SystemClock* clock, bool auto_start = false) + : clock_(clock), start_(0) { + if (auto_start) { + Start(); + } + } + + void Start() { start_ = clock_->NowNanos(); } + + uint64_t ElapsedNanos(bool reset = false) { + auto now = clock_->NowNanos(); + auto elapsed = now - start_; + if (reset) { + start_ = now; + } + return elapsed; + } + + uint64_t ElapsedNanosSafe(bool reset = false) { + return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U; + } + + private: + SystemClock* clock_; + uint64_t start_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/string_util.cc b/src/rocksdb/util/string_util.cc new file mode 100644 index 000000000..324482a4c --- /dev/null +++ b/src/rocksdb/util/string_util.cc @@ -0,0 +1,504 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "util/string_util.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "port/sys_time.h" +#include "rocksdb/slice.h" + +#ifndef __has_cpp_attribute +#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) 0 +#else +#define ROCKSDB_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#endif + +#if ROCKSDB_HAS_CPP_ATTRIBUTE(maybe_unused) && __cplusplus >= 201703L +#define ROCKSDB_MAYBE_UNUSED [[maybe_unused]] +#elif ROCKSDB_HAS_CPP_ATTRIBUTE(gnu::unused) || __GNUC__ +#define ROCKSDB_MAYBE_UNUSED [[gnu::unused]] +#else +#define ROCKSDB_MAYBE_UNUSED +#endif + +namespace ROCKSDB_NAMESPACE { + +const std::string kNullptrString = "nullptr"; + +std::vector StringSplit(const std::string& arg, char delim) { + std::vector splits; + std::stringstream ss(arg); + std::string item; + while (std::getline(ss, item, delim)) { + splits.push_back(item); + } + return splits; +} + +// for micros < 10ms, print "XX us". +// for micros < 10sec, print "XX ms". +// for micros >= 10 sec, print "XX sec". +// for micros <= 1 hour, print Y:X M:S". +// for micros > 1 hour, print Z:Y:X H:M:S". +int AppendHumanMicros(uint64_t micros, char* output, int len, + bool fixed_format) { + if (micros < 10000 && !fixed_format) { + return snprintf(output, len, "%" PRIu64 " us", micros); + } else if (micros < 10000000 && !fixed_format) { + return snprintf(output, len, "%.3lf ms", + static_cast(micros) / 1000); + } else if (micros < 1000000l * 60 && !fixed_format) { + return snprintf(output, len, "%.3lf sec", + static_cast(micros) / 1000000); + } else if (micros < 1000000ll * 60 * 60 && !fixed_format) { + return snprintf(output, len, "%02" PRIu64 ":%05.3f M:S", + micros / 1000000 / 60, + static_cast(micros % 60000000) / 1000000); + } else { + return snprintf(output, len, "%02" PRIu64 ":%02" PRIu64 ":%05.3f H:M:S", + micros / 1000000 / 3600, (micros / 1000000 / 60) % 60, + static_cast(micros % 60000000) / 1000000); + } +} + +// for sizes >=10TB, print "XXTB" +// for sizes >=10GB, print "XXGB" +// etc. +// append file size summary to output and return the len +int AppendHumanBytes(uint64_t bytes, char* output, int len) { + const uint64_t ull10 = 10; + if (bytes >= ull10 << 40) { + return snprintf(output, len, "%" PRIu64 "TB", bytes >> 40); + } else if (bytes >= ull10 << 30) { + return snprintf(output, len, "%" PRIu64 "GB", bytes >> 30); + } else if (bytes >= ull10 << 20) { + return snprintf(output, len, "%" PRIu64 "MB", bytes >> 20); + } else if (bytes >= ull10 << 10) { + return snprintf(output, len, "%" PRIu64 "KB", bytes >> 10); + } else { + return snprintf(output, len, "%" PRIu64 "B", bytes); + } +} + +void AppendNumberTo(std::string* str, uint64_t num) { + char buf[30]; + snprintf(buf, sizeof(buf), "%" PRIu64, num); + str->append(buf); +} + +void AppendEscapedStringTo(std::string* str, const Slice& value) { + for (size_t i = 0; i < value.size(); i++) { + char c = value[i]; + if (c >= ' ' && c <= '~') { + str->push_back(c); + } else { + char buf[10]; + snprintf(buf, sizeof(buf), "\\x%02x", + static_cast(c) & 0xff); + str->append(buf); + } + } +} + +std::string NumberToHumanString(int64_t num) { + char buf[19]; + int64_t absnum = num < 0 ? -num : num; + if (absnum < 10000) { + snprintf(buf, sizeof(buf), "%" PRIi64, num); + } else if (absnum < 10000000) { + snprintf(buf, sizeof(buf), "%" PRIi64 "K", num / 1000); + } else if (absnum < 10000000000LL) { + snprintf(buf, sizeof(buf), "%" PRIi64 "M", num / 1000000); + } else { + snprintf(buf, sizeof(buf), "%" PRIi64 "G", num / 1000000000); + } + return std::string(buf); +} + +std::string BytesToHumanString(uint64_t bytes) { + const char* size_name[] = {"KB", "MB", "GB", "TB"}; + double final_size = static_cast(bytes); + size_t size_idx; + + // always start with KB + final_size /= 1024; + size_idx = 0; + + while (size_idx < 3 && final_size >= 1024) { + final_size /= 1024; + size_idx++; + } + + char buf[20]; + snprintf(buf, sizeof(buf), "%.2f %s", final_size, size_name[size_idx]); + return std::string(buf); +} + +std::string TimeToHumanString(int unixtime) { + char time_buffer[80]; + time_t rawtime = unixtime; + struct tm tInfo; + struct tm* timeinfo = port::LocalTimeR(&rawtime, &tInfo); + assert(timeinfo == &tInfo); + strftime(time_buffer, 80, "%c", timeinfo); + return std::string(time_buffer); +} + +std::string EscapeString(const Slice& value) { + std::string r; + AppendEscapedStringTo(&r, value); + return r; +} + +bool ConsumeDecimalNumber(Slice* in, uint64_t* val) { + uint64_t v = 0; + int digits = 0; + while (!in->empty()) { + char c = (*in)[0]; + if (c >= '0' && c <= '9') { + ++digits; + const unsigned int delta = (c - '0'); + static const uint64_t kMaxUint64 = ~static_cast(0); + if (v > kMaxUint64 / 10 || + (v == kMaxUint64 / 10 && delta > kMaxUint64 % 10)) { + // Overflow + return false; + } + v = (v * 10) + delta; + in->remove_prefix(1); + } else { + break; + } + } + *val = v; + return (digits > 0); +} + +bool isSpecialChar(const char c) { + if (c == '\\' || c == '#' || c == ':' || c == '\r' || c == '\n') { + return true; + } + return false; +} + +namespace { +using CharMap = std::pair; +} + +char UnescapeChar(const char c) { + static const CharMap convert_map[] = {{'r', '\r'}, {'n', '\n'}}; + + auto iter = std::find_if(std::begin(convert_map), std::end(convert_map), + [c](const CharMap& p) { return p.first == c; }); + + if (iter == std::end(convert_map)) { + return c; + } + return iter->second; +} + +char EscapeChar(const char c) { + static const CharMap convert_map[] = {{'\n', 'n'}, {'\r', 'r'}}; + + auto iter = std::find_if(std::begin(convert_map), std::end(convert_map), + [c](const CharMap& p) { return p.first == c; }); + + if (iter == std::end(convert_map)) { + return c; + } + return iter->second; +} + +std::string EscapeOptionString(const std::string& raw_string) { + std::string output; + for (auto c : raw_string) { + if (isSpecialChar(c)) { + output += '\\'; + output += EscapeChar(c); + } else { + output += c; + } + } + + return output; +} + +std::string UnescapeOptionString(const std::string& escaped_string) { + bool escaped = false; + std::string output; + + for (auto c : escaped_string) { + if (escaped) { + output += UnescapeChar(c); + escaped = false; + } else { + if (c == '\\') { + escaped = true; + continue; + } + output += c; + } + } + return output; +} + +std::string trim(const std::string& str) { + if (str.empty()) return std::string(); + size_t start = 0; + size_t end = str.size() - 1; + while (isspace(str[start]) != 0 && start < end) { + ++start; + } + while (isspace(str[end]) != 0 && start < end) { + --end; + } + if (start <= end) { + return str.substr(start, end - start + 1); + } + return std::string(); +} + +bool EndsWith(const std::string& string, const std::string& pattern) { + size_t plen = pattern.size(); + size_t slen = string.size(); + if (plen <= slen) { + return string.compare(slen - plen, plen, pattern) == 0; + } else { + return false; + } +} + +bool StartsWith(const std::string& string, const std::string& pattern) { + return string.compare(0, pattern.size(), pattern) == 0; +} + +#ifndef ROCKSDB_LITE + +bool ParseBoolean(const std::string& type, const std::string& value) { + if (value == "true" || value == "1") { + return true; + } else if (value == "false" || value == "0") { + return false; + } + throw std::invalid_argument(type); +} + +uint8_t ParseUint8(const std::string& value) { + uint64_t num = ParseUint64(value); + if ((num >> 8LL) == 0) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + +uint32_t ParseUint32(const std::string& value) { + uint64_t num = ParseUint64(value); + if ((num >> 32LL) == 0) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + +int32_t ParseInt32(const std::string& value) { + int64_t num = ParseInt64(value); + if (num <= std::numeric_limits::max() && + num >= std::numeric_limits::min()) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + +#endif + +uint64_t ParseUint64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + uint64_t num = std::stoull(value.c_str(), &endchar); +#else + char* endptr; + uint64_t num = std::strtoul(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +int64_t ParseInt64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int64_t num = std::stoll(value.c_str(), &endchar); +#else + char* endptr; + int64_t num = std::strtoll(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +int ParseInt(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int num = std::stoi(value.c_str(), &endchar); +#else + char* endptr; + int num = std::strtoul(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10; + else if (c == 'm' || c == 'M') + num <<= 20; + else if (c == 'g' || c == 'G') + num <<= 30; + } + + return num; +} + +double ParseDouble(const std::string& value) { +#ifndef CYGWIN + return std::stod(value); +#else + return std::strtod(value.c_str(), 0); +#endif +} + +size_t ParseSizeT(const std::string& value) { + return static_cast(ParseUint64(value)); +} + +std::vector ParseVectorInt(const std::string& value) { + std::vector result; + size_t start = 0; + while (start < value.size()) { + size_t end = value.find(':', start); + if (end == std::string::npos) { + result.push_back(ParseInt(value.substr(start))); + break; + } else { + result.push_back(ParseInt(value.substr(start, end - start))); + start = end + 1; + } + } + return result; +} + +bool SerializeIntVector(const std::vector& vec, std::string* value) { + *value = ""; + for (size_t i = 0; i < vec.size(); ++i) { + if (i > 0) { + *value += ":"; + } + *value += std::to_string(vec[i]); + } + return true; +} + +// Copied from folly/string.cpp: +// https://github.com/facebook/folly/blob/0deef031cb8aab76dc7e736f8b7c22d701d5f36b/folly/String.cpp#L457 +// There are two variants of `strerror_r` function, one returns +// `int`, and another returns `char*`. Selecting proper version using +// preprocessor macros portably is extremely hard. +// +// For example, on Android function signature depends on `__USE_GNU` and +// `__ANDROID_API__` macros (https://git.io/fjBBE). +// +// So we are using C++ overloading trick: we pass a pointer of +// `strerror_r` to `invoke_strerror_r` function, and C++ compiler +// selects proper function. + +#if !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))) +ROCKSDB_MAYBE_UNUSED +static std::string invoke_strerror_r(int (*strerror_r)(int, char*, size_t), + int err, char* buf, size_t buflen) { + // Using XSI-compatible strerror_r + int r = strerror_r(err, buf, buflen); + + // OSX/FreeBSD use EINVAL and Linux uses -1 so just check for non-zero + if (r != 0) { + snprintf(buf, buflen, "Unknown error %d (strerror_r failed with error %d)", + err, errno); + } + return buf; +} + +ROCKSDB_MAYBE_UNUSED +static std::string invoke_strerror_r(char* (*strerror_r)(int, char*, size_t), + int err, char* buf, size_t buflen) { + // Using GNU strerror_r + return strerror_r(err, buf, buflen); +} +#endif // !(defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER))) + +std::string errnoStr(int err) { + char buf[1024]; + buf[0] = '\0'; + + std::string result; + + // https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man3/strerror_r.3.html + // http://www.kernel.org/doc/man-pages/online/pages/man3/strerror.3.html +#if defined(_WIN32) && (defined(__MINGW32__) || defined(_MSC_VER)) + // mingw64 has no strerror_r, but Windows has strerror_s, which C11 added + // as well. So maybe we should use this across all platforms (together + // with strerrorlen_s). Note strerror_r and _s have swapped args. + int r = strerror_s(buf, sizeof(buf), err); + if (r != 0) { + snprintf(buf, sizeof(buf), + "Unknown error %d (strerror_r failed with error %d)", err, errno); + } + result.assign(buf); +#else + // Using any strerror_r + result.assign(invoke_strerror_r(strerror_r, err, buf, sizeof(buf))); +#endif + + return result; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/string_util.h b/src/rocksdb/util/string_util.h new file mode 100644 index 000000000..11178fd1d --- /dev/null +++ b/src/rocksdb/util/string_util.h @@ -0,0 +1,177 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +class Slice; + +extern std::vector StringSplit(const std::string& arg, char delim); + +// Append a human-readable printout of "num" to *str +extern void AppendNumberTo(std::string* str, uint64_t num); + +// Append a human-readable printout of "value" to *str. +// Escapes any non-printable characters found in "value". +extern void AppendEscapedStringTo(std::string* str, const Slice& value); + +// Put n digits from v in base kBase to (*buf)[0] to (*buf)[n-1] and +// advance *buf to the position after what was written. +template +inline void PutBaseChars(char** buf, size_t n, uint64_t v, bool uppercase) { + const char* digitChars = uppercase ? "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + : "0123456789abcdefghijklmnopqrstuvwxyz"; + for (size_t i = n; i > 0; --i) { + (*buf)[i - 1] = digitChars[static_cast(v % kBase)]; + v /= kBase; + } + *buf += n; +} + +// Parse n digits from *buf in base kBase to *v and advance *buf to the +// position after what was read. On success, true is returned. On failure, +// false is returned, *buf is placed at the first bad character, and *v +// contains the partial parsed data. Overflow is not checked but the +// result is accurate mod 2^64. Requires the starting value of *v to be +// zero or previously accumulated parsed digits, i.e. +// ParseBaseChars(&b, n, &v); +// is equivalent to n calls to +// ParseBaseChars(&b, 1, &v); +template +inline bool ParseBaseChars(const char** buf, size_t n, uint64_t* v) { + while (n) { + char c = **buf; + *v *= static_cast(kBase); + if (c >= '0' && (kBase >= 10 ? c <= '9' : c < '0' + kBase)) { + *v += static_cast(c - '0'); + } else if (kBase > 10 && c >= 'A' && c < 'A' + kBase - 10) { + *v += static_cast(c - 'A' + 10); + } else if (kBase > 10 && c >= 'a' && c < 'a' + kBase - 10) { + *v += static_cast(c - 'a' + 10); + } else { + return false; + } + --n; + ++*buf; + } + return true; +} + +// Return a human-readable version of num. +// for num >= 10.000, prints "xxK" +// for num >= 10.000.000, prints "xxM" +// for num >= 10.000.000.000, prints "xxG" +extern std::string NumberToHumanString(int64_t num); + +// Return a human-readable version of bytes +// ex: 1048576 -> 1.00 GB +extern std::string BytesToHumanString(uint64_t bytes); + +// Return a human-readable version of unix time +// ex: 1562116015 -> "Tue Jul 2 18:06:55 2019" +extern std::string TimeToHumanString(int unixtime); + +// Append a human-readable time in micros. +int AppendHumanMicros(uint64_t micros, char* output, int len, + bool fixed_format); + +// Append a human-readable size in bytes +int AppendHumanBytes(uint64_t bytes, char* output, int len); + +// Return a human-readable version of "value". +// Escapes any non-printable characters found in "value". +extern std::string EscapeString(const Slice& value); + +// Parse a human-readable number from "*in" into *value. On success, +// advances "*in" past the consumed number and sets "*val" to the +// numeric value. Otherwise, returns false and leaves *in in an +// unspecified state. +extern bool ConsumeDecimalNumber(Slice* in, uint64_t* val); + +// Returns true if the input char "c" is considered as a special character +// that will be escaped when EscapeOptionString() is called. +// +// @param c the input char +// @return true if the input char "c" is considered as a special character. +// @see EscapeOptionString +bool isSpecialChar(const char c); + +// If the input char is an escaped char, it will return the its +// associated raw-char. Otherwise, the function will simply return +// the original input char. +char UnescapeChar(const char c); + +// If the input char is a control char, it will return the its +// associated escaped char. Otherwise, the function will simply return +// the original input char. +char EscapeChar(const char c); + +// Converts a raw string to an escaped string. Escaped-characters are +// defined via the isSpecialChar() function. When a char in the input +// string "raw_string" is classified as a special characters, then it +// will be prefixed by '\' in the output. +// +// It's inverse function is UnescapeOptionString(). +// @param raw_string the input string +// @return the '\' escaped string of the input "raw_string" +// @see isSpecialChar, UnescapeOptionString +std::string EscapeOptionString(const std::string& raw_string); + +// The inverse function of EscapeOptionString. It converts +// an '\' escaped string back to a raw string. +// +// @param escaped_string the input '\' escaped string +// @return the raw string of the input "escaped_string" +std::string UnescapeOptionString(const std::string& escaped_string); + +std::string trim(const std::string& str); + +// Returns true if "string" ends with "pattern" +bool EndsWith(const std::string& string, const std::string& pattern); + +// Returns true if "string" starts with "pattern" +bool StartsWith(const std::string& string, const std::string& pattern); + +#ifndef ROCKSDB_LITE +bool ParseBoolean(const std::string& type, const std::string& value); + +uint8_t ParseUint8(const std::string& value); + +uint32_t ParseUint32(const std::string& value); + +int32_t ParseInt32(const std::string& value); +#endif + +uint64_t ParseUint64(const std::string& value); + +int ParseInt(const std::string& value); + +int64_t ParseInt64(const std::string& value); + +double ParseDouble(const std::string& value); + +size_t ParseSizeT(const std::string& value); + +std::vector ParseVectorInt(const std::string& value); + +bool SerializeIntVector(const std::vector& vec, std::string* value); + +extern const std::string kNullptrString; + +// errnoStr() function returns a string that describes the error code passed in +// the argument err +extern std::string errnoStr(int err); + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/thread_guard.h b/src/rocksdb/util/thread_guard.h new file mode 100644 index 000000000..b2bb06a1b --- /dev/null +++ b/src/rocksdb/util/thread_guard.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "port/port.h" +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Resource management object for threads that joins the thread upon +// destruction. Has unique ownership of the thread object, so copying it is not +// allowed, while moving it transfers ownership. +class ThreadGuard { + public: + ThreadGuard() = default; + + explicit ThreadGuard(port::Thread&& thread) : thread_(std::move(thread)) {} + + ThreadGuard(const ThreadGuard&) = delete; + ThreadGuard& operator=(const ThreadGuard&) = delete; + + ThreadGuard(ThreadGuard&&) noexcept = default; + ThreadGuard& operator=(ThreadGuard&&) noexcept = default; + + ~ThreadGuard() { + if (thread_.joinable()) { + thread_.join(); + } + } + + const port::Thread& GetThread() const { return thread_; } + port::Thread& GetThread() { return thread_; } + + private: + port::Thread thread_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/thread_list_test.cc b/src/rocksdb/util/thread_list_test.cc new file mode 100644 index 000000000..af4e62355 --- /dev/null +++ b/src/rocksdb/util/thread_list_test.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include + +#include "monitoring/thread_status_updater.h" +#include "rocksdb/db.h" +#include "test_util/testharness.h" + +#ifdef ROCKSDB_USING_THREAD_STATUS + +namespace ROCKSDB_NAMESPACE { + +class SimulatedBackgroundTask { + public: + SimulatedBackgroundTask( + const void* db_key, const std::string& db_name, const void* cf_key, + const std::string& cf_name, + const ThreadStatus::OperationType operation_type = + ThreadStatus::OP_UNKNOWN, + const ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN) + : db_key_(db_key), + db_name_(db_name), + cf_key_(cf_key), + cf_name_(cf_name), + operation_type_(operation_type), + state_type_(state_type), + should_run_(true), + running_count_(0) { + Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo( + db_key_, db_name_, cf_key_, cf_name_); + } + + ~SimulatedBackgroundTask() { + Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_); + } + + void Run() { + std::unique_lock l(mutex_); + running_count_++; + bg_cv_.notify_all(); + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_); + Env::Default()->GetThreadStatusUpdater()->SetThreadOperation( + operation_type_); + Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_); + while (should_run_) { + bg_cv_.wait(l); + } + Env::Default()->GetThreadStatusUpdater()->ClearThreadState(); + Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation(); + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(nullptr); + running_count_--; + bg_cv_.notify_all(); + } + + void FinishAllTasks() { + std::unique_lock l(mutex_); + should_run_ = false; + bg_cv_.notify_all(); + } + + void WaitUntilScheduled(int job_count) { + std::unique_lock l(mutex_); + while (running_count_ < job_count) { + bg_cv_.wait(l); + } + } + + void WaitUntilDone() { + std::unique_lock l(mutex_); + while (running_count_ > 0) { + bg_cv_.wait(l); + } + } + + static void DoSimulatedTask(void* arg) { + reinterpret_cast(arg)->Run(); + } + + private: + const void* db_key_; + const std::string db_name_; + const void* cf_key_; + const std::string cf_name_; + const ThreadStatus::OperationType operation_type_; + const ThreadStatus::StateType state_type_; + std::mutex mutex_; + std::condition_variable bg_cv_; + bool should_run_; + std::atomic running_count_; +}; + +class ThreadListTest : public testing::Test { + public: + ThreadListTest() {} +}; + +TEST_F(ThreadListTest, GlobalTables) { + // verify the global tables for operations and states are properly indexed. + for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) { + ASSERT_EQ(global_operation_table[type].type, type); + ASSERT_EQ( + global_operation_table[type].name, + ThreadStatus::GetOperationName(ThreadStatus::OperationType(type))); + } + + for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) { + ASSERT_EQ(global_state_table[type].type, type); + ASSERT_EQ(global_state_table[type].name, + ThreadStatus::GetStateName(ThreadStatus::StateType(type))); + } + + for (int stage = 0; stage != ThreadStatus::NUM_OP_STAGES; ++stage) { + ASSERT_EQ(global_op_stage_table[stage].stage, stage); + ASSERT_EQ(global_op_stage_table[stage].name, + ThreadStatus::GetOperationStageName( + ThreadStatus::OperationStage(stage))); + } +} + +TEST_F(ThreadListTest, SimpleColumnFamilyInfoTest) { + Env* env = Env::Default(); + const int kHighPriorityThreads = 3; + const int kLowPriorityThreads = 5; + const int kSimulatedHighPriThreads = kHighPriorityThreads - 1; + const int kSimulatedLowPriThreads = kLowPriorityThreads / 3; + const int kDelayMicros = 1000000; + env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH); + env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW); + // Wait 1 second so that threads start + Env::Default()->SleepForMicroseconds(kDelayMicros); + SimulatedBackgroundTask running_task(reinterpret_cast(1234), "running", + reinterpret_cast(5678), + "pikachu"); + + for (int test = 0; test < kSimulatedHighPriThreads; ++test) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &running_task, + Env::Priority::HIGH); + } + + for (int test = 0; test < kSimulatedLowPriThreads; ++test) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &running_task, + Env::Priority::LOW); + } + running_task.WaitUntilScheduled(kSimulatedHighPriThreads + + kSimulatedLowPriThreads); + // We can only reserve limited number of waiting threads + ASSERT_EQ(kHighPriorityThreads - kSimulatedHighPriThreads, + env->ReserveThreads(kHighPriorityThreads, Env::Priority::HIGH)); + ASSERT_EQ(kLowPriorityThreads - kSimulatedLowPriThreads, + env->ReserveThreads(kLowPriorityThreads, Env::Priority::LOW)); + + // Reservation shall not affect the existing thread list + std::vector thread_list; + + // Verify the number of running threads in each pool. + ASSERT_OK(env->GetThreadList(&thread_list)); + int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0}; + for (auto thread_status : thread_list) { + if (thread_status.cf_name == "pikachu" && + thread_status.db_name == "running") { + running_count[thread_status.thread_type]++; + } + } + // Cannot reserve more threads + ASSERT_EQ(0, env->ReserveThreads(kHighPriorityThreads, Env::Priority::HIGH)); + ASSERT_EQ(0, env->ReserveThreads(kLowPriorityThreads, Env::Priority::LOW)); + + ASSERT_EQ(running_count[ThreadStatus::HIGH_PRIORITY], + kSimulatedHighPriThreads); + ASSERT_EQ(running_count[ThreadStatus::LOW_PRIORITY], kSimulatedLowPriThreads); + ASSERT_EQ(running_count[ThreadStatus::USER], 0); + + running_task.FinishAllTasks(); + running_task.WaitUntilDone(); + + ASSERT_EQ(kHighPriorityThreads - kSimulatedHighPriThreads, + env->ReleaseThreads(kHighPriorityThreads, Env::Priority::HIGH)); + ASSERT_EQ(kLowPriorityThreads - kSimulatedLowPriThreads, + env->ReleaseThreads(kLowPriorityThreads, Env::Priority::LOW)); + // Verify none of the threads are running + ASSERT_OK(env->GetThreadList(&thread_list)); + + for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) { + running_count[i] = 0; + } + for (auto thread_status : thread_list) { + if (thread_status.cf_name == "pikachu" && + thread_status.db_name == "running") { + running_count[thread_status.thread_type]++; + } + } + + ASSERT_EQ(running_count[ThreadStatus::HIGH_PRIORITY], 0); + ASSERT_EQ(running_count[ThreadStatus::LOW_PRIORITY], 0); + ASSERT_EQ(running_count[ThreadStatus::USER], 0); +} + +namespace { +void UpdateStatusCounts(const std::vector& thread_list, + int operation_counts[], int state_counts[]) { + for (auto thread_status : thread_list) { + operation_counts[thread_status.operation_type]++; + state_counts[thread_status.state_type]++; + } +} + +void VerifyAndResetCounts(const int correct_counts[], int collected_counts[], + int size) { + for (int i = 0; i < size; ++i) { + ASSERT_EQ(collected_counts[i], correct_counts[i]); + collected_counts[i] = 0; + } +} + +void UpdateCount(int operation_counts[], int from_event, int to_event, + int amount) { + operation_counts[from_event] -= amount; + operation_counts[to_event] += amount; +} +} // namespace + +TEST_F(ThreadListTest, SimpleEventTest) { + Env* env = Env::Default(); + + // simulated tasks + const int kFlushWriteTasks = 3; + SimulatedBackgroundTask flush_write_task( + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_FLUSH); + + const int kCompactionWriteTasks = 4; + SimulatedBackgroundTask compaction_write_task( + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_COMPACTION); + + const int kCompactionReadTasks = 5; + SimulatedBackgroundTask compaction_read_task( + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_COMPACTION); + + const int kCompactionWaitTasks = 6; + SimulatedBackgroundTask compaction_wait_task( + reinterpret_cast(1234), "running", reinterpret_cast(5678), + "pikachu", ThreadStatus::OP_COMPACTION); + + // setup right answers + int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; + correct_operation_counts[ThreadStatus::OP_FLUSH] = kFlushWriteTasks; + correct_operation_counts[ThreadStatus::OP_COMPACTION] = + kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks; + + env->SetBackgroundThreads(correct_operation_counts[ThreadStatus::OP_FLUSH], + Env::HIGH); + env->SetBackgroundThreads( + correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW); + + // schedule the simulated tasks + for (int t = 0; t < kFlushWriteTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, &flush_write_task, + Env::Priority::HIGH); + } + flush_write_task.WaitUntilScheduled(kFlushWriteTasks); + + for (int t = 0; t < kCompactionWriteTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_write_task, Env::Priority::LOW); + } + compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks); + + for (int t = 0; t < kCompactionReadTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_read_task, Env::Priority::LOW); + } + compaction_read_task.WaitUntilScheduled(kCompactionReadTasks); + + for (int t = 0; t < kCompactionWaitTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_wait_task, Env::Priority::LOW); + } + compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks); + + // verify the thread-status + int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; + int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0}; + + std::vector thread_list; + ASSERT_OK(env->GetThreadList(&thread_list)); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-wait tasks and see if the thread-status + // reflects this update + compaction_wait_task.FinishAllTasks(); + compaction_wait_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks); + + ASSERT_OK(env->GetThreadList(&thread_list)); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate flush-write tasks and see if the thread-status + // reflects this update + flush_write_task.FinishAllTasks(); + flush_write_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH, + ThreadStatus::OP_UNKNOWN, kFlushWriteTasks); + + ASSERT_OK(env->GetThreadList(&thread_list)); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-write tasks and see if the thread-status + // reflects this update + compaction_write_task.FinishAllTasks(); + compaction_write_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks); + + ASSERT_OK(env->GetThreadList(&thread_list)); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-write tasks and see if the thread-status + // reflects this update + compaction_read_task.FinishAllTasks(); + compaction_read_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionReadTasks); + + ASSERT_OK(env->GetThreadList(&thread_list)); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + +#else + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return 0; +} + +#endif // ROCKSDB_USING_THREAD_STATUS diff --git a/src/rocksdb/util/thread_local.cc b/src/rocksdb/util/thread_local.cc new file mode 100644 index 000000000..969639d9b --- /dev/null +++ b/src/rocksdb/util/thread_local.cc @@ -0,0 +1,521 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/thread_local.h" + +#include + +#include "port/likely.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +struct Entry { + Entry() : ptr(nullptr) {} + Entry(const Entry& e) : ptr(e.ptr.load(std::memory_order_relaxed)) {} + std::atomic ptr; +}; + +class StaticMeta; + +// This is the structure that is declared as "thread_local" storage. +// The vector keep list of atomic pointer for all instances for "current" +// thread. The vector is indexed by an Id that is unique in process and +// associated with one ThreadLocalPtr instance. The Id is assigned by a +// global StaticMeta singleton. So if we instantiated 3 ThreadLocalPtr +// instances, each thread will have a ThreadData with a vector of size 3: +// --------------------------------------------------- +// | | instance 1 | instance 2 | instance 3 | +// --------------------------------------------------- +// | thread 1 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +// | thread 2 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +// | thread 3 | void* | void* | void* | <- ThreadData +// --------------------------------------------------- +struct ThreadData { + explicit ThreadData(ThreadLocalPtr::StaticMeta* _inst) + : entries(), next(nullptr), prev(nullptr), inst(_inst) {} + std::vector entries; + ThreadData* next; + ThreadData* prev; + ThreadLocalPtr::StaticMeta* inst; +}; + +class ThreadLocalPtr::StaticMeta { + public: + StaticMeta(); + + // Return the next available Id + uint32_t GetId(); + // Return the next available Id without claiming it + uint32_t PeekId() const; + // Return the given Id back to the free pool. This also triggers + // UnrefHandler for associated pointer value (if not NULL) for all threads. + void ReclaimId(uint32_t id); + + // Return the pointer value for the given id for the current thread. + void* Get(uint32_t id) const; + // Reset the pointer value for the given id for the current thread. + void Reset(uint32_t id, void* ptr); + // Atomically swap the supplied ptr and return the previous value + void* Swap(uint32_t id, void* ptr); + // Atomically compare and swap the provided value only if it equals + // to expected value. + bool CompareAndSwap(uint32_t id, void* ptr, void*& expected); + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(uint32_t id, autovector* ptrs, void* const replacement); + // Update res by applying func on each thread-local value. Holds a lock that + // prevents unref handler from running during this call, but clients must + // still provide external synchronization since the owning thread can + // access the values without internal locking, e.g., via Get() and Reset(). + void Fold(uint32_t id, FoldFunc func, void* res); + + // Register the UnrefHandler for id + void SetHandler(uint32_t id, UnrefHandler handler); + + // protect inst, next_instance_id_, free_instance_ids_, head_, + // ThreadData.entries + // + // Note that here we prefer function static variable instead of the usual + // global static variable. The reason is that c++ destruction order of + // static variables in the reverse order of their construction order. + // However, C++ does not guarantee any construction order when global + // static variables are defined in different files, while the function + // static variables are initialized when their function are first called. + // As a result, the construction order of the function static variables + // can be controlled by properly invoke their first function calls in + // the right order. + // + // For instance, the following function contains a function static + // variable. We place a dummy function call of this inside + // Env::Default() to ensure the construction order of the construction + // order. + static port::Mutex* Mutex(); + + // Returns the member mutex of the current StaticMeta. In general, + // Mutex() should be used instead of this one. However, in case where + // the static variable inside Instance() goes out of scope, MemberMutex() + // should be used. One example is OnThreadExit() function. + port::Mutex* MemberMutex() { return &mutex_; } + + private: + // Get UnrefHandler for id with acquiring mutex + // REQUIRES: mutex locked + UnrefHandler GetHandler(uint32_t id); + + // Triggered before a thread terminates + static void OnThreadExit(void* ptr); + + // Add current thread's ThreadData to the global chain + // REQUIRES: mutex locked + void AddThreadData(ThreadData* d); + + // Remove current thread's ThreadData from the global chain + // REQUIRES: mutex locked + void RemoveThreadData(ThreadData* d); + + static ThreadData* GetThreadLocal(); + + uint32_t next_instance_id_; + // Used to recycle Ids in case ThreadLocalPtr is instantiated and destroyed + // frequently. This also prevents it from blowing up the vector space. + autovector free_instance_ids_; + // Chain all thread local structure together. This is necessary since + // when one ThreadLocalPtr gets destroyed, we need to loop over each + // thread's version of pointer corresponding to that instance and + // call UnrefHandler for it. + ThreadData head_; + + std::unordered_map handler_map_; + + // The private mutex. Developers should always use Mutex() instead of + // using this variable directly. + port::Mutex mutex_; + // Thread local storage + static thread_local ThreadData* tls_; + + // Used to make thread exit trigger possible if !defined(OS_MACOSX). + // Otherwise, used to retrieve thread data. + pthread_key_t pthread_key_; +}; + +thread_local ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; + +// Windows doesn't support a per-thread destructor with its +// TLS primitives. So, we build it manually by inserting a +// function to be called on each thread's exit. +// See http://www.codeproject.com/Articles/8113/Thread-Local-Storage-The-C-Way +// and http://www.nynaeve.net/?p=183 +// +// really we do this to have clear conscience since using TLS with thread-pools +// is iffy +// although OK within a request. But otherwise, threads have no identity in its +// modern use. + +// This runs on windows only called from the System Loader +#ifdef OS_WIN + +// Windows cleanup routine is invoked from a System Loader with a different +// signature so we can not directly hookup the original OnThreadExit which is +// private member +// so we make StaticMeta class share with the us the address of the function so +// we can invoke it. +namespace wintlscleanup { + +// This is set to OnThreadExit in StaticMeta singleton constructor +UnrefHandler thread_local_inclass_routine = nullptr; +pthread_key_t thread_local_key = pthread_key_t(-1); + +// Static callback function to call with each thread termination. +void NTAPI WinOnThreadExit(PVOID module, DWORD reason, PVOID reserved) { + // We decided to punt on PROCESS_EXIT + if (DLL_THREAD_DETACH == reason) { + if (thread_local_key != pthread_key_t(-1) && + thread_local_inclass_routine != nullptr) { + void* tls = TlsGetValue(thread_local_key); + if (tls != nullptr) { + thread_local_inclass_routine(tls); + } + } + } +} + +} // namespace wintlscleanup + +// extern "C" suppresses C++ name mangling so we know the symbol name for the +// linker /INCLUDE:symbol pragma above. +extern "C" { + +#ifdef _MSC_VER +// The linker must not discard thread_callback_on_exit. (We force a reference +// to this variable with a linker /include:symbol pragma to ensure that.) If +// this variable is discarded, the OnThreadExit function will never be called. +#ifndef _X86_ + +// .CRT section is merged with .rdata on x64 so it must be constant data. +#pragma const_seg(".CRT$XLB") +// When defining a const variable, it must have external linkage to be sure the +// linker doesn't discard it. +extern const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit; +const PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = + wintlscleanup::WinOnThreadExit; +// Reset the default section. +#pragma const_seg() + +#pragma comment(linker, "/include:_tls_used") +#pragma comment(linker, "/include:p_thread_callback_on_exit") + +#else // _X86_ + +#pragma data_seg(".CRT$XLB") +PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit; +// Reset the default section. +#pragma data_seg() + +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma comment(linker, "/INCLUDE:_p_thread_callback_on_exit") + +#endif // _X86_ + +#else +// https://github.com/couchbase/gperftools/blob/master/src/windows/port.cc +BOOL WINAPI DllMain(HINSTANCE h, DWORD dwReason, PVOID pv) { + if (dwReason == DLL_THREAD_DETACH) + wintlscleanup::WinOnThreadExit(h, dwReason, pv); + return TRUE; +} +#endif +} // extern "C" + +#endif // OS_WIN + +void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::Instance(); } + +ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { + // Here we prefer function static variable instead of global + // static variable as function static variable is initialized + // when the function is first call. As a result, we can properly + // control their construction order by properly preparing their + // first function call. + // + // Note that here we decide to make "inst" a static pointer w/o deleting + // it at the end instead of a static variable. This is to avoid the following + // destruction order disaster happens when a child thread using ThreadLocalPtr + // dies AFTER the main thread dies: When a child thread happens to use + // ThreadLocalPtr, it will try to delete its thread-local data on its + // OnThreadExit when the child thread dies. However, OnThreadExit depends + // on the following variable. As a result, if the main thread dies before any + // child thread happen to use ThreadLocalPtr dies, then the destruction of + // the following variable will go first, then OnThreadExit, therefore causing + // invalid access. + // + // The above problem can be solved by using thread_local to store tls_. + // thread_local supports dynamic construction and destruction of + // non-primitive typed variables. As a result, we can guarantee the + // destruction order even when the main thread dies before any child threads. + static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta(); + return inst; +} + +port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; } + +void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { + auto* tls = static_cast(ptr); + assert(tls != nullptr); + + // Use the cached StaticMeta::Instance() instead of directly calling + // the variable inside StaticMeta::Instance() might already go out of + // scope here in case this OnThreadExit is called after the main thread + // dies. + auto* inst = tls->inst; + pthread_setspecific(inst->pthread_key_, nullptr); + + MutexLock l(inst->MemberMutex()); + inst->RemoveThreadData(tls); + // Unref stored pointers of current thread from all instances + uint32_t id = 0; + for (auto& e : tls->entries) { + void* raw = e.ptr.load(); + if (raw != nullptr) { + auto unref = inst->GetHandler(id); + if (unref != nullptr) { + unref(raw); + } + } + ++id; + } + // Delete thread local structure no matter if it is Mac platform + delete tls; +} + +ThreadLocalPtr::StaticMeta::StaticMeta() + : next_instance_id_(0), head_(this), pthread_key_(0) { + if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { + abort(); + } + + // OnThreadExit is not getting called on the main thread. + // Call through the static destructor mechanism to avoid memory leak. + // + // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global + // singleton (destructors are invoked in reverse order of constructor + // _completion_); the latter must not mutate internal members. This + // cleanup mechanism inherently relies on use-after-release of the + // StaticMeta, and is brittle with respect to compiler-specific handling + // of memory backing destructed statically-scoped objects. Perhaps + // registering with atexit(3) would be more robust. + // +// This is not required on Windows. +#if !defined(OS_WIN) + static struct A { + ~A() { + if (tls_) { + OnThreadExit(tls_); + } + } + } a; +#endif // !defined(OS_WIN) + + head_.next = &head_; + head_.prev = &head_; + +#ifdef OS_WIN + // Share with Windows its cleanup routine and the key + wintlscleanup::thread_local_inclass_routine = OnThreadExit; + wintlscleanup::thread_local_key = pthread_key_; +#endif +} + +void ThreadLocalPtr::StaticMeta::AddThreadData(ThreadData* d) { + Mutex()->AssertHeld(); + d->next = &head_; + d->prev = head_.prev; + head_.prev->next = d; + head_.prev = d; +} + +void ThreadLocalPtr::StaticMeta::RemoveThreadData(ThreadData* d) { + Mutex()->AssertHeld(); + d->next->prev = d->prev; + d->prev->next = d->next; + d->next = d->prev = d; +} + +ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { + if (UNLIKELY(tls_ == nullptr)) { + auto* inst = Instance(); + tls_ = new ThreadData(inst); + { + // Register it in the global chain, needs to be done before thread exit + // handler registration + MutexLock l(Mutex()); + inst->AddThreadData(tls_); + } + // Even it is not OS_MACOSX, need to register value for pthread_key_ so that + // its exit handler will be triggered. + if (pthread_setspecific(inst->pthread_key_, tls_) != 0) { + { + MutexLock l(Mutex()); + inst->RemoveThreadData(tls_); + } + delete tls_; + abort(); + } + } + return tls_; +} + +void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + return nullptr; + } + return tls->entries[id].ptr.load(std::memory_order_acquire); +} + +void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + tls->entries[id].ptr.store(ptr, std::memory_order_release); +} + +void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire); +} + +bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, + void*& expected) { + auto* tls = GetThreadLocal(); + if (UNLIKELY(id >= tls->entries.size())) { + // Need mutex to protect entries access within ReclaimId + MutexLock l(Mutex()); + tls->entries.resize(id + 1); + } + return tls->entries[id].ptr.compare_exchange_strong( + expected, ptr, std::memory_order_release, std::memory_order_relaxed); +} + +void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, + void* const replacement) { + MutexLock l(Mutex()); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = + t->entries[id].ptr.exchange(replacement, std::memory_order_acquire); + if (ptr != nullptr) { + ptrs->push_back(ptr); + } + } + } +} + +void ThreadLocalPtr::StaticMeta::Fold(uint32_t id, FoldFunc func, void* res) { + MutexLock l(Mutex()); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = t->entries[id].ptr.load(); + if (ptr != nullptr) { + func(ptr, res); + } + } + } +} + +uint32_t ThreadLocalPtr::TEST_PeekId() { return Instance()->PeekId(); } + +void ThreadLocalPtr::StaticMeta::SetHandler(uint32_t id, UnrefHandler handler) { + MutexLock l(Mutex()); + handler_map_[id] = handler; +} + +UnrefHandler ThreadLocalPtr::StaticMeta::GetHandler(uint32_t id) { + Mutex()->AssertHeld(); + auto iter = handler_map_.find(id); + if (iter == handler_map_.end()) { + return nullptr; + } + return iter->second; +} + +uint32_t ThreadLocalPtr::StaticMeta::GetId() { + MutexLock l(Mutex()); + if (free_instance_ids_.empty()) { + return next_instance_id_++; + } + + uint32_t id = free_instance_ids_.back(); + free_instance_ids_.pop_back(); + return id; +} + +uint32_t ThreadLocalPtr::StaticMeta::PeekId() const { + MutexLock l(Mutex()); + if (!free_instance_ids_.empty()) { + return free_instance_ids_.back(); + } + return next_instance_id_; +} + +void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { + // This id is not used, go through all thread local data and release + // corresponding value + MutexLock l(Mutex()); + auto unref = GetHandler(id); + for (ThreadData* t = head_.next; t != &head_; t = t->next) { + if (id < t->entries.size()) { + void* ptr = t->entries[id].ptr.exchange(nullptr); + if (ptr != nullptr && unref != nullptr) { + unref(ptr); + } + } + } + handler_map_[id] = nullptr; + free_instance_ids_.push_back(id); +} + +ThreadLocalPtr::ThreadLocalPtr(UnrefHandler handler) + : id_(Instance()->GetId()) { + if (handler != nullptr) { + Instance()->SetHandler(id_, handler); + } +} + +ThreadLocalPtr::~ThreadLocalPtr() { Instance()->ReclaimId(id_); } + +void* ThreadLocalPtr::Get() const { return Instance()->Get(id_); } + +void ThreadLocalPtr::Reset(void* ptr) { Instance()->Reset(id_, ptr); } + +void* ThreadLocalPtr::Swap(void* ptr) { return Instance()->Swap(id_, ptr); } + +bool ThreadLocalPtr::CompareAndSwap(void* ptr, void*& expected) { + return Instance()->CompareAndSwap(id_, ptr, expected); +} + +void ThreadLocalPtr::Scrape(autovector* ptrs, void* const replacement) { + Instance()->Scrape(id_, ptrs, replacement); +} + +void ThreadLocalPtr::Fold(FoldFunc func, void* res) { + Instance()->Fold(id_, func, res); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/thread_local.h b/src/rocksdb/util/thread_local.h new file mode 100644 index 000000000..fde68f86f --- /dev/null +++ b/src/rocksdb/util/thread_local.h @@ -0,0 +1,100 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +// Cleanup function that will be called for a stored thread local +// pointer (if not NULL) when one of the following happens: +// (1) a thread terminates +// (2) a ThreadLocalPtr is destroyed +// +// Warning: this function is called while holding a global mutex. The same mutex +// is used (at least in some cases) by most methods of ThreadLocalPtr, and it's +// shared across all instances of ThreadLocalPtr. Thereforere extra care +// is needed to avoid deadlocks. In particular, the handler shouldn't lock any +// mutexes and shouldn't call any methods of any ThreadLocalPtr instances, +// unless you know what you're doing. +using UnrefHandler = void (*)(void* ptr); + +// ThreadLocalPtr stores only values of pointer type. Different from +// the usual thread-local-storage, ThreadLocalPtr has the ability to +// distinguish data coming from different threads and different +// ThreadLocalPtr instances. For example, if a regular thread_local +// variable A is declared in DBImpl, two DBImpl objects would share +// the same A. However, a ThreadLocalPtr that is defined under the +// scope of DBImpl can avoid such confliction. As a result, its memory +// usage would be O(# of threads * # of ThreadLocalPtr instances). +class ThreadLocalPtr { + public: + explicit ThreadLocalPtr(UnrefHandler handler = nullptr); + + ThreadLocalPtr(const ThreadLocalPtr&) = delete; + ThreadLocalPtr& operator=(const ThreadLocalPtr&) = delete; + + ~ThreadLocalPtr(); + + // Return the current pointer stored in thread local + void* Get() const; + + // Set a new pointer value to the thread local storage. + void Reset(void* ptr); + + // Atomically swap the supplied ptr and return the previous value + void* Swap(void* ptr); + + // Atomically compare the stored value with expected. Set the new + // pointer value to thread local only if the comparison is true. + // Otherwise, expected returns the stored value. + // Return true on success, false on failure + bool CompareAndSwap(void* ptr, void*& expected); + + // Reset all thread local data to replacement, and return non-nullptr + // data for all existing threads + void Scrape(autovector* ptrs, void* const replacement); + + using FoldFunc = std::function; + // Update res by applying func on each thread-local value. Holds a lock that + // prevents unref handler from running during this call, but clients must + // still provide external synchronization since the owning thread can + // access the values without internal locking, e.g., via Get() and Reset(). + void Fold(FoldFunc func, void* res); + + // Add here for testing + // Return the next available Id without claiming it + static uint32_t TEST_PeekId(); + + // Initialize the static singletons of the ThreadLocalPtr. + // + // If this function is not called, then the singletons will be + // automatically initialized when they are used. + // + // Calling this function twice or after the singletons have been + // initialized will be no-op. + static void InitSingletons(); + + class StaticMeta; + + private: + static StaticMeta* Instance(); + + const uint32_t id_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/thread_local_test.cc b/src/rocksdb/util/thread_local_test.cc new file mode 100644 index 000000000..25ef5c0ee --- /dev/null +++ b/src/rocksdb/util/thread_local_test.cc @@ -0,0 +1,582 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/thread_local.h" + +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" +#include "util/autovector.h" + +namespace ROCKSDB_NAMESPACE { + +class ThreadLocalTest : public testing::Test { + public: + ThreadLocalTest() : env_(Env::Default()) {} + + Env* env_; +}; + +namespace { + +struct Params { + Params(port::Mutex* m, port::CondVar* c, int* u, int n, + UnrefHandler handler = nullptr) + : mu(m), + cv(c), + unref(u), + total(n), + started(0), + completed(0), + doWrite(false), + tls1(handler), + tls2(nullptr) {} + + port::Mutex* mu; + port::CondVar* cv; + int* unref; + int total; + int started; + int completed; + bool doWrite; + ThreadLocalPtr tls1; + ThreadLocalPtr* tls2; +}; + +class IDChecker : public ThreadLocalPtr { + public: + static uint32_t PeekId() { return TEST_PeekId(); } +}; + +} // anonymous namespace + +// Suppress false positive clang analyzer warnings. +#ifndef __clang_analyzer__ +TEST_F(ThreadLocalTest, UniqueIdTest) { + port::Mutex mu; + port::CondVar cv(&mu); + + uint32_t base_id = IDChecker::PeekId(); + // New ThreadLocal instance bumps id by 1 + { + // Id used 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 1u); + // Id used 1 + Params p2(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 2u); + // Id used 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 3u); + // Id used 3 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 4u); + } + // id 3, 2, 1, 0 are in the free queue in order + ASSERT_EQ(IDChecker::PeekId(), base_id + 0u); + + // pick up 0 + Params p1(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 1u); + // pick up 1 + Params* p2 = new Params(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 2u); + // pick up 2 + Params p3(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 3u); + // return up 1 + delete p2; + ASSERT_EQ(IDChecker::PeekId(), base_id + 1u); + // Now we have 3, 1 in queue + // pick up 1 + Params p4(&mu, &cv, nullptr, 1u); + ASSERT_EQ(IDChecker::PeekId(), base_id + 3u); + // pick up 3 + Params p5(&mu, &cv, nullptr, 1u); + // next new id + ASSERT_EQ(IDChecker::PeekId(), base_id + 4u); + // After exit, id sequence in queue: + // 3, 1, 2, 0 +} +#endif // __clang_analyzer__ + +TEST_F(ThreadLocalTest, SequentialReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + uint32_t base_id = IDChecker::PeekId(); + + port::Mutex mu; + port::CondVar cv(&mu); + Params p(&mu, &cv, nullptr, 1); + ThreadLocalPtr tls2; + p.tls2 = &tls2; + + ASSERT_GT(IDChecker::PeekId(), base_id); + base_id = IDChecker::PeekId(); + + auto func = [](Params* ptr) { + Params& params = *ptr; + ASSERT_TRUE(params.tls1.Get() == nullptr); + params.tls1.Reset(reinterpret_cast(1)); + ASSERT_TRUE(params.tls1.Get() == reinterpret_cast(1)); + params.tls1.Reset(reinterpret_cast(2)); + ASSERT_TRUE(params.tls1.Get() == reinterpret_cast(2)); + + ASSERT_TRUE(params.tls2->Get() == nullptr); + params.tls2->Reset(reinterpret_cast(1)); + ASSERT_TRUE(params.tls2->Get() == reinterpret_cast(1)); + params.tls2->Reset(reinterpret_cast(2)); + ASSERT_TRUE(params.tls2->Get() == reinterpret_cast(2)); + + params.mu->Lock(); + ++(params.completed); + params.cv->SignalAll(); + params.mu->Unlock(); + }; + + for (int iter = 0; iter < 1024; ++iter) { + ASSERT_EQ(IDChecker::PeekId(), base_id); + // Another new thread, read/write should not see value from previous thread + env_->StartThreadTyped(func, &p); + + mu.Lock(); + while (p.completed != iter + 1) { + cv.Wait(); + } + mu.Unlock(); + ASSERT_EQ(IDChecker::PeekId(), base_id); + } +} + +TEST_F(ThreadLocalTest, ConcurrentReadWriteTest) { + // global id list carries over 3, 1, 2, 0 + uint32_t base_id = IDChecker::PeekId(); + + ThreadLocalPtr tls2; + port::Mutex mu1; + port::CondVar cv1(&mu1); + Params p1(&mu1, &cv1, nullptr, 16); + p1.tls2 = &tls2; + + port::Mutex mu2; + port::CondVar cv2(&mu2); + Params p2(&mu2, &cv2, nullptr, 16); + p2.doWrite = true; + p2.tls2 = &tls2; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + p.mu->Lock(); + // Size_T switches size along with the ptr size + // we want to cast to. + size_t own = ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + // Let write threads write a different value from the read threads + if (p.doWrite) { + own += 8192; + } + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + auto* env = Env::Default(); + auto start = env->NowMicros(); + + p.tls1.Reset(reinterpret_cast(own)); + p.tls2->Reset(reinterpret_cast(own + 1)); + // Loop for 1 second + while (env->NowMicros() - start < 1000 * 1000) { + for (int iter = 0; iter < 100000; ++iter) { + ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(own)); + ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(own + 1)); + if (p.doWrite) { + p.tls1.Reset(reinterpret_cast(own)); + p.tls2->Reset(reinterpret_cast(own + 1)); + } + } + } + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + p.mu->Unlock(); + }; + + // Initiate 2 instnaces: one keeps writing and one keeps reading. + // The read instance should not see data from the write instance. + // Each thread local copy of the value are also different from each + // other. + for (int th = 0; th < p1.total; ++th) { + env_->StartThreadTyped(func, &p1); + } + for (int th = 0; th < p2.total; ++th) { + env_->StartThreadTyped(func, &p2); + } + + mu1.Lock(); + while (p1.completed != p1.total) { + cv1.Wait(); + } + mu1.Unlock(); + + mu2.Lock(); + while (p2.completed != p2.total) { + cv2.Wait(); + } + mu2.Unlock(); + + ASSERT_EQ(IDChecker::PeekId(), base_id + 3u); +} + +TEST_F(ThreadLocalTest, Unref) { + auto unref = [](void* ptr) { + auto& p = *static_cast(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + // Case 0: no unref triggered if ThreadLocalPtr is never accessed + auto func0 = [](Params* ptr) { + auto& p = *ptr; + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThreadTyped(func0, &p); + } + env_->WaitForJoin(); + ASSERT_EQ(unref_count, 0); + } + + // Case 1: unref triggered by thread exit + auto func1 = [](Params* ptr) { + auto& p = *ptr; + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + ThreadLocalPtr tls2(unref); + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = &tls2; + + for (int i = 0; i < p.total; ++i) { + env_->StartThreadTyped(func1, &p); + } + + env_->WaitForJoin(); + + // N threads x 2 ThreadLocal instance cleanup on thread exit + ASSERT_EQ(unref_count, 2 * p.total); + } + + // Case 2: unref triggered by ThreadLocal instance destruction + auto func2 = [](Params* ptr) { + auto& p = *ptr; + + p.mu->Lock(); + ++(p.started); + p.cv->SignalAll(); + while (p.started != p.total) { + p.cv->Wait(); + } + p.mu->Unlock(); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThreadTyped(func2, &p); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + // Now destroy one ThreadLocal instance + delete p.tls2; + p.tls2 = nullptr; + // instance destroy for N threads + ASSERT_EQ(unref_count, p.total); + + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + // additional N threads exit unref for the left instance + ASSERT_EQ(unref_count, 2 * p.total); + } +} + +TEST_F(ThreadLocalTest, Swap) { + ThreadLocalPtr tls; + tls.Reset(reinterpret_cast(1)); + ASSERT_EQ(reinterpret_cast(tls.Swap(nullptr)), 1); + ASSERT_TRUE(tls.Swap(reinterpret_cast(2)) == nullptr); + ASSERT_EQ(reinterpret_cast(tls.Get()), 2); + ASSERT_EQ(reinterpret_cast(tls.Swap(reinterpret_cast(3))), 2); +} + +TEST_F(ThreadLocalTest, Scrape) { + auto unref = [](void* ptr) { + auto& p = *static_cast(ptr); + p.mu->Lock(); + ++(*p.unref); + p.mu->Unlock(); + }; + + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + + ASSERT_TRUE(p.tls1.Get() == nullptr); + ASSERT_TRUE(p.tls2->Get() == nullptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.tls1.Reset(ptr); + p.tls2->Reset(ptr); + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 1; th <= 128; th += th) { + port::Mutex mu; + port::CondVar cv(&mu); + int unref_count = 0; + Params p(&mu, &cv, &unref_count, th, unref); + p.tls2 = new ThreadLocalPtr(unref); + + for (int i = 0; i < p.total; ++i) { + env_->StartThreadTyped(func, &p); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (p.completed != p.total) { + cv.Wait(); + } + mu.Unlock(); + + ASSERT_EQ(unref_count, 0); + + // Scrape all thread local data. No unref at thread + // exit or ThreadLocalPtr destruction + autovector ptrs; + p.tls1.Scrape(&ptrs, nullptr); + p.tls2->Scrape(&ptrs, nullptr); + delete p.tls2; + // Signal to exit + mu.Lock(); + p.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); + + ASSERT_EQ(unref_count, 0); + } +} + +TEST_F(ThreadLocalTest, Fold) { + auto unref = [](void* ptr) { + delete static_cast*>(ptr); + }; + static const int kNumThreads = 16; + static const int kItersPerThread = 10; + port::Mutex mu; + port::CondVar cv(&mu); + Params params(&mu, &cv, nullptr, kNumThreads, unref); + auto func = [](void* ptr) { + auto& p = *static_cast(ptr); + ASSERT_TRUE(p.tls1.Get() == nullptr); + p.tls1.Reset(new std::atomic(0)); + + for (int i = 0; i < kItersPerThread; ++i) { + static_cast*>(p.tls1.Get())->fetch_add(1); + } + + p.mu->Lock(); + ++(p.completed); + p.cv->SignalAll(); + + // Waiting for instruction to exit thread + while (p.completed != 0) { + p.cv->Wait(); + } + p.mu->Unlock(); + }; + + for (int th = 0; th < params.total; ++th) { + env_->StartThread(func, ¶ms); + } + + // Wait for all threads to finish using Params + mu.Lock(); + while (params.completed != params.total) { + cv.Wait(); + } + mu.Unlock(); + + // Verify Fold() behavior + int64_t sum = 0; + params.tls1.Fold( + [](void* ptr, void* res) { + auto sum_ptr = static_cast(res); + *sum_ptr += static_cast*>(ptr)->load(); + }, + &sum); + ASSERT_EQ(sum, kNumThreads * kItersPerThread); + + // Signal to exit + mu.Lock(); + params.completed = 0; + cv.SignalAll(); + mu.Unlock(); + env_->WaitForJoin(); +} + +TEST_F(ThreadLocalTest, CompareAndSwap) { + ThreadLocalPtr tls; + ASSERT_TRUE(tls.Swap(reinterpret_cast(1)) == nullptr); + void* expected = reinterpret_cast(1); + // Swap in 2 + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast(2), expected)); + expected = reinterpret_cast(100); + // Fail Swap, still 2 + ASSERT_TRUE(!tls.CompareAndSwap(reinterpret_cast(2), expected)); + ASSERT_EQ(expected, reinterpret_cast(2)); + // Swap in 3 + expected = reinterpret_cast(2); + ASSERT_TRUE(tls.CompareAndSwap(reinterpret_cast(3), expected)); + ASSERT_EQ(tls.Get(), reinterpret_cast(3)); +} + +namespace { + +void* AccessThreadLocal(void* /*arg*/) { + TEST_SYNC_POINT("AccessThreadLocal:Start"); + ThreadLocalPtr tlp; + tlp.Reset(new std::string("hello RocksDB")); + TEST_SYNC_POINT("AccessThreadLocal:End"); + return nullptr; +} + +} // namespace + +// The following test is disabled as it requires manual steps to run it +// correctly. +// +// Currently we have no way to acess SyncPoint w/o ASAN error when the +// child thread dies after the main thread dies. So if you manually enable +// this test and only see an ASAN error on SyncPoint, it means you pass the +// test. +TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"AccessThreadLocal:Start", "MainThreadDiesFirst:End"}, + {"PosixEnv::~PosixEnv():End", "AccessThreadLocal:End"}}); + + // Triggers the initialization of singletons. + Env::Default(); + +#ifndef ROCKSDB_LITE + try { +#endif // ROCKSDB_LITE + ROCKSDB_NAMESPACE::port::Thread th(&AccessThreadLocal, nullptr); + th.detach(); + TEST_SYNC_POINT("MainThreadDiesFirst:End"); +#ifndef ROCKSDB_LITE + } catch (const std::system_error& ex) { + std::cerr << "Start thread: " << ex.code() << std::endl; + FAIL(); + } +#endif // ROCKSDB_LITE +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/thread_operation.h b/src/rocksdb/util/thread_operation.h new file mode 100644 index 000000000..c24fccd5c --- /dev/null +++ b/src/rocksdb/util/thread_operation.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file defines the structures for thread operation and state. +// Thread operations are used to describe high level action of a +// thread such as doing compaction or flush, while thread state +// are used to describe lower-level action such as reading / +// writing a file or waiting for a mutex. Operations and states +// are designed to be independent. Typically, a thread usually involves +// in one operation and one state at any specific point in time. + +#pragma once + +#include + +#include "rocksdb/thread_status.h" + +namespace ROCKSDB_NAMESPACE { + +#ifdef ROCKSDB_USING_THREAD_STATUS + +// The structure that describes a major thread operation. +struct OperationInfo { + const ThreadStatus::OperationType type; + const std::string name; +}; + +// The global operation table. +// +// When updating a status of a thread, the pointer of the OperationInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +// +// Note that it's not designed to be constant as in the future we +// might consider adding global count to the OperationInfo. +static OperationInfo global_operation_table[] = { + {ThreadStatus::OP_UNKNOWN, ""}, + {ThreadStatus::OP_COMPACTION, "Compaction"}, + {ThreadStatus::OP_FLUSH, "Flush"}}; + +struct OperationStageInfo { + const ThreadStatus::OperationStage stage; + const std::string name; +}; + +// A table maintains the mapping from stage type to stage string. +// Note that the string must be changed accordingly when the +// associated function name changed. +static OperationStageInfo global_op_stage_table[] = { + {ThreadStatus::STAGE_UNKNOWN, ""}, + {ThreadStatus::STAGE_FLUSH_RUN, "FlushJob::Run"}, + {ThreadStatus::STAGE_FLUSH_WRITE_L0, "FlushJob::WriteLevel0Table"}, + {ThreadStatus::STAGE_COMPACTION_PREPARE, "CompactionJob::Prepare"}, + {ThreadStatus::STAGE_COMPACTION_RUN, "CompactionJob::Run"}, + {ThreadStatus::STAGE_COMPACTION_PROCESS_KV, + "CompactionJob::ProcessKeyValueCompaction"}, + {ThreadStatus::STAGE_COMPACTION_INSTALL, "CompactionJob::Install"}, + {ThreadStatus::STAGE_COMPACTION_SYNC_FILE, + "CompactionJob::FinishCompactionOutputFile"}, + {ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH, + "MemTableList::PickMemtablesToFlush"}, + {ThreadStatus::STAGE_MEMTABLE_ROLLBACK, + "MemTableList::RollbackMemtableFlush"}, + {ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS, + "MemTableList::TryInstallMemtableFlushResults"}, +}; + +// The structure that describes a state. +struct StateInfo { + const ThreadStatus::StateType type; + const std::string name; +}; + +// The global state table. +// +// When updating a status of a thread, the pointer of the StateInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +static StateInfo global_state_table[] = { + {ThreadStatus::STATE_UNKNOWN, ""}, + {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"}, +}; + +struct OperationProperty { + int code; + std::string name; +}; + +static OperationProperty compaction_operation_properties[] = { + {ThreadStatus::COMPACTION_JOB_ID, "JobID"}, + {ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL, "InputOutputLevel"}, + {ThreadStatus::COMPACTION_PROP_FLAGS, "Manual/Deletion/Trivial"}, + {ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES, "TotalInputBytes"}, + {ThreadStatus::COMPACTION_BYTES_READ, "BytesRead"}, + {ThreadStatus::COMPACTION_BYTES_WRITTEN, "BytesWritten"}, +}; + +static OperationProperty flush_operation_properties[] = { + {ThreadStatus::FLUSH_JOB_ID, "JobID"}, + {ThreadStatus::FLUSH_BYTES_MEMTABLES, "BytesMemtables"}, + {ThreadStatus::FLUSH_BYTES_WRITTEN, "BytesWritten"}}; + +#else + +struct OperationInfo {}; + +struct StateInfo {}; + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/threadpool_imp.cc b/src/rocksdb/util/threadpool_imp.cc new file mode 100644 index 000000000..09706cac5 --- /dev/null +++ b/src/rocksdb/util/threadpool_imp.cc @@ -0,0 +1,551 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/threadpool_imp.h" + +#ifndef OS_WIN +#include +#endif + +#ifdef OS_LINUX +#include +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "monitoring/thread_status_util.h" +#include "port/port.h" +#include "test_util/sync_point.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +void ThreadPoolImpl::PthreadCall(const char* label, int result) { + if (result != 0) { + fprintf(stderr, "pthread %s: %s\n", label, errnoStr(result).c_str()); + abort(); + } +} + +struct ThreadPoolImpl::Impl { + Impl(); + ~Impl(); + + void JoinThreads(bool wait_for_jobs_to_complete); + + void SetBackgroundThreadsInternal(int num, bool allow_reduce); + int GetBackgroundThreads(); + + unsigned int GetQueueLen() const { + return queue_len_.load(std::memory_order_relaxed); + } + + void LowerIOPriority(); + + void LowerCPUPriority(CpuPriority pri); + + void WakeUpAllThreads() { bgsignal_.notify_all(); } + + void BGThread(size_t thread_id); + + void StartBGThreads(); + + void Submit(std::function&& schedule, + std::function&& unschedule, void* tag); + + int UnSchedule(void* arg); + + void SetHostEnv(Env* env) { env_ = env; } + + Env* GetHostEnv() const { return env_; } + + bool HasExcessiveThread() const { + return static_cast(bgthreads_.size()) > total_threads_limit_; + } + + // Return true iff the current thread is the excessive thread to terminate. + // Always terminate the running thread that is added last, even if there are + // more than one thread to terminate. + bool IsLastExcessiveThread(size_t thread_id) const { + return HasExcessiveThread() && thread_id == bgthreads_.size() - 1; + } + + bool IsExcessiveThread(size_t thread_id) const { + return static_cast(thread_id) >= total_threads_limit_; + } + + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() const { return priority_; } + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority) { priority_ = priority; } + + int ReserveThreads(int threads_to_be_reserved) { + std::unique_lock lock(mu_); + // We can reserve at most num_waiting_threads_ in total so the number of + // threads that can be reserved might be fewer than the desired one. In + // rare cases, num_waiting_threads_ could be less than reserved_threads + // due to SetBackgroundThreadInternal or last excessive threads. If that + // happens, we cannot reserve any other threads. + int reserved_threads_in_success = + std::min(std::max(num_waiting_threads_ - reserved_threads_, 0), + threads_to_be_reserved); + reserved_threads_ += reserved_threads_in_success; + return reserved_threads_in_success; + } + + int ReleaseThreads(int threads_to_be_released) { + std::unique_lock lock(mu_); + // We cannot release more than reserved_threads_ + int released_threads_in_success = + std::min(reserved_threads_, threads_to_be_released); + reserved_threads_ -= released_threads_in_success; + WakeUpAllThreads(); + return released_threads_in_success; + } + + private: + static void BGThreadWrapper(void* arg); + + bool low_io_priority_; + CpuPriority cpu_priority_; + Env::Priority priority_; + Env* env_; + + int total_threads_limit_; + std::atomic_uint queue_len_; // Queue length. Used for stats reporting + // Number of reserved threads, managed by ReserveThreads(..) and + // ReleaseThreads(..), if num_waiting_threads_ is no larger than + // reserved_threads_, its thread will be blocked to ensure the reservation + // mechanism + int reserved_threads_; + // Number of waiting threads (Maximum number of threads that can be + // reserved), in rare cases, num_waiting_threads_ could be less than + // reserved_threads due to SetBackgroundThreadInternal or last + // excessive threads. + int num_waiting_threads_; + bool exit_all_threads_; + bool wait_for_jobs_to_complete_; + + // Entry per Schedule()/Submit() call + struct BGItem { + void* tag = nullptr; + std::function function; + std::function unschedFunction; + }; + + using BGQueue = std::deque; + BGQueue queue_; + + std::mutex mu_; + std::condition_variable bgsignal_; + std::vector bgthreads_; +}; + +inline ThreadPoolImpl::Impl::Impl() + : low_io_priority_(false), + cpu_priority_(CpuPriority::kNormal), + priority_(Env::LOW), + env_(nullptr), + total_threads_limit_(0), + queue_len_(), + reserved_threads_(0), + num_waiting_threads_(0), + exit_all_threads_(false), + wait_for_jobs_to_complete_(false), + queue_(), + mu_(), + bgsignal_(), + bgthreads_() {} + +inline ThreadPoolImpl::Impl::~Impl() { assert(bgthreads_.size() == 0U); } + +void ThreadPoolImpl::Impl::JoinThreads(bool wait_for_jobs_to_complete) { + std::unique_lock lock(mu_); + assert(!exit_all_threads_); + + wait_for_jobs_to_complete_ = wait_for_jobs_to_complete; + exit_all_threads_ = true; + // prevent threads from being recreated right after they're joined, in case + // the user is concurrently submitting jobs. + total_threads_limit_ = 0; + reserved_threads_ = 0; + num_waiting_threads_ = 0; + + lock.unlock(); + + bgsignal_.notify_all(); + + for (auto& th : bgthreads_) { + th.join(); + } + + bgthreads_.clear(); + + exit_all_threads_ = false; + wait_for_jobs_to_complete_ = false; +} + +inline void ThreadPoolImpl::Impl::LowerIOPriority() { + std::lock_guard lock(mu_); + low_io_priority_ = true; +} + +inline void ThreadPoolImpl::Impl::LowerCPUPriority(CpuPriority pri) { + std::lock_guard lock(mu_); + cpu_priority_ = pri; +} + +void ThreadPoolImpl::Impl::BGThread(size_t thread_id) { + bool low_io_priority = false; + CpuPriority current_cpu_priority = CpuPriority::kNormal; + + while (true) { + // Wait until there is an item that is ready to run + std::unique_lock lock(mu_); + // Stop waiting if the thread needs to do work or needs to terminate. + // Increase num_waiting_threads_ once this task has started waiting + num_waiting_threads_++; + + TEST_SYNC_POINT("ThreadPoolImpl::BGThread::WaitingThreadsInc"); + TEST_IDX_SYNC_POINT("ThreadPoolImpl::BGThread::Start:th", thread_id); + // When not exist_all_threads and the current thread id is not the last + // excessive thread, it may be blocked due to 3 reasons: 1) queue is empty + // 2) it is the excessive thread (not the last one) + // 3) the number of waiting threads is not greater than reserved threads + // (i.e, no available threads due to full reservation") + while (!exit_all_threads_ && !IsLastExcessiveThread(thread_id) && + (queue_.empty() || IsExcessiveThread(thread_id) || + num_waiting_threads_ <= reserved_threads_)) { + bgsignal_.wait(lock); + } + // Decrease num_waiting_threads_ once the thread is not waiting + num_waiting_threads_--; + + if (exit_all_threads_) { // mechanism to let BG threads exit safely + + if (!wait_for_jobs_to_complete_ || queue_.empty()) { + break; + } + } else if (IsLastExcessiveThread(thread_id)) { + // Current thread is the last generated one and is excessive. + // We always terminate excessive thread in the reverse order of + // generation time. But not when `exit_all_threads_ == true`, + // otherwise `JoinThreads()` could try to `join()` a `detach()`ed + // thread. + auto& terminating_thread = bgthreads_.back(); + terminating_thread.detach(); + bgthreads_.pop_back(); + if (HasExcessiveThread()) { + // There is still at least more excessive thread to terminate. + WakeUpAllThreads(); + } + TEST_IDX_SYNC_POINT("ThreadPoolImpl::BGThread::Termination:th", + thread_id); + TEST_SYNC_POINT("ThreadPoolImpl::BGThread::Termination"); + break; + } + + auto func = std::move(queue_.front().function); + queue_.pop_front(); + + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); + + bool decrease_io_priority = (low_io_priority != low_io_priority_); + CpuPriority cpu_priority = cpu_priority_; + lock.unlock(); + + if (cpu_priority < current_cpu_priority) { + TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::BeforeSetCpuPriority", + ¤t_cpu_priority); + // 0 means current thread. + port::SetCpuPriority(0, cpu_priority); + current_cpu_priority = cpu_priority; + TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::BGThread::AfterSetCpuPriority", + ¤t_cpu_priority); + } + +#ifdef OS_LINUX + if (decrease_io_priority) { +#define IOPRIO_CLASS_SHIFT (13) +#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) + // Put schedule into IOPRIO_CLASS_IDLE class (lowest) + // These system calls only have an effect when used in conjunction + // with an I/O scheduler that supports I/O priorities. As at + // kernel 2.6.17 the only such scheduler is the Completely + // Fair Queuing (CFQ) I/O scheduler. + // To change scheduler: + // echo cfq > /sys/block//queue/schedule + // Tunables to consider: + // /sys/block//queue/slice_idle + // /sys/block//queue/slice_sync + syscall(SYS_ioprio_set, 1, // IOPRIO_WHO_PROCESS + 0, // current thread + IOPRIO_PRIO_VALUE(3, 0)); + low_io_priority = true; + } +#else + (void)decrease_io_priority; // avoid 'unused variable' error +#endif + + TEST_SYNC_POINT_CALLBACK("ThreadPoolImpl::Impl::BGThread:BeforeRun", + &priority_); + + func(); + } +} + +// Helper struct for passing arguments when creating threads. +struct BGThreadMetadata { + ThreadPoolImpl::Impl* thread_pool_; + size_t thread_id_; // Thread count in the thread. + BGThreadMetadata(ThreadPoolImpl::Impl* thread_pool, size_t thread_id) + : thread_pool_(thread_pool), thread_id_(thread_id) {} +}; + +void ThreadPoolImpl::Impl::BGThreadWrapper(void* arg) { + BGThreadMetadata* meta = reinterpret_cast(arg); + size_t thread_id = meta->thread_id_; + ThreadPoolImpl::Impl* tp = meta->thread_pool_; +#ifdef ROCKSDB_USING_THREAD_STATUS + // initialize it because compiler isn't good enough to see we don't use it + // uninitialized + ThreadStatus::ThreadType thread_type = ThreadStatus::NUM_THREAD_TYPES; + switch (tp->GetThreadPriority()) { + case Env::Priority::HIGH: + thread_type = ThreadStatus::HIGH_PRIORITY; + break; + case Env::Priority::LOW: + thread_type = ThreadStatus::LOW_PRIORITY; + break; + case Env::Priority::BOTTOM: + thread_type = ThreadStatus::BOTTOM_PRIORITY; + break; + case Env::Priority::USER: + thread_type = ThreadStatus::USER; + break; + case Env::Priority::TOTAL: + assert(false); + return; + } + assert(thread_type != ThreadStatus::NUM_THREAD_TYPES); + ThreadStatusUtil::RegisterThread(tp->GetHostEnv(), thread_type); +#endif + delete meta; + tp->BGThread(thread_id); +#ifdef ROCKSDB_USING_THREAD_STATUS + ThreadStatusUtil::UnregisterThread(); +#endif + return; +} + +void ThreadPoolImpl::Impl::SetBackgroundThreadsInternal(int num, + bool allow_reduce) { + std::lock_guard lock(mu_); + if (exit_all_threads_) { + return; + } + if (num > total_threads_limit_ || + (num < total_threads_limit_ && allow_reduce)) { + total_threads_limit_ = std::max(0, num); + WakeUpAllThreads(); + StartBGThreads(); + } +} + +int ThreadPoolImpl::Impl::GetBackgroundThreads() { + std::unique_lock lock(mu_); + return total_threads_limit_; +} + +void ThreadPoolImpl::Impl::StartBGThreads() { + // Start background thread if necessary + while ((int)bgthreads_.size() < total_threads_limit_) { + port::Thread p_t(&BGThreadWrapper, + new BGThreadMetadata(this, bgthreads_.size())); + +// Set the thread name to aid debugging +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + auto th_handle = p_t.native_handle(); + std::string thread_priority = Env::PriorityToString(GetThreadPriority()); + std::ostringstream thread_name_stream; + thread_name_stream << "rocksdb:"; + for (char c : thread_priority) { + thread_name_stream << static_cast(tolower(c)); + } + pthread_setname_np(th_handle, thread_name_stream.str().c_str()); +#endif +#endif + bgthreads_.push_back(std::move(p_t)); + } +} + +void ThreadPoolImpl::Impl::Submit(std::function&& schedule, + std::function&& unschedule, + void* tag) { + std::lock_guard lock(mu_); + + if (exit_all_threads_) { + return; + } + + StartBGThreads(); + + // Add to priority queue + queue_.push_back(BGItem()); + TEST_SYNC_POINT("ThreadPoolImpl::Submit::Enqueue"); + auto& item = queue_.back(); + item.tag = tag; + item.function = std::move(schedule); + item.unschedFunction = std::move(unschedule); + + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); + + if (!HasExcessiveThread()) { + // Wake up at least one waiting thread. + bgsignal_.notify_one(); + } else { + // Need to wake up all threads to make sure the one woken + // up is not the one to terminate. + WakeUpAllThreads(); + } +} + +int ThreadPoolImpl::Impl::UnSchedule(void* arg) { + int count = 0; + + std::vector> candidates; + { + std::lock_guard lock(mu_); + + // Remove from priority queue + BGQueue::iterator it = queue_.begin(); + while (it != queue_.end()) { + if (arg == (*it).tag) { + if (it->unschedFunction) { + candidates.push_back(std::move(it->unschedFunction)); + } + it = queue_.erase(it); + count++; + } else { + ++it; + } + } + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); + } + + // Run unschedule functions outside the mutex + for (auto& f : candidates) { + f(); + } + + return count; +} + +ThreadPoolImpl::ThreadPoolImpl() : impl_(new Impl()) {} + +ThreadPoolImpl::~ThreadPoolImpl() {} + +void ThreadPoolImpl::JoinAllThreads() { impl_->JoinThreads(false); } + +void ThreadPoolImpl::SetBackgroundThreads(int num) { + impl_->SetBackgroundThreadsInternal(num, true); +} + +int ThreadPoolImpl::GetBackgroundThreads() { + return impl_->GetBackgroundThreads(); +} + +unsigned int ThreadPoolImpl::GetQueueLen() const { + return impl_->GetQueueLen(); +} + +void ThreadPoolImpl::WaitForJobsAndJoinAllThreads() { + impl_->JoinThreads(true); +} + +void ThreadPoolImpl::LowerIOPriority() { impl_->LowerIOPriority(); } + +void ThreadPoolImpl::LowerCPUPriority(CpuPriority pri) { + impl_->LowerCPUPriority(pri); +} + +void ThreadPoolImpl::IncBackgroundThreadsIfNeeded(int num) { + impl_->SetBackgroundThreadsInternal(num, false); +} + +void ThreadPoolImpl::SubmitJob(const std::function& job) { + auto copy(job); + impl_->Submit(std::move(copy), std::function(), nullptr); +} + +void ThreadPoolImpl::SubmitJob(std::function&& job) { + impl_->Submit(std::move(job), std::function(), nullptr); +} + +void ThreadPoolImpl::Schedule(void (*function)(void* arg1), void* arg, + void* tag, void (*unschedFunction)(void* arg)) { + if (unschedFunction == nullptr) { + impl_->Submit(std::bind(function, arg), std::function(), tag); + } else { + impl_->Submit(std::bind(function, arg), std::bind(unschedFunction, arg), + tag); + } +} + +int ThreadPoolImpl::UnSchedule(void* arg) { return impl_->UnSchedule(arg); } + +void ThreadPoolImpl::SetHostEnv(Env* env) { impl_->SetHostEnv(env); } + +Env* ThreadPoolImpl::GetHostEnv() const { return impl_->GetHostEnv(); } + +// Return the thread priority. +// This would allow its member-thread to know its priority. +Env::Priority ThreadPoolImpl::GetThreadPriority() const { + return impl_->GetThreadPriority(); +} + +// Set the thread priority. +void ThreadPoolImpl::SetThreadPriority(Env::Priority priority) { + impl_->SetThreadPriority(priority); +} + +// Reserve a specific number of threads, prevent them from running other +// functions The number of reserved threads could be fewer than the desired one +int ThreadPoolImpl::ReserveThreads(int threads_to_be_reserved) { + return impl_->ReserveThreads(threads_to_be_reserved); +} + +// Release a specific number of threads +int ThreadPoolImpl::ReleaseThreads(int threads_to_be_released) { + return impl_->ReleaseThreads(threads_to_be_released); +} + +ThreadPool* NewThreadPool(int num_threads) { + ThreadPoolImpl* thread_pool = new ThreadPoolImpl(); + thread_pool->SetBackgroundThreads(num_threads); + return thread_pool; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/threadpool_imp.h b/src/rocksdb/util/threadpool_imp.h new file mode 100644 index 000000000..a5109e38f --- /dev/null +++ b/src/rocksdb/util/threadpool_imp.h @@ -0,0 +1,120 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/threadpool.h" + +namespace ROCKSDB_NAMESPACE { + +class ThreadPoolImpl : public ThreadPool { + public: + ThreadPoolImpl(); + ~ThreadPoolImpl(); + + ThreadPoolImpl(ThreadPoolImpl&&) = delete; + ThreadPoolImpl& operator=(ThreadPoolImpl&&) = delete; + + // Implement ThreadPool interfaces + + // Wait for all threads to finish. + // Discards all the jobs that did not + // start executing and waits for those running + // to complete + void JoinAllThreads() override; + + // Set the number of background threads that will be executing the + // scheduled jobs. + void SetBackgroundThreads(int num) override; + int GetBackgroundThreads() override; + + // Get the number of jobs scheduled in the ThreadPool queue. + unsigned int GetQueueLen() const override; + + // Waits for all jobs to complete those + // that already started running and those that did not + // start yet + void WaitForJobsAndJoinAllThreads() override; + + // Make threads to run at a lower kernel IO priority + // Currently only has effect on Linux + void LowerIOPriority(); + + // Make threads to run at a lower kernel CPU priority + // Currently only has effect on Linux + void LowerCPUPriority(CpuPriority pri); + + // Ensure there is at aleast num threads in the pool + // but do not kill threads if there are more + void IncBackgroundThreadsIfNeeded(int num); + + // Submit a fire and forget job + // These jobs can not be unscheduled + + // This allows to submit the same job multiple times + void SubmitJob(const std::function&) override; + // This moves the function in for efficiency + void SubmitJob(std::function&&) override; + + // Schedule a job with an unschedule tag and unschedule function + // Can be used to filter and unschedule jobs by a tag + // that are still in the queue and did not start running + void Schedule(void (*function)(void* arg1), void* arg, void* tag, + void (*unschedFunction)(void* arg)); + + // Filter jobs that are still in a queue and match + // the given tag. Remove them from a queue if any + // and for each such job execute an unschedule function + // if such was given at scheduling time. + int UnSchedule(void* tag); + + void SetHostEnv(Env* env); + + Env* GetHostEnv() const; + + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() const; + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority); + + // Reserve a specific number of threads, prevent them from running other + // functions The number of reserved threads could be fewer than the desired + // one + int ReserveThreads(int threads_to_be_reserved) override; + + // Release a specific number of threads + int ReleaseThreads(int threads_to_be_released) override; + + static void PthreadCall(const char* label, int result); + + struct Impl; + + private: + // Current public virtual interface does not provide usable + // functionality and thus can not be used internally to + // facade different implementations. + // + // We propose a pimpl idiom in order to easily replace the thread pool impl + // w/o touching the header file but providing a different .cc potentially + // CMake option driven. + // + // Another option is to introduce a Env::MakeThreadPool() virtual interface + // and override the environment. This would require refactoring ThreadPool + // usage. + // + // We can also combine these two approaches + std::unique_ptr impl_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/timer.h b/src/rocksdb/util/timer.h new file mode 100644 index 000000000..db71cefaf --- /dev/null +++ b/src/rocksdb/util/timer.h @@ -0,0 +1,340 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/system_clock.h" +#include "test_util/sync_point.h" +#include "util/mutexlock.h" + +namespace ROCKSDB_NAMESPACE { + +// A Timer class to handle repeated work. +// +// `Start()` and `Shutdown()` are currently not thread-safe. The client must +// serialize calls to these two member functions. +// +// A single timer instance can handle multiple functions via a single thread. +// It is better to leave long running work to a dedicated thread pool. +// +// Timer can be started by calling `Start()`, and ended by calling `Shutdown()`. +// Work (in terms of a `void function`) can be scheduled by calling `Add` with +// a unique function name and de-scheduled by calling `Cancel`. +// Many functions can be added. +// +// Impl Details: +// A heap is used to keep track of when the next timer goes off. +// A map from a function name to the function keeps track of all the functions. +class Timer { + public: + explicit Timer(SystemClock* clock) + : clock_(clock), + mutex_(clock), + cond_var_(&mutex_), + running_(false), + executing_task_(false) {} + + ~Timer() { Shutdown(); } + + // Add a new function to run. + // fn_name has to be identical, otherwise it will fail to add and return false + // start_after_us is the initial delay. + // repeat_every_us is the interval between ending time of the last call and + // starting time of the next call. For example, repeat_every_us = 2000 and + // the function takes 1000us to run. If it starts at time [now]us, then it + // finishes at [now]+1000us, 2nd run starting time will be at [now]+3000us. + // repeat_every_us == 0 means do not repeat. + bool Add(std::function fn, const std::string& fn_name, + uint64_t start_after_us, uint64_t repeat_every_us) { + auto fn_info = std::make_unique(std::move(fn), fn_name, 0, + repeat_every_us); + InstrumentedMutexLock l(&mutex_); + // Assign time within mutex to make sure the next_run_time is larger than + // the current running one + fn_info->next_run_time_us = clock_->NowMicros() + start_after_us; + // the new task start time should never before the current task executing + // time, as the executing task can only be running if it's next_run_time_us + // is due (<= clock_->NowMicros()). + if (executing_task_ && + fn_info->next_run_time_us < heap_.top()->next_run_time_us) { + return false; + } + auto it = map_.find(fn_name); + if (it == map_.end()) { + heap_.push(fn_info.get()); + map_.try_emplace(fn_name, std::move(fn_info)); + } else { + // timer doesn't support duplicated function name + return false; + } + cond_var_.SignalAll(); + return true; + } + + void Cancel(const std::string& fn_name) { + InstrumentedMutexLock l(&mutex_); + + // Mark the function with fn_name as invalid so that it will not be + // requeued. + auto it = map_.find(fn_name); + if (it != map_.end() && it->second) { + it->second->Cancel(); + } + + // If the currently running function is fn_name, then we need to wait + // until it finishes before returning to caller. + while (!heap_.empty() && executing_task_) { + FunctionInfo* func_info = heap_.top(); + assert(func_info); + if (func_info->name == fn_name) { + WaitForTaskCompleteIfNecessary(); + } else { + break; + } + } + } + + void CancelAll() { + InstrumentedMutexLock l(&mutex_); + CancelAllWithLock(); + } + + // Start the Timer + bool Start() { + InstrumentedMutexLock l(&mutex_); + if (running_) { + return false; + } + + running_ = true; + thread_ = std::make_unique(&Timer::Run, this); + return true; + } + + // Shutdown the Timer + bool Shutdown() { + { + InstrumentedMutexLock l(&mutex_); + if (!running_) { + return false; + } + running_ = false; + CancelAllWithLock(); + cond_var_.SignalAll(); + } + + if (thread_) { + thread_->join(); + } + return true; + } + + bool HasPendingTask() const { + InstrumentedMutexLock l(&mutex_); + for (const auto& fn_info : map_) { + if (fn_info.second->IsValid()) { + return true; + } + } + return false; + } + +#ifndef NDEBUG + // Wait until Timer starting waiting, call the optional callback, then wait + // for Timer waiting again. + // Tests can provide a custom Clock object to mock time, and use the callback + // here to bump current time and trigger Timer. See timer_test for example. + // + // Note: only support one caller of this method. + void TEST_WaitForRun(const std::function& callback = nullptr) { + InstrumentedMutexLock l(&mutex_); + // It act as a spin lock + while (executing_task_ || + (!heap_.empty() && + heap_.top()->next_run_time_us <= clock_->NowMicros())) { + cond_var_.TimedWait(clock_->NowMicros() + 1000); + } + if (callback != nullptr) { + callback(); + } + cond_var_.SignalAll(); + do { + cond_var_.TimedWait(clock_->NowMicros() + 1000); + } while (executing_task_ || + (!heap_.empty() && + heap_.top()->next_run_time_us <= clock_->NowMicros())); + } + + size_t TEST_GetPendingTaskNum() const { + InstrumentedMutexLock l(&mutex_); + size_t ret = 0; + for (const auto& fn_info : map_) { + if (fn_info.second->IsValid()) { + ret++; + } + } + return ret; + } + + void TEST_OverrideTimer(SystemClock* clock) { + InstrumentedMutexLock l(&mutex_); + clock_ = clock; + } +#endif // NDEBUG + + private: + void Run() { + InstrumentedMutexLock l(&mutex_); + + while (running_) { + if (heap_.empty()) { + // wait + TEST_SYNC_POINT("Timer::Run::Waiting"); + cond_var_.Wait(); + continue; + } + + FunctionInfo* current_fn = heap_.top(); + assert(current_fn); + + if (!current_fn->IsValid()) { + heap_.pop(); + map_.erase(current_fn->name); + continue; + } + + if (current_fn->next_run_time_us <= clock_->NowMicros()) { + // make a copy of the function so it won't be changed after + // mutex_.unlock. + std::function fn = current_fn->fn; + executing_task_ = true; + mutex_.Unlock(); + // Execute the work + fn(); + mutex_.Lock(); + executing_task_ = false; + cond_var_.SignalAll(); + + // Remove the work from the heap once it is done executing, make sure + // it's the same function after executing the work while mutex is + // released. + // Note that we are just removing the pointer from the heap. Its + // memory is still managed in the map (as it holds a unique ptr). + // So current_fn is still a valid ptr. + assert(heap_.top() == current_fn); + heap_.pop(); + + // current_fn may be cancelled already. + if (current_fn->IsValid() && current_fn->repeat_every_us > 0) { + assert(running_); + current_fn->next_run_time_us = + clock_->NowMicros() + current_fn->repeat_every_us; + + // Schedule new work into the heap with new time. + heap_.push(current_fn); + } else { + // if current_fn is cancelled or no need to repeat, remove it from the + // map to avoid leak. + map_.erase(current_fn->name); + } + } else { + cond_var_.TimedWait(current_fn->next_run_time_us); + } + } + } + + void CancelAllWithLock() { + mutex_.AssertHeld(); + if (map_.empty() && heap_.empty()) { + return; + } + + // With mutex_ held, set all tasks to invalid so that they will not be + // re-queued. + for (auto& elem : map_) { + auto& func_info = elem.second; + assert(func_info); + func_info->Cancel(); + } + + // WaitForTaskCompleteIfNecessary() may release mutex_ + WaitForTaskCompleteIfNecessary(); + + while (!heap_.empty()) { + heap_.pop(); + } + map_.clear(); + } + + // A wrapper around std::function to keep track when it should run next + // and at what frequency. + struct FunctionInfo { + // the actual work + std::function fn; + // name of the function + std::string name; + // when the function should run next + uint64_t next_run_time_us; + // repeat interval + uint64_t repeat_every_us; + // controls whether this function is valid. + // A function is valid upon construction and until someone explicitly + // calls `Cancel()`. + bool valid; + + FunctionInfo(std::function&& _fn, std::string _name, + const uint64_t _next_run_time_us, uint64_t _repeat_every_us) + : fn(std::move(_fn)), + name(std::move(_name)), + next_run_time_us(_next_run_time_us), + repeat_every_us(_repeat_every_us), + valid(true) {} + + void Cancel() { valid = false; } + + bool IsValid() const { return valid; } + }; + + void WaitForTaskCompleteIfNecessary() { + mutex_.AssertHeld(); + while (executing_task_) { + TEST_SYNC_POINT("Timer::WaitForTaskCompleteIfNecessary:TaskExecuting"); + cond_var_.Wait(); + } + } + + struct RunTimeOrder { + bool operator()(const FunctionInfo* f1, const FunctionInfo* f2) { + return f1->next_run_time_us > f2->next_run_time_us; + } + }; + + SystemClock* clock_; + // This mutex controls both the heap_ and the map_. It needs to be held for + // making any changes in them. + mutable InstrumentedMutex mutex_; + InstrumentedCondVar cond_var_; + std::unique_ptr thread_; + bool running_; + bool executing_task_; + + std::priority_queue, RunTimeOrder> + heap_; + + // In addition to providing a mapping from a function name to a function, + // it is also responsible for memory management. + std::unordered_map> map_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/timer_queue.h b/src/rocksdb/util/timer_queue.h new file mode 100644 index 000000000..36a1744ac --- /dev/null +++ b/src/rocksdb/util/timer_queue.h @@ -0,0 +1,231 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for commercial purposes, all without asking permission. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "test_util/sync_point.h" + +// Allows execution of handlers at a specified time in the future +// Guarantees: +// - All handlers are executed ONCE, even if cancelled (aborted parameter will +// be set to true) +// - If TimerQueue is destroyed, it will cancel all handlers. +// - Handlers are ALWAYS executed in the Timer Queue worker thread. +// - Handlers execution order is NOT guaranteed +// +//////////////////////////////////////////////////////////////////////////////// +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +class TimerQueue { + public: + TimerQueue() : m_th(&TimerQueue::run, this) {} + + ~TimerQueue() { shutdown(); } + + // This function is not thread-safe. + void shutdown() { + if (closed_) { + return; + } + cancelAll(); + // Abusing the timer queue to trigger the shutdown. + add(0, [this](bool) { + m_finish = true; + return std::make_pair(false, 0); + }); + m_th.join(); + closed_ = true; + } + + // Adds a new timer + // \return + // Returns the ID of the new timer. You can use this ID to cancel the + // timer + uint64_t add(int64_t milliseconds, + std::function(bool)> handler) { + WorkItem item; + Clock::time_point tp = Clock::now(); + item.end = tp + std::chrono::milliseconds(milliseconds); + TEST_SYNC_POINT_CALLBACK("TimeQueue::Add:item.end", &item.end); + item.period = milliseconds; + item.handler = std::move(handler); + + std::unique_lock lk(m_mtx); + uint64_t id = ++m_idcounter; + item.id = id; + m_items.push(std::move(item)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return id; + } + + // Cancels the specified timer + // \return + // 1 if the timer was cancelled. + // 0 if you were too late to cancel (or the timer ID was never valid to + // start with) + size_t cancel(uint64_t id) { + // Instead of removing the item from the container (thus breaking the + // heap integrity), we set the item as having no handler, and put + // that handler on a new item at the top for immediate execution + // The timer thread will then ignore the original item, since it has no + // handler. + std::unique_lock lk(m_mtx); + for (auto&& item : m_items.getContainer()) { + if (item.id == id && item.handler) { + WorkItem newItem; + // Zero time, so it stays at the top for immediate execution + newItem.end = Clock::time_point(); + newItem.id = 0; // Means it is a canceled item + // Move the handler from item to newitem (thus clearing item) + newItem.handler = std::move(item.handler); + m_items.push(std::move(newItem)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return 1; + } + } + return 0; + } + + // Cancels all timers + // \return + // The number of timers cancelled + size_t cancelAll() { + // Setting all "end" to 0 (for immediate execution) is ok, + // since it maintains the heap integrity + std::unique_lock lk(m_mtx); + m_cancel = true; + for (auto&& item : m_items.getContainer()) { + if (item.id && item.handler) { + item.end = Clock::time_point(); + item.id = 0; + } + } + auto ret = m_items.size(); + + m_checkWork.notify_one(); + return ret; + } + + private: + using Clock = std::chrono::steady_clock; + TimerQueue(const TimerQueue&) = delete; + TimerQueue& operator=(const TimerQueue&) = delete; + + void run() { + std::unique_lock lk(m_mtx); + while (!m_finish) { + auto end = calcWaitTime_lock(); + if (end.first) { + // Timers found, so wait until it expires (or something else + // changes) + m_checkWork.wait_until(lk, end.second); + } else { + // No timers exist, so wait forever until something changes + m_checkWork.wait(lk); + } + + // Check and execute as much work as possible, such as, all expired + // timers + checkWork(&lk); + } + + // If we are shutting down, we should not have any items left, + // since the shutdown cancels all items + assert(m_items.size() == 0); + } + + std::pair calcWaitTime_lock() { + while (m_items.size()) { + if (m_items.top().handler) { + // Item present, so return the new wait time + return std::make_pair(true, m_items.top().end); + } else { + // Discard empty handlers (they were cancelled) + m_items.pop(); + } + } + + // No items found, so return no wait time (causes the thread to wait + // indefinitely) + return std::make_pair(false, Clock::time_point()); + } + + void checkWork(std::unique_lock* lk) { + while (m_items.size() && m_items.top().end <= Clock::now()) { + WorkItem item(m_items.top()); + m_items.pop(); + + if (item.handler) { + (*lk).unlock(); + auto reschedule_pair = item.handler(item.id == 0); + (*lk).lock(); + if (!m_cancel && reschedule_pair.first) { + int64_t new_period = (reschedule_pair.second == -1) + ? item.period + : reschedule_pair.second; + + item.period = new_period; + item.end = Clock::now() + std::chrono::milliseconds(new_period); + m_items.push(std::move(item)); + } + } + } + } + + bool m_finish = false; + bool m_cancel = false; + uint64_t m_idcounter = 0; + std::condition_variable m_checkWork; + + struct WorkItem { + Clock::time_point end; + int64_t period; + uint64_t id; // id==0 means it was cancelled + std::function(bool)> handler; + bool operator>(const WorkItem& other) const { return end > other.end; } + }; + + std::mutex m_mtx; + // Inheriting from priority_queue, so we can access the internal container + class Queue : public std::priority_queue, + std::greater> { + public: + std::vector& getContainer() { return this->c; } + } m_items; + ROCKSDB_NAMESPACE::port::Thread m_th; + bool closed_ = false; +}; diff --git a/src/rocksdb/util/timer_queue_test.cc b/src/rocksdb/util/timer_queue_test.cc new file mode 100644 index 000000000..b3c3768ec --- /dev/null +++ b/src/rocksdb/util/timer_queue_test.cc @@ -0,0 +1,73 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free +// to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for +// commercial purposes, all without asking permission. See Other Information +// below. +// + +#include "util/timer_queue.h" + +#include + +namespace Timing { + +using Clock = std::chrono::high_resolution_clock; +double now() { + static auto start = Clock::now(); + return std::chrono::duration(Clock::now() - start) + .count(); +} + +} // namespace Timing + +int main() { + TimerQueue q; + + double tnow = Timing::now(); + + q.add(10000, [tnow](bool aborted) mutable { + printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(false, 0); + }); + q.add(10001, [tnow](bool aborted) mutable { + printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(false, 0); + }); + + q.add(1000, [tnow](bool aborted) mutable { + printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(!aborted, 1000); + }); + + auto id = q.add(2000, [tnow](bool aborted) mutable { + printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(!aborted, 2000); + }); + + (void)id; + // auto ret = q.cancel(id); + // assert(ret == 1); + // q.cancelAll(); + + return 0; +} +////////////////////////////////////////// diff --git a/src/rocksdb/util/timer_test.cc b/src/rocksdb/util/timer_test.cc new file mode 100644 index 000000000..0ebfa9f3d --- /dev/null +++ b/src/rocksdb/util/timer_test.cc @@ -0,0 +1,402 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/timer.h" + +#include "db/db_test_util.h" +#include "rocksdb/file_system.h" +#include "test_util/mock_time_env.h" + +namespace ROCKSDB_NAMESPACE { + +class TimerTest : public testing::Test { + public: + TimerTest() + : mock_clock_(std::make_shared(SystemClock::Default())) { + } + + protected: + std::shared_ptr mock_clock_; + + void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); } + + const int kUsPerSec = 1000000; +}; + +TEST_F(TimerTest, SingleScheduleOnce) { + const int kInitDelayUs = 1 * kUsPerSec; + Timer timer(mock_clock_.get()); + + int count = 0; + timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, 0); + + ASSERT_TRUE(timer.Start()); + + ASSERT_EQ(0, count); + // Wait for execution to finish + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); }); + ASSERT_EQ(1, count); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, MultipleScheduleOnce) { + const int kInitDelay1Us = 1 * kUsPerSec; + const int kInitDelay2Us = 3 * kUsPerSec; + Timer timer(mock_clock_.get()); + + int count1 = 0; + timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, 0); + + int count2 = 0; + timer.Add([&] { count2++; }, "fn_sch_test2", kInitDelay2Us, 0); + + ASSERT_TRUE(timer.Start()); + ASSERT_EQ(0, count1); + ASSERT_EQ(0, count2); + + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelay1Us); }); + + ASSERT_EQ(1, count1); + ASSERT_EQ(0, count2); + + timer.TEST_WaitForRun([&] { + mock_clock_->SleepForMicroseconds(kInitDelay2Us - kInitDelay1Us); + }); + + ASSERT_EQ(1, count1); + ASSERT_EQ(1, count2); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, SingleScheduleRepeatedly) { + const int kIterations = 5; + const int kInitDelayUs = 1 * kUsPerSec; + const int kRepeatUs = 1 * kUsPerSec; + + Timer timer(mock_clock_.get()); + int count = 0; + timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs); + + ASSERT_TRUE(timer.Start()); + ASSERT_EQ(0, count); + + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); }); + + ASSERT_EQ(1, count); + + // Wait for execution to finish + for (int i = 1; i < kIterations; i++) { + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kRepeatUs); }); + } + ASSERT_EQ(kIterations, count); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, MultipleScheduleRepeatedly) { + const int kIterations = 5; + const int kInitDelay1Us = 0 * kUsPerSec; + const int kInitDelay2Us = 1 * kUsPerSec; + const int kInitDelay3Us = 0 * kUsPerSec; + const int kRepeatUs = 2 * kUsPerSec; + const int kLargeRepeatUs = 100 * kUsPerSec; + + Timer timer(mock_clock_.get()); + + int count1 = 0; + timer.Add([&] { count1++; }, "fn_sch_test1", kInitDelay1Us, kRepeatUs); + + int count2 = 0; + timer.Add([&] { count2++; }, "fn_sch_test2", kInitDelay2Us, kRepeatUs); + + // Add a function with relatively large repeat interval + int count3 = 0; + timer.Add([&] { count3++; }, "fn_sch_test3", kInitDelay3Us, kLargeRepeatUs); + + ASSERT_TRUE(timer.Start()); + + ASSERT_EQ(0, count2); + // Wait for execution to finish + for (int i = 1; i < kIterations * (kRepeatUs / kUsPerSec); i++) { + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(1 * kUsPerSec); }); + ASSERT_EQ((i + 2) / (kRepeatUs / kUsPerSec), count1); + ASSERT_EQ((i + 1) / (kRepeatUs / kUsPerSec), count2); + + // large interval function should only run once (the first one). + ASSERT_EQ(1, count3); + } + + timer.Cancel("fn_sch_test1"); + + // Wait for execution to finish + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(1 * kUsPerSec); }); + ASSERT_EQ(kIterations, count1); + ASSERT_EQ(kIterations, count2); + ASSERT_EQ(1, count3); + + timer.Cancel("fn_sch_test2"); + + ASSERT_EQ(kIterations, count1); + ASSERT_EQ(kIterations, count2); + + // execute the long interval one + timer.TEST_WaitForRun([&] { + mock_clock_->SleepForMicroseconds( + kLargeRepeatUs - static_cast(mock_clock_->NowMicros())); + }); + ASSERT_EQ(2, count3); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, AddAfterStartTest) { + const int kIterations = 5; + const int kInitDelayUs = 1 * kUsPerSec; + const int kRepeatUs = 1 * kUsPerSec; + + // wait timer to run and then add a new job + SyncPoint::GetInstance()->LoadDependency( + {{"Timer::Run::Waiting", "TimerTest:AddAfterStartTest:1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + Timer timer(mock_clock_.get()); + + ASSERT_TRUE(timer.Start()); + + TEST_SYNC_POINT("TimerTest:AddAfterStartTest:1"); + int count = 0; + timer.Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs); + ASSERT_EQ(0, count); + // Wait for execution to finish + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); }); + ASSERT_EQ(1, count); + + for (int i = 1; i < kIterations; i++) { + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kRepeatUs); }); + } + ASSERT_EQ(kIterations, count); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, CancelRunningTask) { + static constexpr char kTestFuncName[] = "test_func"; + const int kRepeatUs = 1 * kUsPerSec; + Timer timer(mock_clock_.get()); + ASSERT_TRUE(timer.Start()); + int* value = new int; + *value = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"TimerTest::CancelRunningTask:test_func:0", + "TimerTest::CancelRunningTask:BeforeCancel"}, + {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting", + "TimerTest::CancelRunningTask:test_func:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + timer.Add( + [&]() { + *value = 1; + TEST_SYNC_POINT("TimerTest::CancelRunningTask:test_func:0"); + TEST_SYNC_POINT("TimerTest::CancelRunningTask:test_func:1"); + }, + kTestFuncName, 0, kRepeatUs); + port::Thread control_thr([&]() { + TEST_SYNC_POINT("TimerTest::CancelRunningTask:BeforeCancel"); + timer.Cancel(kTestFuncName); + // Verify that *value has been set to 1. + ASSERT_EQ(1, *value); + delete value; + value = nullptr; + }); + mock_clock_->SleepForMicroseconds(kRepeatUs); + control_thr.join(); + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, ShutdownRunningTask) { + const int kRepeatUs = 1 * kUsPerSec; + constexpr char kTestFunc1Name[] = "test_func1"; + constexpr char kTestFunc2Name[] = "test_func2"; + Timer timer(mock_clock_.get()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"TimerTest::ShutdownRunningTest:test_func:0", + "TimerTest::ShutdownRunningTest:BeforeShutdown"}, + {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting", + "TimerTest::ShutdownRunningTest:test_func:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(timer.Start()); + + int* value = new int; + *value = 0; + timer.Add( + [&]() { + TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:test_func:0"); + *value = 1; + TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:test_func:1"); + }, + kTestFunc1Name, 0, kRepeatUs); + + timer.Add([&]() { ++(*value); }, kTestFunc2Name, 0, kRepeatUs); + + port::Thread control_thr([&]() { + TEST_SYNC_POINT("TimerTest::ShutdownRunningTest:BeforeShutdown"); + timer.Shutdown(); + }); + mock_clock_->SleepForMicroseconds(kRepeatUs); + control_thr.join(); + delete value; +} + +TEST_F(TimerTest, AddSameFuncName) { + const int kInitDelayUs = 1 * kUsPerSec; + const int kRepeat1Us = 5 * kUsPerSec; + const int kRepeat2Us = 4 * kUsPerSec; + + Timer timer(mock_clock_.get()); + ASSERT_TRUE(timer.Start()); + + int func_counter1 = 0; + ASSERT_TRUE(timer.Add([&] { func_counter1++; }, "duplicated_func", + kInitDelayUs, kRepeat1Us)); + + int func2_counter = 0; + ASSERT_TRUE( + timer.Add([&] { func2_counter++; }, "func2", kInitDelayUs, kRepeat2Us)); + + // New function with the same name should fail to add + int func_counter2 = 0; + ASSERT_FALSE(timer.Add([&] { func_counter2++; }, "duplicated_func", + kInitDelayUs, kRepeat1Us)); + + ASSERT_EQ(0, func_counter1); + ASSERT_EQ(0, func2_counter); + + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); }); + + ASSERT_EQ(1, func_counter1); + ASSERT_EQ(1, func2_counter); + + timer.TEST_WaitForRun([&] { mock_clock_->SleepForMicroseconds(kRepeat1Us); }); + + ASSERT_EQ(2, func_counter1); + ASSERT_EQ(2, func2_counter); + ASSERT_EQ(0, func_counter2); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, RepeatIntervalWithFuncRunningTime) { + const int kInitDelayUs = 1 * kUsPerSec; + const int kRepeatUs = 5 * kUsPerSec; + const int kFuncRunningTimeUs = 1 * kUsPerSec; + + Timer timer(mock_clock_.get()); + ASSERT_TRUE(timer.Start()); + + int func_counter = 0; + timer.Add( + [&] { + mock_clock_->SleepForMicroseconds(kFuncRunningTimeUs); + func_counter++; + }, + "func", kInitDelayUs, kRepeatUs); + + ASSERT_EQ(0, func_counter); + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); }); + ASSERT_EQ(1, func_counter); + ASSERT_EQ(kInitDelayUs + kFuncRunningTimeUs, mock_clock_->NowMicros()); + + // After repeat interval time, the function is not executed, as running + // the function takes some time (`kFuncRunningTimeSec`). The repeat interval + // is the time between ending time of the last call and starting time of the + // next call. + uint64_t next_abs_interval_time_us = kInitDelayUs + kRepeatUs; + timer.TEST_WaitForRun([&] { + mock_clock_->SetCurrentTime(next_abs_interval_time_us / kUsPerSec); + }); + ASSERT_EQ(1, func_counter); + + // After the function running time, it's executed again + timer.TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kFuncRunningTimeUs); }); + ASSERT_EQ(2, func_counter); + + ASSERT_TRUE(timer.Shutdown()); +} + +TEST_F(TimerTest, DestroyRunningTimer) { + const int kInitDelayUs = 1 * kUsPerSec; + const int kRepeatUs = 1 * kUsPerSec; + + auto timer_ptr = new Timer(mock_clock_.get()); + + int count = 0; + timer_ptr->Add([&] { count++; }, "fn_sch_test", kInitDelayUs, kRepeatUs); + ASSERT_TRUE(timer_ptr->Start()); + + timer_ptr->TEST_WaitForRun( + [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); }); + + // delete a running timer should not cause any exception + delete timer_ptr; +} + +TEST_F(TimerTest, DestroyTimerWithRunningFunc) { + const int kRepeatUs = 1 * kUsPerSec; + auto timer_ptr = new Timer(mock_clock_.get()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->LoadDependency({ + {"TimerTest::DestroyTimerWithRunningFunc:test_func:0", + "TimerTest::DestroyTimerWithRunningFunc:BeforeDelete"}, + {"Timer::WaitForTaskCompleteIfNecessary:TaskExecuting", + "TimerTest::DestroyTimerWithRunningFunc:test_func:1"}, + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_TRUE(timer_ptr->Start()); + + int count = 0; + timer_ptr->Add( + [&]() { + TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:test_func:0"); + count++; + TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:test_func:1"); + }, + "fn_running_test", 0, kRepeatUs); + + port::Thread control_thr([&] { + TEST_SYNC_POINT("TimerTest::DestroyTimerWithRunningFunc:BeforeDelete"); + delete timer_ptr; + }); + mock_clock_->SleepForMicroseconds(kRepeatUs); + control_thr.join(); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/user_comparator_wrapper.h b/src/rocksdb/util/user_comparator_wrapper.h new file mode 100644 index 000000000..59ebada12 --- /dev/null +++ b/src/rocksdb/util/user_comparator_wrapper.h @@ -0,0 +1,64 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "monitoring/perf_context_imp.h" +#include "rocksdb/comparator.h" + +namespace ROCKSDB_NAMESPACE { + +// Wrapper of user comparator, with auto increment to +// perf_context.user_key_comparison_count. +class UserComparatorWrapper { + public: + // `UserComparatorWrapper`s constructed with the default constructor are not + // usable and will segfault on any attempt to use them for comparisons. + UserComparatorWrapper() : user_comparator_(nullptr) {} + + explicit UserComparatorWrapper(const Comparator* const user_cmp) + : user_comparator_(user_cmp) {} + + ~UserComparatorWrapper() = default; + + const Comparator* user_comparator() const { return user_comparator_; } + + int Compare(const Slice& a, const Slice& b) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->Compare(a, b); + } + + bool Equal(const Slice& a, const Slice& b) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->Equal(a, b); + } + + int CompareTimestamp(const Slice& ts1, const Slice& ts2) const { + return user_comparator_->CompareTimestamp(ts1, ts2); + } + + int CompareWithoutTimestamp(const Slice& a, const Slice& b) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->CompareWithoutTimestamp(a, b); + } + + int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b, + bool b_has_ts) const { + PERF_COUNTER_ADD(user_key_comparison_count, 1); + return user_comparator_->CompareWithoutTimestamp(a, a_has_ts, b, b_has_ts); + } + + bool EqualWithoutTimestamp(const Slice& a, const Slice& b) const { + return user_comparator_->EqualWithoutTimestamp(a, b); + } + + private: + const Comparator* user_comparator_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/vector_iterator.h b/src/rocksdb/util/vector_iterator.h new file mode 100644 index 000000000..c4cc01d56 --- /dev/null +++ b/src/rocksdb/util/vector_iterator.h @@ -0,0 +1,118 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#pragma once + +#include +#include +#include + +#include "db/dbformat.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice.h" +#include "table/internal_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +// Iterator over a vector of keys/values +class VectorIterator : public InternalIterator { + public: + VectorIterator(std::vector keys, std::vector values, + const CompareInterface* icmp = nullptr) + : keys_(std::move(keys)), + values_(std::move(values)), + current_(keys_.size()), + indexed_cmp_(icmp, &keys_) { + assert(keys_.size() == values_.size()); + + indices_.reserve(keys_.size()); + for (size_t i = 0; i < keys_.size(); i++) { + indices_.push_back(i); + } + if (icmp != nullptr) { + std::sort(indices_.begin(), indices_.end(), indexed_cmp_); + } + } + + virtual bool Valid() const override { + return !indices_.empty() && current_ < indices_.size(); + } + + virtual void SeekToFirst() override { current_ = 0; } + virtual void SeekToLast() override { current_ = indices_.size() - 1; } + + virtual void Seek(const Slice& target) override { + if (indexed_cmp_.cmp != nullptr) { + current_ = std::lower_bound(indices_.begin(), indices_.end(), target, + indexed_cmp_) - + indices_.begin(); + } else { + current_ = + std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + } + + virtual void SeekForPrev(const Slice& target) override { + if (indexed_cmp_.cmp != nullptr) { + current_ = std::upper_bound(indices_.begin(), indices_.end(), target, + indexed_cmp_) - + indices_.begin(); + } else { + current_ = + std::upper_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + if (!Valid()) { + SeekToLast(); + } else { + Prev(); + } + } + + virtual void Next() override { current_++; } + virtual void Prev() override { current_--; } + + virtual Slice key() const override { + return Slice(keys_[indices_[current_]]); + } + virtual Slice value() const override { + return Slice(values_[indices_[current_]]); + } + + virtual Status status() const override { return Status::OK(); } + + virtual bool IsKeyPinned() const override { return true; } + virtual bool IsValuePinned() const override { return true; } + + protected: + std::vector keys_; + std::vector values_; + size_t current_; + + private: + struct IndexedKeyComparator { + IndexedKeyComparator(const CompareInterface* c, + const std::vector* ks) + : cmp(c), keys(ks) {} + + bool operator()(size_t a, size_t b) const { + return cmp->Compare((*keys)[a], (*keys)[b]) < 0; + } + + bool operator()(size_t a, const Slice& b) const { + return cmp->Compare((*keys)[a], b) < 0; + } + + bool operator()(const Slice& a, size_t b) const { + return cmp->Compare(a, (*keys)[b]) < 0; + } + + const CompareInterface* cmp; + const std::vector* keys; + }; + + IndexedKeyComparator indexed_cmp_; + std::vector indices_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/work_queue.h b/src/rocksdb/util/work_queue.h new file mode 100644 index 000000000..94ece85d9 --- /dev/null +++ b/src/rocksdb/util/work_queue.h @@ -0,0 +1,150 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +/// Unbounded thread-safe work queue. +// +// This file is an excerpt from Facebook's zstd repo at +// https://github.com/facebook/zstd/. The relevant file is +// contrib/pzstd/utils/WorkQueue.h. + +template +class WorkQueue { + // Protects all member variable access + std::mutex mutex_; + std::condition_variable readerCv_; + std::condition_variable writerCv_; + std::condition_variable finishCv_; + + std::queue queue_; + bool done_; + std::size_t maxSize_; + + // Must have lock to call this function + bool full() const { + if (maxSize_ == 0) { + return false; + } + return queue_.size() >= maxSize_; + } + + public: + /** + * Constructs an empty work queue with an optional max size. + * If `maxSize == 0` the queue size is unbounded. + * + * @param maxSize The maximum allowed size of the work queue. + */ + WorkQueue(std::size_t maxSize = 0) : done_(false), maxSize_(maxSize) {} + + /** + * Push an item onto the work queue. Notify a single thread that work is + * available. If `finish()` has been called, do nothing and return false. + * If `push()` returns false, then `item` has not been copied from. + * + * @param item Item to push onto the queue. + * @returns True upon success, false if `finish()` has been called. An + * item was pushed iff `push()` returns true. + */ + template + bool push(U&& item) { + { + std::unique_lock lock(mutex_); + while (full() && !done_) { + writerCv_.wait(lock); + } + if (done_) { + return false; + } + queue_.push(std::forward(item)); + } + readerCv_.notify_one(); + return true; + } + + /** + * Attempts to pop an item off the work queue. It will block until data is + * available or `finish()` has been called. + * + * @param[out] item If `pop` returns `true`, it contains the popped item. + * If `pop` returns `false`, it is unmodified. + * @returns True upon success. False if the queue is empty and + * `finish()` has been called. + */ + bool pop(T& item) { + { + std::unique_lock lock(mutex_); + while (queue_.empty() && !done_) { + readerCv_.wait(lock); + } + if (queue_.empty()) { + assert(done_); + return false; + } + item = queue_.front(); + queue_.pop(); + } + writerCv_.notify_one(); + return true; + } + + /** + * Sets the maximum queue size. If `maxSize == 0` then it is unbounded. + * + * @param maxSize The new maximum queue size. + */ + void setMaxSize(std::size_t maxSize) { + { + std::lock_guard lock(mutex_); + maxSize_ = maxSize; + } + writerCv_.notify_all(); + } + + /** + * Promise that `push()` won't be called again, so once the queue is empty + * there will never any more work. + */ + void finish() { + { + std::lock_guard lock(mutex_); + assert(!done_); + done_ = true; + } + readerCv_.notify_all(); + writerCv_.notify_all(); + finishCv_.notify_all(); + } + + /// Blocks until `finish()` has been called (but the queue may not be empty). + void waitUntilFinished() { + std::unique_lock lock(mutex_); + while (!done_) { + finishCv_.wait(lock); + } + } +}; +} // namespace ROCKSDB_NAMESPACE diff --git a/src/rocksdb/util/work_queue_test.cc b/src/rocksdb/util/work_queue_test.cc new file mode 100644 index 000000000..c23a51279 --- /dev/null +++ b/src/rocksdb/util/work_queue_test.cc @@ -0,0 +1,272 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + */ +#include "util/work_queue.h" + +#include + +#include +#include +#include +#include +#include + +#include "port/stack_trace.h" + +namespace ROCKSDB_NAMESPACE { + +// Unit test for work_queue.h. +// +// This file is an excerpt from Facebook's zstd repo at +// https://github.com/facebook/zstd/. The relevant file is +// contrib/pzstd/utils/test/WorkQueueTest.cpp. + +struct Popper { + WorkQueue* queue; + int* results; + std::mutex* mutex; + + void operator()() { + int result; + while (queue->pop(result)) { + std::lock_guard lock(*mutex); + results[result] = result; + } + } +}; + +TEST(WorkQueue, SingleThreaded) { + WorkQueue queue; + int result; + + queue.push(5); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(5, result); + + queue.push(1); + queue.push(2); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(1, result); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(2, result); + + queue.push(1); + queue.push(2); + queue.finish(); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(1, result); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(2, result); + EXPECT_FALSE(queue.pop(result)); + + queue.waitUntilFinished(); +} + +TEST(WorkQueue, SPSC) { + WorkQueue queue; + const int max = 100; + + for (int i = 0; i < 10; ++i) { + queue.push(i); + } + + std::thread thread([&queue, max] { + int result; + for (int i = 0;; ++i) { + if (!queue.pop(result)) { + EXPECT_EQ(i, max); + break; + } + EXPECT_EQ(i, result); + } + }); + + std::this_thread::yield(); + for (int i = 10; i < max; ++i) { + queue.push(i); + } + queue.finish(); + + thread.join(); +} + +TEST(WorkQueue, SPMC) { + WorkQueue queue; + std::vector results(50, -1); + std::mutex mutex; + std::vector threads; + for (int i = 0; i < 5; ++i) { + threads.emplace_back(Popper{&queue, results.data(), &mutex}); + } + + for (int i = 0; i < 50; ++i) { + queue.push(i); + } + queue.finish(); + + for (auto& thread : threads) { + thread.join(); + } + + for (int i = 0; i < 50; ++i) { + EXPECT_EQ(i, results[i]); + } +} + +TEST(WorkQueue, MPMC) { + WorkQueue queue; + std::vector results(100, -1); + std::mutex mutex; + std::vector popperThreads; + for (int i = 0; i < 4; ++i) { + popperThreads.emplace_back(Popper{&queue, results.data(), &mutex}); + } + + std::vector pusherThreads; + for (int i = 0; i < 2; ++i) { + auto min = i * 50; + auto max = (i + 1) * 50; + pusherThreads.emplace_back([&queue, min, max] { + for (int j = min; j < max; ++j) { + queue.push(j); + } + }); + } + + for (auto& thread : pusherThreads) { + thread.join(); + } + queue.finish(); + + for (auto& thread : popperThreads) { + thread.join(); + } + + for (int i = 0; i < 100; ++i) { + EXPECT_EQ(i, results[i]); + } +} + +TEST(WorkQueue, BoundedSizeWorks) { + WorkQueue queue(1); + int result; + queue.push(5); + queue.pop(result); + queue.push(5); + queue.pop(result); + queue.push(5); + queue.finish(); + queue.pop(result); + EXPECT_EQ(5, result); +} + +TEST(WorkQueue, BoundedSizePushAfterFinish) { + WorkQueue queue(1); + int result; + queue.push(5); + std::thread pusher([&queue] { queue.push(6); }); + // Dirtily try and make sure that pusher has run. + std::this_thread::sleep_for(std::chrono::seconds(1)); + queue.finish(); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(5, result); + EXPECT_FALSE(queue.pop(result)); + + pusher.join(); +} + +TEST(WorkQueue, SetMaxSize) { + WorkQueue queue(2); + int result; + queue.push(5); + queue.push(6); + queue.setMaxSize(1); + std::thread pusher([&queue] { queue.push(7); }); + // Dirtily try and make sure that pusher has run. + std::this_thread::sleep_for(std::chrono::seconds(1)); + queue.finish(); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(5, result); + EXPECT_TRUE(queue.pop(result)); + EXPECT_EQ(6, result); + EXPECT_FALSE(queue.pop(result)); + + pusher.join(); +} + +TEST(WorkQueue, BoundedSizeMPMC) { + WorkQueue queue(10); + std::vector results(200, -1); + std::mutex mutex; + std::cerr << "Creating popperThreads" << std::endl; + std::vector popperThreads; + for (int i = 0; i < 4; ++i) { + popperThreads.emplace_back(Popper{&queue, results.data(), &mutex}); + } + + std::cerr << "Creating pusherThreads" << std::endl; + std::vector pusherThreads; + for (int i = 0; i < 2; ++i) { + auto min = i * 100; + auto max = (i + 1) * 100; + pusherThreads.emplace_back([&queue, min, max] { + for (int j = min; j < max; ++j) { + queue.push(j); + } + }); + } + + std::cerr << "Joining pusherThreads" << std::endl; + for (auto& thread : pusherThreads) { + thread.join(); + } + std::cerr << "Finishing queue" << std::endl; + queue.finish(); + + std::cerr << "Joining popperThreads" << std::endl; + for (auto& thread : popperThreads) { + thread.join(); + } + + std::cerr << "Inspecting results" << std::endl; + for (int i = 0; i < 200; ++i) { + EXPECT_EQ(i, results[i]); + } +} + +TEST(WorkQueue, FailedPush) { + WorkQueue queue; + EXPECT_TRUE(queue.push(1)); + queue.finish(); + EXPECT_FALSE(queue.push(1)); +} + +TEST(WorkQueue, FailedPop) { + WorkQueue queue; + int x = 5; + EXPECT_TRUE(queue.push(x)); + queue.finish(); + x = 0; + EXPECT_TRUE(queue.pop(x)); + EXPECT_EQ(5, x); + EXPECT_FALSE(queue.pop(x)); + EXPECT_EQ(5, x); +} + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/rocksdb/util/xxhash.cc b/src/rocksdb/util/xxhash.cc new file mode 100644 index 000000000..88852c330 --- /dev/null +++ b/src/rocksdb/util/xxhash.cc @@ -0,0 +1,48 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +/* + * xxHash - Extremely Fast Hash algorithm + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/* + * xxhash.c instantiates functions defined in xxhash.h + */ +// clang-format off +#ifndef XXH_STATIC_LINKING_ONLY +#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ +#endif // !defined(XXH_STATIC_LINKING_ONLY) +#define XXH_IMPLEMENTATION /* access definitions */ + +#include "xxhash.h" diff --git a/src/rocksdb/util/xxhash.h b/src/rocksdb/util/xxhash.h new file mode 100644 index 000000000..195f06b39 --- /dev/null +++ b/src/rocksdb/util/xxhash.h @@ -0,0 +1,5346 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +/* BEGIN RocksDB customizations */ +#ifndef XXH_STATIC_LINKING_ONLY +// Using compiled xxhash.cc +#define XXH_STATIC_LINKING_ONLY 1 +#endif // !defined(XXH_STATIC_LINKING_ONLY) +#ifndef XXH_NAMESPACE +#define XXH_NAMESPACE ROCKSDB_ +#endif // !defined(XXH_NAMESPACE) + +// for FALLTHROUGH_INTENDED, inserted as appropriate +#include "port/lang.h" +/* END RocksDB customizations */ + +// clang-format off +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2020 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ +/*! + * @mainpage xxHash + * + * @file xxhash.h + * xxHash prototypes and implementation + */ +/* TODO: update */ +/* Notice extracted from xxHash homepage: + +xxHash is an extremely fast hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MurmurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +Note: SMHasher's CRC32 implementation is not the fastest one. +Other speed-oriented implementations can be faster, +especially in combination with PCLMUL instruction: +https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735 + +A 64-bit version, named XXH64, is available since r35. +It offers much better speed, but for 64-bit applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * XXH_INLINE_ALL (and XXH_PRIVATE_API) + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such + * as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ +# ifdef XXH_NAMESPACE +# error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported" + /* + * Note: Alternative: #undef all symbols (it's a pretty large list). + * Without #error: it compiles, but functions are actually not inlined. + */ +# endif +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, but they must + * still be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and is a more dispersed action. + * Meanwhile, renaming can be achieved in a single block + */ +# define XXH_IPREF(Id) XXH_INLINE_ ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + + + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + + +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 1 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is only useful when xxHash is compiled as a shared library, as it is + * independent of the version defined in the header. + * + * @return `XXH_VERSION_NUMBER` as of when the libray was compiled. + */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXH32_hash_t; +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +# endif +#endif + +/*! + * @} + * + * @defgroup xxh32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is considered rather weak by today's standards. + * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit + * systems, and offers true 64/128 bit hash results. It provides a superior + * level of dispersion, and greatly reduces the risks of collisions. + * + * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families + * @see @ref xxh32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * Example code for incrementally hashing a file: + * @code{.c} + * #include + * #include + * #define BUFFER_SIZE 256 + * + * // Note: XXH64 and XXH3 use the same interface. + * XXH32_hash_t + * hashFile(FILE* stream) + * { + * XXH32_state_t* state; + * unsigned char buf[BUFFER_SIZE]; + * size_t amt; + * XXH32_hash_t hash; + * + * state = XXH32_createState(); // Create a state + * assert(state != NULL); // Error check here + * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed + * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) { + * XXH32_update(state, buf, amt); // Hash the file in chunks + * } + * hash = XXH32_digest(state); // Finalize the hash + * XXH32_freeState(state); // Clean up + * return hash; + * } + * @endcode + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXH64_hash_t; +#else +# include +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup xxh64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. It provides a superior level of + * dispersion, and greatly reduces the risks of collisions. + */ + + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +/*! + * @} + * ************************************************************************ + * @defgroup xxh3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Any 32-bit and 64-bit targets that can run XXH32 smoothly + * can run XXH3 at competitive speeds, even without vector support. + * Further details are explained in the implementation. + * + * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8, + * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generage exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ + +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/* XXH3_64bits(): + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len); + +/* + * XXH3_64bits_withSeed(): + * This variant generates a custom secret on the fly + * based on default secret altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * Note: seed==0 produces the same results as XXH3_64bits(). + */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/* + * XXH3_64bits_withSecret(): + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of produced hash values depends on secret's entropy. + * Technically, the secret must look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever unsure about the "randomness" of the blob of bytes, + * consider relabelling it as a "custom seed" instead, + * and employ "XXH3_generateSecret()" (see below) + * to generate a high entropy secret derived from the custom seed. + */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + + +/******* Streaming *******/ +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); +XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state); + +/* + * XXH3_64bits_reset(): + * Initialize with default parameters. + * digest will be equivalent to `XXH3_64bits()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr); +/* + * XXH3_64bits_reset_withSeed(): + * Generate a custom secret from `seed`, and store it into `statePtr`. + * digest will be equivalent to `XXH3_64bits_withSeed()`. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +/* + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr); + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/******* Streaming *******/ +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed); +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize); + +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr); + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * XXH128_cmp(): + * + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash); +XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src); + + +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v1; /*!< First accumulator lane */ + XXH32_hash_t v2; /*!< Second accumulator lane */ + XXH32_hash_t v3; /*!< Third accumulator lane */ + XXH32_hash_t v4; /*!< Fourth accumulator lane */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v1; /*!< First accumulator lane */ + XXH64_hash_t v2; /*!< Second accumulator lane */ + XXH64_hash_t v3; /*!< Third accumulator lane */ + XXH64_hash_t v4; /*!< Fourth accumulator lane */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */ +}; /* typedef'd to XXH64_state_t */ + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * @note **This structure has a strict alignment requirement of 64 bytes.** Do + * not allocate this with `malloc()` or `new`, it will not be sufficiently + * aligned. Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack + * allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do not access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t reserved32; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; } + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/* + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed, + * as it becomes much more difficult for an external actor to guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE + * into an already allocated buffer secretBuffer. + * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`, + * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()` + * are part of this list. They all accept a `secret` parameter + * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so + * this function can be used to generate a secret of proper quality. + * + * customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes. + * The resulting `secret` will nonetheless provide all expected qualities. + * + * Supplying NULL as the customSeed copies the default secret into `secretBuffer`. + * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + */ +XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize); + + +/* simple short-cut to pre-selected XXH3_128bits variant */ +XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed); + + +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref xxh32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * . + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://stackoverflow.com/a/32095106/646947 for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 +/*! + * @def XXH_ACCEPT_NULL_INPUT_POINTER + * @brief Whether to add explicit `NULL` checks. + * + * If the input pointer is `NULL` and the length is non-zero, xxHash's default + * behavior is to dereference it, triggering a segfault. + * + * When this macro is enabled, xxHash actively checks the input for a null pointer. + * If it is, the result for null input pointers is the same as a zero-length input. + */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64 & arm64, + * which are platforms known to offer good unaligned memory accesses performance. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using + * -fno-inline with GCC or Clang, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH_REROLL + * @brief Whether to reroll `XXH32_finalize` and `XXH64_finalize`. + * + * For performance, `XXH32_finalize` and `XXH64_finalize` use an unrolled loop + * in the form of a switch statement. + * + * This is not always desirable, as it generates larger code, and depending on + * the architecture, may even be slower + * + * This is automatically defined with `-Os`/`-Oz` on GCC and Clang. + */ +# define XXH_REROLL 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */ +# if !defined(__clang__) && ( \ + (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) ) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \ + || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXH_REROLL 1 +# else +# define XXH_REROLL 0 +# endif +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#include + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined(__GNUC__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# define XXH_ASSERT(c) ((void)0) +#endif + +/* note: use after variable declarations */ +#define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0) + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#ifdef __GNUC__ +# define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign; + return ((const xxh_unalign*)ptr)->u32; +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, a runtime check (which is usually constant folded) + * is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup xxh32_impl XXH32 implementation + * @ingroup impl + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang autovectorizes it incorrectly + * and it is pointless writing a NEON implementation that is basically the + * same speed as scalar for XXH32. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param h32 The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 h32) +{ + h32 ^= h32 >> 15; + h32 *= XXH_PRIME32_2; + h32 ^= h32 >> 13; + h32 *= XXH_PRIME32_3; + h32 ^= h32 >> 16; + return(h32); +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param h32 The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + */ +static xxh_u32 +XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + h32 += (*ptr++) * XXH_PRIME32_5; \ + h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \ +} while (0) + + /* Compact rerolled version */ + if (XXH_REROLL) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(h32); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 8: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 4: XXH_PROCESS4; + return XXH32_avalanche(h32); + + case 13: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 9: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(h32); + + case 14: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 10: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(h32); + + case 15: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 11: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 7: XXH_PROCESS4; + FALLTHROUGH_INTENDED; + case 3: XXH_PROCESS1; + FALLTHROUGH_INTENDED; + case 2: XXH_PROCESS1; + FALLTHROUGH_INTENDED; + case 1: XXH_PROCESS1; + FALLTHROUGH_INTENDED; + case 0: return XXH32_avalanche(h32); + } + XXH_ASSERT(0); + return h32; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input, len, seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input ? input + len : NULL; + xxh_u32 h32; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)16; + } +#endif + + if (len>=16) { + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +/*! + * @ingroup xxh32_family + */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + state.v2 = seed + XXH_PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - XXH_PRIME32_1; + /* do not write into reserved, planned to be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved)); + return XXH_OK; +} + + +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + /* uintptr_t casts avoid UB or compiler warning on out-of-bounds + * pointer arithmetic */ + if ((uintptr_t)p <= (uintptr_t)bEnd - 16) { + const uintptr_t limit = (uintptr_t)bEnd - 16; + xxh_u32 v1 = state->v1; + xxh_u32 v2 = state->v2; + xxh_u32 v3 = state->v3; + xxh_u32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4; + } while ((uintptr_t)p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + + XXH_rotl32(state->v2, 7) + + XXH_rotl32(state->v3, 12) + + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} + + +/******* Canonical representation *******/ + +/*! + * @ingroup xxh32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup xxh32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __pack instructions are safer, but compiler specific, hence potentially + * problematic for some compilers. + * + * Currently only defined for GCC and ICC. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64; + return ((const xxh_unalign64*)ptr)->u64; +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://stackoverflow.com/a/32095106/646947 + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup xxh64_impl XXH64 implementation + * @ingroup impl + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +static xxh_u64 XXH64_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 33; + h64 *= XXH_PRIME64_2; + h64 ^= h64 >> 29; + h64 *= XXH_PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +static xxh_u64 +XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + h64 ^= (*ptr++) * XXH_PRIME64_5; + h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(h64); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + const xxh_u8* bEnd = input ? input + len : NULL; + xxh_u64 h64; + +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + if (input==NULL) { + len=0; + bEnd=input=(const xxh_u8*)(size_t)32; + } +#endif + + if (len>=32) { + const xxh_u8* const limit = bEnd - 32; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + XXH_PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ + +/*! @ingroup xxh64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)); + state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + state.v2 = seed + XXH_PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - XXH_PRIME64_1; + /* do not write into reserved64, might be removed in a future version */ + memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64)); + return XXH_OK; +} + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH64_state_t* state, const void* input, size_t len) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + /* uintptr_t casts avoid UB or compiler warning on out-of-bounds + * pointer arithmetic */ + if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) { + const uintptr_t limit = (uintptr_t)bEnd - 32; + xxh_u64 v1 = state->v1; + xxh_u64 v2 = state->v2; + xxh_u64 v3 = state->v3; + xxh_u64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8; + } while ((uintptr_t)p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + xxh_u64 const v1 = state->v1; + xxh_u64 const v2 = state->v2; + xxh_u64 const v3 = state->v3; + xxh_u64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} + + +/******* Canonical representation *******/ + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup xxh64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup xxh3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#else +/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* circumvent a clang bug */ +# include +# undef inline +# endif +#elif defined(_MSC_VER) +# include +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * @ref XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment reqired for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXH_VECTOR XXH_NEON +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + + +#if XXH_VECTOR == XXH_NEON +/* + * NEON's setup for vmlal_u32 is a little more complicated than it is on + * SSE2, AVX2, and VSX. + * + * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast. + * + * To do the same operation, the 128-bit 'Q' register needs to be split into + * two 64-bit 'D' registers, performing this operation:: + * + * [ a | b ] + * | '---------. .--------' | + * | x | + * | .---------' '--------. | + * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ] + * + * Due to significant changes in aarch64, the fastest method for aarch64 is + * completely different than the fastest method for ARMv7-A. + * + * ARMv7-A treats D registers as unions overlaying Q registers, so modifying + * D11 will modify the high half of Q5. This is similar to how modifying AH + * will only affect bits 8-15 of AX on x86. + * + * VZIP takes two registers, and puts even lanes in one register and odd lanes + * in the other. + * + * On ARMv7-A, this strangely modifies both parameters in place instead of + * taking the usual 3-operand form. + * + * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the + * lower and upper halves of the Q register to end up with the high and low + * halves where we want - all in one instruction. + * + * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] } + * + * Unfortunately we need inline assembly for this: Instructions modifying two + * registers at once is not possible in GCC or Clang's IR, and they have to + * create a copy. + * + * aarch64 requires a different approach. + * + * In order to make it easier to write a decent compiler for aarch64, many + * quirks were removed, such as conditional execution. + * + * NEON was also affected by this. + * + * aarch64 cannot access the high bits of a Q-form register, and writes to a + * D-form register zero the high bits, similar to how writes to W-form scalar + * registers (or DWORD registers on x86_64) work. + * + * The formerly free vget_high intrinsics now require a vext (with a few + * exceptions) + * + * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent + * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one + * operand. + * + * The equivalent of the VZIP.32 on the lower and upper halves would be this + * mess: + * + * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] } + * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] } + * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] } + * + * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN): + * + * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32); + * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF); + * + * This is available on ARMv7-A, but is less efficient than a single VZIP.32. + */ + +/*! + * Function-like macro: + * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi) + * { + * outLo = (uint32x2_t)(in & 0xFFFFFFFF); + * outHi = (uint32x2_t)(in >> 32); + * in = UNDEFINED; + * } + */ +# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \ + && defined(__GNUC__) \ + && !defined(__aarch64__) && !defined(__arm64__) +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \ + /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \ + /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \ + __asm__("vzip.32 %e0, %f0" : "+w" (in)); \ + (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \ + (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \ + } while (0) +# else +# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \ + do { \ + (outLo) = vmovn_u64 (in); \ + (outHi) = vshrn_n_u64 ((in), 32); \ + } while (0) +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +# if defined(__s390x__) +# include +# else +/* gcc's altivec.h can have the unwanted consequence to unconditionally + * #define bool, vector, and pixel keywords, + * with bad consequences for programs already using these keywords for other purposes. + * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined. + * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler, + * but it seems that, in some cases, it isn't. + * Force the build macro to be defined, so that keywords are not altered. + */ +# if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__) +# define __APPLE_ALTIVEC__ +# endif +# include +# endif + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# include +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs, rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs, rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= 0x165667919E3779F9ULL; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= 0x9FB21C651E98DF25ULL; + h64 ^= (h64 >> 35) + len ; + h64 *= 0x9FB21C651E98DF25ULL; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); + + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXH3_avalanche(acc); + XXH_ASSERT(nbRounds >= 8); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + return XXH3_avalanche(acc); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1)); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64)); + + XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret; + XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret; + int i; + for (i=0; i < nbRounds; ++i) { + /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*', + * this will warn "discards 'const' qualifier". */ + union { + XXH_ALIGN(64) const __m512i* cp; + XXH_ALIGN(64) void* p; + } remote_const_void; + remote_const_void.cp = src + i; + dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret; + XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + // MSVC 32bit mode does not support _mm_set_epi64x before 2015 + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret; + XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { + XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) { + /* data_vec = xinput[i]; */ + uint8x16_t data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key; + uint32x2_t data_key_lo, data_key_hi; + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped = vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + /* data_key = data_vec ^ key_vec; */ + data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (data_key >> 32); + * data_key = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + + } + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* xacc = (uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t data_vec = veorq_u64 (acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec)); + + /* xacc[i] *= XXH_PRIME32_1 */ + uint32x2_t data_key_lo, data_key_hi; + /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF); + * data_key_hi = (uint32x2_t) (xacc[i] >> 32); + * xacc[i] = UNDEFINED; */ + XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi); + { /* + * prod_hi = (data_key >> 32) * XXH_PRIME32_1; + * + * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will + * incorrectly "optimize" this: + * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b)); + * shifted = vshll_n_u32(tmp, 32); + * to this: + * tmp = "vmulq_u64"(a, b); // no such thing! + * shifted = vshlq_n_u64(tmp, 32); + * + * However, unlike SSE, Clang lacks a 64-bit multiply routine + * for NEON, and it scalarizes two 64-bit multiplies instead. + * + * vmull_u32 has the same timing as vmul_u32, and it avoids + * this bug completely. + * See https://bugs.llvm.org/show_bug.cgi?id=39967 + */ + uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime); + } + } } +} + +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */ + xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */ + xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + xacc[i] += product; + + /* swap high and low halves */ +#ifdef __s390x__ + xacc[i] += vec_permi(data_vec, data_vec, 2); +#else + xacc[i] += vec_xxpermdi(data_vec, data_vec, 2); +#endif + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_u64x2* const xacc = (xxh_u64x2*) acc; + const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +/* scalar variants - universal */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < XXH_ACC_NB; i++) { + xxh_u64 const data_val = XXH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8); + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +} + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + for (i=0; i < XXH_ACC_NB; i++) { + xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[i] = acc64; + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__clang__) && defined(__aarch64__) + /* + * UGLY HACK: + * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), long MOVK chains stall the + * integer pipelines: + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes Clang to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + /* + * Note: in debug mode, this overrides the asm optimization + * and Clang will emit MOVK chains again. + */ + XXH_ASSERT(kSecretPtr == XXH3_kSecret); + + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes Clang to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + + + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * XXH3_accumulate() + * Loops over XXH3_accumulate_512(). + * Assumption: nbStripes will not overflow the secret size + */ +XXH_FORCE_INLINE void +XXH3_accumulate( xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes, + XXH3_f_accumulate_512 f_acc512) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; + XXH_PREFETCH(in + XXH_PREFETCH_DIST); + f_acc512(acc, + in, + secret + n*XXH_SECRET_CONSUME_RATE); + } +} + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + * Since the function is not inlined, the compiler may not be able to understand that, + * in some scenarios, its `secret` argument is actually a compile time constant. + * This variant enforces that the compiler can detect that, + * and uses this opportunity to streamline the generated code for better performance. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc512, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc512, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* input, size_t len, + XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len) +{ + return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + + +/* === XXH3 streaming === */ + +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state) +{ + memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/* Note : when XXH3_consumeStripes() is invoked, + * there must be a guarantee that at least one more byte must be consumed from input + * so that the function can blindly consume all stripes using the "normal" secret segment */ +XXH_FORCE_INLINE void +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */ + XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock); + if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) { + /* need a scrambling operation */ + size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr; + size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock; + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512); + f_scramble(acc, secret + secretLimit); + XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512); + *nbStripesSoFarPtr = nbStripesAfterBlock; + } else { + XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512); + *nbStripesSoFarPtr += nbStripes; + } +} + +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* state, + const xxh_u8* input, size_t len, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) +#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1) + return XXH_OK; +#else + return XXH_ERROR; +#endif + + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */ + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc512, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + + /* Consume input by a multiple of internal buffer size */ + if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) { + const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE; + do { + XXH3_consumeStripes(state->acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc512, f_scramble); + input += XXH3_INTERNALBUFFER_SIZE; + } while (inputbuffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + } + XXH_ASSERT(input < bEnd); + + /* Some remaining input (always) : buffer it */ + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); + } + + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate_512, XXH3_scrambleAcc); + /* last stripe */ + XXH3_accumulate_512(acc, + state->buffer + state->bufferedSize - XXH_STRIPE_LEN, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } else { /* bufferedSize < XXH_STRIPE_LEN */ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + XXH3_accumulate_512(acc, + lastStripe, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); + } +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->seed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} + + +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API void +XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize) +{ + XXH_ASSERT(secretBuffer != NULL); + if (customSeedSize == 0) { + memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return; + } + XXH_ASSERT(customSeed != NULL); + + { size_t const segmentSize = sizeof(XXH128_hash_t); + size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize; + XXH128_canonical_t scrambler; + XXH64_hash_t seeds[12]; + size_t segnb; + XXH_ASSERT(nbSegments == 12); + XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */ + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + + /* + * Copy customSeed to seeds[], truncating or repeating as necessary. + */ + { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds)); + size_t filled = toFill; + memcpy(seeds, customSeed, toFill); + while (filled < sizeof(seeds)) { + toFill = XXH_MIN(filled, sizeof(seeds) - filled); + memcpy((char*)seeds + filled, seeds, toFill); + filled += toFill; + } } + + /* generate secret */ + memcpy(secretBuffer, &scrambler, sizeof(scrambler)); + for (segnb=1; segnb < nbSegments; segnb++) { + size_t const segmentStart = segnb * segmentSize; + XXH128_canonical_t segment; + XXH128_canonicalFromHash(&segment, + XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) ); + memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment)); + } } +} + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= 0x9FB21C651E98DF25ULL; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + int const nbRounds = (int)len / 32; + int i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + for (i=0; i<4; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + (32 * i), + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + XXH_ASSERT(nbRounds >= 4); + for (i=4 ; i < nbRounds; i++) { + acc = XXH128_mix32B(acc, + input + (32 * i), + input + (32 * i) + 16, + secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)), + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + 0ULL - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate_512 f_acc512, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc512, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc512, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ + +/* + * All the functions are actually the same as for 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_128bits_reset(statePtr); + if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate_512, XXH3_scrambleAcc); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} + +/* 128-bit utility functions */ + +#include /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + memcpy(dst, &hash.high64, sizeof(hash.high64)); + memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup xxh3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXH_NO_XXH3 */ + +/*! + * @} + */ +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} +#endif diff --git a/src/rocksdb/util/xxph3.h b/src/rocksdb/util/xxph3.h new file mode 100644 index 000000000..968000c3a --- /dev/null +++ b/src/rocksdb/util/xxph3.h @@ -0,0 +1,1764 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +/* + xxHash - Extremely Fast Hash algorithm + Header File + Copyright (C) 2012-2016, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +// This is a fork of a preview version of xxHash, as RocksDB depends on +// this preview version of XXH3. To allow this to coexist with the +// standard xxHash, including in the "unity" build where all source files +// and headers go into a single translation unit, here "XXH" has been +// replaced with "XXPH" for XX Preview Hash. + +#ifndef XXPHASH_H_5627135585666179 +#define XXPHASH_H_5627135585666179 1 + +/* BEGIN RocksDB customizations */ +#ifndef XXPH_STATIC_LINKING_ONLY +// Access experimental APIs +#define XXPH_STATIC_LINKING_ONLY 1 +#endif +#define XXPH_NAMESPACE ROCKSDB_ +#define XXPH_INLINE_ALL +#include +/* END RocksDB customizations */ + +// clang-format off +#if defined (__cplusplus) +extern "C" { +#endif + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXPH_OK=0, XXPH_ERROR } XXPH_errorcode; + + +/* **************************** + * API modifier + ******************************/ +/** XXPH_INLINE_ALL (and XXPH_PRIVATE_API) + * This build macro includes xxhash functions in `static` mode + * in order to inline them, and remove their symbol from the public list. + * Inlining offers great performance improvement on small keys, + * and dramatic ones when length is expressed as a compile-time constant. + * See https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html . + * Methodology : + * #define XXPH_INLINE_ALL + * #include "xxhash.h" + * `xxhash.c` is automatically included. + * It's not useful to compile and link it as a separate object. + */ +#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) +# ifndef XXPH_STATIC_LINKING_ONLY +# define XXPH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXPH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXPH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXPH_PUBLIC_API static __inline +# else + /* this version may generate warnings for unused static functions */ +# define XXPH_PUBLIC_API static +# endif +#else +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXPH_IMPORT) || defined(XXPH_EXPORT)) +# ifdef XXPH_EXPORT +# define XXPH_PUBLIC_API __declspec(dllexport) +# elif XXPH_IMPORT +# define XXPH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXPH_PUBLIC_API /* do nothing */ +# endif +#endif /* XXPH_INLINE_ALL || XXPH_PRIVATE_API */ + +/*! XXPH_NAMESPACE, aka Namespace Emulation : + * + * If you want to include _and expose_ xxHash functions from within your own library, + * but also want to avoid symbol collisions with other libraries which may also include xxHash, + * + * you can use XXPH_NAMESPACE, to automatically prefix any public symbol from xxhash library + * with the value of XXPH_NAMESPACE (therefore, avoid NULL and numeric values). + * + * Note that no change is required within the calling program as long as it includes `xxhash.h` : + * regular symbol name will be automatically translated by this header. + */ +#ifdef XXPH_NAMESPACE +# define XXPH_CAT(A,B) A##B +# define XXPH_NAME2(A,B) XXPH_CAT(A,B) +# define XXPH_versionNumber XXPH_NAME2(XXPH_NAMESPACE, XXPH_versionNumber) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXPH_VERSION_MAJOR 0 +#define XXPH_VERSION_MINOR 7 +#define XXPH_VERSION_RELEASE 2 +#define XXPH_VERSION_NUMBER (XXPH_VERSION_MAJOR *100*100 + XXPH_VERSION_MINOR *100 + XXPH_VERSION_RELEASE) +XXPH_PUBLIC_API unsigned XXPH_versionNumber (void); + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint32_t XXPH32_hash_t; +#else +# include +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXPH32_hash_t; +# else +# if ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXPH32_hash_t; +# else +# error "unsupported platform : need a 32-bit type" +# endif +# endif +#endif + +#ifndef XXPH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint64_t XXPH64_hash_t; +#else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXPH64_hash_t; +#endif + +#endif /* XXPH_NO_LONG_LONG */ + + + +#ifdef XXPH_STATIC_LINKING_ONLY + +/* ================================================================================================ + This section contains declarations which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + These declarations should only be used with static linking. + Never use them in association with dynamic linking ! +=================================================================================================== */ + + +/*-********************************************************************** +* XXPH3 +* New experimental hash +************************************************************************/ +#ifndef XXPH_NO_LONG_LONG + + +/* ============================================ + * XXPH3 is a new hash algorithm, + * featuring improved speed performance for both small and large inputs. + * See full speed analysis at : http://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * In general, expect XXPH3 to run about ~2x faster on large inputs, + * and >3x faster on small ones, though exact differences depend on platform. + * + * The algorithm is portable, will generate the same hash on all platforms. + * It benefits greatly from vectorization units, but does not require it. + * + * XXPH3 offers 2 variants, _64bits and _128bits. + * When only 64 bits are needed, prefer calling the _64bits variant : + * it reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The XXPH3 algorithm is still considered experimental. + * Produced results can still change between versions. + * Results produced by v0.7.x are not comparable with results from v0.7.y . + * It's nonetheless possible to use XXPH3 for ephemeral data (local sessions), + * but avoid storing values in long-term storage for later reads. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + * + * There are still a number of opened questions that community can influence during the experimental period. + * I'm trying to list a few of them below, though don't consider this list as complete. + * + * - 128-bits output type : currently defined as a structure of two 64-bits fields. + * That's because 128-bit values do not exist in C standard. + * Note that it means that, at byte level, result is not identical depending on endianess. + * However, at field level, they are identical on all platforms. + * The canonical representation solves the issue of identical byte-level representation across platforms, + * which is necessary for serialization. + * Q1 : Would there be a better representation for a 128-bit hash result ? + * Q2 : Are the names of the inner 64-bit fields important ? Should they be changed ? + * + * - Prototype XXPH128() : XXPH128() uses the same arguments as XXPH64(), for consistency. + * It means it maps to XXPH3_128bits_withSeed(). + * This variant is slightly slower than XXPH3_128bits(), + * because the seed is now part of the algorithm, and can't be simplified. + * Is that a good idea ? + * + * - Seed type for XXPH128() : currently, it's a single 64-bit value, like the 64-bit variant. + * It could be argued that it's more logical to offer a 128-bit seed input parameter for a 128-bit hash. + * But 128-bit seed is more difficult to use, since it requires to pass a structure instead of a scalar value. + * Such a variant could either replace current one, or become an additional one. + * Farmhash, for example, offers both variants (the 128-bits seed variant is called `doubleSeed`). + * Follow up question : if both 64-bit and 128-bit seeds are allowed, which variant should be called XXPH128 ? + * + * - Result for len==0 : Currently, the result of hashing a zero-length input is always `0`. + * It seems okay as a return value when using "default" secret and seed. + * But is it still fine to return `0` when secret or seed are non-default ? + * Are there use cases which could depend on generating a different hash result for zero-length input when the secret is different ? + * + * - Consistency (1) : Streaming XXPH128 uses an XXPH3 state, which is the same state as XXPH3_64bits(). + * It means a 128bit streaming loop must invoke the following symbols : + * XXPH3_createState(), XXPH3_128bits_reset(), XXPH3_128bits_update() (loop), XXPH3_128bits_digest(), XXPH3_freeState(). + * Is that consistent enough ? + * + * - Consistency (2) : The canonical representation of `XXPH3_64bits` is provided by existing functions + * XXPH64_canonicalFromHash(), and reverse operation XXPH64_hashFromCanonical(). + * As a mirror, canonical functions for XXPH128_hash_t results generated by `XXPH3_128bits` + * are XXPH128_canonicalFromHash() and XXPH128_hashFromCanonical(). + * Which means, `XXPH3` doesn't appear in the names, because canonical functions operate on a type, + * independently of which algorithm was used to generate that type. + * Is that consistent enough ? + */ + +#ifdef XXPH_NAMESPACE +# define XXPH3_64bits XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits) +# define XXPH3_64bits_withSecret XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSecret) +# define XXPH3_64bits_withSeed XXPH_NAME2(XXPH_NAMESPACE, XXPH3_64bits_withSeed) +#endif + +/* XXPH3_64bits() : + * default 64-bit variant, using default secret and default seed of 0. + * It's the fastest variant. */ +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* data, size_t len); + +/* XXPH3_64bits_withSecret() : + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The secret *must* be large enough (>= XXPH3_SECRET_SIZE_MIN). + * It should consist of random bytes. + * Avoid repeating same character, or sequences of bytes, + * and especially avoid swathes of \0. + * Failure to respect these conditions will result in a poor quality hash. + */ +#define XXPH3_SECRET_SIZE_MIN 136 +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize); + +/* XXPH3_64bits_withSeed() : + * This variant generates on the fly a custom secret, + * based on the default secret, altered using the `seed` value. + * While this operation is decently fast, note that it's not completely free. + * note : seed==0 produces same results as XXPH3_64bits() */ +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits_withSeed(const void* data, size_t len, XXPH64_hash_t seed); + +#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ +# include +# define XXPH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXPH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXPH_ALIGN(n) __declspec(align(n)) +#else +# define XXPH_ALIGN(n) /* disabled */ +#endif + +#define XXPH3_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ + +#endif /* XXPH_NO_LONG_LONG */ + + +/*-********************************************************************** +* XXPH_INLINE_ALL +************************************************************************/ +#if defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) + +/* === RocksDB modification: was #include here but permanently inlining === */ + +typedef struct { + XXPH64_hash_t low64; + XXPH64_hash_t high64; +} XXPH128_hash_t; + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXPH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXPH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if !defined(__clang__) && defined(__GNUC__) && defined(__ARM_FEATURE_UNALIGNED) && defined(__ARM_ARCH) && (__ARM_ARCH == 6) +# define XXPH_FORCE_MEMORY_ACCESS 2 +# elif !defined(__clang__) && ((defined(__INTEL_COMPILER) && !defined(_WIN32)) || \ + (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7))) +# define XXPH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXPH_ACCEPT_NULL_INPUT_POINTER : + * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault. + * When this macro is enabled, xxHash actively checks input for null pointer. + * It it is, result for null input pointers is the same as a null-length input. + */ +#ifndef XXPH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */ +# define XXPH_ACCEPT_NULL_INPUT_POINTER 0 +#endif + +/*!XXPH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; + * set it to 0 when the input is guaranteed to be aligned, + * or when alignment doesn't matter for performance. + */ +#ifndef XXPH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXPH_FORCE_ALIGN_CHECK 0 +# else +# define XXPH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +/*!XXPH_REROLL: + * Whether to reroll XXPH32_finalize, and XXPH64_finalize, + * instead of using an unrolled jump table/if statement loop. + * + * This is automatically defined on -Os/-Oz on GCC and Clang. */ +#ifndef XXPH_REROLL +# if defined(__OPTIMIZE_SIZE__) +# define XXPH_REROLL 1 +# else +# define XXPH_REROLL 0 +# endif +#endif + +#include /* ULLONG_MAX */ + +#ifndef XXPH_STATIC_LINKING_ONLY +#define XXPH_STATIC_LINKING_ONLY +#endif + +/* BEGIN RocksDB customizations */ +#include "port/lang.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */ +/* END RocksDB customizations */ + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# define XXPH_FORCE_INLINE static __forceinline +# define XXPH_NO_INLINE static __declspec(noinline) +#else +# if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# ifdef __GNUC__ +# define XXPH_FORCE_INLINE static inline __attribute__((always_inline)) +# define XXPH_NO_INLINE static __attribute__((noinline)) +# else +# define XXPH_FORCE_INLINE static inline +# define XXPH_NO_INLINE static +# endif +# else +# define XXPH_FORCE_INLINE static +# define XXPH_NO_INLINE static +# endif /* __STDC_VERSION__ */ +#endif + + + +/* ************************************* +* Debug +***************************************/ +/* DEBUGLEVEL is expected to be defined externally, + * typically through compiler command line. + * Value must be a number. */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + +#if (DEBUGLEVEL>=1) +# include /* note : can still be disabled with NDEBUG */ +# define XXPH_ASSERT(c) assert(c) +#else +# define XXPH_ASSERT(c) ((void)0) +#endif + +/* note : use after variable declarations */ +#define XXPH_STATIC_ASSERT(c) { enum { XXPH_sa = 1/(int)(!!(c)) }; } + + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXPH32_hash_t xxh_u32; + + +/* === Memory access === */ + +#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u32 XXPH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +static xxh_u32 XXPH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ +static xxh_u32 XXPH_read32(const void* memPtr) +{ + xxh_u32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* === Endianess === */ + +/* XXPH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXPH_CPU_LITTLE_ENDIAN +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXPH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXPH_CPU_LITTLE_ENDIAN 0 +# else +static int XXPH_isLittleEndian(void) +{ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} +# define XXPH_CPU_LITTLE_ENDIAN XXPH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXPH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +#if !defined(NO_CLANG_BUILTIN) && __has_builtin(__builtin_rotateleft32) && __has_builtin(__builtin_rotateleft64) +# define XXPH_rotl32 __builtin_rotateleft32 +# define XXPH_rotl64 __builtin_rotateleft64 +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXPH_rotl32(x,r) _rotl(x,r) +# define XXPH_rotl64(x,r) _rotl64(x,r) +#else +# define XXPH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXPH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXPH_swap32 _byteswap_ulong +#elif XXPH_GCC_VERSION >= 403 +# define XXPH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXPH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXPH_aligned, XXPH_unaligned } XXPH_alignment; + +XXPH_FORCE_INLINE xxh_u32 XXPH_readLE32(const void* ptr) +{ + return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read32(ptr) : XXPH_swap32(XXPH_read32(ptr)); +} + +XXPH_FORCE_INLINE xxh_u32 +XXPH_readLE32_align(const void* ptr, XXPH_alignment align) +{ + if (align==XXPH_unaligned) { + return XXPH_readLE32(ptr); + } else { + return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXPH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +XXPH_PUBLIC_API unsigned XXPH_versionNumber (void) { return XXPH_VERSION_NUMBER; } + + +static const xxh_u32 PRIME32_1 = 0x9E3779B1U; /* 0b10011110001101110111100110110001 */ +static const xxh_u32 PRIME32_2 = 0x85EBCA77U; /* 0b10000101111010111100101001110111 */ +static const xxh_u32 PRIME32_3 = 0xC2B2AE3DU; /* 0b11000010101100101010111000111101 */ +static const xxh_u32 PRIME32_4 = 0x27D4EB2FU; /* 0b00100111110101001110101100101111 */ +static const xxh_u32 PRIME32_5 = 0x165667B1U; /* 0b00010110010101100110011110110001 */ + +#ifndef XXPH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ + +/*====== Memory access ======*/ + +typedef XXPH64_hash_t xxh_u64; + +#if (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXPH_read64(const void* memPtr) { return *(const xxh_u64*) memPtr; } + +#elif (defined(XXPH_FORCE_MEMORY_ACCESS) && (XXPH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +static xxh_u64 XXPH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static xxh_u64 XXPH_read64(const void* memPtr) +{ + xxh_u64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXPH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXPH_swap64 _byteswap_uint64 +#elif XXPH_GCC_VERSION >= 403 +# define XXPH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXPH_swap64 (xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + +XXPH_FORCE_INLINE xxh_u64 XXPH_readLE64(const void* ptr) +{ + return XXPH_CPU_LITTLE_ENDIAN ? XXPH_read64(ptr) : XXPH_swap64(XXPH_read64(ptr)); +} + +XXPH_FORCE_INLINE xxh_u64 +XXPH_readLE64_align(const void* ptr, XXPH_alignment align) +{ + if (align==XXPH_unaligned) + return XXPH_readLE64(ptr); + else + return XXPH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXPH_swap64(*(const xxh_u64*)ptr); +} + + +/*====== xxh64 ======*/ + +static const xxh_u64 PRIME64_1 = 0x9E3779B185EBCA87ULL; /* 0b1001111000110111011110011011000110000101111010111100101010000111 */ +static const xxh_u64 PRIME64_2 = 0xC2B2AE3D27D4EB4FULL; /* 0b1100001010110010101011100011110100100111110101001110101101001111 */ +static const xxh_u64 PRIME64_3 = 0x165667B19E3779F9ULL; /* 0b0001011001010110011001111011000110011110001101110111100111111001 */ +static const xxh_u64 PRIME64_4 = 0x85EBCA77C2B2AE63ULL; /* 0b1000010111101011110010100111011111000010101100101010111001100011 */ +static const xxh_u64 PRIME64_5 = 0x27D4EB2F165667C5ULL; /* 0b0010011111010100111010110010111100010110010101100110011111000101 */ + + +/* ********************************************************************* +* XXPH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ + +/*======== Was #include "xxh3.h", now inlined below ==========*/ + +/* + xxHash - Extremely Fast Hash algorithm + Development source file for `xxh3` + Copyright (C) 2019-present, Yann Collet. + + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - xxHash source repository : https://github.com/Cyan4973/xxHash +*/ + +/* RocksDB Note: This file contains a preview release (xxhash repository + version 0.7.2) of XXPH3 that is unlikely to be compatible with the final + version of XXPH3. We have therefore renamed this XXPH3 ("preview"), for + clarity so that we can continue to use this version even after + integrating a newer incompatible version. +*/ + +/* === Dependencies === */ + +#undef XXPH_INLINE_ALL /* in case it's already defined */ +#define XXPH_INLINE_ALL + + +/* === Compiler specifics === */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXPH_RESTRICT restrict +#else +/* note : it might be useful to define __restrict or __restrict__ for some C++ compilers */ +# define XXPH_RESTRICT /* disable */ +#endif + +#if defined(__GNUC__) +# if defined(__AVX2__) +# include +# elif defined(__SSE2__) +# include +# elif defined(__ARM_NEON__) || defined(__ARM_NEON) +# define inline __inline__ /* clang bug */ +# include +# undef inline +# endif +#elif defined(_MSC_VER) +# include +#endif + +/* + * Sanity check. + * + * XXPH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * + * Almost all 32-bit and 64-bit targets meet this, except for Thumb-1, the + * classic 16-bit only subset of ARM's instruction set. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand is helpful too. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we + * will give a warning. + * + * Usually, if this happens, it is because of an accident and you probably + * need to specify -march, as you probably meant to compileh for a newer + * architecture. + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXPH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ +#define XXPH_SCALAR 0 +#define XXPH_SSE2 1 +#define XXPH_AVX2 2 +#define XXPH_NEON 3 +#define XXPH_VSX 4 + +#ifndef XXPH_VECTOR /* can be defined on command line */ +# if defined(__AVX2__) +# define XXPH_VECTOR XXPH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXPH_VECTOR XXPH_SSE2 +# elif defined(__GNUC__) /* msvc support maybe later */ \ + && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \ + && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) +# define XXPH_VECTOR XXPH_NEON +# elif defined(__PPC64__) && defined(__POWER8_VECTOR__) && defined(__GNUC__) +# define XXPH_VECTOR XXPH_VSX +# else +# define XXPH_VECTOR XXPH_SCALAR +# endif +#endif + +/* control alignment of accumulator, + * for compatibility with fast vector loads */ +#ifndef XXPH_ACC_ALIGN +# if XXPH_VECTOR == 0 /* scalar */ +# define XXPH_ACC_ALIGN 8 +# elif XXPH_VECTOR == 1 /* sse2 */ +# define XXPH_ACC_ALIGN 16 +# elif XXPH_VECTOR == 2 /* avx2 */ +# define XXPH_ACC_ALIGN 32 +# elif XXPH_VECTOR == 3 /* neon */ +# define XXPH_ACC_ALIGN 16 +# elif XXPH_VECTOR == 4 /* vsx */ +# define XXPH_ACC_ALIGN 16 +# endif +#endif + +/* xxh_u64 XXPH_mult32to64(xxh_u32 a, xxh_u64 b) { return (xxh_u64)a * (xxh_u64)b; } */ +#if defined(_MSC_VER) && defined(_M_IX86) +# include +# define XXPH_mult32to64(x, y) __emulu(x, y) +#else +# define XXPH_mult32to64(x, y) ((xxh_u64)((x) & 0xFFFFFFFF) * (xxh_u64)((y) & 0xFFFFFFFF)) +#endif + +/* VSX stuff. It's a lot because VSX support is mediocre across compilers and + * there is a lot of mischief with endianness. */ +#if XXPH_VECTOR == XXPH_VSX +# include +# undef vector +typedef __vector unsigned long long U64x2; +typedef __vector unsigned char U8x16; +typedef __vector unsigned U32x4; + +#ifndef XXPH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXPH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXPH_VSX_BE 1 +# else +# define XXPH_VSX_BE 0 +# endif +#endif + +/* We need some helpers for big endian mode. */ +#if XXPH_VSX_BE +/* A wrapper for POWER9's vec_revb. */ +# ifdef __POWER9_VECTOR__ +# define XXPH_vec_revb vec_revb +# else +XXPH_FORCE_INLINE U64x2 XXPH_vec_revb(U64x2 val) +{ + U8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif + +/* Power8 Crypto gives us vpermxor which is very handy for + * PPC64EB. + * + * U8x16 vpermxor(U8x16 a, U8x16 b, U8x16 mask) + * { + * U8x16 ret; + * for (int i = 0; i < 16; i++) { + * ret[i] = a[mask[i] & 0xF] ^ b[mask[i] >> 4]; + * } + * return ret; + * } + * + * Because both of the main loops load the key, swap, and xor it with input, + * we can combine the key swap into this instruction. + */ +# ifdef vec_permxor +# define XXPH_vec_permxor vec_permxor +# else +# define XXPH_vec_permxor __builtin_crypto_vpermxor +# endif +#endif /* XXPH_VSX_BE */ +/* + * Because we reinterpret the multiply, there are endian memes: vec_mulo actually becomes + * vec_mule. + * + * Additionally, the intrinsic wasn't added until GCC 8, despite existing for a while. + * Clang has an easy way to control this, we can just use the builtin which doesn't swap. + * GCC needs inline assembly. */ +#if __has_builtin(__builtin_altivec_vmuleuw) +# define XXPH_vec_mulo __builtin_altivec_vmulouw +# define XXPH_vec_mule __builtin_altivec_vmuleuw +#else +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXPH_FORCE_INLINE U64x2 XXPH_vec_mulo(U32x4 a, U32x4 b) { + U64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXPH_FORCE_INLINE U64x2 XXPH_vec_mule(U32x4 a, U32x4 b) { + U64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +#endif /* __has_builtin(__builtin_altivec_vmuleuw) */ +#endif /* XXPH_VECTOR == XXPH_VSX */ + +/* prefetch + * can be disabled, by declaring XXPH_NO_PREFETCH build macro */ +#if defined(XXPH_NO_PREFETCH) +# define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +#if defined(_MSC_VER) && \ + (defined(_M_X64) || \ + defined(_M_IX86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXPH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXPH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXPH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXPH_NO_PREFETCH */ + + +/* ========================================== + * XXPH3 default settings + * ========================================== */ + +#define XXPH_SECRET_DEFAULT_SIZE 192 /* minimum XXPH3_SECRET_SIZE_MIN */ + +#if (XXPH_SECRET_DEFAULT_SIZE < XXPH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +XXPH_ALIGN(64) static const xxh_u8 kSecret[XXPH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +/* + * GCC for x86 has a tendency to use SSE in this loop. While it + * successfully avoids swapping (as MUL overwrites EAX and EDX), it + * slows it down because instead of free register swap shifts, it + * must use pshufd and punpckl/hd. + * + * To prevent this, we use this attribute to shut off SSE. + */ +#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) +__attribute__((__target__("no-sse"))) +#endif +static XXPH128_hash_t +XXPH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this + * type despite not having the arithmetic for it. This results in a + * laggy compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if defined(__GNUC__) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs; + XXPH128_hash_t const r128 = { (xxh_u64)(product), (xxh_u64)(product >> 64) }; + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif defined(_M_X64) || defined(_M_IA64) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXPH128_hash_t const r128 = { product_low, product_high }; + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown + * below with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 + * --------- + * 6 9 7 5 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for + * UINT64_MAX. This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARMv6+ A32/T32, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, + * and allows this to be calculated in only 4 instructions which + * is comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be + * a couple of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXPH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXPH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXPH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXPH128_hash_t r128 = { lower, upper }; + return r128; +#endif +} + +/* + * We want to keep the attribute here because a target switch + * disables inlining. + * + * Does a 64-bit to 128-bit multiply, then XOR folds it. + * The reason for the separate function is to prevent passing + * too many structs around by value. This will hopefully inline + * the multiply, but we don't force it. + */ +#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__) +__attribute__((__target__("no-sse"))) +#endif +static xxh_u64 +XXPH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXPH128_hash_t product = XXPH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + + +static XXPH64_hash_t XXPH3_avalanche(xxh_u64 h64) +{ + h64 ^= h64 >> 37; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + return h64; +} + + +/* ========================================== + * Short keys + * ========================================== */ + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(input != NULL); + XXPH_ASSERT(1 <= len && len <= 3); + XXPH_ASSERT(secret != NULL); + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1) | (((xxh_u32)c2) << 8) | (((xxh_u32)c3) << 16) | (((xxh_u32)len) << 24); + xxh_u64 const keyed = (xxh_u64)combined ^ (XXPH_readLE32(secret) + seed); + xxh_u64 const mixed = keyed * PRIME64_1; + return XXPH3_avalanche(mixed); + } +} + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(input != NULL); + XXPH_ASSERT(secret != NULL); + XXPH_ASSERT(4 <= len && len <= 8); + { xxh_u32 const input_lo = XXPH_readLE32(input); + xxh_u32 const input_hi = XXPH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo | ((xxh_u64)input_hi << 32); + xxh_u64 const keyed = input_64 ^ (XXPH_readLE64(secret) + seed); + xxh_u64 const mix64 = len + ((keyed ^ (keyed >> 51)) * PRIME32_1); + return XXPH3_avalanche((mix64 ^ (mix64 >> 47)) * PRIME64_2); + } +} + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(input != NULL); + XXPH_ASSERT(secret != NULL); + XXPH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const input_lo = XXPH_readLE64(input) ^ (XXPH_readLE64(secret) + seed); + xxh_u64 const input_hi = XXPH_readLE64(input + len - 8) ^ (XXPH_readLE64(secret + 8) - seed); + xxh_u64 const acc = len + (input_lo + input_hi) + XXPH3_mul128_fold64(input_lo, input_hi); + return XXPH3_avalanche(acc); + } +} + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXPH64_hash_t seed) +{ + XXPH_ASSERT(len <= 16); + { if (len > 8) return XXPH3_len_9to16_64b(input, len, secret, seed); + if (len >= 4) return XXPH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXPH3_len_1to3_64b(input, len, secret, seed); + /* + * RocksDB modification from XXPH3 preview: zero result for empty + * string can be problematic for multiplication-based algorithms. + * Return a hash of the seed instead. + */ + return XXPH3_mul128_fold64(seed + XXPH_readLE64(secret), PRIME64_2); + } +} + + +/* === Long Keys === */ + +#define STRIPE_LEN 64 +#define XXPH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define ACC_NB (STRIPE_LEN / sizeof(xxh_u64)) + +typedef enum { XXPH3_acc_64bits, XXPH3_acc_128bits } XXPH3_accWidth_e; + +XXPH_FORCE_INLINE void +XXPH3_accumulate_512( void* XXPH_RESTRICT acc, + const void* XXPH_RESTRICT input, + const void* XXPH_RESTRICT secret, + XXPH3_accWidth_e accWidth) +{ +#if (XXPH_VECTOR == XXPH_AVX2) + + XXPH_ASSERT((((size_t)acc) & 31) == 0); + { XXPH_ALIGN(32) __m256i* const xacc = (__m256i *) acc; + const __m256i* const xinput = (const __m256i *) input; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ + const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this type */ + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ + __m256i const product = _mm256_mul_epu32 (data_key, _mm256_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ + if (accWidth == XXPH3_acc_128bits) { + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + xacc[i] = _mm256_add_epi64(product, sum); + } else { /* XXPH3_acc_64bits */ + __m256i const sum = _mm256_add_epi64(xacc[i], data_vec); + xacc[i] = _mm256_add_epi64(product, sum); + } + } } + +#elif (XXPH_VECTOR == XXPH_SSE2) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + { XXPH_ALIGN(16) __m128i* const xacc = (__m128i *) acc; + const __m128i* const xinput = (const __m128i *) input; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ + const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this type */ + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); /* uint32 dk[8] = {d0+k0, d1+k1, d2+k2, d3+k3, ...} */ + __m128i const product = _mm_mul_epu32 (data_key, _mm_shuffle_epi32 (data_key, 0x31)); /* uint64 mul[4] = {dk0*dk1, dk2*dk3, ...} */ + if (accWidth == XXPH3_acc_128bits) { + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + xacc[i] = _mm_add_epi64(product, sum); + } else { /* XXPH3_acc_64bits */ + __m128i const sum = _mm_add_epi64(xacc[i], data_vec); + xacc[i] = _mm_add_epi64(product, sum); + } + } } + +#elif (XXPH_VECTOR == XXPH_NEON) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + { + XXPH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* const xinput = (const uint8_t *) input; + uint8_t const* const xsecret = (const uint8_t *) secret; + + size_t i; + for (i=0; i < STRIPE_LEN / sizeof(uint64x2_t); i++) { +#if !defined(__aarch64__) && !defined(__arm64__) && defined(__GNUC__) /* ARM32-specific hack */ + /* vzip on ARMv7 Clang generates a lot of vmovs (technically vorrs) without this. + * vzip on 32-bit ARM NEON will overwrite the original register, and I think that Clang + * assumes I don't want to destroy it and tries to make a copy. This slows down the code + * a lot. + * aarch64 not only uses an entirely different syntax, but it requires three + * instructions... + * ext v1.16B, v0.16B, #8 // select high bits because aarch64 can't address them directly + * zip1 v3.2s, v0.2s, v1.2s // first zip + * zip2 v2.2s, v0.2s, v1.2s // second zip + * ...to do what ARM does in one: + * vzip.32 d0, d1 // Interleave high and low bits and overwrite. */ + + /* data_vec = xsecret[i]; */ + uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); + /* data_key = data_vec ^ key_vec; */ + uint32x4_t data_key; + + if (accWidth == XXPH3_acc_64bits) { + /* Add first to prevent register swaps */ + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXPH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + /* can probably be optimized better */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped= vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + + data_key = vreinterpretq_u32_u8(veorq_u8(data_vec, key_vec)); + + /* Here's the magic. We use the quirkiness of vzip to shuffle data_key in place. + * shuffle: data_key[0, 1, 2, 3] = data_key[0, 2, 1, 3] */ + __asm__("vzip.32 %e0, %f0" : "+w" (data_key)); + /* xacc[i] += (uint64x2_t) data_key[0, 1] * (uint64x2_t) data_key[2, 3]; */ + xacc[i] = vmlal_u32(xacc[i], vget_low_u32(data_key), vget_high_u32(data_key)); + +#else + /* On aarch64, vshrn/vmovn seems to be equivalent to, if not faster than, the vzip method. */ + + /* data_vec = xsecret[i]; */ + uint8x16_t const data_vec = vld1q_u8(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint8x16_t const key_vec = vld1q_u8(xsecret + (i * 16)); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t const data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec)); + /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF); */ + uint32x2_t const data_key_lo = vmovn_u64 (data_key); + /* data_key_hi = (uint32x2_t) (data_key >> 32); */ + uint32x2_t const data_key_hi = vshrn_n_u64 (data_key, 32); + if (accWidth == XXPH3_acc_64bits) { + /* xacc[i] += data_vec; */ + xacc[i] = vaddq_u64 (xacc[i], vreinterpretq_u64_u8(data_vec)); + } else { /* XXPH3_acc_128bits */ + /* xacc[i] += swap(data_vec); */ + uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec); + uint64x2_t const swapped= vextq_u64(data64, data64, 1); + xacc[i] = vaddq_u64 (xacc[i], swapped); + } + /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */ + xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi); + +#endif + } + } + +#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) + U64x2* const xacc = (U64x2*) acc; /* presumed aligned */ + U64x2 const* const xinput = (U64x2 const*) input; /* no alignment restriction */ + U64x2 const* const xsecret = (U64x2 const*) secret; /* no alignment restriction */ + U64x2 const v32 = { 32, 32 }; +#if XXPH_VSX_BE + U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, + 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; +#endif + size_t i; + for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { + /* data_vec = xinput[i]; */ + /* key_vec = xsecret[i]; */ +#if XXPH_VSX_BE + /* byteswap */ + U64x2 const data_vec = XXPH_vec_revb(vec_vsx_ld(0, xinput + i)); + U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); + /* See comment above. data_key = data_vec ^ swap(xsecret[i]); */ + U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); +#else + U64x2 const data_vec = vec_vsx_ld(0, xinput + i); + U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); + U64x2 const data_key = data_vec ^ key_vec; +#endif + /* shuffled = (data_key << 32) | (data_key >> 32); */ + U32x4 const shuffled = (U32x4)vec_rl(data_key, v32); + /* product = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)shuffled & 0xFFFFFFFF); */ + U64x2 const product = XXPH_vec_mulo((U32x4)data_key, shuffled); + xacc[i] += product; + + if (accWidth == XXPH3_acc_64bits) { + xacc[i] += data_vec; + } else { /* XXPH3_acc_128bits */ + /* swap high and low halves */ + U64x2 const data_swapped = vec_xxpermdi(data_vec, data_vec, 2); + xacc[i] += data_swapped; + } + } + +#else /* scalar variant of Accumulator - universal */ + + XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ + const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXPH_ASSERT(((size_t)acc & (XXPH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const data_val = XXPH_readLE64(xinput + 8*i); + xxh_u64 const data_key = data_val ^ XXPH_readLE64(xsecret + i*8); + + if (accWidth == XXPH3_acc_64bits) { + xacc[i] += data_val; + } else { + xacc[i ^ 1] += data_val; /* swap adjacent lanes */ + } + xacc[i] += XXPH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32); + } +#endif +} + +XXPH_FORCE_INLINE void +XXPH3_scrambleAcc(void* XXPH_RESTRICT acc, const void* XXPH_RESTRICT secret) +{ +#if (XXPH_VECTOR == XXPH_AVX2) + + XXPH_ASSERT((((size_t)acc) & 31) == 0); + { XXPH_ALIGN(32) __m256i* const xacc = (__m256i*) acc; + const __m256i* const xsecret = (const __m256i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm256_loadu_si256() requires this argument type */ + const __m256i prime32 = _mm256_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, 0x31); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXPH_VECTOR == XXPH_SSE2) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + { XXPH_ALIGN(16) __m128i* const xacc = (__m128i*) acc; + const __m128i* const xsecret = (const __m128i *) secret; /* not really aligned, just for ptr arithmetic, and because _mm_loadu_si128() requires this argument type */ + const __m128i prime32 = _mm_set1_epi32((int)PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, 0x31); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } + +#elif (XXPH_VECTOR == XXPH_NEON) + + XXPH_ASSERT((((size_t)acc) & 15) == 0); + + { uint64x2_t* const xacc = (uint64x2_t*) acc; + uint8_t const* const xsecret = (uint8_t const*) secret; + uint32x2_t const prime = vdup_n_u32 (PRIME32_1); + + size_t i; + for (i=0; i < STRIPE_LEN/sizeof(uint64x2_t); i++) { + /* data_vec = xacc[i] ^ (xacc[i] >> 47); */ + uint64x2_t const acc_vec = xacc[i]; + uint64x2_t const shifted = vshrq_n_u64 (acc_vec, 47); + uint64x2_t const data_vec = veorq_u64 (acc_vec, shifted); + + /* key_vec = xsecret[i]; */ + uint32x4_t const key_vec = vreinterpretq_u32_u8(vld1q_u8(xsecret + (i * 16))); + /* data_key = data_vec ^ key_vec; */ + uint32x4_t const data_key = veorq_u32 (vreinterpretq_u32_u64(data_vec), key_vec); + /* shuffled = { data_key[0, 2], data_key[1, 3] }; */ + uint32x2x2_t const shuffled = vzip_u32 (vget_low_u32(data_key), vget_high_u32(data_key)); + + /* data_key *= PRIME32_1 */ + + /* prod_hi = (data_key >> 32) * PRIME32_1; */ + uint64x2_t const prod_hi = vmull_u32 (shuffled.val[1], prime); + /* xacc[i] = prod_hi << 32; */ + xacc[i] = vshlq_n_u64(prod_hi, 32); + /* xacc[i] += (prod_hi & 0xFFFFFFFF) * PRIME32_1; */ + xacc[i] = vmlal_u32(xacc[i], shuffled.val[0], prime); + } } + +#elif (XXPH_VECTOR == XXPH_VSX) && /* work around a compiler bug */ (__GNUC__ > 5) + + U64x2* const xacc = (U64x2*) acc; + const U64x2* const xsecret = (const U64x2*) secret; + /* constants */ + U64x2 const v32 = { 32, 32 }; + U64x2 const v47 = { 47, 47 }; + U32x4 const prime = { PRIME32_1, PRIME32_1, PRIME32_1, PRIME32_1 }; + size_t i; +#if XXPH_VSX_BE + /* endian swap */ + U8x16 const vXorSwap = { 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70, + 0x8F, 0x9E, 0xAD, 0xBC, 0xCB, 0xDA, 0xE9, 0xF8 }; +#endif + for (i = 0; i < STRIPE_LEN / sizeof(U64x2); i++) { + U64x2 const acc_vec = xacc[i]; + U64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + /* key_vec = xsecret[i]; */ +#if XXPH_VSX_BE + /* swap bytes words */ + U64x2 const key_raw = vec_vsx_ld(0, xsecret + i); + U64x2 const data_key = (U64x2)XXPH_vec_permxor((U8x16)data_vec, (U8x16)key_raw, vXorSwap); +#else + U64x2 const key_vec = vec_vsx_ld(0, xsecret + i); + U64x2 const data_key = data_vec ^ key_vec; +#endif + + /* data_key *= PRIME32_1 */ + + /* prod_lo = ((U64x2)data_key & 0xFFFFFFFF) * ((U64x2)prime & 0xFFFFFFFF); */ + U64x2 const prod_even = XXPH_vec_mule((U32x4)data_key, prime); + /* prod_hi = ((U64x2)data_key >> 32) * ((U64x2)prime >> 32); */ + U64x2 const prod_odd = XXPH_vec_mulo((U32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } + +#else /* scalar variant of Scrambler - universal */ + + XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned on 32-bytes boundaries, little hint for the auto-vectorizer */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + size_t i; + XXPH_ASSERT((((size_t)acc) & (XXPH_ACC_ALIGN-1)) == 0); + for (i=0; i < ACC_NB; i++) { + xxh_u64 const key64 = XXPH_readLE64(xsecret + 8*i); + xxh_u64 acc64 = xacc[i]; + acc64 ^= acc64 >> 47; + acc64 ^= key64; + acc64 *= PRIME32_1; + xacc[i] = acc64; + } + +#endif +} + +#define XXPH_PREFETCH_DIST 384 + +/* assumption : nbStripes will not overflow secret size */ +XXPH_FORCE_INLINE void +XXPH3_accumulate( xxh_u64* XXPH_RESTRICT acc, + const xxh_u8* XXPH_RESTRICT input, + const xxh_u8* XXPH_RESTRICT secret, + size_t nbStripes, + XXPH3_accWidth_e accWidth) +{ + size_t n; + for (n = 0; n < nbStripes; n++ ) { + const xxh_u8* const in = input + n*STRIPE_LEN; + XXPH_PREFETCH(in + XXPH_PREFETCH_DIST); + XXPH3_accumulate_512(acc, + in, + secret + n*XXPH_SECRET_CONSUME_RATE, + accWidth); + } +} + +/* note : clang auto-vectorizes well in SS2 mode _if_ this function is `static`, + * and doesn't auto-vectorize it at all if it is `FORCE_INLINE`. + * However, it auto-vectorizes better AVX2 if it is `FORCE_INLINE` + * Pretty much every other modes and compilers prefer `FORCE_INLINE`. + */ + +#if defined(__clang__) && (XXPH_VECTOR==0) && !defined(__AVX2__) && !defined(__arm__) && !defined(__thumb__) +static void +#else +XXPH_FORCE_INLINE void +#endif +XXPH3_hashLong_internal_loop( xxh_u64* XXPH_RESTRICT acc, + const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, + XXPH3_accWidth_e accWidth) +{ + size_t const nb_rounds = (secretSize - STRIPE_LEN) / XXPH_SECRET_CONSUME_RATE; + size_t const block_len = STRIPE_LEN * nb_rounds; + size_t const nb_blocks = len / block_len; + + size_t n; + + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + XXPH3_accumulate(acc, input + n*block_len, secret, nb_rounds, accWidth); + XXPH3_scrambleAcc(acc, secret + secretSize - STRIPE_LEN); + } + + /* last partial block */ + XXPH_ASSERT(len > STRIPE_LEN); + { size_t const nbStripes = (len - (block_len * nb_blocks)) / STRIPE_LEN; + XXPH_ASSERT(nbStripes <= (secretSize / XXPH_SECRET_CONSUME_RATE)); + XXPH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, accWidth); + + /* last stripe */ + if (len & (STRIPE_LEN - 1)) { + const xxh_u8* const p = input + len - STRIPE_LEN; +#define XXPH_SECRET_LASTACC_START 7 /* do not align on 8, so that secret is different from scrambler */ + XXPH3_accumulate_512(acc, p, secret + secretSize - STRIPE_LEN - XXPH_SECRET_LASTACC_START, accWidth); + } } +} + +XXPH_FORCE_INLINE xxh_u64 +XXPH3_mix2Accs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret) +{ + return XXPH3_mul128_fold64( + acc[0] ^ XXPH_readLE64(secret), + acc[1] ^ XXPH_readLE64(secret+8) ); +} + +static XXPH64_hash_t +XXPH3_mergeAccs(const xxh_u64* XXPH_RESTRICT acc, const xxh_u8* XXPH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + + result64 += XXPH3_mix2Accs(acc+0, secret + 0); + result64 += XXPH3_mix2Accs(acc+2, secret + 16); + result64 += XXPH3_mix2Accs(acc+4, secret + 32); + result64 += XXPH3_mix2Accs(acc+6, secret + 48); + + return XXPH3_avalanche(result64); +} + +#define XXPH3_INIT_ACC { PRIME32_3, PRIME64_1, PRIME64_2, PRIME64_3, \ + PRIME64_4, PRIME32_2, PRIME64_5, PRIME32_1 }; + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_hashLong_internal(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) +{ + XXPH_ALIGN(XXPH_ACC_ALIGN) xxh_u64 acc[ACC_NB] = XXPH3_INIT_ACC; + + XXPH3_hashLong_internal_loop(acc, input, len, secret, secretSize, XXPH3_acc_64bits); + + /* converge into final hash */ + XXPH_STATIC_ASSERT(sizeof(acc) == 64); +#define XXPH_SECRET_MERGEACCS_START 11 /* do not align on 8, so that secret is different from accumulator */ + XXPH_ASSERT(secretSize >= sizeof(acc) + XXPH_SECRET_MERGEACCS_START); + return XXPH3_mergeAccs(acc, secret + XXPH_SECRET_MERGEACCS_START, (xxh_u64)len * PRIME64_1); +} + + +XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXPH3_hashLong_64b_defaultSecret(const xxh_u8* XXPH_RESTRICT input, size_t len) +{ + return XXPH3_hashLong_internal(input, len, kSecret, sizeof(kSecret)); +} + +XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXPH3_hashLong_64b_withSecret(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize) +{ + return XXPH3_hashLong_internal(input, len, secret, secretSize); +} + + +XXPH_FORCE_INLINE void XXPH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXPH_CPU_LITTLE_ENDIAN) v64 = XXPH_swap64(v64); + memcpy(dst, &v64, sizeof(v64)); +} + +/* XXPH3_initCustomSecret() : + * destination `customSecret` is presumed allocated and same size as `kSecret`. + */ +XXPH_FORCE_INLINE void XXPH3_initCustomSecret(xxh_u8* customSecret, xxh_u64 seed64) +{ + int const nbRounds = XXPH_SECRET_DEFAULT_SIZE / 16; + int i; + + XXPH_STATIC_ASSERT((XXPH_SECRET_DEFAULT_SIZE & 15) == 0); + + for (i=0; i < nbRounds; i++) { + XXPH_writeLE64(customSecret + 16*i, XXPH_readLE64(kSecret + 16*i) + seed64); + XXPH_writeLE64(customSecret + 16*i + 8, XXPH_readLE64(kSecret + 16*i + 8) - seed64); + } +} + + +/* XXPH3_hashLong_64b_withSeed() : + * Generate a custom key, + * based on alteration of default kSecret with the seed, + * and then use this key for long mode hashing. + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + */ +XXPH_NO_INLINE XXPH64_hash_t /* It's important for performance that XXPH3_hashLong is not inlined. Not sure why (uop cache maybe ?), but difference is large and easily measurable */ +XXPH3_hashLong_64b_withSeed(const xxh_u8* input, size_t len, XXPH64_hash_t seed) +{ + XXPH_ALIGN(8) xxh_u8 secret[XXPH_SECRET_DEFAULT_SIZE]; + if (seed==0) return XXPH3_hashLong_64b_defaultSecret(input, len); + XXPH3_initCustomSecret(secret, seed); + return XXPH3_hashLong_internal(input, len, secret, sizeof(secret)); +} + + +XXPH_FORCE_INLINE xxh_u64 XXPH3_mix16B(const xxh_u8* XXPH_RESTRICT input, + const xxh_u8* XXPH_RESTRICT secret, xxh_u64 seed64) +{ + xxh_u64 const input_lo = XXPH_readLE64(input); + xxh_u64 const input_hi = XXPH_readLE64(input+8); + return XXPH3_mul128_fold64( + input_lo ^ (XXPH_readLE64(secret) + seed64), + input_hi ^ (XXPH_readLE64(secret+8) - seed64) ); +} + + +XXPH_FORCE_INLINE XXPH64_hash_t +XXPH3_len_17to128_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, + XXPH64_hash_t seed) +{ + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; + XXPH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * PRIME64_1; + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXPH3_mix16B(input+48, secret+96, seed); + acc += XXPH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXPH3_mix16B(input+32, secret+64, seed); + acc += XXPH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXPH3_mix16B(input+16, secret+32, seed); + acc += XXPH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXPH3_mix16B(input+0, secret+0, seed); + acc += XXPH3_mix16B(input+len-16, secret+16, seed); + + return XXPH3_avalanche(acc); + } +} + +#define XXPH3_MIDSIZE_MAX 240 + +XXPH_NO_INLINE XXPH64_hash_t +XXPH3_len_129to240_64b(const xxh_u8* XXPH_RESTRICT input, size_t len, + const xxh_u8* XXPH_RESTRICT secret, size_t secretSize, + XXPH64_hash_t seed) +{ + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); (void)secretSize; + XXPH_ASSERT(128 < len && len <= XXPH3_MIDSIZE_MAX); + + #define XXPH3_MIDSIZE_STARTOFFSET 3 + #define XXPH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * PRIME64_1; + int const nbRounds = (int)len / 16; + int i; + for (i=0; i<8; i++) { + acc += XXPH3_mix16B(input+(16*i), secret+(16*i), seed); + } + acc = XXPH3_avalanche(acc); + XXPH_ASSERT(nbRounds >= 8); + for (i=8 ; i < nbRounds; i++) { + acc += XXPH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXPH3_MIDSIZE_STARTOFFSET, seed); + } + /* last bytes */ + acc += XXPH3_mix16B(input + len - 16, secret + XXPH3_SECRET_SIZE_MIN - XXPH3_MIDSIZE_LASTOFFSET, seed); + return XXPH3_avalanche(acc); + } +} + +/* === Public entry point === */ + +XXPH_PUBLIC_API XXPH64_hash_t XXPH3_64bits(const void* input, size_t len) +{ + if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, 0); + if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), 0); + return XXPH3_hashLong_64b_defaultSecret((const xxh_u8*)input, len); +} + +XXPH_PUBLIC_API XXPH64_hash_t +XXPH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize) +{ + XXPH_ASSERT(secretSize >= XXPH3_SECRET_SIZE_MIN); + /* if an action must be taken should `secret` conditions not be respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash */ + if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, 0); + if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, 0); + return XXPH3_hashLong_64b_withSecret((const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize); +} + +XXPH_PUBLIC_API XXPH64_hash_t +XXPH3_64bits_withSeed(const void* input, size_t len, XXPH64_hash_t seed) +{ + if (len <= 16) return XXPH3_len_0to16_64b((const xxh_u8*)input, len, kSecret, seed); + if (len <= 128) return XXPH3_len_17to128_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + if (len <= XXPH3_MIDSIZE_MAX) return XXPH3_len_129to240_64b((const xxh_u8*)input, len, kSecret, sizeof(kSecret), seed); + return XXPH3_hashLong_64b_withSeed((const xxh_u8*)input, len, seed); +} + +/* === XXPH3 streaming === */ + +/* RocksDB Note: unused & removed due to bug in preview version */ + +/*======== END #include "xxh3.h", now inlined above ==========*/ + +#endif /* XXPH_NO_LONG_LONG */ + +/* === END RocksDB modification of permanently inlining === */ + +#endif /* defined(XXPH_INLINE_ALL) || defined(XXPH_PRIVATE_API) */ + +#endif /* XXPH_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif + +#endif /* XXPHASH_H_5627135585666179 */ -- cgit v1.2.3